def finish(self): if len(self.__stage_list) > 0: logger().info('cat finish, flush %d transactions', len(self.__stage_list)) for val in self.__stage_list[::-1]: val.complete() self.__stage_list = []
def track_location_set(track_id, location): table_name = get_track_location_table() cols = [] cols.append(('location', 'hdfs_path', location)) ret = hbase_util.put_cols(table_name, track_id, cols) logger().info('write hbase rowkey[%s] -> column[%s] content[%s]', track_id, 'location:hdfs_path', location) return ret
def kds_get(batch): rowkey = kds_rowkey.gen(batch) table = get_table_name() ret = hbase_util.get_col(table, rowkey, 'kds', 'data') logger().info('seek data for row[%s]', rowkey) if ret: return ret['kds:data'] return None
def track_batch_set(track_id, batch): table_name = get_track_batch_table() cols = [] cols.append(('batch', 'newest', batch)) ret = hbase_util.put_cols(table_name, track_id, cols) logger().info('write hbase rowkey[%s] -> column[%s] content[%s]', track_id, 'batch:newest', batch) return ret
def kds_save(batch, data): rowkey = kds_rowkey.gen(batch) table = get_table_name() cols = [] cols.append(('kds', 'data', data)) logger().info('save data for row[%s], table[%s]', rowkey, table) if not hbase_util.put_cols(table, rowkey, cols): return False return True
def get(table, row, column_family=None, column=None): succ = False for i in range(retry_times): try: ret = _get(table, row, column_family, column) succ = True break except Exception, e: # pylint: disable=W0703 logger().error('exception:[%s]', str(e)) thrift_reconn()
def put_cols(table, row, cols): #(column_family, column, data) succ = False for i in range(retry_times): try: ret = _put_cols(table, row, cols) succ = True break except Exception, e: # pylint: disable=W0703 logger().error('exception:[%s]', str(e)) thrift_reconn()
def put_col(table, row, column_family, column, content): succ = False for i in range(retry_times): try: ret = _put_col(table, row, column_family, column, content) succ = True break except Exception, e: logger().error('exception:[%s]', str(e)) thrift_reconn()
def new_fun(*args, **kw): arg = args f = fun for i in range(3): try: return f(*args, **kw) except Exception, e: logger().warning(traceback.format_exc()) time.sleep(3) recover() f = getattr(obj, '__' + fun_name) continue
def track_batch_get(track_id): table_name = get_track_batch_table() ret = hbase_util.get(table_name, track_id, column_family='batch', column='newest') if not ret: logger().info('rowkey[%s] not found', track_id) return None ret = ret['batch:newest'] logger().info( 'get hbase rowkey[%s] column[batch:newest] -> content[%s]', track_id, ret) return ret
def get(self): cat_helper.agent_reset() try: if FLAG().get: return self.process() else: msg = 'get method not supported' logger().error(msg) self.write(msg) self.set_status(404) finally: cat_helper.agent_finish()
def track_location_get(track_id): table_name = get_track_location_table() ret = hbase_util.get(table_name, track_id, column_family='location', column='hdfs_path') if not ret: logger().info('rowkey[%s] not found', track_id) return None ret = ret['location:hdfs_path'] logger().info( 'get hbase rowkey[%s] column[location:hdfs_path]-> content[%s]', track_id, ret) return ret
def image_exist(track_id, tpid, tpe, seq, image_type, batch=''): row_key = rowkey.gen(track_id, tpid, tpe, seq, image_type) logger().info('row_key[%s], batch[%s]', row_key, batch) if not batch: last_batch = batch_manager.get_last_batch(track_id, tpid, tpe, seq, image_type) return last_batch != None else: if tpe == '00' and seq == '004': return False else: table = get_table_name() ret = hbase_util.get_col(table, row_key, 'content', batch) return not not ret
def image_delete(track_id, tpid, tpe, seq, image_type, batch=''): row_key = rowkey.gen(track_id, tpid, tpe, seq, image_type) if not batch: batch = batch_manager.get_last_batch(track_id, tpid, tpe, seq, image_type) if not batch: logger().error('get empty batch for row_key[%s]', row_key) return False table = get_table_name() columns = ['content:' + batch] logger().debug('delete row [%s], batch[%s]', row_key, batch) hbase_util.delete(table, row_key, columns=columns) batch_manager.del_batch(track_id, tpid, tpe, seq, image_type, batch) return True
def del_batch(track_id, tpid, tpe, seq, image_type, batch): batch_list = get_batch_list(track_id, tpid, tpe, seq, image_type) logger().debug('old batch_list:%s', batch_list) if batch_list: batch_list = _batch_del(batch_list, batch) else: batch_list = '' logger().debug('new batch_list:%s', batch_list) # write table = get_table_name() #row_key = '%s-%s_%s_%s_%s' % (track_id, tpid, tpe, seq, image_type) row = rowkey.gen_v2(track_id, tpid, tpe, seq, image_type) cols = [] cols.append(('meta', 'batch_list', batch_list)) return hbase_util.put_cols(table, row, cols)
def image_get(track_id, tpid, tpe, seq, image_type, batch=None): #row_key = '%s-%s_%s_%s_%s' % (track_id, tpid, tpe, seq, image_type) row_key = rowkey.gen(track_id, tpid, tpe, seq, image_type) if not batch: batch = batch_manager.get_last_batch(track_id, tpid, tpe, seq, image_type) if not batch: logger().error('get empty batch for row_key[%s]', row_key) return None table = get_table_name() logger().debug('row [%s], batch[%s]', row_key, batch) ret = hbase_util.get_col(table, row_key, 'content', batch) if ret: return ret['content:' + batch] return None
def image_write(track_id, tpid, tpe, seq, image_type, content, task_seq, batch): row_key = rowkey.gen(track_id, tpid, tpe, seq, image_type) cols = [] table = batch_manager.get_table_name() cols.append(('meta', 'type', tpe)) cols.append(('meta', 'seq', seq)) cols.append(('meta', 'image_type', image_type)) cols.append(('meta', 'task_seq_' + task_seq, batch)) if not hbase_util.put_cols(table, row_key, cols): return False cols = [] table = get_table_name() logger().info('write row_key[%s] to table[%s]', row_key, table) cols.append(('content', batch, content)) if not hbase_util.put_cols(table, row_key, cols): return False return True
def run(self): fpath = FLAG().raw_data if not fpath: logger().error('bad raw_data path[%s]', fpath) sys.exit(1) logger().info('raw_data path[%s]', fpath) cnt = 0 ''' first_line = True ''' # for line in open(fpath, 'rU'): for line in open(fpath): ''' if first_line: first_line = False if FLAG().skip_first_line: continue # not first_line ''' self.process_line(line.strip()) self.post_process()
def process_line(self, line): segs = line.split('\t') if 12 != len(segs): logger().warning('error line[%s]', line) counter.inc('error line') return data = {} data['content_id'] = segs[0] data['program_id'] = segs[1] data['pack_id'] = segs[2] data['hits'] = segs[3] data['cont_name'] = segs[4] data['cont_recomm'] = segs[5] data['publish_time'] = segs[6] data['cont_type'] = segs[7] data['cont_type_name'] = segs[8] data['media_shape'] = segs[9] data['keywords'] = segs[10] data['source'] = segs[11] self.process(data)
def image_upload(track_id, tpid, tpe, seq, image_type, content, task_seq='!!!', batch='!!!'): if not task_seq: task_seq = '!!!' if not batch: batch = '!!!' _timer = Timer() _timer.stage_begin('image write') if not image_write(track_id, tpid, tpe, seq, image_type, content, task_seq, batch): return False _timer.stage_begin('set batch') if not batch_manager.set_batch(track_id, tpid, tpe, seq, image_type, batch): return False _timer.finish() logger().debug(_timer.dump()) return True
def image_get(self, track_id, tpid): task_id = track_id.split('_')[0] task_seq, num_reducer = self.get_task_info(task_id) logger().info('task_seq:%s, num_reducer:%s', task_seq, str(num_reducer)) if not task_seq or not num_reducer: return None cmd = '/opt/hadoop-3.1.0/bin/hadoop jar ' + \ 'jar/HDFS004Store.jar kd.mapreduce.Hdfs004Store2 ' arg = '%s %s %d %s' % (tpid, task_seq, num_reducer, './' + tpid) cmd = "%s %s" % (cmd, arg) logger().info('cmd:%s', cmd) ret = 0 == os.system(cmd) # pylint: disable C0122 if ret: data = open('./' + tpid).read() jpg = data.split(',')[-1] jpg = base64.b64decode(jpg) os.remove('./' + tpid) logger().info('jpg size:%d', len(jpg)) return jpg else: return None
def api_v2_main(): import os import base64 import sys for line in sys.stdin: track_id, tpid, batch = line.strip().split() pic1 = '%s_00_004.jpg' % tpid pic = '/mnt/data1/mongo/jingjingg_004/%s_00_004.jpg' % tpid tpe = "00" seq = "004" image_type = "jpg" try: with open(pic) as f: content = f.read().strip() res = image_upload(track_id, tpid, tpe, seq, image_type, content) if res: logger().info('%s is succes upload,image is %s', tpid, pic1) else: logger().info('%s is failed upload,image is %s', tpid, pic1) except: logger().info("has empty pic, image is %s" % pic1)
def image_stat(track_id, tpe, seq_list, image_type): conn = hbase_util.get_thrift_conn() table_name = batch_manager.get_table_name() logger().info('scan table:%s', table_name) table = conn.table(table_name) start_key = '%d-%s' % (rowkey.compute_magic(track_id), track_id) ret = {} for seq in seq_list: ret[seq] = [] for row_data in table.scan(row_start=start_key, row_stop=start_key + '~', batch_size=300): row_key = row_data[0] data = row_data[1] if 'meta:type' not in data or tpe != data['meta:type']: logger().info('bad type[%s], rowkey[%s]', data['meta:type'], row_key) continue if 'meta:seq' not in data or data['meta:seq'] not in seq_list: logger().info('bad seq [%s], rowkey[%s]', data['meta:seq'], row_key) continue if 'meta:image_type' not in data or \ image_type != data['meta:image_type']: logger().info('bad image_type[%s], rowkey[%s]', data['meta:image_type'], row_key) continue if 'meta:batch_list' not in data or not data['meta:batch_list']: logger().info('empty batch_list for rowkey[%s]', row_key) tpid = '_'.join(row_key.split('-')[2].split('_')[0:2]) ret[data['meta:seq']].append(tpid) for key, val in ret.items(): logger().info('stat result %s:%d', key, len(val)) return ret
def kds_del(batch): rowkey = kds_rowkey.gen(batch) logger().info('delete rowkey[%s]', rowkey) table = get_table_name() return hbase_util.delete(table, rowkey)
def process(self): logger().error('not implement') sys.exit(-1)
def __init__(self): logger().info('cat init') cat.init("kd.hbase")
table.put(row, col_dict, wal=True) return True def put_cols(table, row, cols): #(column_family, column, data) succ = False for i in range(retry_times): try: ret = _put_cols(table, row, cols) succ = True break except Exception, e: # pylint: disable=W0703 logger().error('exception:[%s]', str(e)) thrift_reconn() if not succ: logger().info('put col failed %d times, table[%s], rowkey[%s]', retry_times, table, row) return succ def _put_col(table, row, column_family, column, content): cf = '%s:%s' % (column_family, column) conn = get_thrift_conn() table = conn.table(table) table.put(row, {cf: content}, wal=True) return True def put_col(table, row, column_family, column, content): succ = False for i in range(retry_times): try:
def recover(self): master = self.__get_master_from_sentinel() logger().info('switch master to [%s]', master)