def _load(self): log('[{time}] loading from {path}'.format(time=get_time(), path=self._source_path)) for i, label_tag in enumerate(self._label_tags): path = os.path.join(self._source_path, label_tag) files = sample( os.listdir(path)[self._start:self._end], self._max_num ) if self._max_num > 0 else os.listdir(path)[self._start:self._end] print('start: {}, end: {}'.format(self._start, self._end)) print(len(files)) pbar = ProgressBar(len(files)) for j, filename in enumerate(files): filepath = os.path.join(path, filename) try: with open(filepath, 'r') as f: log_sequence = json.load(f) feature = self._sequence2feature(log_sequence) self._data_ids.append( filepath.split('/')[-1].split('.')[0].split('_') [0]) self._feature_data.append(feature) self._label_data.append(i) except: log('[{time}] Failed to load file {filepath}'.format( time=get_time(), filepath=filepath)) print('[{time}] Failed to load file {filepath}'.format( time=get_time(), filepath=filepath)) pbar.updateBar(j)
def _load(self): for i, label_tag in enumerate(self._label_tags): path = os.path.join(self._source_path, label_tag) print('\nLoading {} data...'.format(label_tag)) pbar = ProgressBar(len(os.listdir(path))) for j, filename in enumerate(os.listdir(path)): filepath = os.path.join(path, filename) try: with open(filepath, 'rb') as f: # print filepath feature_seq = json.load(f) feature = self._sequence2feature(feature_seq) self._data_ids.append(filepath.split('/')[-1]) self._feature_data.append(feature) self._label_data.append(i) except EOFError: print('[{time}] Failed to load file {filepath}'.format( time=get_time(), filepath=filepath)) pbar.updateBar(j)
def _load(self): log('[{time}] loading from {path}'.format(time=get_time(), path=self._source_path_list)) print(self._label_tags) # 迭代标签['neg', 'pos'] for i, label_tag in enumerate(self._label_tags): print('label_tag: {}'.format(label_tag)) # 迭代日期目录[ds_1, ds_2, ...] for source_path in self._source_path_list: print('Loading data from DIR: {}'.format(source_path)) # 获取待预测id if label_tag != 'pos' and label_tag != 'neg': ds = source_path.split('/')[-1] trigger_dir_pred = os.path.join( SAVE_DIR_BASE, 'trigger', '{}_{}'.format(ds, label_tag)) print('Trigger file: {}'.format(trigger_dir_pred)) with open(trigger_dir_pred) as f: ids_to_pred = json.load(f) # 获取正样本id else: ds = source_path.split('/')[-1] trigger_dir_pos = os.path.join(SAVE_DIR_BASE, 'trigger', '{}_pos'.format(ds)) print('Trigger file: {}'.format(trigger_dir_pos)) with open(trigger_dir_pos) as f: ids_pos = json.load(f) # 获取负样本id if label_tag == 'neg': ds = source_path.split('/')[-1] trigger_dir_total = os.path.join( SAVE_DIR_BASE, 'trigger', '{}_total'.format(ds)) print('Trigger file: {}'.format(trigger_dir_total)) with open(trigger_dir_total) as f: ids_total = json.load(f) random.seed(1) ids_neg = random.sample( list(set(ids_total) - set(ids_pos)), len(ids_pos)) # 迭代行为序列,提取特征 sample_or_not = lambda ids, max_num: random.sample( ids, max_num) if max_num != 0 else ids if label_tag == 'neg': ids = sample_or_not(ids_neg, self._max_num) elif label_tag == 'pos': ids = sample_or_not(ids_pos, self._max_num) else: ids = ids_to_pred print('label: {}, num: {}'.format(label_tag, len(ids))) pbar = ProgressBar(len(ids)) for j, filename in enumerate(ids): filepath = os.path.join(source_path, filename) try: with open(filepath, 'r') as f: log_sequence = json.load(f) feature = self._sequence2feature(log_sequence) self._data_ids.append(filepath.split('/')[-1]) self._feature_data.append(feature) self._label_data.append(i) except Exception as e: log('[{time}] Failed to load file {filepath}'.format( time=get_time(), filepath=filepath)) print( '[{time}] Failed to load file {filepath}'.format( time=get_time(), filepath=filepath), e) pbar.updateBar(j)