def stat_news(db, slice_size=1000, _limit=None, _suffix='.count'): all_ids, num_ids = get_valid_ids(db, config.TABLE_NEWS, 'id') number_news = db.count(config.TABLE_NEWS) tag_map = init_stat_map(db, config.TABLE_TAG) category_map = init_stat_map(db, config.TABLE_CATEGORY) logger.info( '#{:d} news in {}-{}'.format(number_news, str(db), config.TABLE_NEWS)) _limit = number_news if not _limit else min(_limit, number_news) all_ids.append(all_ids[-1] + 1) logger.info('#{:d} will be parsed'.format(_limit)) for idx_start in xrange(0, _limit, slice_size): idx_end = min(idx_start + slice_size, _limit) id_start = all_ids[idx_start] id_end = all_ids[idx_end] query = 'SELECT tags, categories FROM {} WHERE id>={} and id<{' \ '}'.format(config.TABLE_NEWS, id_start, id_end) cur_slice = db.execute(query) for tags, categories in cur_slice: stat_item(tag_map, tags) stat_item(category_map, categories, _drop_tail=True) logger.info('#{:6d}/{:d} stated'.format(idx_end, _limit)) tag_file = config.pjoin(config.DATA_DIR, 'stat', 'tag' + _suffix) category_file = config.pjoin(config.DATA_DIR, 'stat', 'category' + _suffix) util.save_map(tag_map, tag_file, 'tag_stat') util.save_map(category_map, category_file, 'category_stat')
def get_segmentation_file(table_name, column_name, cut_all=True): sub_directory = '{}_segment'.format(table_name) sub_directory = pjoin(config.DATA_DIR, sub_directory) util.check_directory(sub_directory) segment_type = config.SEGMENT_FULL if cut_all else config.SEGMENT_PRECISE filename = '_'.join([table_name, column_name, segment_type]) return pjoin(sub_directory, filename)
def test_get_path(self): path1_gen = config.get_path('result') path1_true = config.pjoin(config.DATA_DIR, 'result') self.assertEqual(path1_gen, path1_true) path2_gen = config.get_path('result', 'subdir') path2_true = config.pjoin(config.DATA_DIR, 'result', 'subdir') self.assertEqual(path2_gen, path2_true)
def test_stat_news(self): _suffix = '.test' stat_news.stat_news(self.db_local_large, _limit=1000, _suffix=_suffix) tag_file = config.pjoin(config.DATA_DIR, 'stat', 'tag' + _suffix) category_file = config.pjoin(config.DATA_DIR, 'stat', 'category' + _suffix) self.assertTrue(exists(tag_file)) self.assertTrue(exists(category_file))
def test_detect_region(self): __affine__, __thres__ = config.HESSIAN_AFFINE _from = config.pjoin(config.DATA_TEST, 'ppm') _to = config.pjoin(config.DATA_TEST) subprocess.call([ config.AFFINE_DETECTOR, '-{}'.format(__affine__), '-i', _from, '-o', _to, '-thres', __thres__ ])
def test_detect_region(self): __affine__, __thres__ = config.HESSIAN_AFFINE _from = config.pjoin(config.DATA_TEST, 'ppm') _to = config.pjoin(config.DATA_TEST) subprocess.call([config.AFFINE_DETECTOR, '-{}'.format(__affine__), '-i', _from, '-o', _to, '-thres', __thres__])
def draw_result_curve(all_evaluation, _classes=None, _useable=False): def save_plt_fig(_path, ext='png', close=True): if not _path.endswith(ext): _path = '{}.{}'.format(_path, ext) util.check_directory(os.path.dirname(_path)) plt.savefig(_path) if close: plt.close() logger.info('image saved to: %s', _path) if _classes is None: _classes = encoder.classes_ if type(_classes) is not list: _classes = list(_classes) _classes.append('All') useable_flag = 'useable' if _useable else 'all' plot_groups = [['hit', 'fp', 'miss'], ['precision', 'recall', 'error_rate'], ['gt_class_count', 'pred_class_count']] sub_dir = config.pjoin(config.RESULT_DIR, 'back_test_figures_{}_{}'.format(useable_flag, args.date)) util.check_directory(sub_dir) n_class = len(_classes) n_plot_groups = len(plot_groups) for _ in range(n_class): _class = _classes[_] plt.title('class={}'.format(_class.encode('utf8'))) plt.xlabel('threshold') for i_plot in range(n_plot_groups): c_count = 0 plots = [] plot_group = plot_groups[i_plot] plt.rcParams["figure.figsize"] = [24.0, 10.0] plt.subplot(1, n_plot_groups, i_plot + 1) # plt.subplots_adjust(left=1.0, right=1.0, bottom=3.0, top=3.0) colors = [] for type_ in plot_group: m_cur = [all_evaluation[t].get(type_)[_] for t in thresholds] plots.append(plt.plot(thresholds, m_cur, PLOT_COLORS[c_count])) colors.append('{}: {}'.format(PLOT_COLORS[c_count], type_)) c_count += 1 plt.xlabel('\n'.join(colors)) plt.ylabel('/'.join(plot_group)) plt.grid(True) # plt.legend(plots, plot_group, # loc='lower left', numpoints=1) _file = config.pjoin(sub_dir, '{:02d}_{}.png'.format(_+1, _class)) save_plt_fig(_file)
def clean_segmentation(_file=None, _type=None, _segment_type=None): if not _file: if not _type or not _segment_type: logger.error('At least one way to specify the segmentation file.') _filename = 'mp_news_{}_{}'.format(_type, _segment_type) _file = config.pjoin(config.DATA_DIR, 'mp_news_segment', _filename) logger.info('cleaning {}'.format(_file)) _keys, _rows = data_util.load_news_area_info(db_local_large) num_rows = len(_rows) slice_size = num_rows // 100 logger.info('#{} rows got from {}'.format(num_rows, str(db_local_large))) cleaned_file = _file + '_cleaned' f_cleaned = open(cleaned_file, 'wb') count = 0 for _mp_id, _words in load_segmentation(_file): count += 1 f_cleaned.write(str(_mp_id)) f_cleaned.write('\t') f_cleaned.write((' '.join(_words)).encode('utf-8')) f_cleaned.write(__line_sep__) if count % slice_size == 0: logger.info( '{:=6d}[{:3d}%] done.'.format(count, count / slice_size)) f_cleaned.flush() f_cleaned.close()
def get_model_path(model_set, model_name): """ Get model path by model set and model name :param model_set: model collections under ${FASTER_RCNN_ROOT}/data :param model_name: model name without any suffix, i.e. vgg :return: model path and corresponding proto file path """ model_root = pjoin(config.FASTER_RCNN_DIR, 'data') if model_set in ('fast', 'faster'): model_set += '_rcnn' model_set_dir = pjoin(model_root, '{}_models'.format(model_set)) models = os.listdir(model_set_dir) for model in models: if model.lower().startswith(model_name.lower()): model_path = pjoin(model_set_dir, model) proto_path = pjoin(cfg.MODELS_DIR, model_name.upper(), 'faster_rcnn_alt_opt', 'faster_rcnn_test.pt') return model_path, proto_path return None, None
def load_stop_words(_stop_list='chinese', decode=None): _filename = '{}_stop_words.txt'.format(_stop_list) _stop_words_file = config.pjoin(config.DATA_DIR, 'dic', _filename) if decode: lines = [line.strip().decode('utf-8') for line in open( _stop_words_file, 'rb')] else: lines = [line.rstrip() for line in open(_stop_words_file, 'rb')] return set(lines)
def walk_wrapper(path_from, path_to, _ext=None): path_from = os.path.abspath(path_from) path_to = os.path.abspath(path_to) if path_from[-1] != os.sep: path_from += os.sep logger.info('Walk in {}'.format(path_from)) logger.info('Results to {}'.format(path_to)) _prefix = os.path.commonprefix([os.path.split(path_from), os.path.split(path_to)]) len_prefix = len(os.path.sep.join(_prefix)) logger.info('Common prefix: {}'.format(_prefix)) for cur_dir, dir_list, file_list in os.walk(path_from): dir_name = cur_dir[len(path_from):] to_dir = pjoin(path_to, dir_name) check_directory(to_dir) logger.info('Found directory: {}'.format(cur_dir)) for _file in file_list: _from_file = pjoin(cur_dir, _file) if not _ext: _to_file = pjoin(to_dir, _file) else: _filename, _origin_ext = os.path.splitext(_file) _to_file = pjoin(to_dir, '{}.{}'.format(_filename, _ext)) if os.path.exists(_to_file): logger.info('skip for exists: {}'.format(_to_file)) continue try: func(_from_file, _to_file) logger.info('[{}] from {} to {}'.format( func.func_name, '${{FROM}}{}'.format(_from_file[len_prefix:]), '${{TO}}{}'.format(_to_file[len_prefix:]) )) except Exception, e: logger.info('Failed [{}] {}, error msg: {}'.format( func.func_name, _from_file, str(e) ))
def walk_wrapper(path_from, path_to, _ext=None): path_from = os.path.abspath(path_from) path_to = os.path.abspath(path_to) if path_from[-1] != os.sep: path_from += os.sep logger.info('Walk in {}'.format(path_from)) logger.info('Results to {}'.format(path_to)) _prefix = os.path.commonprefix( [os.path.split(path_from), os.path.split(path_to)]) len_prefix = len(os.path.sep.join(_prefix)) logger.info('Common prefix: {}'.format(_prefix)) for cur_dir, dir_list, file_list in os.walk(path_from): dir_name = cur_dir[len(path_from):] to_dir = pjoin(path_to, dir_name) check_directory(to_dir) logger.info('Found directory: {}'.format(cur_dir)) for _file in file_list: _from_file = pjoin(cur_dir, _file) if not _ext: _to_file = pjoin(to_dir, _file) else: _filename, _origin_ext = os.path.splitext(_file) _to_file = pjoin(to_dir, '{}.{}'.format(_filename, _ext)) if os.path.exists(_to_file): logger.info('skip for exists: {}'.format(_to_file)) continue try: func(_from_file, _to_file) logger.info('[{}] from {} to {}'.format( func.func_name, '${{FROM}}{}'.format(_from_file[len_prefix:]), '${{TO}}{}'.format(_to_file[len_prefix:]))) except Exception, e: logger.info('Failed [{}] {}, error msg: {}'.format( func.func_name, _from_file, str(e)))
train_samples = np.take(all_features, train_ids, axis=0) test_samples = np.take(all_features, test_ids, axis=0) evaluate(train_samples, train_labels, test_samples, test_labels, _classifier, f) def evaluate(train_samples, train_labels, test_samples, test_labels, classifier, f): count_values(train_labels) count_values(test_labels) classifier.fit(train_samples, train_labels) test_pred = classifier.predict(test_samples) experiment_util.calculate_result(test_labels, test_pred, f) if __name__ == '__main__': util.check_directory(config.RESULT_DIR) args = parse_args() __filename = 'news_info_useable' news_info_file = config.pjoin(config.DATA_DIR, 'news', __filename) news_id_index, news_mp_index, id_map = config.News.load_info(news_info_file) mp_ids = sorted(news_mp_index.keys()) logger.info('number of mp_ids: {}'.format(len(mp_ids))) if args.dry_run: mp_ids = random.sample(mp_ids, 200) logger.info('Arguments: {}'.format(args)) experiment_manager()
def run_default(): for _dataset in config.DATASETS: _from = pjoin(config.DATA_ROOT, _dataset) _to = pjoin(config.DATA_REGION_FEATURE_ROOT, _dataset) detect_region(_from, _to, __suffix__)
def evaluate(gt, pred, f=None): _name = 'back_test_evaluation_{}_{}_{}.txt'.format(args.date, args.limit, get_time_str()) util.check_directory(config.RESULT_DIR) f = open(config.pjoin(config.RESULT_DIR, _name), 'wb') def count_values(_values): counter = Counter(_values) logger.info(range(__n_class__)) res = [counter[_] for _ in range(__n_class__)] res.append(sum(res)) logger.info(res) return res def ilog(content): logger.info(content) iwrite(f, content) def _evaluate(cur_gt, cur_pred): num_gt, num_pred = len(cur_gt), len(cur_pred) ilog('#gt: {}'.format(num_gt)) ilog(range(__n_class__)) ilog(num_samples_by_class) ilog('#pred: {}'.format(num_pred)) counter_pred = count_values(cur_pred.values()) # ilog(counter_pred) res = ResultHolder( hit=[0] * __n_class__, fp=[0] * __n_class__, miss=[0] * __n_class__ ) hit, fp, miss = res.hit, res.fp, res.miss for k in cur_pred: if k in cur_gt: _gt_k = cur_gt.pop(k) if _gt_k == cur_pred[k]: hit[_gt_k] += 1 else: miss[_gt_k] += 1 fp[cur_pred[k]] += 1 for v in cur_gt.values(): miss[v] += 1 ilog('common prediction: {}'.format(hit)) ilog('false prediction: {}'.format(fp)) ilog('miss prediction: {}'.format(miss)) for _ in ('hit', 'fp', 'miss'): res.get(_).append(sum(res.get(_))) sum_hit, sum_fp, sum_miss = hit[-1], fp[-1], miss[-1] ilog('Total hit/false/miss: {}/{}/{}'.format(sum_hit, sum_fp, sum_miss)) def calc_percentage(cur, base): return cur * 1.0 / base if base > 0 else -1 res.set('precision', [calc_percentage(h, h + p) for h, p in zip(hit, fp)]) res.set('recall', [calc_percentage(h, num) for h, num in zip(hit, num_samples_by_class)]) res.set('error_rate', [calc_percentage(f, num) for f, num in zip(fp, num_samples_by_class)]) ilog('class precision:\n{}'.format(str(res.precision))) ilog('class recall:\n{}'.format(str(res.recall))) ilog('class error_rate:\n{}'.format(str(res.error_rate))) ilog('Precision: {:.2f}/{:.2f}'.format( calc_percentage(sum_hit, sum_hit + sum_fp), calc_percentage(sum_hit, num_pred))) ilog('Recall: {:.2f}/{:.2f}'.format( calc_percentage(sum_hit, num_gt), calc_percentage(sum_hit, num_samples))) res.set('gt_class_count', num_samples_by_class) res.set('pred_class_count', counter_pred) return res all_evaluation = {} ilog('=' * 80) num_samples = len(pred) ilog('= Back Test Evaluation, #samples={}'.format(num_samples)) ilog('=' * 80) for only_useable in (True, False): cur_gt = {k: v[0] for k, v in gt.items() if v[1] == 1} \ if only_useable else {k: v[0] for k, v in gt.items()} _flag = 'Useable' if only_useable else 'All' num_samples_by_class = count_values(cur_gt.values()) for threshold in thresholds: ilog('=' * 60) ilog('= Back Test[{}] with Predict Thres[{}]'.format( _flag, threshold)) ilog('=' * 60) cur_pred = {k: v[0] for k, v in pred.items() if v[1] >= threshold} all_evaluation[threshold] = _evaluate(cur_gt.copy(), cur_pred) draw_result_curve(all_evaluation, _useable=only_useable) ilog('== All evaluation done ==')
def get_api_predict_path(): util.check_directory(config.RESULT_DIR) _name = 'api_predict_{}_{}.txt'.format(args.date, args.limit) return config.pjoin(config.RESULT_DIR, _name)
_keys, _rows = data_util.load_news_area_info(db_local_large) num_rows = len(_rows) slice_size = num_rows // 100 logger.info('#{} rows got from {}'.format(num_rows, str(db_local_large))) cleaned_file = _file + '_cleaned' f_cleaned = open(cleaned_file, 'wb') count = 0 for _mp_id, _words in load_segmentation(_file): count += 1 f_cleaned.write(str(_mp_id)) f_cleaned.write('\t') f_cleaned.write((' '.join(_words)).encode('utf-8')) f_cleaned.write(__line_sep__) if count % slice_size == 0: logger.info( '{:=6d}[{:3d}%] done.'.format(count, count / slice_size)) f_cleaned.flush() f_cleaned.close() if __name__ == '__main__': db_local_large = config.local_mp_online mp_segment_dir = config.pjoin(config.DATA_DIR, 'mp_news_segment') filenames = os.listdir(mp_segment_dir) for filename in filenames: segmentation_file = config.pjoin(mp_segment_dir, filename) clean_segmentation(segmentation_file)
def test_load_segmentation(self): _file = config.pjoin(config.DATA_DIR, 'test', 'test_file') logger.info(type(load_segmentation(_file))) for _line in load_segmentation(_file): logger.info(_line.rstrip())
@util.walk def detect_region(_from, _to): """ >> ./h_affine.ln -haraff -i img1.ppm -o img1.haraff -thres 1000 >> ./h_affine.ln -hesaff -i img1.ppm -o img1.hesaff -thres 500 """ subprocess.call([ AFFINE_DETECTOR, '-{}'.format(__affine__), '-i', _from, '-o', _to, '-thres', str(__thres__) ]) def run_default(): for _dataset in config.DATASETS: _from = pjoin(config.DATA_PPM_ROOT, _dataset) _to = pjoin(config.DATA_REGION_ROOT, _dataset) detect_region(_from, _to, __suffix__) if __name__ == '__main__': __affine__, __thres__ = config.HESSIAN_AFFINE __suffix__ = __affine__ if len(sys.argv) > 1: detect_region(pjoin(config.DATA_PPM_ROOT, 'test'), pjoin(config.DATA_REGION_ROOT, 'test'), __suffix__) else: run_default()
output_file = pjoin(config.DATA_DIR, output_folder, output_filename) print 'Will output to:', output_file # def get_word_vector(_file, _model_name, _output_file, pooling=max): _extract_word2vec_representation(input_file, model_file, output_file, pooling=_pooling, norm=_norm) if __name__ == '__main__': (index, model) = WORD2VEC_LIST[WORD2VEC_INDEX] # model_filename = 'model_{}_s{}_w{}_m{}_n{}_s{}.word2vec'.format( # SG_VALUES[model.sg], model.vector_size, model.window, # model.min_count, model.negative, '1e3') model_filename = config.FORMATTER_WORD2VEC_MODEL.format( SG_VALUES[model.sg], model.vector_size, model.window, model.min_count, model.negative, model.sample) model_file = config.pjoin(config.DATA_WORD2VEC_DIR, model_filename) ID_MAP = load_news_ids() num_samples = len(ID_MAP) id_list = [0 for _ in xrange(num_samples)] for _id in ID_MAP: id_list[ID_MAP[_id]] = _id # train_or_test_word2vec() for pooling in config.POOLING_METHODS: for norm in config.NORM_METHODS: for _type in ['content']: # for _type in ['title', 'brief', 'content']: extract_word2vec_representation(_type, pooling, norm) # extract_word2vec_representation('title', 'max', 'root') # extract_word2vec_representation('title', 'max', None)
def experiment_manager(): _type = config.TYPE_CONTENT _segment_type = config.SEGMENT_PRECISE _pooling = config.AVERAGE_POOLING _norm = config.L2_NORM _vector_size = 800 if 'LR' in args.classifiers: _classifier = LogisticRegressionCV(n_jobs=8, cv=5) elif 'svc_linear' in args.classifiers: _classifier = SVC(kernel='linear', probability=True) else: _classifier = LogisticRegressionCV(n_jobs=8, cv=5) __sample_flag = 'sample' if args.sample else 'no_sample' __name = '1v1_{}_{}_{}_{}.txt'.format(__sample_flag, type(_classifier).__name__, date.today(), time()) result_file = config.pjoin(config.RESULT_DIR, __name) logger.info('result file: {}'.format(result_file)) f = open(result_file, 'wb') iwrite(f, '\nClassifier Info\n') iwrite(f, _classifier) global all_features all_features = load_all_feature(_type, _vector_size, _pooling, _norm, _segment_type) all_ids = np.array([id_map[_] for _ in mp_ids]) all_labels = np.array([news_id_index[_].area_id for _ in all_ids]) count_values(all_labels) all_labels = np.array([area_id_index[_].name for _ in all_labels]) count_values(all_labels) label_encoder = get_label_encoder() all_labels = label_encoder.transform(all_labels) num_class = len(label_encoder.classes_) for i_class in range(num_class): cur_class_ids = [all_ids[_] for _ in xrange(len(all_labels)) if all_labels[_] == i_class] train_id_cur, test_id_cur = experiment_util.split_train_test_set( cur_class_ids) rest_ids_all = list(set(all_ids) - set(cur_class_ids)) # rest_ids_all = [_ for _ in all_ids if _ not in cur_class_ids] if args.sample: random.shuffle(rest_ids_all) rest_ids = rest_ids_all[:len(cur_class_ids)] else: rest_ids = rest_ids_all train_id_rest, _ = experiment_util.split_train_test_set( rest_ids) test_id_rest = list(set(rest_ids_all) - set(train_id_rest)) # test_id_rest = [_ for _ in rest_ids_all if _ not in train_id_rest] train_ids = train_id_cur + train_id_rest test_ids = test_id_cur + test_id_rest train_labels = [0] * len(train_id_cur) + [1] * len(train_id_rest) test_labels = [0] * len(test_id_cur) + [1] * len(test_id_rest) logger.info('Train {}/{} || Test {}/{}'.format(len(train_id_cur), len(train_id_rest), len(test_id_cur), len(test_id_rest))) _msg = 'Class: {}[{}]'.format(i_class, label_encoder.inverse_transform(i_class)) logger.info(_msg) iwrite(f, _msg) train_samples = np.take(all_features, train_ids, axis=0) test_samples = np.take(all_features, test_ids, axis=0) evaluate(train_samples, train_labels, test_samples, test_labels, _classifier, f)
#! /usr/bin/env python # -*- coding: utf-8 -*- # @filename generate_id # @author [email protected] # @date 2016-03-16 16:17 from os import linesep from core.data.data_util import mysql_batch_job_wrapper from core.data.get_word_vector import SampleIterator from core.util import LoggerUtil from core.util import config logger = LoggerUtil.get_logger(__file__.split('/')[-1][:-3]) sample_file = config.pjoin(config.DATA_DIR, 'mp_news_segment', 'mp_news_title_precise_cleaned') id_file = config.pjoin(config.DATA_DIR, 'news', 'news_info') if False: count = 0 with open(id_file, 'wb') as f: for _id, _words in SampleIterator(sample_file): f.write(str(count)) f.write('\t') f.write(str(_id)) f.write(linesep) count += 1 id_map = {} id_list = [] for line in open(id_file):
logger.info('calc TF-IDF done: {}'.format(end_time - start_time)) # save statistics _result_dir = config.pjoin(config.DATA_STAT_DIR, os.path.basename(_file)) util.check_directory(_result_dir) vectorizer_file = config.pjoin(_result_dir, 'vectorization') util.save_sparse_csr_matrix(vectorizer_file, vectorizer_result) logger.info('vectorization file saved: {}'.format(vectorizer_file)) tfidf_file = config.pjoin(_result_dir, 'tfidf') util.save_sparse_csr_matrix(tfidf_file, tfidf) logger.info('tfidf file saved: {}'.format(tfidf_file)) words_file = config.pjoin(_result_dir, 'all_words') with open(words_file, 'wb') as f: for word in all_words: f.write(word.encode('utf-8')) f.write(os.linesep) logger.info('words file saved: {}'.format(words_file)) id_map_file = config.pjoin(_result_dir, 'id_map') util.save_map(news_id_map, id_map_file) logger.info('news id map saved: {}'.format(id_map_file)) return news_id_map, vectorizer_result, all_words, tfidf if __name__ == '__main__': db_local_large = config.local_mp_online segmentation_file = config.pjoin(config.DATA_DIR, 'mp_news_segment', 'mp_news_content_precise') stat_tf_idf(segmentation_file)
def stat_tf_idf(_file=None, _type=None, _segment_type=None): """ For a certain segmentation type of [brief/content/title], vectorization all the corpus text and output tf-idf results :param _file: segmentation file path, will be replaced with the auto-generated one if None is given :param _type: chosen from brief/content/title :param _segment_type: chosen from full/precise :return: """ if not _file: if not _type or not _segment_type: logger.error('At least one way to specify the segmentation file.') _filename = 'mp_news_{}_{}'.format(_type, _segment_type) _file = config.pjoin(config.DATA_DIR, 'mp_news_segment', _filename) _keys, _rows = data_util.load_news_area_info(db_local_large) num_rows = len(_rows) slice_size = num_rows // 100 logger.info('#{} rows got from {}'.format(num_rows, str(db_local_large))) news_id_map = {} count = 0 _corpus = [] for _mp_id, _words in load_segmentation(_file): news_id_map[_mp_id] = count count += 1 _corpus.append(' '.join(_words)) if count % slice_size == 0: logger.info( '{:=6d}[{:3d}%] done.'.format(count, count / slice_size)) vectorizer = CountVectorizer(decode_error='ignore') transformer = TfidfTransformer() start_time = time.time() vectorizer_result = vectorizer.fit_transform(_corpus) end_time = time.time() logger.info('vectorization done: {}'.format(end_time - start_time)) all_words = vectorizer.get_feature_names() logger.info('#{:d} words in total'.format(len(all_words))) start_time = time.time() tfidf = transformer.fit_transform(vectorizer_result) end_time = time.time() logger.info('calc TF-IDF done: {}'.format(end_time - start_time)) # save statistics _result_dir = config.pjoin(config.DATA_STAT_DIR, os.path.basename(_file)) util.check_directory(_result_dir) vectorizer_file = config.pjoin(_result_dir, 'vectorization') util.save_sparse_csr_matrix(vectorizer_file, vectorizer_result) logger.info('vectorization file saved: {}'.format(vectorizer_file)) tfidf_file = config.pjoin(_result_dir, 'tfidf') util.save_sparse_csr_matrix(tfidf_file, tfidf) logger.info('tfidf file saved: {}'.format(tfidf_file)) words_file = config.pjoin(_result_dir, 'all_words') with open(words_file, 'wb') as f: for word in all_words: f.write(word.encode('utf-8')) f.write(os.linesep) logger.info('words file saved: {}'.format(words_file)) id_map_file = config.pjoin(_result_dir, 'id_map') util.save_map(news_id_map, id_map_file) logger.info('news id map saved: {}'.format(id_map_file)) return news_id_map, vectorizer_result, all_words, tfidf
def load_label_encoder(_name='encoder_20_2016-04-05.pkl'): _file = config.pjoin(config.DATA_CLASSIFIER_DIR, 'label_encoder', _name) return pickle.load(open(_file, 'rb'))
def get_all_result_path(): util.check_directory(config.RESULT_DIR) _name = 'back_test_result_{}_{}.pkl'.format(args.date, args.limit) return config.pjoin(config.RESULT_DIR, _name)
def run_default(): for _dataset in DATASETS: _from = pjoin(DATA_ROOT, _dataset) _to = pjoin(DATA_PPM_ROOT, _dataset) convert_image(_from, _to, 'ppm')
from core.util.config import AFFINE_EXTRACTOR, pjoin logger = LoggerUtil.get_logger(__file__.split('/')[-1][:-3]) @util.walk def detect_region(_from, _to): """ >> ./h_affine.ln -haraff -i img1.ppm -o img1.haraff -thres 1000 >> ./h_affine.ln -hesaff -i img1.ppm -o img1.hesaff -thres 500 """ subprocess.call([AFFINE_EXTRACTOR, _from]) def run_default(): for _dataset in config.DATASETS: _from = pjoin(config.DATA_ROOT, _dataset) _to = pjoin(config.DATA_REGION_FEATURE_ROOT, _dataset) detect_region(_from, _to, __suffix__) if __name__ == '__main__': __affine__, __thres__ = config.HESSIAN_AFFINE __suffix__ = __affine__ if len(sys.argv) > 1: detect_region(pjoin(config.DATA_ROOT, 'test'), pjoin(config.DATA_REGION_FEATURE_ROOT, 'test'), __suffix__) else: run_default()
def get_label_encoder(_name='encoder_20_2016-04-05'): import pickle _file = '{}.pkl'.format(_name) _path = config.pjoin(config.DATA_CLASSIFIER_DIR, 'label_encoder', _file) return pickle.load(open(_path, 'rb'))
import _init_paths import subprocess import sys from core.util import LoggerUtil from core.util import util from core.util.config import pjoin, DATASETS, DATA_ROOT, DATA_PPM_ROOT logger = LoggerUtil.get_logger(__file__.split('/')[-1][:-3]) @util.walk def convert_image(_from, _to): subprocess.check_call(['convert', _from, _to]) def run_default(): for _dataset in DATASETS: _from = pjoin(DATA_ROOT, _dataset) _to = pjoin(DATA_PPM_ROOT, _dataset) convert_image(_from, _to, 'ppm') if __name__ == '__main__': if len(sys.argv) > 1: convert_image(pjoin(DATA_ROOT, 'test'), pjoin(DATA_PPM_ROOT, 'test'), 'ppm') else: run_default()
@util.walk def detect_region(_from, _to): """ >> ./h_affine.ln -haraff -i img1.ppm -o img1.haraff -thres 1000 >> ./h_affine.ln -hesaff -i img1.ppm -o img1.hesaff -thres 500 """ subprocess.call([AFFINE_DETECTOR, '-{}'.format(__affine__), '-i', _from, '-o', _to, '-thres', str(__thres__)]) def run_default(): for _dataset in config.DATASETS: _from = pjoin(config.DATA_PPM_ROOT, _dataset) _to = pjoin(config.DATA_REGION_ROOT, _dataset) detect_region(_from, _to, __suffix__) if __name__ == '__main__': __affine__, __thres__ = config.HESSIAN_AFFINE __suffix__ = __affine__ if len(sys.argv) > 1: detect_region(pjoin(config.DATA_PPM_ROOT, 'test'), pjoin(config.DATA_REGION_ROOT, 'test'), __suffix__) else: run_default()