def add_features_cross_smooth_ctr(all_data): ''' 向总体数据添加特征 feature=['user_id', 'item_id', 'item_brand_id', 'shop_id', 'context_page_id', 'category2_label', 'item_price_level', 'category_predict_rank'] 拼接键[feature, 'day'] ''' for feature in tqdm([ 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level' ]): for feature2 in tqdm([ 'item_id', 'item_brand_id', 'shop_id', 'item_price_level', 'hour' ]): feature_path = feature_data_path + feature + '_' + feature2 + '_smooth_CTR.pkl' #要存放的目录 if not os.path.exists(feature_path): gen_features_cross_smooth_ctr() ctr_data = load_pickle(feature_path) all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, feature2, 'day']) return all_data
def compute_codes_orig_it(track_ids, maindir, clique_ids, start_idx, end_idx): """Computes the original features, based on Thierry and Ellis, 2012. Dimensionality reduction using PCA of 50, 100, and 200 components.""" res = [] trainedpca = utils.load_pickle("models/pca_250Kexamples_900dim_nocovers.pkl") pca_components = [50,100,200] # Init codes codes = [] for n_comp in pca_components: codes.append(np.ones((end_idx-start_idx,n_comp)) * np.nan) for i, tid in enumerate(track_ids[start_idx:end_idx]): path = utils.path_from_tid(maindir, tid) feats = utils.extract_feats(path) if feats == None: continue med = np.median(feats, axis=0) for pca_idx, n_comp in enumerate(pca_components): tmp = dan_tools.chromnorm(med.reshape(med.shape[0], 1)).squeeze() codes[pca_idx][i] = trainedpca.apply_newdata(tmp, ndims=n_comp) if i % 1000 == 0: logger.info("Computed %d of %d track(s)" % (i, end_idx-start_idx)) res = (codes, track_ids[start_idx:end_idx], clique_ids[start_idx:end_idx]) return res
def load_codes(codesdir, lda_idx, max_files=None): code_files = glob.glob(os.path.join(codesdir, "*.pk")) if lda_idx == 0: n_comp = 50 elif lda_idx == 1: n_comp = 100 elif lda_idx == 2: n_comp = 200 elif lda_idx == -1: n_comp = 2045 feats = np.empty((0,n_comp)) track_ids = [] clique_ids = [] if max_files is not None: code_files = code_files[:max_files] for code_file in code_files: codes = utils.load_pickle(code_file) feats = np.append(feats, codes[0][lda_idx], axis=0) track_ids += codes[1] clique_ids += list(codes[2]) track_ids = np.asarray(track_ids) clique_ids = np.asarray(clique_ids) return feats, track_ids, clique_ids
def add_app_hist_install(data): feature_path = feature_data_path + 'app_hist_install.pkl' app_hist_install = load_pickle(feature_path) data = pd.merge(data, app_hist_install, 'left', ['appID', 'clickDay']) app_hist_install['app_hist_install'] = app_hist_install[ 'app_hist_install'] / (app_hist_install['clickDay'] - 1) return data
def load_titles(): titles_train, titles_val, titles_test = load_pickle( os.path.join(CACHE_DIR, 'titles.pkl')) print( f'Titles: {len(titles_train)}, {len(titles_val)}, {len(titles_test)}') return titles_train, titles_val, titles_test
def __init__(self, name=None, mode='ranking', # model initialization load_weights_from=None, weights_file=None, # neural network architecture layer_sizes=None, activation='relu', dropout=0.5, freeze_embeddings=False, # error penalties for heuristic ranking objective FN=0.8, FL=0.5 if directories.CHINESE else 0.4, WL=1.0, # learning rates all_pairs_lr=0.002, top_pairs_lr=0.0002, ranking_lr=0.000002, reinforce_lr=0.000002, reward_rescaling_lr=0.00002, # which speaker and string-matching features to use pair_features=None, # mention features use_length=True, use_mention_type=True, use_position=True, use_dep_reln=False, # distance and genre features use_distance=True, use_genre=True, # averaged word embedding features use_spans=True, use_doc_embedding=True): if layer_sizes is None: layer_sizes = [1000, 500, 500] if pair_features is None: pair_features=[ # speaker features "same-speaker", "antecedent-is-mention-speaker", "mention-is-antecedent-speaker", # string-matching features "relaxed-head-match", "exact-string-match", "relaxed-string-match", ] self.load_weights_from = load_weights_from self.weights_file = weights_file self.layer_sizes = layer_sizes self.activation = activation self.dropout = dropout self.freeze_embeddings = freeze_embeddings self.FN, self.FL, self.WL = FN, FL, WL self.ranking_lr = ranking_lr self.reinforce_lr = reinforce_lr self.reward_rescaling_lr = reward_rescaling_lr self.top_pairs_lr = top_pairs_lr self.all_pairs_lr = all_pairs_lr self.use_length = use_length self.use_mention_type = use_mention_type self.use_position = use_position self.use_dep_reln = use_dep_reln self.use_distance = use_distance self.use_genre = use_genre self.use_spans = use_spans self.use_doc_embedding = use_doc_embedding if os.path.exists(directories.MISC + 'pair_feature_names.pkl'): name_mapping = utils.load_pickle(directories.MISC + 'pair_feature_names.pkl') self.active_pair_features = sorted([name_mapping[f] for f in pair_features]) self.set_name(name) self.set_mode(mode)
def deal_with_postag(data_list, mode='full'): if len(data_list) == 1 and mode == 'train': cache_postag = get_config_values('cache', 'postag_train') elif len(data_list) == 1 and mode == 'dev': cache_postag = get_config_values('cache', 'postag_dev') elif len(data_list) == 2 and mode == 'mix': cache_postag = get_config_values('cache', 'postag_mix') elif len(data_list) == 3 and mode == 'full': cache_postag = get_config_values('cache', 'postag_full') else: logger.warn('Found data format wrong when dealing with postag...') if not os.path.exists(cache_postag): logger.info("dealing with postag...") postag = [] for dataset in tqdm(data_list): for line in dataset: postag.append([[ Converter('zh-hans').convert(word['word'].strip().replace( ' ', '')), word['pos'], len(word['word']) ] for word in line['postag']]) save_pickle(cache_postag, postag) else: logger.info("loading with postag...") postag = load_pickle(cache_postag) logger.info("postag total num: {0}".format(len(postag))) logger.info("postag 5: {0}".format(postag[:5])) return postag
def plot_imgrun_loss(p): """ load the models and loss so we can find the best fit.. input: params = dict (root_dir, img_run_id, txt_run_id, img_size,kl_weight, ..) """ img_root = f"{p['root_dir']}/imgruns/{p['img_run_id']}/" print(img_root) epoch_mx = 0 for filename in glob.glob(os.path.join(img_root, 'saved_data/losses*.pkl')): epochs = (filename.split(sep="_")[-1].rstrip(".pkl") ) #split returns a list... if int(epochs) > epoch_mx: loss_file = filename epoch_mx = int(epochs) print(loss_file) # loops over matching filenames in all subdirectories of `directory`. train, test = ut.load_pickle( os.path.join(img_root, f"saved_data/losses_{epoch_mx}.pkl")) print(len(train)) print(len(test)) e_train = range(0, len(train)) e_test = range(0, len(train), int(len(train) / len(test)))
def load_bias(bias_name) -> Dict[str, np.ndarray]: """Load dictionary of example_id->bias where bias is a length 3 array of log-probabilities""" if bias_name == "hans": if bias_name == "hans": bias_src = config.MNLI_WORD_OVERLAP_BIAS if not exists(bias_src): raise Exception("lexical overlap bias file is not found") bias = utils.load_pickle(bias_src) for k, v in bias.items(): # Convert from entail vs non-entail to 3-way classes by splitting non-entail # to neutral and contradict bias[k] = np.array([ v[0] - np.log(2.), v[1], v[0] - np.log(2.), ]) return bias if bias_name in config.BIAS_SOURCES: file_path = config.BIAS_SOURCES[bias_name] with open(file_path, "r") as hypo_file: all_lines = hypo_file.read() bias = json.loads(all_lines) for k, v in bias.items(): bias[k] = np.array(v) return bias else: raise Exception("invalid bias name")
def main(): """Save figures showing the pre-existing Urban Observatory network of sensors and comparisons with optimised networks using our approach. """ print("Saving Urban Observatory figures...") set_fig_style() config = get_config() lad20cd = lad20nm_to_lad20cd(config["la"]) networks_path = get_single_obj_filepath(config) networks = load_pickle(networks_path) uo_sensors = load_uo_sensors(config) figs_dir = get_figures_save_dir(config) population_groups, all_groups = get_objectives(config) oa_weights = get_weights(lad20cd, population_groups) theta, _ = get_default_optimisation_params(config) uo_sensor_dict = get_uo_sensor_dict(lad20cd, uo_sensors=uo_sensors) uo_coverage = get_uo_coverage_oa(lad20cd, uo_sensor_dict, theta, all_groups, oa_weights) fig_uo_sensor_locations(lad20cd, uo_sensors, figs_dir) fig_uo_coverage_grid(lad20cd, uo_sensors, theta, figs_dir) fig_uo_coverage_grid_diff(lad20cd, uo_sensors, theta, all_groups, networks, figs_dir) fig_uo_coverage_oa(uo_coverage, theta, all_groups, figs_dir) fig_uo_coverage_oa_diff(lad20cd, uo_coverage, theta, all_groups, networks, figs_dir)
def get_arguments(): args = build_parser() # set random seed for reproducible experiments # reference: https://github.com/pytorch/pytorch/issues/7068 random.seed(args.random_seed) numpy.random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) # these flags can affect performance, selec carefully # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False os.makedirs(args.save_path, exist_ok=True) if args.train_flag: os.makedirs(os.path.join(args.save_path, 'training_log'), exist_ok=True) else: loaded_args = load_pickle( os.path.join(os.path.dirname(args.model_load), 'argument.pickle')) args = update_arguments_for_eval(args, loaded_args) # cuda setting os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES'] = ', '.join(map(str, args.gpu_no)) with open(os.path.join(args.save_path, 'argument.txt'), 'w') as f: for key, value in sorted(vars(args).items()): f.write('%s: %s' % (key, value) + '\n') save_pickle(os.path.join(args.save_path, 'argument.pickle'), args) return args
def add_features_cross_day_ctr(all_data): ''' 向总体数据添加特征 feature=['user_id', 'item_id', 'item_brand_id', 'shop_id', 'context_page_id', 'category2_label', 'item_price_level', 'category_predict_rank'] 拼接键[feature, 'day'] ''' for feature in tqdm([ 'user_id', ]): for feature2 in tqdm([ 'item_id', 'item_brand_id', 'shop_id', 'category2_label', 'item_price_level', ]): I_alias = feature + '_' + feature2 + '_day_I' #总点击次数 C_alias = feature + '_' + feature2 + '_day_C' #购买次数 feature_path = feature_data_path + feature + '_' + feature2 + '_before_day_CTR.pkl' #要存放的目录 if not os.path.exists(feature_path): gen_features_cross_day_ctr() ctr_data = load_pickle(feature_path) all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, feature2, 'day']) all_data[I_alias] = all_data[I_alias].fillna(0) all_data[C_alias] = all_data[C_alias].fillna(0) return all_data
def save_shap_val(hp_filename, filename, name, SAVE_DIR, train_data, test_data, test_labels, use_gpu=True, background_length=100, padding_length=512): hp_d = 'models/{}.pkl'.format(hp_filename) hp_path = utils.get_abs_path(SAVE_DIR, hp_d) d = utils.load_pickle(hp_path) model_d = 'models/{}.pkl'.format(filename) model_path = utils.get_abs_path(SAVE_DIR, model_d) model = init_model(train_data, d, model_path, use_gpu=use_gpu) features_l, importance_l = [], [] features = 'features/{}_shap_all_features.pkl'.format(name) feature_path = utils.get_abs_path(SAVE_DIR, features) scores = 'feature_importance/{}_shap_all_scores.pkl'.format(name) model_path = utils.get_abs_path(SAVE_DIR, scores) features_l, importance_l = get_lstm_shap( model, train_data, test_data, background_length=background_length, padding_length=padding_length, feature_path=feature_path, model_path=model_path) utils.save_pickle(features_l, feature_path) utils.save_pickle(importance_l, model_path)
def __init__(self, system_id): super().__init__(system_id) self.story_model = os.environ.get( "CWC_STORY_MODEL_" + system_id.upper(), self.model_folder + "/" + self.BEAM_MODEL_FILE) self.story_vocab = os.environ.get( "CWC_STORY_VOCAB_" + system_id.upper(), self.model_folder + "/" + self.BEAM_VOCAB_FILE) torch.manual_seed(self.torch_seed) # Load models and vocab dictionaries, init stopping symbols for generation self.st_model = load_model(self.story_model, self.use_cuda) self.st_dict = load_pickle(self.story_vocab) self.st_vocab_size = len(self.st_dict) self.st_eot_id = self.st_dict.word2idx[self.title_end] self.st_eos_id = self.st_dict.word2idx[self.story_end] self.st_sep_id = self.st_dict.word2idx[self.story_sep] # self.special_chars = [self.story_end, self.story_sep, self.title_end] self.special_chars = SPECIAL_CHARACTERS self.nlp = init_nlp_model() self.decoder = BeamSearchDecoder(self.st_model, self.beam_size, self.st_eos_id, verbosity=False, dictionary=self.st_dict)
def get_mean_final_score(model_result_paths): print('Mean score begin ...') test_unass = load_json(TEST_UNASS_PATH) # aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl')) # test_pub = load_json(TEST_PUB_PATH) # aid2orgwithyear = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl')) # title_feature_df = pd.read_pickle(os.path.join(TEST_FEATURE_DIR, 'test-title-distance-df.pkl')) # title_feature = title_feature_df.values # org_text_process_dict = { # 'my_stopwords': set(stopwords.words('english')), # 'num_pattern': re.compile(r'\d+'), # 'remove_punctuation': str.maketrans(string.punctuation, ' '*len(string.punctuation)), # 'lemmatizer': WordNetLemmatizer(), # } result_dict_list = [load_pickle(path) for path in model_result_paths] submission = defaultdict(list) # count = 0 # problem_pids = [] for pid_with_index in tqdm.tqdm(test_unass): candidate_aids = result_dict_list[0][pid_with_index]['candidate-aids'] inner_data = np.zeros((len(candidate_aids), len(result_dict_list))) for num, result_dict in enumerate(result_dict_list): data = result_dict[pid_with_index]['result-score'] inner_data[:, num] = data final_output = np.mean(inner_data, axis=1) predict_author = candidate_aids[np.argmax(final_output)] submission[predict_author].append(pid_with_index.split('-')[0]) save_json( submission, os.path.join( FINAL_DIR, 'name-clean-2-mean-result-%d.json' % len(result_dict_list)))
def gen_feature_click_stats(update=True): """生成各个分类属性日点击量的统计特征 file_name: (feature)_click_day_stats.pkl example: user_id_click_day_mean 该用户平均每天点击多少次 item_id_click_day_max 该物品单日最高销量 features: 'user_id_click_day_mean', 'user_id_click_day_max', 'user_id_click_day_min', 'item_id_click_day_mean', 'item_id_click_day_max', 'item_id_click_day_min', 'item_brand_id_click_day_mean', 'item_brand_id_click_day_max', 'item_brand_id_click_day_min', 'shop_id_click_day_mean', 'shop_id_click_day_max', 'shop_id_click_day_min', 'context_page_id_click_day_mean', 'context_page_id_click_day_max', 'context_page_id_click_day_min', 'category2_label_click_day_mean', 'category2_label_click_day_max', 'category2_label_click_day_min' """ data = load_pickle(raw_data_path + 'all_data.pkl') stats_feature = ['user_id', 'item_id', 'item_brand_id', 'shop_id'] for feature in tqdm(stats_feature): feature_path = feature_data_path + feature + '_click_day_stats.pkl' if os.path.exists(feature_path) and update == False: print('found ' + feature_path) else: print('generating ' + feature_path) feature_stats = gen_feature_click_day_stats(data, feature) print(feature_stats.columns) dump_pickle(feature_stats, feature_path)
def __init__(self, bson_filepaths, transform=None, mode='train'): assert mode in {'train', 'valid'} self.transform = transform train_valid_data = utils.load_pickle(config.TRAIN_VALID_DATA_FILENAME) self.bson_filepaths = bson_filepaths self.dataset_index = train_valid_data[f'{mode}_index'] self.data = train_valid_data['shuffled_train_data']
def add_features_hour_ctr(all_data): for feature in tqdm([ 'user_id', 'item_id', 'item_brand_id', 'category2_label', 'category3_label', 'context_page_id', 'shop_id', 'item_sales_level_bin', 'item_price_level_bin', 'item_collected_level_bin', 'item_pv_level_bin', 'shop_review_num_level_bin', 'shop_review_positive_rate_bin', 'shop_star_level_bin', 'shop_score_service_bin', 'shop_score_delivery_bin', 'shop_score_description_bin', ]): feature_path = feature_data_path + '_2_5_' + feature + '_hour_CTR.pkl' if not os.path.exists(feature_path): gen_features_hour_ctr() ctr_data = load_pickle(feature_path) all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, 'day', 'hour_bin']) return all_data
def add_features_cross_history_ctr(all_data): for feature in tqdm([ 'user_id', ]): for feature2 in tqdm([ 'item_id', 'item_brand_id', 'category2_label', 'category3_label', 'shop_id', 'item_sales_level_bin', 'item_price_level_bin' ]): I_alias = feature + '_' + feature2 + '_history_I' #总点击次数 C_alias = feature + '_' + feature2 + '_history_C' #购买次数 feature_path = feature_data_path + '_2_5_' + feature + '_' + feature2 + '_before_history_CTR.pkl' #要存放的目录 if not os.path.exists(feature_path): gen_features_cross_history_ctr() ctr_data = load_pickle(feature_path) all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, feature2, 'day']) all_data[I_alias] = all_data[I_alias].fillna(0) all_data[C_alias] = all_data[C_alias].fillna(0) return all_data
def main(unused_argv): session_config = tf.ConfigProto() session_config.gpu_options.allow_growth = True vocab = utils.load_pickle(FLAGS.vocab) caption_model = model.Captioner(vocab) caption_model.build_estimator(config=run_config, model_dir=FLAGS.model_dir, params=params) if FLAGS.mode == 'train': coco_data_train = utils.load_coco(FLAGS.data_dir, 'train') coco_data_val = utils.load_coco(FLAGS.data_dir, 'val') print('Successfully loading data') for _ in range(FLAGS.num_epochs // FLAGS.epochs_per_eval): caption_model.train(captions=coco_data_train.captions, features=coco_data_train.features, batch_size=FLAGS.batch_size, epochs=FLAGS.epochs_per_eval) caption_model.eval(captions=coco_data_val.captions, features=coco_data_val.features, batch_size=FLAGS.batch_size) elif FLAGS.mode == 'inference': assert FLAGS.predict_image is not None caption_model.predict(FLAGS.predict_image)
def gen_user_start_installed_cateA(): """ 计算用户初始安装的各大类app的的数量 拼接键['userID',] """ user_install = load_pickle(raw_data_path + 'user_installedapps.pkl') app_cate = pd.read_csv(raw_data_path + 'app_categories.csv') app_cate['cate_a'] = app_cate.appCategory.apply(lambda x: x // 100 if x > 100 else x) user_install = user_install.merge(app_cate, 'left', 'appID') for cate_a in tqdm(app_cate.cate_a.unique()): feature_path = feature_data_path + 'user_start_installed_cate_' + str( cate_a) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) user_install_cate = user_install[user_install.cate_a == cate_a][[ 'userID', 'cate_a' ]] user_install_cate.rename( columns={'cate_a': 'user_start_install_cate_' + str(cate_a)}, inplace=True) user_install_cate = user_install_cate.groupby( 'userID', as_index=False).sum() dump_pickle(user_install_cate, feature_path)
def test(self, path): corp = Corpus(path) bs = Bayesian() count = 0 sender_bl = load_pickle('sender_bl.pickle') # scan email and define if msg is SPAM or HAM # first check if sender occurs in sender Blacklist # then count spamicity of the word using the Bayes approach for fname, body in corp.emails(): sender = find_sender(body) if sender in sender_bl: self.tag_it(path, fname, 'SPAM') continue spamicity_list = [] count += 1 tokens = tokenize(body) # compute spamicity for each word and create list of the values for el in tokens: word_spamicity = [el, bs.word_spamicity(el)] spamicity_list.append(word_spamicity) # prepare list for Bayes spamicity_list = [ list(i) for i in set(map(tuple, spamicity_list)) ] # remove duplicates from list spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True) prediction = bs.bayes_pred( spamicity_list[:15]) # Consider only 15 'words' if prediction > 0.9 or sender in sender_bl: self.tag_it(path, fname, 'SPAM') else: self.tag_it(path, fname, 'OK')
def cal_test_additional_chars( test_data_path, label_additional_chars, test_save_path, ): test_data_file_names = os.listdir(test_data_path) lengths = len(test_data_file_names) test_data_additional_chars = set() # new_extra_chars = set("/﹒–é/▲‧♥♡∩×『2〉×.è◆……①&") extra_chars = set( "!#$%&\()*+,-./:;<=>?@[\\]^_`{|}~!#¥%&?《》{}“”,:‘’。()·、;【】/……﹒–") for index in range(lengths): test_data_dir = os.path.join(test_data_path, str(index) + '.txt') with open(test_data_dir, 'r', encoding='utf-8') as f1: lines_text = f1.readlines() raw_text = '' for line_text in lines_text: raw_text += line_text test_data_additional_chars.update( re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', str(raw_text))) additional_chars = test_data_additional_chars.difference( label_additional_chars) # 去掉标签里含有的特殊字符 additional_chars = additional_chars.difference(extra_chars) # 去掉额外的一些标点符号 # additional_chars = additional_chars.difference(new_extra_chars) # 去掉额外的一些标点符号 save_pickle(additional_chars, test_save_path) # 保存成pickle形式 additional_chars = load_pickle(test_save_path) return additional_chars, test_data_additional_chars, label_additional_chars
def gen_user_search_time(file_name): ''' #用当次搜索距离当天第一次搜索该商品时间差 #用当次搜索距离当天第最后一次搜索该商品时间差 #用当次搜索距离当天第一次搜索该店铺时间差 #用当次搜索距离当天第最后一次搜索该店铺时间差 #用当次搜索距离当天第一次搜索该品牌时间差 #用当次搜索距离当天第最后一次搜索该品牌时间差 #用当次搜索距离当天第一次搜索该类目时间差 #用当次搜索距离当天第最后一次搜索该类目时间差 ''' data_select = pd.DataFrame() data = load_pickle(path=raw_data_path + file_name + '.pkl') cols = ['item_id','shop_id', 'item_brand_id','second_cate'] for col in cols: data_filter = data[['user_id', col,'day','context_timestamp']].groupby(['user_id', col,'day']) max_time = data_filter.agg(max) min_time = data_filter.agg(min) x = data.loc[:, ('user_id', col, 'day')].values m = max_time.loc[[tuple(i) for i in x]] n = min_time.loc[[tuple(i) for i in x]] data_select['sub_maxtime_'+col] = data['context_timestamp'].values - np.squeeze(m.values) data_select['sub_mintime_'+col] = data['context_timestamp'].values - np.squeeze(n.values) data_select['sub_maxtime_'+col] = data_select['sub_maxtime_'+col].apply(lambda x: x.total_seconds()) data_select['sub_mintime_'+col] = data_select['sub_mintime_'+col].apply(lambda x: x.total_seconds()) dump_pickle(data_select, feature_data_path +file_name + '_user_search_time')
def test(model_props=None, model_name=None, weights_file='best_weights', dataset_name='test', save_output=True, save_scores=False): if model_props is None: model_props = model_properties.MentionRankingProps( name=model_name, load_weights_from=model_name, weights_file=weights_file) print "Loading data" vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy') dataset = datasets.DocumentBatchedDataset(dataset_name, model_props, with_ids=True) docs = utils.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl') stats = {} print "Building model" model, _ = pairwise_models.get_model(dataset, vectors, model_props) print "Evaluating model on", dataset_name evaluate_model(dataset, docs, model, model_props, stats, save_output=save_output, save_scores=save_scores) timer.clear() utils.write_pickle(stats, model_props.path + dataset_name + "_results.pkl")
def add_user_start_installed_cateA(data): for cate in tqdm([0, 1, 2, 3, 4, 5]): feature_path = feature_data_path + 'user_start_installed_cate_' + str( cate) + '.pkl' user_start_installed_cateA = load_pickle(feature_path) data = pd.merge(data, user_start_installed_cateA, 'left', 'userID') return data
def test(self, path): corp = Corpus(path) bs = Bayesian() count = 0 sender_bl = load_pickle('sender_bl.pickle') # scan email and define if msg is SPAM or HAM # first check if sender occurs in sender Blacklist # then count spamicity of the word using the Bayes approach for fname, body in corp.emails(): sender = find_sender(body) if sender in sender_bl: self.tag_it(path, fname, 'SPAM') continue spamicity_list = [] count += 1 tokens = tokenize(body) # compute spamicity for each word and create list of the values for el in tokens: word_spamicity = [el, bs.word_spamicity(el)] spamicity_list.append(word_spamicity) # prepare list for Bayes spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))] # remove duplicates from list spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True) prediction = bs.bayes_pred(spamicity_list[:15]) # Consider only 15 'words' if prediction > 0.9 or sender in sender_bl: self.tag_it(path, fname, 'SPAM') else: self.tag_it(path, fname, 'OK')
def gen_user_feature_click_hour(): """生成用户对所有分类属性的当前小时点击量 """ data = load_pickle(raw_data_path + 'all_data_4567.pkl') feature_list = [ 'category2_label', 'category3_label', 'shop_id', 'item_id', 'item_brand_id', 'context_page_id', 'item_price_level_bin', 'item_sales_level_bin', 'item_property_topic_k_15', ] for feature in tqdm(feature_list): feature_path = feature_data_path + '_2_1_' + 'user_' + feature + '_click_hour.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) user_feature_click_hour = data.groupby( ['user_id', 'day', 'hour', feature]).size().reset_index().rename( columns={0: 'user_' + feature + '_click_hour'}) dump_pickle(user_feature_click_hour, feature_path)
def gen_feature_click_day_hour(update=True): ''' 计算feature=['user_id', 'item_id', 'item_brand_id', 'shop_id', 'user_gender_id', 'context_page_id', 'user_occupation_id', 'user_age_level']的点击量 计算的是每天每小时 文件名:[feature]_click_hour.pkl ''' all_data = load_pickle(raw_data_path + 'all_data_4567.pkl') for feature in tqdm([ 'user_id', 'item_id', 'item_brand_id', 'category2_label', 'category3_label', 'context_page_id', 'shop_id', 'item_property_topic_k_15' ]): feature_path = feature_data_path + '_2_7_' + feature + '_click_day_hour.pkl' # 要存放的目录 if os.path.exists(feature_path) and update == False: print('found ' + feature_path) else: print('generating ' + feature_path) feature_click_day_hour = all_data.groupby( [feature, 'day', 'hour']).size().reset_index().rename( columns={0: feature + '_click_hour'}) dump_pickle(feature_click_day_hour, feature_path) # 存储
def deal_with_text(data_list, mode='full'): if len(data_list) == 1 and mode == 'train': cache_text = get_config_values('cache', 'text_train') elif len(data_list) == 1 and mode == 'dev': cache_text = get_config_values('cache', 'text_dev') elif len(data_list) == 2 and mode == 'mix': cache_text = get_config_values('cache', 'text_mix') elif len(data_list) == 3 and mode == 'full': cache_text = get_config_values('cache', 'text_full') else: logger.warn('Found data format wrong when dealing with text...') if not os.path.exists(cache_text): logger.info("dealing with text...") text = [] for dataset in tqdm(data_list): text.extend([ Converter('zh-hans').convert(line['text']) for line in dataset ]) save_pickle(cache_text, text) else: logger.info("loading with text...") text = load_pickle(cache_text) logger.info("text total num: {0}".format(len(text))) return text
def process_babi_dataset(save, print_dict=False): file = open('dialog-bAbI-tasks/dialog-babi-task5-full-dialogs-trn.txt', 'r') text = file.readlines() file.close() system_acts = load_pickle('system_acts.pickle') def print_dict(): for key in uttr_dict: print(key) print(uttr_dict[key]) print() uttr_dict = {'<BEGIN>': [set()]} for act in system_acts: uttr_dict[act] = [set()] prev_uttr = '<BEGIN>' for uttr in text: if uttr == '\n': prev_uttr = '<BEGIN>' for act in system_acts: if prev_uttr == '': prev_uttr = act continue if act in uttr: user_uttr = re.sub(r'\d+', '', uttr.split(act)[0]).strip() uttr_dict[prev_uttr][0].add(user_uttr) prev_uttr = act if save: save_pickle(uttr_dict, 'simulator_uttrs.pickle') if print_dict: for k, v in uttr_dict.items(): print(k, v, '\n')
def gen_user_basic_info(file_name='train', test_day=24): data_select = pd.DataFrame() data = load_pickle(path=raw_data_path + file_name + '.pkl') data_select['user_id'] = data['user_id'] data_select['user_gender_id'] = data['user_gender_id'] data_select['user_age_level'] = data['user_age_level'] data_select['user_occupation_id'] = data['user_occupation_id'] data_select['user_star_level'] = data['user_star_level'] #用户搜索时间划分,上午/下午/晚上/凌晨 data_select['is_morning'] = (data['hour'].values >= 8) & (data['hour'].values <= 12) data_select['is_afternoon'] = (data['hour'].values > 12) & (data['hour'].values <= 17) data_select['is_evening'] = (data['hour'].values > 17) & (data['hour'].values <= 23) data_select['is_before_dawn'] = (data['hour'].values < 8) if file_name == 'train': ''' 为了后面的抽样,这里先加上is_trade,训练时记得要删去 ''' data_select['is_trade'] = data['is_trade'] dump_pickle(data_select, feature_data_path + file_name + '_user_basic_info')
def add_feature_click_stats(data, ): """添加分类属性日点击量的统计特征 join_key: ['feature_id',] """ feature_list = [ 'user_id', 'category2_label', 'category3_label', 'shop_id', 'item_id', 'item_brand_id', 'context_page_id', ] for feature in tqdm(feature_list): feature_path = feature_data_path + '_2_2_' + feature + '_click_day_mean.pkl' if not os.path.exists(feature_path): gen_feature_click_stats() feature_click_day_stats = load_pickle(feature_path) data = pd.merge(data, feature_click_day_stats, 'left', [ feature, ]) return data
def add_user_feature_click_day(data): """添加用户对所有分类属性的当天点击量 join_key: ['user_id', 'feature_id', 'day'] """ feature_list = [ 'category2_label', 'category3_label', 'shop_id', 'item_id', 'item_brand_id', 'context_page_id', 'item_price_level_bin', 'item_sales_level_bin', 'item_property_topic_k_15', ] for feature in tqdm(feature_list): feature_path = feature_data_path + '_2_1_' + 'user_' + feature + '_click_day.pkl' if not os.path.exists(feature_path): gen_user_feature_click_day() user_feature_click_day = load_pickle(feature_path) data = pd.merge(data, user_feature_click_day, 'left', [feature, 'day', 'user_id']) return data
def __init__(self): os.makedirs(SPACY_DIR, exist_ok=True) self.text_tokens_path = os.path.join(SPACY_DIR, 'text.tokens.json') self.token_vector_path = os.path.join(SPACY_DIR, 'token.vector.pkl') self.text_tokens = load_json(self.text_tokens_path, {}) self.token_vector = load_pickle(self.token_vector_path, {}) self.text_tokens_len = len(self.text_tokens) self.token_vector_len = len(self.token_vector) self.nlp = spacy.load('en_core_web_lg') self.n_calls = 0
def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None): doc_vectors = utils.load_pickle(directories.MISC + name.replace("_reduced", "") + "_document_vectors.pkl") main_pairs = PairDataBuilder(columns) tune_pairs = PairDataBuilder(columns) main_mentions = MentionDataBuilder(columns) tune_mentions = MentionDataBuilder(columns) main_docs = DocumentDataBuilder(columns) tune_docs = DocumentDataBuilder(columns) print "Building dataset", name + ("/tune" if tune_fraction > 0 else "") p = utils.Progbar(target=(2 if reduced else utils.lines_in_file(directories.RAW + name))) for i, d in enumerate(utils.load_json_lines(directories.RAW + name)): if reduced and i > 2: break p.update(i + 1) if reduced and tune_fraction != 0: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if i == 0 else (tune_pairs, tune_mentions, tune_docs) else: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs) ms, ps = mentions.size(), pairs.size() mention_positions = {} for mention_num in sorted(d["mentions"].keys(), key=int): mention_positions[mention_num] = mentions.size() mentions.add_mention(d["mentions"][mention_num], vectors, doc_vectors[d["mentions"][mention_num]["doc_id"]]) for key in sorted(d["labels"].keys(), key=lambda k: (int(k.split()[1]), int(k.split()[0]))): k1, k2 = key.split() pairs.add_pair(d["labels"][key], mention_positions[k1], mention_positions[k2], int(d["mentions"][k1]["doc_id"]), int(d["mentions"][k1]["mention_id"]), int(d["mentions"][k2]["mention_id"]), d["pair_features"][key]) me, pe = mentions.size(), pairs.size() docs.add_doc(ms, me, ps, pe, d["document_features"]) suffix = ("_reduced" if reduced else "") if tune_mentions.size() > 0: tune_mentions.write(name + "_tune" + suffix) tune_pairs.write(name + "_tune" + suffix) tune_docs.write(name + "_tune" + suffix) main_mentions.write(name + "_train" + suffix) main_pairs.write(name + "_train" + suffix) main_docs.write(name + "_train" + suffix) else: main_mentions.write(name + suffix) main_pairs.write(name + suffix) main_docs.write(name + suffix)
def merge_switch_ssm_fitness_sims(input_fnames, output_fname): """ Combine all the switch SSM fitness simulations into a single pickle file. """ ### combine all the simulations into one pickle file. all_sim_data = OrderedDict() for fname in input_fnames: sim_name = os.path.basename(fname).split(".data")[0] curr_sim_data = utils.load_pickle(fname) all_sim_data[sim_name] = curr_sim_data extra = {} utils.save_as_pickle(output_fname, all_sim_data, extra)
def process(statsfile, k, optfile=None): stats = utils.load_pickle(statsfile) track_ar = average_rank_per_track(stats) clique_ar = average_rank_per_clique(stats) ma_p = mean_average_precision(stats) #k_p = average_precision(stats, k, ver=True) k_p = average_precision_at_k(stats, k) # Set up logger logger = utils.configure_logger() # print results logger.info("Number of queries: %d" % len(stats)) logger.info("Average Rank per Track: %.3f" % track_ar) logger.info("Average Rank per Clique: %.3f" % clique_ar) logger.info("Mean Average Precision: %.2f %%" % (ma_p * 100)) logger.info("Precision at %d: %.2f %%" % (k, k_p * 100)) if optfile is not None: stats2 = utils.load_pickle(optfile) #plot_rank_histograms(stats, stats2, test=False) plot_precision_at_k_histograms(stats, stats2, K=[1,3,5,10], test=False) else: plot_rank_histogram(stats)
def compute_codes(args): """Computes maximum 10,000 x 10 tracks. N is the index in the MSD: e.g. if N = 1: tracks computed: from 100,000 to 199,999 if N = 5: tracks computed: from 500,000 to 599,999 """ track_ids = args["track_ids"] maindir = args["maindir"] d = args["d"] N = args["N"] clique_ids = args["clique_ids"] outdir = args["outdir"] origcodesdir = args["origcodesdir"] pca_n = args["pca_n"] norm = args["norm"] MAX = 1e5 / 1 ITER = 1e4 / 1 for it in xrange(10): logger.info("Computing %d of 10 iteration" % it) start_idx = int(N*MAX + it*ITER) end_idx = int(start_idx + ITER) codes = [] strN = str(N) if N < 10: strN = "0" + str(N) out_file = os.path.join(outdir, strN) + str(it) + "-msd-codes.pk" if origcodesdir is None: origcodes = None else: origcodes_file = os.path.join(origcodesdir, strN) + str(it) + \ "-msd-codes.pk" origcodes = utils.load_pickle(origcodes_file)[0][0] #origcodes = utils.load_pickle(origcodes_file)[0] if d == "": codes = compute_codes_orig_it(track_ids, maindir, clique_ids, start_idx, end_idx) else: codes = compute_codes_it(track_ids, maindir, d, clique_ids, start_idx, end_idx, origcodes=origcodes, norm=norm) utils.save_pickle(codes, out_file)
def test(model_props=None, model_name=None, weights_file='best_weights', dataset_name='test', save_output=True, save_scores=False): if model_props is None: model_props = model_properties.MentionRankingProps(name=model_name, load_weights_from=model_name, weights_file=weights_file) print "Loading data" vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy') dataset = datasets.DocumentBatchedDataset(dataset_name, model_props, with_ids=True) docs = utils.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl') stats = {} print "Building model" model, _ = pairwise_models.get_model(dataset, vectors, model_props) print "Evaluating model on", dataset_name evaluate_model(dataset, docs, model, model_props, stats, save_output=save_output, save_scores=save_scores) timer.clear() utils.write_pickle(stats, model_props.path + dataset_name + "_results.pkl")
def __init__(self): self.spams_dict = load_pickle("spams.pickle") self.hams_dict = load_pickle("hams.pickle")
def main(): # Args parser parser = argparse.ArgumentParser(description= "Cover song ID on the training Second Hand Song dataset", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument("-lda", action="store", nargs=2, default=[None,0], help="LDA file and version", metavar=('lda.pkl', 'n')) parser.add_argument("-codes", action="store", default=None, dest="codesfile", help="Pickle to the features file") parser.add_argument("-f", action="store", default="", dest="featfile", help="Pickle to the final features") parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), default=("", 0), help="pca model saved in a pickle file, " \ "use n dimensions") args = parser.parse_args() start_time = time.time() maindir = args.msd_dir shsf = "SHS/shs_dataset_train.txt" dictfile = args.dictfile # sanity cheks utils.assert_file(dictfile) utils.assert_file(maindir) utils.assert_file(shsf) # read clique ids and track ids cliques, all_tracks = utils.read_shs_file(shsf) track_ids = all_tracks.keys() clique_ids = np.asarray(utils.compute_clique_idxs(track_ids, cliques)) logger.info("Track ids and clique ids read") utils.save_pickle(clique_ids, "SHS/clique_ids_train.pk") utils.save_pickle(track_ids, "SHS/track_ids_train.pk") # read LDA file lda_file = args.lda[0] if lda_file != None: lda_file = utils.load_pickle(lda_file) logger.info("LDA file read") # read codes file codesfile = args.codesfile if codesfile != None: codesfile = utils.load_pickle(codesfile) logger.info("Codes file read") # Compute features if needed if args.featfile == "": feats = compute_feats(track_ids, maindir, dictfile, lda_file=lda_file, lda_n=int(args.lda[1]), codes=codesfile, pca=args.pca[0], pca_n=int(args.pca[1])) else: feats = utils.load_pickle(args.featfile) # Apply PCA pcafile = args.pca[0] pcadim = int(args.pca[1]) if pcafile != "" and False: trainedpca = utils.load_pickle(pcafile) assert pcadim > 0 logger.info('trained pca loaded') pcafeats = np.zeros((feats.shape[0], pcadim)) for i,feat in enumerate(feats): pcafeats[i] = trainedpca.apply_newdata(feat, ndims=pcadim) feats = pcafeats # Scores feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids) stats = score(feats, clique_ids) # Save data if dictfile == "": dictfile = "thierry" # For saving purposes utils.save_pickle(stats, "results/stats-" + os.path.basename(dictfile) + ".pk") # done logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \ % (anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None, sort_by_len=True): '''Loads the dataset :type path: String :param path: The path to the dataset (here IMDB) :type n_words: int :param n_words: The number of word to keep in the vocabulary. All extra words are set to unknow (1). :type valid_portion: float :param valid_portion: The proportion of the full train set used for the validation set. :type maxlen: None or positive int :param maxlen: the max sequence length we use in the train/valid set. :type sort_by_len: bool :name sort_by_len: Sort by the sequence lenght for the train, valid and test set. This allow faster execution as it cause less padding per minibatch. Another mechanism must be used to shuffle the train set at each epoch. ''' ############# # LOAD DATA # ############# # Load the dataset # path = get_dataset_file( # path, "imdb.pkl", # "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl") # if path.endswith(".gz"): # f = gzip.open(path, 'rb') # else: # f = open(path, 'rb') train_set = utils.load_pickle("encode_train_reviews.pickle") test_set = utils.load_pickle("encode_test_reviews.pickle") if maxlen: new_train_set_x = [] new_train_set_y = [] for x, y in zip(train_set[0], train_set[1]): if len(x) < maxlen: new_train_set_x.append(x) new_train_set_y.append(y) train_set = (new_train_set_x, new_train_set_y) del new_train_set_x, new_train_set_y # split training set into validation set train_set_x, train_set_y = train_set n_samples = len(train_set_x) sidx = numpy.random.permutation(n_samples) n_train = int(numpy.round(n_samples * (1. - valid_portion))) valid_set_x = [train_set_x[s] for s in sidx[n_train:]] valid_set_y = [train_set_y[s] for s in sidx[n_train:]] train_set_x = [train_set_x[s] for s in sidx[:n_train]] train_set_y = [train_set_y[s] for s in sidx[:n_train]] train_set = (train_set_x, train_set_y) valid_set = (valid_set_x, valid_set_y) def remove_unk(x): return [[1 if w >= n_words else w for w in sen] for sen in x] test_set_x, test_set_y = test_set valid_set_x, valid_set_y = valid_set train_set_x, train_set_y = train_set train_set_x = remove_unk(train_set_x) valid_set_x = remove_unk(valid_set_x) test_set_x = remove_unk(test_set_x) def len_argsort(seq): return sorted(range(len(seq)), key=lambda x: len(seq[x])) if sort_by_len: sorted_index = len_argsort(test_set_x) test_set_x = [test_set_x[i] for i in sorted_index] test_set_y = [test_set_y[i] for i in sorted_index] sorted_index = len_argsort(valid_set_x) valid_set_x = [valid_set_x[i] for i in sorted_index] valid_set_y = [valid_set_y[i] for i in sorted_index] sorted_index = len_argsort(train_set_x) train_set_x = [train_set_x[i] for i in sorted_index] train_set_y = [train_set_y[i] for i in sorted_index] train = (train_set_x, train_set_y) valid = (valid_set_x, valid_set_y) test = (test_set_x, test_set_y) return train, valid, test
def compute_feats(track_ids, maindir, d, lda_file=None, lda_n=0, codes=None, ver=True, pca="", pca_n=0): """Computes the features using the dictionary d. If it doesn't exist, computes them using Thierry's method. The improved pipeline is composed of 11 steps: 1.- Beat Synchronous Chroma 2.- L2-Norm 3.- Shingle (PATCH_LEN: 75 x 12) 4.- 2D-FFT 5.- L2-Norm 6.- Log-Scale 7.- Sparse Coding 8.- Shrinkage 9.- Median Aggregation 10.- Dimensionality Reduction 11.- L2-Norm Original method by Thierry doesn't include steps 5,6,7,8,11. """ if d != "": fx = load_transform(d) K = int(d.split("_")[1].split("E")[1]) else: K = PATCH_LEN if codes is None: compute_codes = True codes = np.ones((len(track_ids),K)) * np.nan else: compute_codes = False K = codes[0].shape[0] if lda_file is not None: if lda_n == 0: n_comp = 50 elif lda_n == 1: n_comp = 100 elif lda_n == 2: n_comp = 200 else: n_comp = K if pca != "": pca = utils.load_pickle(pca) pca = pca[pca_n] final_feats = np.ones((codes.shape[0],n_comp)) * np.nan orig_feats = [] for cnt, tid in enumerate(track_ids): if compute_codes: path = utils.path_from_tid(maindir, tid) # 1.- Beat Synchronous Chroma # 2.- L2-Norm # 3.- Shingle (PATCH_LEN: 75 x 12) # 4.- 2D-FFT feats = utils.extract_feats(path) #orig_feats.append(feats) # Store orig feats if feats == None: continue if d != "": # 5.- L2-Norm # 6.- Log-Scale # 7.- Sparse Coding # 8.- Shrinkage H = fx(feats) else: H = feats #. 9.- Median Aggregation H = np.median(H, axis=0) else: H = codes[cnt] if compute_codes: codes[cnt] = H.copy() if pca != "": H = pca.transform(H) # Apply LDA if needed if lda_file is not None: #H = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze() # 10.- Dimensionality Reduction H = lda_file[lda_n].transform(H) # 11.- L2-Norm final_feats[cnt] = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze() if ver: if cnt % 50 == 1: logger.info("----Computing features %.1f%%" % \ (cnt/float(len(track_ids)) * 100)) if d == "": d = "orig" # For saving purposes # Save codes utils.create_dir("results") if compute_codes: utils.save_pickle(codes, "results/codes-" + os.path.basename(d) + ".pk") # Save features #utils.save_pickle(orig_feats, "results/feats-" + os.path.basename(d) + ".pk") logger.info("Features Computed") return final_feats
def write_links(model_path, dataset_name): links = utils.load_pickle(model_path + dataset_name + '_links.pkl') with open(model_path + dataset_name + "_links", "w") as f: for did in links: f.write(str(did) + "\t" + " ".join( map(lambda (m1, m2): str(m1) + "," + str(m2), links[did])) + "\n")
def __init__(self, columns=None): self.columns = columns self.mention_inds = DatasetColumn('dmi', columns) self.pair_inds = DatasetColumn('dpi', columns) self.features = DatasetColumn('df', columns) self.genres = utils.load_pickle(directories.MISC + 'genres.pkl')
SAVE = True LOWE = False descriptor = "SIFT" descriptor = "spSIFT" if TEST: prefix = "%s_%s_" % (descriptor, "test") else: prefix = "%s_%s_" % (descriptor, "full") train_images, train_labels, test_images, test_labels = get_train_test(TEST) if descriptor == "SIFT": if os.path.isfile(prefix % "kmeans"): kmeans = load_pickle(prefix + "kmeans.pkl") else: pool = Pool(getNumberOfCPUs() - 2) if LOWE: print " [!] Lowe's SIFT" train_sift_with_null = pool.map(get_sift_lowe, train_images) test_sift_with_null = pool.map(get_sift_lowe, test_images) else: print " [!] OpenCV2's SIFT" train_sift_with_null = pool.map(get_sift, train_images) test_sift_with_null = pool.map(get_sift, test_images) pool.close() pool.join() pool.terminate()
def load_docs(dataset_name, word_vectors): return (datasets.Dataset(dataset_name, model_properties.MentionRankingProps(), word_vectors), zip(utils.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl'), utils.load_pickle(directories.ACTION_SPACE + dataset_name + '_action_space.pkl')))
def main(): # Args parser parser = argparse.ArgumentParser( description="Evaluates the 500 binary queries from the SHS data set", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument( "-lda", action="store", nargs=2, default=[None, 0], help="LDA file and version", metavar=("lda.pkl", "n") ) parser.add_argument( "-pca", nargs=2, metavar=("f.pkl", "n"), default=("", 0), help="pca model saved in a pickle file, " "use n dimensions", ) # Parse args = parser.parse_args() # Track time start_time = time.time() maindir = args.msd_dir queriesf = "SHS/list_500queries.txt" shsf = "SHS/shs_dataset_train.txt" lda = args.lda[0] lda_n = int(args.lda[1]) pcafile = args.pca[0] pcadim = int(args.pca[1]) # sanity cheks utils.assert_file(maindir) utils.assert_file(queriesf) utils.assert_file(shsf) utils.assert_file(pcafile) # read queries queries = read_query_file(queriesf) # load pca trainedpca = None if pcafile != "": f = open(pcafile, "r") trainedpca = cPickle.load(f) f.close() assert pcadim > 0 logger.info("trained pca loaded") # load lda if lda != None: lda = utils.load_pickle(lda) # to keep stats results = [] # iterate over queries logger.info("Starting the binary task...") # Get the dictionary transform td = load_transform(args.dictfile) for triplet in queries: # get features filenames = map(lambda tid: utils.path_from_tid(maindir, tid), triplet) triplet_feats = map(lambda f: extract_feats(f, td=td, lda_file=lda, lda_n=lda_n), filenames) if None in triplet_feats: continue # Apply pca if needed if trainedpca: triplet_feats = map(lambda feat: trainedpca.apply_newdata(feat, ndims=pcadim), triplet_feats) assert triplet_feats[np.random.randint(3)].shape[0] == pcadim # Compute result res1 = triplet_feats[0] - triplet_feats[1] res1 = np.sum(res1 * res1) res2 = triplet_feats[0] - triplet_feats[2] res2 = np.sum(res2 * res2) if res1 < res2: results.append(1) else: results.append(0) # verbose if len(results) % 5 == 0: logger.info(" --- after %d queries, accuracy: %.1f %%" % (len(results), 100.0 * np.mean(results))) # done logger.info("After %d queries, accuracy: %.1f %%" % (len(results), 100.0 * np.mean(results))) logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def main(): # Args parser parser = argparse.ArgumentParser(description= "Evaluates the average rank and mean AP for the test SHS " \ "over the entire MSD", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument("-outdir", action="store", default="msd_codes", help="Output directory for the features") parser.add_argument("-N", action="store", default=10, type=int, help="Number of processors to use when computing " \ "the codes for 1M tracks,") parser.add_argument("-lda", action="store", default=None, help="LDA file") parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), default=(None, 0), help="pca model saved in a pickle file, " \ "use n dimensions") parser.add_argument("-codes", action="store", nargs=2, default=[None,0], dest="codesdir", metavar=("msd_codes/", "n"), help="Path to the folder with all the codes and " "version to evaluate") parser.add_argument("-orig_codes", action="store", default=None, dest="origcodesdir", help="Path to the folder with all the codes without " "dimensionality reduction") parser.add_argument("-norm", action="store_true", dest="norm", default=False, help="Normalize before LDA/PCA or not") args = parser.parse_args() start_time = time.time() maindir = args.msd_dir shsf = "SHS/shs_dataset_test.txt" global lda global pca # sanity cheks utils.assert_file(maindir) utils.assert_file(shsf) utils.create_dir(args.outdir) # read cliques and all tracks cliques, all_tracks = utils.read_shs_file(shsf) track_ids = utils.load_pickle("SHS/track_ids_test.pk") clique_ids = utils.load_pickle("SHS/clique_ids_test.pk") # read codes file codesdir = args.codesdir[0] if codesdir is not None: if os.path.isfile(codesdir): c = utils.load_pickle(codesdir) feats = c[0] track_ids = c[1] clique_ids = c[2] else: feats, track_ids, clique_ids = load_codes(codesdir, lda_idx=int(args.codesdir[1])) logger.info("Codes files read") print feats.shape else: # Read PCA file if args.pca[0] is not None: pca = utils.load_pickle(args.pca[0])[int(args.pca[1])] # read LDA file lda_file = args.lda if lda_file is not None: lda = utils.load_pickle(lda_file) utils.assert_file(args.dictfile) # Prepare Multiprocessing computation input = [] pool = Pool(processes=args.N) for n in xrange(args.N): arg = {} arg["track_ids"] = track_ids arg["maindir"] = maindir arg["d"] = args.dictfile arg["N"] = n arg["clique_ids"] = clique_ids arg["outdir"] = args.outdir arg["origcodesdir"] = args.origcodesdir arg["pca_n"] = int(args.pca[1]) arg["norm"] = args.norm input.append(arg) # Start computing the codes pool.map(compute_codes, input) # Done! logger.info("Codes computation done!") logger.info("Took %.2f seconds" % (time.time() - start_time)) sys.exit() # Scores feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids) stats = score(feats, clique_ids, N=len(all_tracks)) # TODO: change file name utils.save_pickle(stats, "stats.pk") # done logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \ % (anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def main(model_path, dataset_name): docs = utils.load_pickle(model_path + dataset_name + '_processed_docs.pkl') for doc_data in utils.load_json_lines(directories.RAW + dataset_name): sentences = doc_data["sentences"] mid_to_mention = {int(m["mention_id"]): m for m in doc_data["mentions"].values()} mid_to_position = {mid: int(m["mention_num"]) for mid, m in mid_to_mention.iteritems()} doc = docs[doc_data["document_features"]["doc_id"]] clusters = [c for c in doc.clusters if len(c) > 1] cluster_to_endpoints = {} for c in clusters: positions = [mid_to_position[mid] for mid in c] cluster_to_endpoints[c] = (min(positions), max(positions)) sorted_clusters = sorted(clusters, key=lambda c: cluster_to_endpoints[c]) color_last_usage = {i: -1 for i in range(len(COLORS))} active_clusters = [] cluster_to_color = {} for c in sorted_clusters: start, end = cluster_to_endpoints[c] for a in list(active_clusters): if cluster_to_endpoints[a][1] < start: active_clusters.remove(a) used_colors = [cluster_to_color[a] for a in active_clusters] sorted_colors = sorted((u, i) for i, u in color_last_usage.iteritems()) next_color = None for u, i in sorted_colors: if i not in used_colors: next_color = i break if next_color is None: next_color = sorted_colors[0][1] color_last_usage[next_color] = start cluster_to_color[c] = next_color active_clusters.append(c) annotations = defaultdict(lambda: defaultdict(list)) for i, c in enumerate(sorted_clusters): color = COLORS[cluster_to_color[c]] for m in c: mention = mid_to_mention[m] start, end = mention["start_index"], mention["end_index"] - 1 annotations[mention["sent_num"]][start].append( (color + "[" + ENDC, 1 + end)) annotations[mention["sent_num"]][end].append( (color + "]" + subscript(i) + ENDC, -1 - start)) for i, s in enumerate(sentences): for j, sentence_annotations in annotations[i].iteritems(): sentence_annotations = sorted(sentence_annotations, key=itemgetter(1)) for (annotation, priority) in sentence_annotations: if priority > 0: s[j] = annotation + s[j] else: s[j] = s[j] + annotation print " ".join(s) print print 80 * "=" print
def train(model_props, n_epochs=10000, reduced=False, dev_set_name='dev'): print "Training", model_props.path pprint(model_props.__dict__) model_props.write(model_props.path + 'model_props.pkl') utils.rmkdir(model_props.path + 'src') for fname in os.listdir('.'): if fname.endswith('.py'): shutil.copyfile(fname, model_props.path + 'src/' + fname) if model_props.ranking or \ model_props.top_pairs: write_start = 0 write_every = 10 else: write_start = 80 write_every = 20 print "Loading data" vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy') train = datasets.DocumentBatchedDataset("train_reduced" if reduced else "train", model_props, with_ids=True) dev = datasets.DocumentBatchedDataset(dev_set_name + "_reduced" if reduced else dev_set_name, model_props, with_ids=True) print "Building model" model, _ = pairwise_models.get_model(dev, vectors, model_props) json_string = model.to_json() open(model_props.path + 'architecture.json', 'w').write(json_string) best_val_score = 1000 best_val_score_in_window = 1000 history = [] for epoch in range(n_epochs): timer.start("train") print "EPOCH {:}, model = {:}".format((epoch + 1), model_props.path) epoch_stats = {} model_weights = model.get_weights() train_docs = utils.load_pickle(directories.DOCUMENTS + 'train_docs.pkl') dev_docs = utils.load_pickle(directories.DOCUMENTS + dev_set_name + '_docs.pkl') if reduced: dev_docs = dev_docs[:3] if model_props.ranking: print "Running over training set" run_model_over_docs(train, train_docs, model) epoch_stats.update(compute_metrics(train_docs, "train")) if model_props.use_rewards: print "Setting costs" set_costs(train, train_docs) print "Training" prog = utils.Progbar(train.n_batches) train.shuffle() loss_sum, n_examples = 0, 0 for i, X in enumerate(train): if X['y'].size == 0: continue batch_loss = model.train_on_batch(X) loss_sum += batch_loss * train.scale_factor n_examples += X['y'].size prog.update(i + 1, exact=[("train loss", loss_sum / n_examples)]) epoch_stats["train time"] = time.time() - prog.start for k in prog.unique_values: epoch_stats[k] = prog.sum_values[k][0] / max(1, prog.sum_values[k][1]) epoch_stats["weight diffs"] = [ (np.sum(np.abs(new_weight - old_weight)), new_weight.size) for new_weight, old_weight in zip(model.get_weights(), model_weights)] summed = np.sum(map(np.array, epoch_stats["weight diffs"][1:]), axis=0) epoch_stats["total weight diff"] = tuple(summed) print "Testing on dev set" evaluate_model(dev, dev_docs, model, model_props, epoch_stats) history.append(epoch_stats) utils.write_pickle(history, model_props.path + 'history.pkl') score = -epoch_stats["dev conll"] if model_props.ranking else \ (epoch_stats["dev loss"] if not model_props.anaphoricity_only else epoch_stats["dev anaphoricity loss"]) if score < best_val_score: best_val_score = score print "New best {:}, saving model".format( "CoNLL F1" if model_props.ranking else "validation loss") model.save_weights(model_props.path + "best_weights.hdf5", overwrite=True) if score < best_val_score_in_window and epoch > write_start: print "Best in last {:}, saved to weights_{:}".format( write_every, write_every * (epoch / write_every)) best_val_score_in_window = score model.save_weights(model_props.path + "weights_{:}.hdf5".format( write_every * (epoch / write_every)), overwrite=True) if epoch + write_every >= n_epochs: model.save_weights(model_props.path + "final_weights.hdf5", overwrite=True) if epoch % write_every == 0: best_val_score_in_window = 1000 timer.stop("train") timer.print_totals() print timer.clear()
print(t) assert False print('Tokenization:') t0 = time.clock() train_tokens = [tokenize(s, token_vector) for s in train[COMMENT]] print('train_tokens: %1.f sec %.2f sec / token' % (time.clock() - t0, (time.clock() - t0) / len(train_tokens))) t0 = time.clock() test_tokens = [tokenize(s, token_vector) for s in test[COMMENT]] print('test_tokens: %1.f sec %.2f sec / token' % (time.clock() - t0, (time.clock() - t0) / len(test_tokens))) save_pickle('token.vector.pkl', token_vector) save_json('train.tokens.json', train_tokens) save_json('test.tokens.json', test_tokens) token_vector = load_pickle('token.vector.pkl') train_tokens = load_json('train.tokens.json') test_tokens = load_json('test.tokens.json') def compute_ngram_vector(token_list, n): """Compute an embedding vector for all n-grams in token_list """ vec = np.zeros((n, SPACY_VECTOR_SIZE), dtype=np.float64) n_vecs = len(token_list) - n + 1 for i in range(n_vecs): for j in range(n): vec[j] += token_vector[token_list[i + j]] vec /= n_vecs return np.reshape(vec, n * SPACY_VECTOR_SIZE)