def main(): p = get_cli_args(args) x_train, y_train, qid_train = load_svmlight_file( p.train.xgboost_train_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_test, y_test, qid_test = load_svmlight_file(p.train.xgboost_test_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_train = x_train.todense() x_train = np.concatenate([ x_train, x_train[:, -2] / x_train[:, 2], x_train[:, -1] / x_train[:, 4] ], 1) x_test = x_test.todense() x_test = np.concatenate( [x_test, x_test[:, -2] / x_test[:, 2], x_test[:, -1] / x_test[:, 4]], 1) train_dmatrix = DMatrix(x_train, y_train) test_dmatrix = DMatrix(x_test, y_test) train_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_train)]) test_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_test)]) params = { 'objective': 'rank:pairwise', 'eval_metric': ['error', 'map@1'], 'tree_method': 'exact', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6 } xgb_model = xgb.train(params, train_dmatrix, num_boost_round=100, evals=[(test_dmatrix, 'validation')]) xgb_train_str = items_to_str(_.omit(params, 'objective', 'eval_metric').items(), sort_by=itemgetter(0)) xgb_model.save_model(xgb_train_str + '_model.xgb')
def main(): p = get_cli_args(args) conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv' num_correct = 0 missed_idxs = [] guessed_when_missed = [] db_connection = get_connection(p.run.env_path) model = load_model(p.model, p.train) with open('./tokens.pkl', 'rb') as fh: token_idx_lookup = pickle.load(fh) with open('./glove_token_idx_lookup.pkl', 'rb') as fh: full_token_idx_lookup = pickle.load(fh) with open('./val_test_indices.json', 'r') as fh: val_indices, test_indices = json.load(fh) model.eval() with torch.no_grad(): with db_connection.cursor() as cursor: dataset = SimpleCoNLLDataset(cursor, token_idx_lookup, full_token_idx_lookup, conll_path, p.run.lookups_path, p.run.idf_path, p.train.train_size, p.run.txt_dataset_path) conll_test_set = DataLoader( dataset, batch_sampler=BatchSampler( SubsetSequentialSampler(test_indices), p.run.batch_size, False), collate_fn=collate_simple_mention_ranker) ctr = count() for batch in progressbar(conll_test_set): (candidate_ids, features), target_rankings = batch target = [ranking[0] for ranking in target_rankings] candidate_scores = model(features) top_1 = [] offset = 0 for ids in candidate_ids: ranking_size = len(ids) top_1.append(ids[torch.argmax( candidate_scores[offset:offset + ranking_size]).item()]) offset += ranking_size for guess, label, ids, idx in zip(top_1, target, candidate_ids, ctr): if guess == label: num_correct += 1 else: missed_idxs.append(idx) guessed_when_missed.append(guess) print(num_correct / next(ctr)) import ipdb ipdb.set_trace() with open('./missed_idxs', 'w') as fh: fh.write('\n'.join( [str((idx, dataset[idx])) for idx in missed_idxs])) with open('./guessed_when_missed', 'w') as fh: fh.write('\n'.join([str(idx) for idx in guessed_when_missed]))
def main(): p = get_cli_args(args) try: open('train.bin').close() open('eval.bin').close() lgb_train = lgb.Dataset('train.bin') lgb_eval = lgb.Dataset('eval.bin', reference=lgb_train) except: x_train, y_train, qid_train = load_svmlight_file( p.train.xgboost_train_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_test, y_test, qid_test = load_svmlight_file( p.train.xgboost_test_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_train = x_train.todense() x_test = x_test.todense() lgb_train = lgb.Dataset( np.array(x_train), np.array(y_train.squeeze()), group=[len(list(g)) for __, g in groupby(qid_train)]) lgb_eval = lgb.Dataset( np.array(x_test), np.array(y_test.squeeze()), reference=lgb_train, group=[len(list(g)) for __, g in groupby(qid_test)]) lgb_train.save_binary("train.bin") lgb_eval.save_binary("eval.bin") params = { 'boosting_type': 'gbdt', 'objective': 'lambdarank', 'metric': {'ndcg'}, 'ndcg_eval_at': [1], 'metric_freq': 1, 'max_bin': 255, 'num_trees': 100, 'num_leaves': 100, 'learning_rate': 0.1, 'num_iterations': 100, 'num_threads': 8, 'feature_fraction': 1.0, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'verbose': 0, } gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval) xgb_train_str = items_to_str(_.omit(params, 'objective', 'eval_metric').items(), sort_by=itemgetter(0)) preds = gbm.predict(lgb_eval) print((y_test != preds).nonzero()) gbm.save_model('model' + xgb_train_str + '.light')
def main(): p = get_cli_args(args) with open('./tokens.pkl', 'rb') as fh: token_idx_lookup = pickle.load(fh) with open('./glove_token_idx_lookup.pkl', 'rb') as fh: full_token_idx_lookup = pickle.load(fh) load_dotenv(dotenv_path=p.run.env_path) EL_DATABASE_NAME = os.getenv("DBNAME") DATABASE_USER = os.getenv("DBUSER") DATABASE_PASSWORD = os.getenv("DBPASS") DATABASE_HOST = os.getenv("DBHOST") with open(p.train.page_id_order_path, 'rb') as fh: page_id_order = pickle.load(fh) page_ids = page_id_order[:p.train.num_pages_to_use] connection = pymysql.connect(host=DATABASE_HOST, user=DATABASE_USER, password=DATABASE_PASSWORD, db=EL_DATABASE_NAME, charset='utf8mb4', use_unicode=True, cursorclass=pymysql.cursors.DictCursor) with connection.cursor() as cursor: cursor.execute("SET NAMES utf8mb4;") cursor.execute("SET CHARACTER SET utf8mb4;") cursor.execute("SET character_set_connection=utf8mb4;") if p.train.train_on_conll: conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv' dataset = SimpleCoNLLDataset(cursor, token_idx_lookup, full_token_idx_lookup, conll_path, p.run.lookups_path, p.run.idf_path, p.train.train_size) else: dataset = SimpleMentionDataset(cursor, token_idx_lookup, full_token_idx_lookup, page_ids, p.run.lookups_path, p.run.idf_path, p.train.train_size) train_str = '_'.join(['conll' if p.train.train_on_conll else 'wiki', 'custom' if p.run.use_custom else '', str(p.train.num_pages_to_use)]) with open('./4data_{}'.format(train_str), 'w') as fh: for item_num, item in progressbar(enumerate(dataset)): fh.write('{}\n'.format(str(item))) if item_num % 1000 == 0: fh.flush()
def main(): p = get_cli_args(args) x_train, y_train, qid_train = load_svmlight_file( p.train.xgboost_train_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_test, y_test, qid_test = load_svmlight_file(p.train.xgboost_test_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking metric = pyltr.metrics.AP(k=1) model = pyltr.models.LambdaMART( metric=metric, n_estimators=1000, learning_rate=0.02, max_features=0.5, query_subsample=0.5, max_leaf_nodes=10, min_samples_leaf=64, verbose=1, ) model.fit(x_train.todense(), y_train, qid_train) preds = model.predict(x_test.todense()) print('Random ranking:', metric.calc_mean_random(qid_test, y_test)) print('Our model:', metric.calc_mean(qid_test, y_test, preds)) import ipdb ipdb.set_trace()
def main(): p = get_cli_args(args) with open('./tokens.pkl', 'rb') as fh: token_idx_lookup = pickle.load(fh) load_dotenv(dotenv_path=p.run.env_path) EL_DATABASE_NAME = os.getenv("DBNAME") DATABASE_USER = os.getenv("DBUSER") DATABASE_PASSWORD = os.getenv("DBPASS") DATABASE_HOST = os.getenv("DBHOST") with open(p.train.page_id_order_path, 'rb') as fh: page_id_order = pickle.load(fh) page_ids = page_id_order[:p.train.num_pages_to_use] connection = pymysql.connect(host=DATABASE_HOST, user=DATABASE_USER, password=DATABASE_PASSWORD, db=EL_DATABASE_NAME, charset='utf8mb4', use_unicode=True, cursorclass=pymysql.cursors.DictCursor) with connection.cursor() as cursor: cursor.execute("SET NAMES utf8mb4;") cursor.execute("SET CHARACTER SET utf8mb4;") cursor.execute("SET character_set_connection=utf8mb4;") datasets = [ MentionCoNLLDataset(cursor, './AIDA-YAGO2-dataset.tsv', p.run.lookups_path, p.train.train_size), MentionWikiDataset(cursor, page_ids, p.run.lookups_path, p.train.train_size) ] with open('./entity_to_row_id.pkl', 'rb') as fh: entity_id_to_row = pickle.load(fh) idf = get_idf(token_idx_lookup, p.run.idf_path) desc_fs_sparse = csr_matrix(load_npz('./desc_fs.npz')) desc_vs = csr_matrix(sparse_to_tfidf_vs(idf, desc_fs_sparse)) norm = (desc_vs.multiply(desc_vs)).sum(1) all_e_id_pairs = set() data = [] i = [] j = [] row_to_entity_id = _.invert(entity_id_to_row) for dataset in datasets: for cands in progressbar(iter(dataset)): if cands is None: continue cand_rows = [ entity_id_to_row[e_id] for e_id in cands if (e_id in entity_id_to_row) ] cand_mat = desc_vs[cand_rows] scores = cand_mat.dot(cand_mat.T) / norm[cand_rows] new_i = cand_rows * len(cand_rows) new_j = [ row_num for row_num in cand_rows for __ in range(len(cand_rows)) ] list_scores = np.array(scores).ravel().tolist() for res_i in range(len(list_scores)): pair = (row_to_entity_id[min(new_i[res_i], new_j[res_i])], row_to_entity_id[max(new_i[res_i], new_j[res_i])]) if pair not in all_e_id_pairs: data.append(list_scores[res_i]) i.append(new_i[res_i]) j.append(new_j[res_i]) mat = csr_matrix(coo_matrix((data, (i, j)))) train_str = 'wiki+conll_' + '_'.join([str(p.train.num_pages_to_use)]) save_npz('compats_{}.npz'.format(train_str), mat)
def main(): p = get_cli_args(args) with open('./tokens.pkl', 'rb') as fh: token_idx_lookup = pickle.load(fh) with open('./glove_token_idx_lookup.pkl', 'rb') as fh: full_token_idx_lookup = pickle.load(fh) load_dotenv(dotenv_path=p.run.env_path) EL_DATABASE_NAME = os.getenv("DBNAME") DATABASE_USER = os.getenv("DBUSER") DATABASE_PASSWORD = os.getenv("DBPASS") DATABASE_HOST = os.getenv("DBHOST") with open(p.train.page_id_order_path, 'rb') as fh: page_id_order = pickle.load(fh) page_ids = page_id_order[:p.train.num_pages_to_use] connection = pymysql.connect(host=DATABASE_HOST, user=DATABASE_USER, password=DATABASE_PASSWORD, db=EL_DATABASE_NAME, charset='utf8mb4', use_unicode=True, cursorclass=pymysql.cursors.DictCursor) with connection.cursor() as cursor: cursor.execute("SET NAMES utf8mb4;") cursor.execute("SET CHARACTER SET utf8mb4;") cursor.execute("SET character_set_connection=utf8mb4;") if p.train.train_on_conll: conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv' dataset = SimpleCoNLLDataset( cursor, token_idx_lookup, full_token_idx_lookup, conll_path, p.run.lookups_path, p.run.idf_path, p.train.train_size, txt_dataset_path=p.run.txt_dataset_path) else: dataset = SimpleMentionDataset( cursor, token_idx_lookup, full_token_idx_lookup, page_ids, p.run.lookups_path, p.run.idf_path, p.train.train_size, txt_dataset_path=p.run.txt_dataset_path) train_str = '_'.join([ 'conll' if p.train.train_on_conll else 'wiki', 'custom' if p.run.use_custom else '', str(p.train.num_pages_to_use) ]) part_num = 0 data = [] fh = open('./4prep_{}_part_{}.pkl'.format(train_str, part_num), 'wb') for item_num, item in progressbar(enumerate(dataset)): if (item_num != 0) and item_num % 100000 == 0: pickle.dump(data, fh) data = [] fh.close() part_num += 1 fh = open('./4prep_{}_part_{}.pkl'.format(train_str, part_num), 'wb') all_target_features = [] all_candidate_features = [] pair_ids = [] mention_features, mention_candidate_ids, label = item target_idx = mention_candidate_ids.index(label) target_features = mention_features[target_idx] for candidate_features, candidate_id in zip( mention_features, mention_candidate_ids): if candidate_id != label: all_target_features.append(target_features) all_candidate_features.append(candidate_features) pair_ids.append((label, candidate_id)) features = (all_target_features, all_candidate_features) data.append(features) pickle.dump(data, fh) fh.close()
def main(): global best_options, best_performances p = get_cli_args(args) arg_options = [ {'path': ['train', 'dropout_keep_prob'], 'options': [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3]}, {'path': ['train', 'margin'], 'if': lambda params: _.get(thaw(params), ['train', 'use_hinge']), 'options': [10, 100, 1000, 10000]}, # {'path': ['train', 'stop_by'], # 'options': ['acc', 'bce_loss']}, {'path': ['train', 'use_hinge'], 'options': [False, True]}, # {'path': ['train', 'stop_after_n_bad_epochs'], # 'options': [1, 2]}, {'path': ['model', 'hidden_sizes'], 'options': [[100]]}, {'path': ['train', 'learning_rate'], 'options': [0.1, 1e-2, 1e-3, 1e-4, 1e-5]}, ] with open('./tokens.pkl', 'rb') as fh: token_idx_lookup = pickle.load(fh) with open('./glove_token_idx_lookup.pkl', 'rb') as fh: full_token_idx_lookup = pickle.load(fh) load_dotenv(dotenv_path=p.run.env_path) EL_DATABASE_NAME = os.getenv("DBNAME") DATABASE_USER = os.getenv("DBUSER") DATABASE_PASSWORD = os.getenv("DBPASS") DATABASE_HOST = os.getenv("DBHOST") with open(p.train.page_id_order_path, 'rb') as fh: page_id_order = pickle.load(fh) page_ids = page_id_order[:p.train.num_pages_to_use] connection = pymysql.connect(host=DATABASE_HOST, user=DATABASE_USER, password=DATABASE_PASSWORD, db=EL_DATABASE_NAME, charset='utf8mb4', use_unicode=True, cursorclass=pymysql.cursors.DictCursor) with connection.cursor() as cursor: cursor.execute("SET NAMES utf8mb4;") cursor.execute("SET CHARACTER SET utf8mb4;") cursor.execute("SET character_set_connection=utf8mb4;") conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv' test_dataset = SimpleCoNLLDataset(cursor, token_idx_lookup, full_token_idx_lookup, conll_path, p.run.lookups_path, p.run.idf_path, p.train.train_size, p.run.val_txt_dataset_path) try: with open('./val_test_indices.json', 'r') as fh: val_indices, test_indices = json.load(fh) except FileNotFoundError: with open('./val_test_indices.json', 'w') as fh: permutation = list(range(len(test_dataset))); shuffle(permutation) split_idx = int(len(test_dataset) * 0.5) val_indices, test_indices = permutation[:split_idx], permutation[split_idx:] json.dump((val_indices, test_indices), fh) val_dataloader = DataLoader(test_dataset, batch_sampler=BatchSampler(SubsetSequentialSampler(val_indices), p.run.batch_size, False), collate_fn=collate_simple_mention_ranker) if p.run.pkl_dataset_prefix: def _col(batch): features, labels = zip(*batch) target, cand = zip(*features) target = torch.tensor(target) cand = torch.tensor(cand) labels = torch.tensor(labels) return (target, cand), labels collate_fn = _col else: collate_fn = collate_simple_mention_pairwise if p.train.use_pairwise else collate_simple_mention_pointwise if p.train.train_on_conll: conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv' dataset = SimpleCoNLLDataset(cursor, token_idx_lookup, full_token_idx_lookup, conll_path, p.run.lookups_path, p.run.idf_path, p.train.train_size, txt_dataset_path=p.run.txt_dataset_path) else: dataset = SimpleMentionDataset(cursor, token_idx_lookup, full_token_idx_lookup, page_ids, p.run.lookups_path, p.run.idf_path, p.train.train_size, txt_dataset_path=p.run.txt_dataset_path, pkl_dataset_prefix=p.run.pkl_dataset_prefix) with open(p.run.perf_path, 'w') as fh: for cand_p, new_options in progressbar(hparam_search(p, arg_options, rand_p=False)): fh.write(str(thaw(new_options)) + '\n') fh.flush() if p.train.use_sequential_sampler: bs = BatchSampler(FixLenSequentialSampler(2140542), p.train.batch_size, False) else: if p.run.pkl_dataset_prefix is not None: sampler = ChunkedRandomSampler(2140542, 100000) else: sampler = RandomSampler(dataset) bs = BatchSampler(sampler, p.train.batch_size, False) dataloader = DataLoader(dataset, batch_sampler=bs, collate_fn=collate_fn) model = get_model(cand_p.model, cand_p.train) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) optimizer = optim.Adam(model.parameters(), cand_p.train.learning_rate) if cand_p.train.use_hinge: calc_loss = nn.MarginRankingLoss(cand_p.train.margin) else: calc_loss = nn.BCEWithLogitsLoss() models_by_epoch = [] model_performances = [] did_early_stop = False for epoch_num in range(cand_p.train.max_num_epochs): epoch_loss = 0 get_stop_by_val = itemgetter(cand_p.train.stop_by) neg_is_bad = cand_p.train.stop_by in ['acc'] performance = eval_model(val_dataloader, model, device) model_performances.append(performance) fh.write(str(performance) + '\n') fh.flush() models_by_epoch.append(model) if len(model_performances) >= cand_p.train.stop_after_n_bad_epochs + 1: stop_by_perfs = [get_stop_by_val(perf) for perf in model_performances] bad_epochs = [diff < 0 if neg_is_bad else diff > 0 for diff in np.diff(stop_by_perfs)] if all(bad_epochs[-cand_p.train.stop_after_n_bad_epochs:]): idx = np.searchsorted(best_performances, performance['acc']) best_options.insert(idx, thaw(cand_p)) best_performances.insert(idx, performance['acc']) choose_model(cand_p, models_by_epoch[-cand_p.train.stop_after_n_bad_epochs - 1]) did_early_stop = True break for batch_num, batch in enumerate(dataloader): model.train() optimizer.zero_grad() if cand_p.train.use_pairwise: features, labels = batch features = [elem.to(device) for elem in features] labels = labels.to(device) target_features, candidate_features = features target_scores = model(target_features) candidate_scores = model(candidate_features) scores = candidate_scores - target_scores else: batch = [elem.to(device) for elem in batch] features, labels = batch scores = model(features) if cand_p.train.use_hinge: loss = calc_loss(target_scores, candidate_scores, torch.ones_like(labels)) else: loss = calc_loss(scores, labels) epoch_loss += loss.item() loss.backward() optimizer.step() print('epoch loss', epoch_loss / len(dataloader)) if not did_early_stop: idx = np.searchsorted(best_performances, performance['acc']) best_options.insert(idx, thaw(cand_p)) best_performances.insert(idx, performance['acc']) choose_model(cand_p, model) print('best', list(zip(best_options[-10:], best_performances[-10:])))
def main(): p = get_cli_args(args) conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv' db_connection = get_connection(p.run.env_path) model = load_model(p.model, p.train) with open('./tokens.pkl', 'rb') as fh: token_idx_lookup = pickle.load(fh) with open('./glove_token_idx_lookup.pkl', 'rb') as fh: full_token_idx_lookup = pickle.load(fh) with open('./val_test_indices.json', 'r') as fh: val_indices, test_indices = json.load(fh) model.eval() with torch.no_grad(): with db_connection.cursor() as cursor: doc_id_dataset = MentionCoNLLDataset(cursor, './AIDA-YAGO2-dataset.tsv', p.run.lookups_path, p.train.train_size) dataset = SimpleCoNLLDataset(cursor, token_idx_lookup, full_token_idx_lookup, conll_path, p.run.lookups_path, p.run.idf_path, p.train.train_size, p.run.txt_dataset_path) # compats = load_npz('compats_wiki+conll_100000.npz') with open('./entity_to_row_id.pkl', 'rb') as fh: entity_id_to_row = pickle.load(fh) idf = get_idf(token_idx_lookup, p.run.idf_path) desc_fs_sparse = csr_matrix(load_npz('./desc_fs.npz')) desc_vs = csr_matrix(sparse_to_tfidf_vs(idf, desc_fs_sparse)) norm = np.sqrt((desc_vs.multiply(desc_vs)).sum(1)) ctr = count() num_correct = 0 num_in_val = 0 num_correct_small = 0 num_in_val_small = 0 grouped = groupby( ((dataset[idx], doc_id_dataset.mention_doc_id[idx]) for idx in range(len(val_indices) + len(test_indices))), key=itemgetter(1)) batches = [ collate_simple_mention_ranker([data for data, doc_id in g]) for doc_id, g in grouped ] val_indices = set(val_indices) for document_batch in progressbar(batches): (candidate_ids, features), target_rankings = document_batch target = [ranking[0] for ranking in target_rankings] candidate_scores = model(features) emissions = emissions_from_flat_scores( [len(ids) for ids in candidate_ids], candidate_scores) keep_top_n = 5 top_emissions = [] top_cands = [] idxs_to_check = [] for i, (emission, cand_ids, top_1, idx) in enumerate( zip(emissions, candidate_ids, target, ctr)): if len(cand_ids) > 1: if cand_ids[np.argmax(emission)] != top_1: if idx in val_indices: idxs_to_check.append(i) em, cand = zip(*nlargest(keep_top_n, zip(emission, cand_ids), key=itemgetter(0))) top_emissions.append(np.array(em)) top_cands.append(cand) compatibilities = compatibilities_from_ids( entity_id_to_row, desc_vs, norm, top_cands) top_1_idx = mp_shallow_tree_doc(top_emissions, compatibilities) # top_1_idx = [np.argmax(em) for em in top_emissions] top_1 = [ cand_ids[idx] for cand_ids, idx in zip(top_cands, top_1_idx) ] for guess, label in zip(top_1, target): num_in_val += 1 if guess == label: num_correct += 1 for idx in idxs_to_check: guess = top_1[idx] label = target[idx] num_in_val_small += 1 if guess == label: num_correct_small += 1 print(num_correct / num_in_val) print(num_correct_small / num_in_val_small)