def main():
    p = get_cli_args(args)
    x_train, y_train, qid_train = load_svmlight_file(
        p.train.xgboost_train_path, query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
    x_test, y_test, qid_test = load_svmlight_file(p.train.xgboost_test_path,
                                                  query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
    x_train = x_train.todense()
    x_train = np.concatenate([
        x_train, x_train[:, -2] / x_train[:, 2], x_train[:, -1] / x_train[:, 4]
    ], 1)
    x_test = x_test.todense()
    x_test = np.concatenate(
        [x_test, x_test[:, -2] / x_test[:, 2], x_test[:, -1] / x_test[:, 4]],
        1)
    train_dmatrix = DMatrix(x_train, y_train)
    test_dmatrix = DMatrix(x_test, y_test)
    train_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_train)])
    test_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_test)])
    params = {
        'objective': 'rank:pairwise',
        'eval_metric': ['error', 'map@1'],
        'tree_method': 'exact',
        'eta': 0.1,
        'gamma': 1.0,
        'min_child_weight': 0.1,
        'max_depth': 6
    }
    xgb_model = xgb.train(params,
                          train_dmatrix,
                          num_boost_round=100,
                          evals=[(test_dmatrix, 'validation')])
    xgb_train_str = items_to_str(_.omit(params, 'objective',
                                        'eval_metric').items(),
                                 sort_by=itemgetter(0))
    xgb_model.save_model(xgb_train_str + '_model.xgb')
Exemple #2
0
def main():
    p = get_cli_args(args)
    conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv'
    num_correct = 0
    missed_idxs = []
    guessed_when_missed = []
    db_connection = get_connection(p.run.env_path)
    model = load_model(p.model, p.train)
    with open('./tokens.pkl', 'rb') as fh:
        token_idx_lookup = pickle.load(fh)
    with open('./glove_token_idx_lookup.pkl', 'rb') as fh:
        full_token_idx_lookup = pickle.load(fh)
    with open('./val_test_indices.json', 'r') as fh:
        val_indices, test_indices = json.load(fh)
    model.eval()
    with torch.no_grad():
        with db_connection.cursor() as cursor:
            dataset = SimpleCoNLLDataset(cursor, token_idx_lookup,
                                         full_token_idx_lookup, conll_path,
                                         p.run.lookups_path, p.run.idf_path,
                                         p.train.train_size,
                                         p.run.txt_dataset_path)
            conll_test_set = DataLoader(
                dataset,
                batch_sampler=BatchSampler(
                    SubsetSequentialSampler(test_indices), p.run.batch_size,
                    False),
                collate_fn=collate_simple_mention_ranker)
            ctr = count()
            for batch in progressbar(conll_test_set):
                (candidate_ids, features), target_rankings = batch
                target = [ranking[0] for ranking in target_rankings]
                candidate_scores = model(features)
                top_1 = []
                offset = 0
                for ids in candidate_ids:
                    ranking_size = len(ids)
                    top_1.append(ids[torch.argmax(
                        candidate_scores[offset:offset +
                                         ranking_size]).item()])
                    offset += ranking_size
                for guess, label, ids, idx in zip(top_1, target, candidate_ids,
                                                  ctr):
                    if guess == label:
                        num_correct += 1
                    else:
                        missed_idxs.append(idx)
                        guessed_when_missed.append(guess)
            print(num_correct / next(ctr))
            import ipdb
            ipdb.set_trace()
            with open('./missed_idxs', 'w') as fh:
                fh.write('\n'.join(
                    [str((idx, dataset[idx])) for idx in missed_idxs]))
            with open('./guessed_when_missed', 'w') as fh:
                fh.write('\n'.join([str(idx) for idx in guessed_when_missed]))
def main():
    p = get_cli_args(args)
    try:
        open('train.bin').close()
        open('eval.bin').close()
        lgb_train = lgb.Dataset('train.bin')
        lgb_eval = lgb.Dataset('eval.bin', reference=lgb_train)
    except:
        x_train, y_train, qid_train = load_svmlight_file(
            p.train.xgboost_train_path, query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
        x_test, y_test, qid_test = load_svmlight_file(
            p.train.xgboost_test_path, query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
        x_train = x_train.todense()
        x_test = x_test.todense()
        lgb_train = lgb.Dataset(
            np.array(x_train),
            np.array(y_train.squeeze()),
            group=[len(list(g)) for __, g in groupby(qid_train)])
        lgb_eval = lgb.Dataset(
            np.array(x_test),
            np.array(y_test.squeeze()),
            reference=lgb_train,
            group=[len(list(g)) for __, g in groupby(qid_test)])
        lgb_train.save_binary("train.bin")
        lgb_eval.save_binary("eval.bin")

    params = {
        'boosting_type': 'gbdt',
        'objective': 'lambdarank',
        'metric': {'ndcg'},
        'ndcg_eval_at': [1],
        'metric_freq': 1,
        'max_bin': 255,
        'num_trees': 100,
        'num_leaves': 100,
        'learning_rate': 0.1,
        'num_iterations': 100,
        'num_threads': 8,
        'feature_fraction': 1.0,
        'bagging_fraction': 0.9,
        'bagging_freq': 1,
        'verbose': 0,
    }
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=100,
                    valid_sets=lgb_eval)
    xgb_train_str = items_to_str(_.omit(params, 'objective',
                                        'eval_metric').items(),
                                 sort_by=itemgetter(0))
    preds = gbm.predict(lgb_eval)
    print((y_test != preds).nonzero())
    gbm.save_model('model' + xgb_train_str + '.light')
Exemple #4
0
def main():
  p = get_cli_args(args)
  with open('./tokens.pkl', 'rb') as fh: token_idx_lookup = pickle.load(fh)
  with open('./glove_token_idx_lookup.pkl', 'rb') as fh: full_token_idx_lookup = pickle.load(fh)
  load_dotenv(dotenv_path=p.run.env_path)
  EL_DATABASE_NAME = os.getenv("DBNAME")
  DATABASE_USER = os.getenv("DBUSER")
  DATABASE_PASSWORD = os.getenv("DBPASS")
  DATABASE_HOST = os.getenv("DBHOST")
  with open(p.train.page_id_order_path, 'rb') as fh:
    page_id_order = pickle.load(fh)
  page_ids = page_id_order[:p.train.num_pages_to_use]
  connection = pymysql.connect(host=DATABASE_HOST,
                               user=DATABASE_USER,
                               password=DATABASE_PASSWORD,
                               db=EL_DATABASE_NAME,
                               charset='utf8mb4',
                               use_unicode=True,
                               cursorclass=pymysql.cursors.DictCursor)
  with connection.cursor() as cursor:
    cursor.execute("SET NAMES utf8mb4;")
    cursor.execute("SET CHARACTER SET utf8mb4;")
    cursor.execute("SET character_set_connection=utf8mb4;")

    if p.train.train_on_conll:
      conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv'
      dataset = SimpleCoNLLDataset(cursor,
                                   token_idx_lookup,
                                   full_token_idx_lookup,
                                   conll_path,
                                   p.run.lookups_path,
                                   p.run.idf_path,
                                   p.train.train_size)
    else:
      dataset = SimpleMentionDataset(cursor,
                                     token_idx_lookup,
                                     full_token_idx_lookup,
                                     page_ids,
                                     p.run.lookups_path,
                                     p.run.idf_path,
                                     p.train.train_size)
    train_str = '_'.join(['conll' if p.train.train_on_conll else 'wiki',
                          'custom' if p.run.use_custom else '',
                          str(p.train.num_pages_to_use)])
    with open('./4data_{}'.format(train_str), 'w') as fh:
      for item_num, item in progressbar(enumerate(dataset)):
        fh.write('{}\n'.format(str(item)))
        if item_num % 1000 == 0: fh.flush()
Exemple #5
0
def main():
    p = get_cli_args(args)
    x_train, y_train, qid_train = load_svmlight_file(
        p.train.xgboost_train_path, query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
    x_test, y_test, qid_test = load_svmlight_file(p.train.xgboost_test_path,
                                                  query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
    metric = pyltr.metrics.AP(k=1)
    model = pyltr.models.LambdaMART(
        metric=metric,
        n_estimators=1000,
        learning_rate=0.02,
        max_features=0.5,
        query_subsample=0.5,
        max_leaf_nodes=10,
        min_samples_leaf=64,
        verbose=1,
    )

    model.fit(x_train.todense(), y_train, qid_train)
    preds = model.predict(x_test.todense())
    print('Random ranking:', metric.calc_mean_random(qid_test, y_test))
    print('Our model:', metric.calc_mean(qid_test, y_test, preds))
    import ipdb
    ipdb.set_trace()
def main():
    p = get_cli_args(args)
    with open('./tokens.pkl', 'rb') as fh:
        token_idx_lookup = pickle.load(fh)
    load_dotenv(dotenv_path=p.run.env_path)
    EL_DATABASE_NAME = os.getenv("DBNAME")
    DATABASE_USER = os.getenv("DBUSER")
    DATABASE_PASSWORD = os.getenv("DBPASS")
    DATABASE_HOST = os.getenv("DBHOST")
    with open(p.train.page_id_order_path, 'rb') as fh:
        page_id_order = pickle.load(fh)
    page_ids = page_id_order[:p.train.num_pages_to_use]
    connection = pymysql.connect(host=DATABASE_HOST,
                                 user=DATABASE_USER,
                                 password=DATABASE_PASSWORD,
                                 db=EL_DATABASE_NAME,
                                 charset='utf8mb4',
                                 use_unicode=True,
                                 cursorclass=pymysql.cursors.DictCursor)
    with connection.cursor() as cursor:
        cursor.execute("SET NAMES utf8mb4;")
        cursor.execute("SET CHARACTER SET utf8mb4;")
        cursor.execute("SET character_set_connection=utf8mb4;")

        datasets = [
            MentionCoNLLDataset(cursor, './AIDA-YAGO2-dataset.tsv',
                                p.run.lookups_path, p.train.train_size),
            MentionWikiDataset(cursor, page_ids, p.run.lookups_path,
                               p.train.train_size)
        ]
        with open('./entity_to_row_id.pkl', 'rb') as fh:
            entity_id_to_row = pickle.load(fh)
        idf = get_idf(token_idx_lookup, p.run.idf_path)
        desc_fs_sparse = csr_matrix(load_npz('./desc_fs.npz'))
        desc_vs = csr_matrix(sparse_to_tfidf_vs(idf, desc_fs_sparse))
        norm = (desc_vs.multiply(desc_vs)).sum(1)
        all_e_id_pairs = set()
        data = []
        i = []
        j = []
        row_to_entity_id = _.invert(entity_id_to_row)
        for dataset in datasets:
            for cands in progressbar(iter(dataset)):
                if cands is None: continue
                cand_rows = [
                    entity_id_to_row[e_id] for e_id in cands
                    if (e_id in entity_id_to_row)
                ]
                cand_mat = desc_vs[cand_rows]
                scores = cand_mat.dot(cand_mat.T) / norm[cand_rows]
                new_i = cand_rows * len(cand_rows)
                new_j = [
                    row_num for row_num in cand_rows
                    for __ in range(len(cand_rows))
                ]
                list_scores = np.array(scores).ravel().tolist()
                for res_i in range(len(list_scores)):
                    pair = (row_to_entity_id[min(new_i[res_i], new_j[res_i])],
                            row_to_entity_id[max(new_i[res_i], new_j[res_i])])
                    if pair not in all_e_id_pairs:
                        data.append(list_scores[res_i])
                        i.append(new_i[res_i])
                        j.append(new_j[res_i])
        mat = csr_matrix(coo_matrix((data, (i, j))))
        train_str = 'wiki+conll_' + '_'.join([str(p.train.num_pages_to_use)])
        save_npz('compats_{}.npz'.format(train_str), mat)
Exemple #7
0
def main():
    p = get_cli_args(args)
    with open('./tokens.pkl', 'rb') as fh:
        token_idx_lookup = pickle.load(fh)
    with open('./glove_token_idx_lookup.pkl', 'rb') as fh:
        full_token_idx_lookup = pickle.load(fh)
    load_dotenv(dotenv_path=p.run.env_path)
    EL_DATABASE_NAME = os.getenv("DBNAME")
    DATABASE_USER = os.getenv("DBUSER")
    DATABASE_PASSWORD = os.getenv("DBPASS")
    DATABASE_HOST = os.getenv("DBHOST")
    with open(p.train.page_id_order_path, 'rb') as fh:
        page_id_order = pickle.load(fh)
    page_ids = page_id_order[:p.train.num_pages_to_use]
    connection = pymysql.connect(host=DATABASE_HOST,
                                 user=DATABASE_USER,
                                 password=DATABASE_PASSWORD,
                                 db=EL_DATABASE_NAME,
                                 charset='utf8mb4',
                                 use_unicode=True,
                                 cursorclass=pymysql.cursors.DictCursor)
    with connection.cursor() as cursor:
        cursor.execute("SET NAMES utf8mb4;")
        cursor.execute("SET CHARACTER SET utf8mb4;")
        cursor.execute("SET character_set_connection=utf8mb4;")

        if p.train.train_on_conll:
            conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv'
            dataset = SimpleCoNLLDataset(
                cursor,
                token_idx_lookup,
                full_token_idx_lookup,
                conll_path,
                p.run.lookups_path,
                p.run.idf_path,
                p.train.train_size,
                txt_dataset_path=p.run.txt_dataset_path)
        else:
            dataset = SimpleMentionDataset(
                cursor,
                token_idx_lookup,
                full_token_idx_lookup,
                page_ids,
                p.run.lookups_path,
                p.run.idf_path,
                p.train.train_size,
                txt_dataset_path=p.run.txt_dataset_path)
        train_str = '_'.join([
            'conll' if p.train.train_on_conll else 'wiki',
            'custom' if p.run.use_custom else '',
            str(p.train.num_pages_to_use)
        ])
        part_num = 0
        data = []
        fh = open('./4prep_{}_part_{}.pkl'.format(train_str, part_num), 'wb')
        for item_num, item in progressbar(enumerate(dataset)):
            if (item_num != 0) and item_num % 100000 == 0:
                pickle.dump(data, fh)
                data = []
                fh.close()
                part_num += 1
                fh = open('./4prep_{}_part_{}.pkl'.format(train_str, part_num),
                          'wb')
            all_target_features = []
            all_candidate_features = []
            pair_ids = []
            mention_features, mention_candidate_ids, label = item
            target_idx = mention_candidate_ids.index(label)
            target_features = mention_features[target_idx]
            for candidate_features, candidate_id in zip(
                    mention_features, mention_candidate_ids):
                if candidate_id != label:
                    all_target_features.append(target_features)
                    all_candidate_features.append(candidate_features)
                    pair_ids.append((label, candidate_id))
            features = (all_target_features, all_candidate_features)
            data.append(features)
        pickle.dump(data, fh)
        fh.close()
def main():
  global best_options, best_performances
  p = get_cli_args(args)
  arg_options = [
    {'path': ['train', 'dropout_keep_prob'],
     'options': [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3]},
    {'path': ['train', 'margin'],
     'if': lambda params: _.get(thaw(params), ['train', 'use_hinge']),
     'options': [10, 100, 1000, 10000]},
    # {'path': ['train', 'stop_by'],
    #  'options': ['acc', 'bce_loss']},
    {'path': ['train', 'use_hinge'],
     'options': [False, True]},
    # {'path': ['train', 'stop_after_n_bad_epochs'],
    #  'options': [1, 2]},
    {'path': ['model', 'hidden_sizes'],
     'options': [[100]]},
    {'path': ['train', 'learning_rate'],
     'options': [0.1, 1e-2, 1e-3, 1e-4, 1e-5]},
  ]
  with open('./tokens.pkl', 'rb') as fh: token_idx_lookup = pickle.load(fh)
  with open('./glove_token_idx_lookup.pkl', 'rb') as fh: full_token_idx_lookup = pickle.load(fh)
  load_dotenv(dotenv_path=p.run.env_path)
  EL_DATABASE_NAME = os.getenv("DBNAME")
  DATABASE_USER = os.getenv("DBUSER")
  DATABASE_PASSWORD = os.getenv("DBPASS")
  DATABASE_HOST = os.getenv("DBHOST")
  with open(p.train.page_id_order_path, 'rb') as fh:
    page_id_order = pickle.load(fh)
  page_ids = page_id_order[:p.train.num_pages_to_use]
  connection = pymysql.connect(host=DATABASE_HOST,
                               user=DATABASE_USER,
                               password=DATABASE_PASSWORD,
                               db=EL_DATABASE_NAME,
                               charset='utf8mb4',
                               use_unicode=True,
                               cursorclass=pymysql.cursors.DictCursor)
  with connection.cursor() as cursor:
    cursor.execute("SET NAMES utf8mb4;")
    cursor.execute("SET CHARACTER SET utf8mb4;")
    cursor.execute("SET character_set_connection=utf8mb4;")
    conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv'
    test_dataset = SimpleCoNLLDataset(cursor,
                                      token_idx_lookup,
                                      full_token_idx_lookup,
                                      conll_path,
                                      p.run.lookups_path,
                                      p.run.idf_path,
                                      p.train.train_size,
                                      p.run.val_txt_dataset_path)
    try:
      with open('./val_test_indices.json', 'r') as fh:
        val_indices, test_indices = json.load(fh)
    except FileNotFoundError:
      with open('./val_test_indices.json', 'w') as fh:
        permutation = list(range(len(test_dataset))); shuffle(permutation)
        split_idx = int(len(test_dataset) * 0.5)
        val_indices, test_indices = permutation[:split_idx], permutation[split_idx:]
        json.dump((val_indices, test_indices), fh)
    val_dataloader = DataLoader(test_dataset,
                                batch_sampler=BatchSampler(SubsetSequentialSampler(val_indices),
                                                           p.run.batch_size,
                                                           False),
                                collate_fn=collate_simple_mention_ranker)
    if p.run.pkl_dataset_prefix:
      def _col(batch):
        features, labels = zip(*batch)
        target, cand = zip(*features)
        target = torch.tensor(target)
        cand = torch.tensor(cand)
        labels = torch.tensor(labels)
        return (target, cand), labels
      collate_fn = _col
    else:
      collate_fn = collate_simple_mention_pairwise if p.train.use_pairwise else collate_simple_mention_pointwise
    if p.train.train_on_conll:
      conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv'
      dataset = SimpleCoNLLDataset(cursor,
                                   token_idx_lookup,
                                   full_token_idx_lookup,
                                   conll_path,
                                   p.run.lookups_path,
                                   p.run.idf_path,
                                   p.train.train_size,
                                   txt_dataset_path=p.run.txt_dataset_path)
    else:
      dataset = SimpleMentionDataset(cursor,
                                     token_idx_lookup,
                                     full_token_idx_lookup,
                                     page_ids,
                                     p.run.lookups_path,
                                     p.run.idf_path,
                                     p.train.train_size,
                                     txt_dataset_path=p.run.txt_dataset_path,
                                     pkl_dataset_prefix=p.run.pkl_dataset_prefix)
    with open(p.run.perf_path, 'w') as fh:
      for cand_p, new_options in progressbar(hparam_search(p, arg_options, rand_p=False)):
        fh.write(str(thaw(new_options)) + '\n')
        fh.flush()
        if p.train.use_sequential_sampler:
          bs = BatchSampler(FixLenSequentialSampler(2140542), p.train.batch_size, False)
        else:
          if p.run.pkl_dataset_prefix is not None:
            sampler = ChunkedRandomSampler(2140542, 100000)
          else:
            sampler = RandomSampler(dataset)
          bs = BatchSampler(sampler, p.train.batch_size, False)
        dataloader = DataLoader(dataset,
                                batch_sampler=bs,
                                collate_fn=collate_fn)
        model = get_model(cand_p.model, cand_p.train)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        optimizer = optim.Adam(model.parameters(), cand_p.train.learning_rate)
        if cand_p.train.use_hinge:
          calc_loss = nn.MarginRankingLoss(cand_p.train.margin)
        else:
          calc_loss = nn.BCEWithLogitsLoss()
        models_by_epoch = []
        model_performances = []
        did_early_stop = False
        for epoch_num in range(cand_p.train.max_num_epochs):
          epoch_loss = 0
          get_stop_by_val = itemgetter(cand_p.train.stop_by)
          neg_is_bad = cand_p.train.stop_by in ['acc']
          performance = eval_model(val_dataloader, model, device)
          model_performances.append(performance)
          fh.write(str(performance) + '\n')
          fh.flush()
          models_by_epoch.append(model)
          if len(model_performances) >= cand_p.train.stop_after_n_bad_epochs + 1:
            stop_by_perfs = [get_stop_by_val(perf) for perf in model_performances]
            bad_epochs = [diff < 0 if neg_is_bad else diff > 0
                          for diff in np.diff(stop_by_perfs)]
            if all(bad_epochs[-cand_p.train.stop_after_n_bad_epochs:]):
              idx = np.searchsorted(best_performances, performance['acc'])
              best_options.insert(idx, thaw(cand_p))
              best_performances.insert(idx, performance['acc'])
              choose_model(cand_p,
                           models_by_epoch[-cand_p.train.stop_after_n_bad_epochs - 1])
              did_early_stop = True
              break
          for batch_num, batch in enumerate(dataloader):
            model.train()
            optimizer.zero_grad()
            if cand_p.train.use_pairwise:
              features, labels = batch
              features = [elem.to(device) for elem in features]
              labels = labels.to(device)
              target_features, candidate_features = features
              target_scores = model(target_features)
              candidate_scores = model(candidate_features)
              scores = candidate_scores - target_scores
            else:
              batch = [elem.to(device) for elem in batch]
              features, labels = batch
              scores = model(features)
            if cand_p.train.use_hinge:
              loss = calc_loss(target_scores, candidate_scores, torch.ones_like(labels))
            else:
              loss = calc_loss(scores, labels)
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
          print('epoch loss', epoch_loss / len(dataloader))
        if not did_early_stop:
          idx = np.searchsorted(best_performances, performance['acc'])
          best_options.insert(idx, thaw(cand_p))
          best_performances.insert(idx, performance['acc'])
          choose_model(cand_p, model)
        print('best', list(zip(best_options[-10:],
                               best_performances[-10:])))
Exemple #9
0
def main():
    p = get_cli_args(args)
    conll_path = 'custom.tsv' if p.run.use_custom else './AIDA-YAGO2-dataset.tsv'
    db_connection = get_connection(p.run.env_path)
    model = load_model(p.model, p.train)
    with open('./tokens.pkl', 'rb') as fh:
        token_idx_lookup = pickle.load(fh)
    with open('./glove_token_idx_lookup.pkl', 'rb') as fh:
        full_token_idx_lookup = pickle.load(fh)
    with open('./val_test_indices.json', 'r') as fh:
        val_indices, test_indices = json.load(fh)
    model.eval()
    with torch.no_grad():
        with db_connection.cursor() as cursor:
            doc_id_dataset = MentionCoNLLDataset(cursor,
                                                 './AIDA-YAGO2-dataset.tsv',
                                                 p.run.lookups_path,
                                                 p.train.train_size)
            dataset = SimpleCoNLLDataset(cursor, token_idx_lookup,
                                         full_token_idx_lookup, conll_path,
                                         p.run.lookups_path, p.run.idf_path,
                                         p.train.train_size,
                                         p.run.txt_dataset_path)
            # compats = load_npz('compats_wiki+conll_100000.npz')
            with open('./entity_to_row_id.pkl', 'rb') as fh:
                entity_id_to_row = pickle.load(fh)
            idf = get_idf(token_idx_lookup, p.run.idf_path)
            desc_fs_sparse = csr_matrix(load_npz('./desc_fs.npz'))
            desc_vs = csr_matrix(sparse_to_tfidf_vs(idf, desc_fs_sparse))
            norm = np.sqrt((desc_vs.multiply(desc_vs)).sum(1))
            ctr = count()
            num_correct = 0
            num_in_val = 0
            num_correct_small = 0
            num_in_val_small = 0
            grouped = groupby(
                ((dataset[idx], doc_id_dataset.mention_doc_id[idx])
                 for idx in range(len(val_indices) + len(test_indices))),
                key=itemgetter(1))
            batches = [
                collate_simple_mention_ranker([data for data, doc_id in g])
                for doc_id, g in grouped
            ]
            val_indices = set(val_indices)
            for document_batch in progressbar(batches):
                (candidate_ids, features), target_rankings = document_batch
                target = [ranking[0] for ranking in target_rankings]
                candidate_scores = model(features)
                emissions = emissions_from_flat_scores(
                    [len(ids) for ids in candidate_ids], candidate_scores)
                keep_top_n = 5
                top_emissions = []
                top_cands = []
                idxs_to_check = []
                for i, (emission, cand_ids, top_1, idx) in enumerate(
                        zip(emissions, candidate_ids, target, ctr)):
                    if len(cand_ids) > 1:
                        if cand_ids[np.argmax(emission)] != top_1:
                            if idx in val_indices:
                                idxs_to_check.append(i)

                    em, cand = zip(*nlargest(keep_top_n,
                                             zip(emission, cand_ids),
                                             key=itemgetter(0)))
                    top_emissions.append(np.array(em))
                    top_cands.append(cand)
                compatibilities = compatibilities_from_ids(
                    entity_id_to_row, desc_vs, norm, top_cands)
                top_1_idx = mp_shallow_tree_doc(top_emissions, compatibilities)
                # top_1_idx = [np.argmax(em) for em in top_emissions]
                top_1 = [
                    cand_ids[idx]
                    for cand_ids, idx in zip(top_cands, top_1_idx)
                ]
                for guess, label in zip(top_1, target):
                    num_in_val += 1
                    if guess == label: num_correct += 1
                for idx in idxs_to_check:
                    guess = top_1[idx]
                    label = target[idx]
                    num_in_val_small += 1
                    if guess == label: num_correct_small += 1
            print(num_correct / num_in_val)
            print(num_correct_small / num_in_val_small)