Beispiel #1
0
def build(use_less_memory: bool):
    """Build the corpus of TREC documents files asynchronously from the HotpotQA raw wiki data.

    Expects the uncompressed HotpotQA raw wiki data available in the ``./data/raw`` folder. Folders are processed in
    order. Resulting documents are collected and sorted according to their ids. Each persisted file carries
    ``CHUNK_SIZE`` documents and is named as ``{first_doc_id_in_file}@{last_doc_id_in_file}``.

    If asked to use less memory, it will defer persistence to the child processes.

    :param use_less_memory: Whether to use less memory by not sorting documents and instead persisting them under a file
    with the same name as the folder from which the raw data originate.
    :return: None.
    """
    global USE_LESS_MEMORY
    USE_LESS_MEMORY = use_less_memory

    assert os.path.exists(
        RAW_DATA_DIR
    ), f'Cannot find raw data in {os.path.abspath(RAW_DATA_DIR)}'
    os.makedirs(os.path.abspath(TREC_CORPUS_DIR), exist_ok=True)

    folder_paths = sorted(glob(os.path.join(RAW_DATA_DIR, '*')))
    doc_triples = []

    # create document database
    helpers.log('Creating documents database.')
    db = sqlite3.connect(DOCUMENT_DB)
    cursor: sqlite3.Cursor = db.cursor()
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS documents (id INTEGER PRIMARY KEY, text TEXT)"
    )
    db.commit()
    cursor.close()
    db.close()

    dfs = []

    helpers.log('Extracting TREC documents.')
    if USE_LESS_MEMORY:
        for _ in parallel.execute(_process_raw_data_folder, folder_paths):
            pass
        logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tExtraction done.')
    else:
        for doc_triples_by_folder in parallel.execute(_process_raw_data_folder,
                                                      folder_paths):
            doc_triples.extend(doc_triples_by_folder)
            doc_triples = sorted(doc_triples, key=lambda triple: triple[0])
        logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tExtraction done.')

        logging.info(
            f'[{datetime.now()}]\t[{os.getpid()}]\tPersisting TREC documents.')
        for _ in parallel.execute(_process_doc_triples,
                                  parallel.chunk(100000, doc_triples)):
            pass
        logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tPersistence done.')
    logging.info(
        f'[{datetime.now()}]\t[{os.getpid()}]\tFinished building TREC corpus.')

    return
Beispiel #2
0
def build():
    global INDEX
    INDEX = Index(env='default')
    batches = parallel.chunk(CHUNK_SIZE, INDEX.document_int_ids())
    helpers.log(f'Building maps for {INDEX.count()} documents.')
    int2wid = {}
    wid2int = {}
    for batch_maps in parallel.execute(_process_batch, batches):
        batch_int2wid, batch_wid2int = batch_maps
        int2wid.update(batch_int2wid)
        wid2int.update(batch_wid2int)

    with open(TOKEN2ID, 'wb')as file:
        pickle.dump(INDEX.token2id, file)
    with open(ID2TOKEN, 'wb')as file:
        pickle.dump(INDEX.id2token, file)
    with open(ID2DF, 'wb')as file:
        pickle.dump(INDEX.id2df, file)
    with open(ID2TF, 'wb')as file:
        pickle.dump(INDEX.id2tf, file)
    with open(ID2TOKEN, 'wb')as file:
        pickle.dump(INDEX.id2token, file)
    with open(INT2WID, 'wb') as file:
        pickle.dump(int2wid, file)
    with open(WID2INT, 'wb') as file:
        pickle.dump(wid2int, file)

    helpers.log(f'Finished building maps. Mapped {len(int2wid)}/{INDEX.index.document_count()}')
def build() -> None:
    helpers.log('Building Trec Eval references.')
    with open(ct.TRAIN_HOTPOT_SET, 'r') as file:
        question_set = json.load(file)
        dev_question_set = question_set[ct.TRAIN_DEV_SPLIT:]
        train_question_set = question_set[:ct.TRAIN_DEV_SPLIT]
    with open(ct.DEV_HOTPOT_SET, 'r') as file:
        test_question_set = json.load(file)
    iterator = [('train', train_question_set), ('dev', dev_question_set),
                ('test', test_question_set)]
    for _set, reference in parallel.execute(_build_reference, iterator):
        helpers.log(f'Created reference {reference} for {_set} set.')
Beispiel #4
0
def _evaluate_epoch(model: nn.Module, ref: str, data_loader: DataLoader,
                    trec_eval: str, trec_eval_agg: str, save: bool) -> METRICS:
    model.eval()
    epoch_run = Run()
    epoch_eval = Evaluator(ref, measures=pytrec_eval.supported_measures)
    acc = 0

    final_scores = torch.empty((len(data_loader.dataset), 1),
                               dtype=torch.float)
    question_ids = []
    document_ids = []
    with torch.no_grad():
        for idx, batch in enumerate(data_loader):
            (questions, documents, features, targets, batch_question_ids,
             batch_document_ids) = batch
            questions = questions.to(device=ct.DEVICE, non_blocking=True)
            documents = documents.to(device=ct.DEVICE, non_blocking=True)
            features = features.to(device=ct.DEVICE, non_blocking=True)
            targets = targets.to(device=ct.DEVICE, non_blocking=True)

            batch_size = questions.shape[0]
            scores, encodings = model(questions, documents, features)
            acc += torch.sum(
                (torch.round(scores) == targets).to(dtype=torch.float))

            question_ids.extend(batch_question_ids)
            document_ids.extend(batch_document_ids)
            if batch_size == ct.BATCH_SIZE:
                final_scores[idx * ct.BATCH_SIZE:(idx + 1) *
                             ct.BATCH_SIZE] = scores
            else:
                final_scores[idx * ct.BATCH_SIZE:] = scores

    for batch_run in parallel.execute(
            _build_run,
            parallel.chunk(
                10000, zip(question_ids, document_ids, final_scores.numpy()))):
        epoch_run.update_rankings(batch_run)

    acc = acc / len(data_loader.dataset)
    _, trec_eval_agg = epoch_eval.evaluate(epoch_run, trec_eval, trec_eval_agg,
                                           save)

    return epoch_run, acc.item(), \
           trec_eval_agg['map_cut_10'], trec_eval_agg['ndcg_cut_10'], trec_eval_agg['recall_10'], \
           trec_eval_agg['map_cut_100'], trec_eval_agg['ndcg_cut_100'], trec_eval_agg['recall_100'], \
           trec_eval_agg['map_cut_1000'], trec_eval_agg['ndcg_cut_1000'], trec_eval_agg['recall_1000'], \
           trec_eval_agg['P_5']
def process():
    """Filter the collection of 5 million document to just the top 5000 at most according to bigram/unigram
    filtering per question. Processed in parallel."""
    global_start = datetime.now()
    global INDEX
    INDEX = Index(env='tfidf')
    os.makedirs(DIR_NAME)
    (batches, no_batches, no_queries), total_retrieved = retrieve.load_dataset_batches(), 0
    retrieve.create_retrieval_db(DB_NAME)

    helpers.log(f'Retrieving documents. Workers: {os.cpu_count()}')
    start = datetime.now()
    for batch_retrieval in parallel.execute(_process_question_batch, batches):
        total_retrieved += batch_retrieval
    end = datetime.now()
    helpers.log(f'Finished retrieval in {end - start}. Filtered {total_retrieved}/{no_queries}')

    global_end = datetime.now()
    helpers.log(f'Finished process in {global_end - global_start}.')
def build(skip_relevant: bool = True):
    global INDEX, COLUMNS, QUESTION_COUNTS
    global SKIP_RELEVANT
    SKIP_RELEVANT = skip_relevant
    INDEX = Index('tfidf')
    helpers.log('Loaded index.')

    os.makedirs(ct.CANDIDATES_DIR, exist_ok=True)
    with open(ct.TRAIN_HOTPOT_SET, 'r') as file:
        question_set = json.load(file)
        train_question_set = question_set[:ct.TRAIN_DEV_SPLIT]
        dev_question_set = question_set[ct.TRAIN_DEV_SPLIT:]
    with open(ct.DEV_HOTPOT_SET, 'r') as file:
        test_question_set = json.load(file)

    iterator: List[Tuple[str, str, Callable]] = [
        # (train_question_set, 'train', ct.TRAIN_CANDIDATES_DB, ct.CANDIDATES_CHUNK),
        # (dev_question_set, 'dev', ct.DEV_CANDIDATES_DB, ct.CANDIDATES_CHUNK),
        (test_question_set, 'test', ct.TEST_CANDIDATES_DB, ct.CANDIDATES_CHUNK)
    ]

    for (_set, split, candidate_db_path, chunk) in iterator:
        start = datetime.now()

        db = sqlite3.connect(candidate_db_path)
        cursor = db.cursor()
        cursor.execute(sql.create_candidate_table)
        db.commit()
        helpers.log('Created candidates table.')

        QUESTION_COUNTS = cursor.execute(sql.count_question_rows).fetchall()
        QUESTION_COUNTS = {json.loads(_id): _count for (_id, _count) in QUESTION_COUNTS}
        helpers.log(f'Retrieved question counts for {len(QUESTION_COUNTS)} questions.')
        cursor.close()
        db.close()

        helpers.log(f'Creating {split} candidate set with {len(_set)} question.')
        total_count = 0
        _set_generator = parallel.chunk(chunk, zip([split] * len(_set), _set))
        for batch_count in parallel.execute(_build_candidates, _set_generator):
            total_count += batch_count
        helpers.log(f'Created {split} candidate set with {total_count} questions in {datetime.now() - start}')
    def __init__(self, normalized: bool):
        super().__init__(None)

        self.normalized = normalized

        if os.path.isfile(constants.IBM_MODEL):
            with open(constants.IBM_MODEL, 'rb') as file:
                self.ibm1 = pickle.load(file)
        else:
            dataset = Dataset.from_file(constants.TRAIN_HOTPOT_SET)

            bitext = []
            batches = parallel.chunk(constants.CHUNK_SIZE, dataset.questions)
            # for partial_bitext in map(_build_bitext, batches):
            for partial_bitext in parallel.execute(_build_bitext, batches):
                bitext.extend(partial_bitext)

            self.ibm1 = nltk.IBMModel1(bitext, 5)

            os.makedirs(constants.TRANSLATION_MODEL_DIR, exist_ok=True)
            with open(constants.IBM_MODEL, 'wb') as file:
                pickle.dump(self.ibm1, file)
Beispiel #8
0
def build():
    assert os.path.exists(
        RAW_DATA_DIR
    ), f'Cannot find raw data in {os.path.abspath(RAW_DATA_DIR)}'
    os.makedirs(os.path.abspath(INDEX_DIR), exist_ok=True)

    folder_paths = sorted(glob(os.path.join(RAW_DATA_DIR, '*')))
    title2wid = {}
    wid2title = {}
    logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tBuilding title maps.')
    for group_title2wid in parallel.execute(_build_group_title_map,
                                            folder_paths):
        for (title, wid) in group_title2wid.items():
            if title2wid.get(title, None) is None:
                title2wid[title] = wid
                wid2title[wid] = title
            else:
                # Hack for taking care of the double title that points to a proper article and to a disambiguation
                # article. Assumes the only article of interest is the one that is not a disambiguation one.
                helpers.log(
                    f'Title {title} has the WID {title2wid.get(title)}. Current WID: {wid}.'
                )
                if wid == 2209045:
                    helpers.log(
                        f'Replacing WID {title2wid.get(title)} with WID {wid}.'
                    )
                    title2wid[title] = wid
                    wid2title[wid] = title

    with open(WID2TITLE, 'wb') as file:
        pickle.dump(wid2title, file)
    with open(TITLE2WID, 'wb') as file:
        pickle.dump(title2wid, file)
    logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tBuilt title maps.')

    return
def evaluate_test_set(model_name: str, output_dir: str):
    os.makedirs('./evaluation', exist_ok=True)

    config = models[model_name]
    query_encoder = config.query_encoder(config.embedding_dim)
    document_encoder = config.document_encoder(config.embedding_dim)
    scorer = config.scorer(**config.scorer_kwargs)
    model = config.ranker(query_encoder, document_encoder,
                          scorer).to(device=ct.DEVICE)
    optimizer = config.optimizer(model.parameters(), **config.optimizer_kwargs)

    _load_checkpoint(model, optimizer, config)

    dataset = QueryDocumentsDataset(ct.TEST_FEATURES_DB)
    data_loader = DataLoader(dataset,
                             ct.BATCH_SIZE,
                             False,
                             collate_fn=QueryDocumentsDataset.collate,
                             num_workers=os.cpu_count(),
                             pin_memory=True)

    model.eval()
    epoch_run = Run()
    epoch_eval = Evaluator(ct.TEST_TREC_REFERENCE,
                           measures=pytrec_eval.supported_measures)

    final_scores = torch.empty((len(data_loader.dataset), 1),
                               dtype=torch.float)
    question_ids = []
    document_ids = []
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(data_loader)):
            (questions, documents, features, targets, batch_question_ids,
             batch_document_ids) = batch
            questions = questions.to(device=ct.DEVICE, non_blocking=True)
            documents = documents.to(device=ct.DEVICE, non_blocking=True)
            features = features.to(device=ct.DEVICE, non_blocking=True)

            batch_size = questions.shape[0]
            scores, encodings = model(questions, documents, features)

            question_ids.extend(batch_question_ids)
            document_ids.extend(batch_document_ids)
            if batch_size == ct.BATCH_SIZE:
                final_scores[idx * ct.BATCH_SIZE:(idx + 1) *
                             ct.BATCH_SIZE] = scores
            else:
                final_scores[idx * ct.BATCH_SIZE:] = scores

    for batch_run in parallel.execute(
            _build_run,
            parallel.chunk(
                10000, zip(question_ids, document_ids, final_scores.numpy()))):
        epoch_run.update_rankings(batch_run)

    trec_eval, trec_eval_agg = epoch_eval.evaluate(epoch_run, save=False)

    os.makedirs(output_dir, exist_ok=True)
    with open(os.path.join(output_dir, model_name + '_hotpot.json'),
              'w') as file:
        # use DEV_HOTPOT because that corresponds to our test set. the actual hotpot test set is unlabeled.
        json.dump(epoch_run.to_json(ct.TEST_FEATURES_DB, ct.DEV_HOTPOT_SET),
                  file,
                  indent=True)

    print(json.dumps(trec_eval_agg, indent=True))
Beispiel #10
0
def build():
    assert constants.TRAIN_FEATURES_CHUNK > 1
    assert constants.DEV_FEATURES_CHUNK > 1

    global INDEX
    INDEX = Index('tfidf')
    helpers.log('Loaded index.')

    global EXTRACTORS
    EXTRACTORS = []
    if 'entity' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(EntityExtractor(INDEX))
    if 'ibm1' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(IBM1FeatureExtractor(normalized=False))
    if 'nibm1' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(IBM1FeatureExtractor(normalized=True))
    if 'bigram' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=False))
    if 'nbigram' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=True))
    if 'qword' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(QuestionWordFeatureExtractor())
    if 'doclen' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(DocumentLengthFeatureExtractor())
    helpers.log('Loaded extractors.')

    global COLUMNS
    COLUMNS = copy.copy(constants.FEATURE_BASE_COLUMN_NAMES)
    COLUMNS.extend(feature for extractor in EXTRACTORS for feature in extractor.feature_name)
    COLUMNS.append(constants.FEATURE_TARGET_COLUMN_NAME)
    helpers.log('Loaded column names.')

    os.makedirs(constants.FEATURES_DIR, exist_ok=True)
    iterator: List[Tuple[str, str, Callable]] = [
        # (constants.TRAIN_CANDIDATES_DB, constants.TRAIN_FEATURES_DB, constants.TRAIN_FEATURES_CHUNK),
        # (constants.DEV_CANDIDATES_DB, constants.DEV_FEATURES_DB, constants.DEV_FEATURES_CHUNK),
        (constants.TEST_CANDIDATES_DB, constants.TEST_FEATURES_DB, constants.TEST_FEATURES_CHUNK)
    ]

    for (candidate_db_path, feature_db_path, chunk) in iterator:
        start_time = datetime.now()
        _set = candidate_db_path.split("/")[-1].split(".")[1]

        done = False
        feature_db = sqlite3.connect(feature_db_path)
        cursor = feature_db.cursor()
        while not done:
            try:
                cursor.execute(sql.create_features_table(COLUMNS))
                feature_db.commit()
                done = True
            except Exception as e:
                helpers.log(e)
        helpers.log(f'Created {_set} features table.')

        (start,) = cursor.execute('SELECT MAX(id) FROM features').fetchone()
        start = start if start is not None else 0  # first id in the database
        cursor.close()
        feature_db.close()
        helpers.log(f'Starting feature build at {start}.')

        candidate_db = sqlite3.connect(candidate_db_path)
        cursor = candidate_db.cursor()
        (stop,) = cursor.execute('SELECT COUNT(*) FROM candidates').fetchone()  # last id in the database
        cursor.close()
        candidate_db.close()
        id_range = range(start + 1, stop + 1)
        helpers.log(f'Retrieved {len(id_range)} candidate indices for {_set} set.')

        total_count = 0
        _set_generator = parallel.chunk(chunk, zip([_set] * len(id_range), id_range))
        _batched_set_generator = parallel.chunk(constants.GRAND_CHUNK, _set_generator)
        for grand_batch_idx, _batch_set in _batched_set_generator:
            grand_batch_count = 0
            for batch_count in parallel.execute(_build_candidates, _batch_set, _as='process'):
                grand_batch_count += batch_count
                total_count += batch_count
            helpers.log(f'Processed {_set} batch of features no {grand_batch_idx} with {grand_batch_count} pairs.')
        helpers.log(f'Created {_set} features set with {total_count} pairs in {datetime.now() - start_time}')
Beispiel #11
0
def run_eval(_set: str, config: Config):
    start = datetime.now()
    if _set == 'train':
        feature_db = ct.TRAIN_FEATURES_DB
        ref = ct.TRAIN_TREC_REFERENCE
    elif _set == 'dev':
        feature_db = ct.DEV_FEATURES_DB
        ref = ct.DEV_TREC_REFERENCE
    elif _set == 'test':
        feature_db = ct.TEST_FEATURES_DB
        ref = ct.TEST_TREC_REFERENCE
    else:
        raise ValueError(f'Unknown set {_set}.')
    with open(ct.INT2WID, 'rb') as file:
        global INT2WID
        INT2WID = pickle.load(file)
    with open(ct.WID2TITLE, 'rb') as file:
        global WID2TITLE
        WID2TITLE = pickle.load(file)

    trec_eval = ct.L2R_EVAL.format(config.name, 'test')
    trec_eval_agg = ct.L2R_EVAL_AGG.format(config.name, 'test')

    query_encoder = config.query_encoder(config.embedding_dim)
    document_encoder = config.document_encoder(config.embedding_dim)
    scorer = config.scorer(**config.scorer_kwargs)
    model = config.ranker(query_encoder, document_encoder,
                          scorer).to(device=ct.DEVICE)
    # noinspection PyCallingNonCallable
    optimizer = config.optimizer(model.parameters(), **config.optimizer_kwargs)
    _ = _load_checkpoint(model, optimizer, config)
    helpers.log(
        f'Loaded maps, model, and optimizer in {datetime.now() - start}.')

    test_data_set = QueryDocumentsDataset(feature_db)
    test_data_loader = data.DataLoader(
        test_data_set,
        ct.BATCH_SIZE,
        False,
        num_workers=os.cpu_count(),
        collate_fn=QueryDocumentsDataset.collate)

    model.eval()
    epoch_run = Run()
    epoch_eval = Evaluator(ref, measures=pytrec_eval.supported_measures)

    final_scores = torch.empty((len(test_data_loader.dataset), 1),
                               dtype=torch.float)
    question_ids = []
    document_ids = []
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(test_data_loader)):
            (questions, documents, features, targets, batch_question_ids,
             batch_document_ids) = batch
            questions = questions.to(device=ct.DEVICE, non_blocking=True)
            documents = documents.to(device=ct.DEVICE, non_blocking=True)
            features = features.to(device=ct.DEVICE, non_blocking=True)

            batch_size = questions.shape[0]
            scores, encodings = model(questions, documents, features)

            question_ids.extend(batch_question_ids)
            document_ids.extend(batch_document_ids)
            if batch_size == ct.BATCH_SIZE:
                final_scores[idx * ct.BATCH_SIZE:(idx + 1) *
                             ct.BATCH_SIZE] = scores
            else:
                final_scores[idx * ct.BATCH_SIZE:] = scores

    for batch_run in tqdm(
            parallel.execute(
                _build_run,
                parallel.chunk(
                    10000, zip(question_ids, document_ids,
                               final_scores.numpy())))):
        epoch_run.update_rankings(batch_run)

    trec_eval, trec_eval_agg = epoch_eval.evaluate(epoch_run, trec_eval,
                                                   trec_eval_agg, False)
    er_10 = 0
    for stats in trec_eval.values():
        er_10 += stats['recall_10'] == 1.0
    er_10 /= len(trec_eval)

    print(f'ndcg@10:\t\t{trec_eval_agg["ndcg_cut_10"]:.4f}')
    print(f'map@10:\t\t{trec_eval_agg["map_cut_10"]:.4f}')
    print(f'er@10:\t\t{er_10:.4f}')
    print(f'recall@10:\t\t{trec_eval_agg["recall_10"]:.4f}')
    print(f'recall@100:\t\t{trec_eval_agg["recall_100"]:.4f}')
    print(f'recall@1000:\t\t{trec_eval_agg["recall_1000"]:.4f}')