コード例 #1
0
def _build_group_title_map(folder_path: str) -> Dict[str, int]:
    title2wid: Dict[str, int] = {}

    file_paths = sorted(glob(os.path.join(folder_path, '*.bz2')))
    for file_path in file_paths:
        with bz2.BZ2File(file_path) as file:
            for line in file:
                doc = json.loads(line)
                doc_wid, doc_title = int(doc['id']), doc['title']

                if title2wid.get(doc_title, None) is None:
                    title2wid[doc_title] = doc_wid
                else:
                    # Hack for taking care of the double title that points to a proper article and to a disambiguation
                    # article. Assumes the only article of interest is the one that is not a disambiguation one.
                    helpers.log(
                        f'Title {doc_title} has the WID {title2wid.get(doc_title)}. Current WID: {doc_wid}.'
                    )
                    if doc_wid == 2209045:
                        helpers.log(
                            f'Replacing WID {title2wid.get(doc_title)} with WID {doc_wid}.'
                        )
                        title2wid[doc_title] = doc_wid

    logging.info(
        f'[{datetime.now()}]\t[{os.getpid()}]\tBuilt title maps for folder {folder_path.split("/")[-1]}.'
    )
    return title2wid
コード例 #2
0
def build():
    global INDEX
    INDEX = Index(env='default')
    batches = parallel.chunk(CHUNK_SIZE, INDEX.document_int_ids())
    helpers.log(f'Building maps for {INDEX.count()} documents.')
    int2wid = {}
    wid2int = {}
    for batch_maps in parallel.execute(_process_batch, batches):
        batch_int2wid, batch_wid2int = batch_maps
        int2wid.update(batch_int2wid)
        wid2int.update(batch_wid2int)

    with open(TOKEN2ID, 'wb')as file:
        pickle.dump(INDEX.token2id, file)
    with open(ID2TOKEN, 'wb')as file:
        pickle.dump(INDEX.id2token, file)
    with open(ID2DF, 'wb')as file:
        pickle.dump(INDEX.id2df, file)
    with open(ID2TF, 'wb')as file:
        pickle.dump(INDEX.id2tf, file)
    with open(ID2TOKEN, 'wb')as file:
        pickle.dump(INDEX.id2token, file)
    with open(INT2WID, 'wb') as file:
        pickle.dump(int2wid, file)
    with open(WID2INT, 'wb') as file:
        pickle.dump(wid2int, file)

    helpers.log(f'Finished building maps. Mapped {len(int2wid)}/{INDEX.index.document_count()}')
コード例 #3
0
ファイル: trec.py プロジェクト: janaleible/hotpotQA-ir-task
def build(use_less_memory: bool):
    """Build the corpus of TREC documents files asynchronously from the HotpotQA raw wiki data.

    Expects the uncompressed HotpotQA raw wiki data available in the ``./data/raw`` folder. Folders are processed in
    order. Resulting documents are collected and sorted according to their ids. Each persisted file carries
    ``CHUNK_SIZE`` documents and is named as ``{first_doc_id_in_file}@{last_doc_id_in_file}``.

    If asked to use less memory, it will defer persistence to the child processes.

    :param use_less_memory: Whether to use less memory by not sorting documents and instead persisting them under a file
    with the same name as the folder from which the raw data originate.
    :return: None.
    """
    global USE_LESS_MEMORY
    USE_LESS_MEMORY = use_less_memory

    assert os.path.exists(
        RAW_DATA_DIR
    ), f'Cannot find raw data in {os.path.abspath(RAW_DATA_DIR)}'
    os.makedirs(os.path.abspath(TREC_CORPUS_DIR), exist_ok=True)

    folder_paths = sorted(glob(os.path.join(RAW_DATA_DIR, '*')))
    doc_triples = []

    # create document database
    helpers.log('Creating documents database.')
    db = sqlite3.connect(DOCUMENT_DB)
    cursor: sqlite3.Cursor = db.cursor()
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS documents (id INTEGER PRIMARY KEY, text TEXT)"
    )
    db.commit()
    cursor.close()
    db.close()

    dfs = []

    helpers.log('Extracting TREC documents.')
    if USE_LESS_MEMORY:
        for _ in parallel.execute(_process_raw_data_folder, folder_paths):
            pass
        logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tExtraction done.')
    else:
        for doc_triples_by_folder in parallel.execute(_process_raw_data_folder,
                                                      folder_paths):
            doc_triples.extend(doc_triples_by_folder)
            doc_triples = sorted(doc_triples, key=lambda triple: triple[0])
        logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tExtraction done.')

        logging.info(
            f'[{datetime.now()}]\t[{os.getpid()}]\tPersisting TREC documents.')
        for _ in parallel.execute(_process_doc_triples,
                                  parallel.chunk(100000, doc_triples)):
            pass
        logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tPersistence done.')
    logging.info(
        f'[{datetime.now()}]\t[{os.getpid()}]\tFinished building TREC corpus.')

    return
コード例 #4
0
def create_retrieval_db(db_name: str) -> None:
    """Create the sqlite database where retrieval results will be persisted."""
    helpers.log(f'Creating retrieval SQL table. Database: {db_name}')
    with sqlite3.connect(db_name) as db:
        cursor = db.cursor()
        cursor.execute(sql.create_table())
        db.commit()

    return
コード例 #5
0
def _process_batch(batch: Tuple[int, List[int]]) -> Tuple[Dict[int, int], Dict[int, int]]:
    no, batch = batch
    wid2int = {}
    int2wid = {}
    for int_doc_id in batch:
        wid = INDEX.get_wid(int_doc_id)
        wid2int[wid] = int_doc_id
        int2wid[int_doc_id] = wid

    helpers.log(f'Finished batch. Mapped {len(wid2int)}.')
    return int2wid, wid2int
コード例 #6
0
def _build_candidates(numbered_batch: Tuple[int, Tuple[str, Dict[str, Any]]]) -> int:
    try:
        start_time = datetime.now()

        batch_index, batch = numbered_batch
        _set, start = batch[0]
        _, stop = batch[-1]

        if _set == 'train':
            candidate_db_path = constants.TRAIN_CANDIDATES_DB
            feature_db_path = constants.TRAIN_FEATURES_DB
        elif _set == 'dev':
            candidate_db_path = constants.DEV_CANDIDATES_DB
            feature_db_path = constants.DEV_FEATURES_DB
        elif _set == 'test':
            candidate_db_path = constants.TEST_CANDIDATES_DB
            feature_db_path = constants.TEST_FEATURES_DB
        else:
            raise ValueError(f'Unknown dataset {_set}.')
        done = False

        candidate_db = sqlite3.connect(candidate_db_path)
        candidate_cursor = candidate_db.cursor()
        candidate_rows = candidate_cursor.execute(sql.fetch_candidate_batch, (start, stop)).fetchall()
        candidate_cursor.close()
        candidate_db.close()

        batch_count = 0
        rows = []
        feature_db = sqlite3.connect(feature_db_path)
        feature_cursor = feature_db.cursor()
        for candidate_row in candidate_rows:
            (_id, question_id, _type, level, doc_iid, doc_wid, doc_title,
             question_text, doc_text, question_tokens, doc_tokens, tfidf, relevance) = candidate_row
            exists = feature_cursor.execute('SELECT id FROM features WHERE id = ?', (_id,)).fetchone()
            if exists is not None:
                continue

            row: List[str] = [_id, question_id, _type, level, doc_iid, doc_wid, doc_title,
                              question_text, doc_text, question_tokens, doc_tokens, tfidf]
            _extract_features(row, EXTRACTORS, json.loads(question_text), json.loads(doc_text))
            row.append(relevance)
            rows.append(row)
            batch_count += 1
        rows_to_db(_set, rows)
        helpers.log(f'Processed batch {batch_index} of {batch_count} pairs in {datetime.now() - start_time}')

        feature_cursor.close()
        feature_db.close()

        return batch_count
    except Exception as e:
        helpers.log(e)
コード例 #7
0
def build() -> None:
    helpers.log('Building Trec Eval references.')
    with open(ct.TRAIN_HOTPOT_SET, 'r') as file:
        question_set = json.load(file)
        dev_question_set = question_set[ct.TRAIN_DEV_SPLIT:]
        train_question_set = question_set[:ct.TRAIN_DEV_SPLIT]
    with open(ct.DEV_HOTPOT_SET, 'r') as file:
        test_question_set = json.load(file)
    iterator = [('train', train_question_set), ('dev', dev_question_set),
                ('test', test_question_set)]
    for _set, reference in parallel.execute(_build_reference, iterator):
        helpers.log(f'Created reference {reference} for {_set} set.')
コード例 #8
0
    def __init__(self, database: str):
        start = datetime.now()

        connection = sqlite3.connect(database)
        cursor = connection.cursor()

        self.data = cursor.execute(
            f'SELECT {", ".join(self._features)} FROM features').fetchall()

        helpers.log(
            f'Initialized {database.split(".")[-3]} dataset in {datetime.now() - start}'
        )
コード例 #9
0
ファイル: index.py プロジェクト: janaleible/hotpotQA-ir-task
    def __init__(self) -> None:

        tree = ElementTree.parse(INDRI_PARAMETERS)
        if INDRI_PARAMETERS.split('/')[-1] == 'indri_stop_stem.xml':
            helpers.log('Loading stopwords.')
            stopwords = set()
            for elem in tree.find('stopper').iter('word'):
                stopwords.add(elem.text)
            self.stopwords = frozenset(stopwords)
        elif INDRI_PARAMETERS.split('/')[-1] == 'index.xml':
            self.stopwords = None
        else:
            raise NotImplementedError(f'Unknown index setting: {INDRI_PARAMETERS.split("/")[-1]}')
        self._punctuation = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
コード例 #10
0
def _load_checkpoint(model, optimizer, config: Config):
    best_statistic = 0
    start = datetime.now()
    if os.path.isfile(ct.L2R_TRAIN_PROGRESS.format(config.name)):
        with open(ct.L2R_BEST_MODEL.format(config.name), 'rb') as file:
            checkpoint = torch.load(file, map_location=ct.DEVICE)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        model.epochs_trained = checkpoint['epoch']

        best_statistic = checkpoint['best_statistic']
        helpers.log(
            f'Loaded checkpoint from {ct.L2R_BEST_MODEL.format(config.name)} in {datetime.now() - start}.'
        )
    return best_statistic
コード例 #11
0
ファイル: run.py プロジェクト: janaleible/hotpotQA-ir-task
    def to_json(self, db, dataset_path) -> List[dict]:
        helpers.log('Creating hotpot data.')
        dataset = Dataset.from_file(dataset_path)
        questions = []

        connection = sqlite3.connect(db)
        cursor = connection.cursor()
        doc_results = cursor.execute(
            f"SELECT DISTINCT doc_title, document_text FROM features"
        ).fetchall()
        title2text = {
            json.loads(doc_title): json.loads(doc_text)
            for (doc_title, doc_text) in doc_results
        }
        cursor.close()
        connection.close()
        helpers.log('Loaded title2text.')

        for question_id, ranking in tqdm(self.items()):
            context = []
            sorted_by_score = sorted(ranking.items(),
                                     key=lambda value: value[1],
                                     reverse=True)
            for rank in range(min(10, len(ranking))):
                (title, score) = sorted_by_score[rank]
                doc_text = title2text[title]
                article = [
                    paragraph.split(constants.EOS.strip())
                    for paragraph in doc_text.split(constants.EOP.strip())
                ]
                article.insert(0, title)
                article.insert(1, score)
                context.append(article)

            full_question = dataset.find_by_id(question_id)
            question = {
                '_id': full_question.id,
                'level': full_question.level,
                'type': full_question.type,
                'question': full_question.question,
                'context': context,
                'answer': full_question.answer,
                'supporting_facts': full_question.supporting_facts
            }
            questions.append(question)

        return questions
コード例 #12
0
ファイル: trec.py プロジェクト: janaleible/hotpotQA-ir-task
def _process_raw_data_folder(folder_path: str):
    """Load documents from the JSON collections line by line.

    Extract document id, title, and first paragraph from each document in each JSON. Create TREC documents and collect
    in a list.

    If ``USE_LESS_MEMORY`` is set to ``True``, the TREC documents will be persisted to disk under a file named after the
    folder from which the files originate. If set to ``False`` the documents will be returned for further processing in
    the main thread.

    Store the document string in a database for later reference.

    :param folder_path: The path to the folder where the compressed JSON collection of raw wiki data lies.
    :return: A sorted collection of (document_id, document_title, trec_document_string)
    """
    doc_count = 0
    doc_pairs: List[Tuple[int, str]] = []
    doc_triples = []
    file_paths = sorted(glob(os.path.join(folder_path, '*.bz2')))
    for file_path in file_paths:
        with bz2.BZ2File(file_path) as file:
            for line in file:
                doc = json.loads(line.decode('utf-8'))
                doc_id, doc_title, doc_str = _extract_doc(doc)
                doc_pairs.append((doc_id, doc_str))
                doc_count += 1
                # doc_triples.append((doc_id, doc_title, _build_trec(doc_id, doc_title, doc_str)))

    folder = folder_path.split("/")[-1]
    helpers.log(f'Extracted documents from folder {folder}.')

    db = sqlite3.connect(DOCUMENT_DB)
    cursor = db.cursor()
    cursor.executemany("INSERT INTO documents (id, text) VALUES (?, ?)",
                       doc_pairs)
    db.commit()
    cursor.close()
    db.close()

    helpers.log(f'Persisted {doc_count} documents to database.')

    if USE_LESS_MEMORY:
        file_name = os.path.join(TREC_CORPUS_DIR, f'{folder}.trectext')
        # _process_doc_triples(doc_triples, file_name)
    else:
        return doc_triples
コード例 #13
0
def _save_epoch_stats(name: str, epoch: int, train_loss: float,
                      train_stats: Tuple[float, ...], dev_stats: Tuple[float,
                                                                       ...]):
    with open(ct.L2R_TRAIN_PROGRESS.format(name), 'a') as f:
        writer = csv.writer(f)
        writer.writerow([epoch, train_loss, *train_stats, *dev_stats])

    helpers.log(
        f'[Epoch {epoch:03d}][Train Acc: {train_stats[0]:0.4f}]'
        f'[Train MAP@10: {train_stats[1]:0.4f}][Train NDCG@10: {train_stats[2]:0.4f}]'
        f'[Train Precision@5: {train_stats[10]:0.4f}]'
        f'[Train Loss: {train_loss:0.4f}]'
        f'[Dev Acc: {dev_stats[0]:0.4f}]'
        f'[Dev MAP@10: {dev_stats[1]:0.4f}][Dev NDCG@10: {dev_stats[2]:0.4f}]'
        f'[Dev Recall@10: {dev_stats[3]:0.4f}]'
        f'[Dev MAP@100: {dev_stats[4]:0.4f}][Dev NDCG@100: {dev_stats[5]:0.4f}]'
        f'[Dev Recall@100: {dev_stats[6]:0.4f}]'
        f'[Dev Recall@1000: {dev_stats[9]:0.4f}]')
コード例 #14
0
def load_dataset_batches() -> Tuple[List[List[Question]], int, int]:
    """Load the dataset in batches of ``CHUNK_SIZE`` and calculate lengths."""
    helpers.log(
        f'Loading dataset in chunks. Data file: {TRAIN_HOTPOT_SET}. Chunk size: {CHUNK_SIZE}.'
    )
    start = datetime.now()

    training_set = Dataset.from_file(TRAIN_HOTPOT_SET)
    batches = parallel.chunk(CHUNK_SIZE, training_set.questions)
    no_batches = len(batches)
    no_queries = len(training_set)

    end = datetime.now()
    helpers.log(
        f'Finished loading in {end - start}. Batches: {no_batches}. Queries: {no_queries}.'
    )

    return batches, no_batches, no_queries
コード例 #15
0
def _process_question_batch(question_numbered_batch: Tuple[int, Tuple[Question]]) -> int:
    """If the batch was not previously processed, filter a batch and persist to SQLite database."""
    (no, questions), retrieved = question_numbered_batch, 0

    already_processed = retrieve.check_already_processed(DB_NAME, question_numbered_batch)
    if len(already_processed) == len(questions):
        helpers.log(f'Batch {no} already processed. Skipping.')
        return 0

    for question in questions:
        if already_processed.get(question.id, False):
            continue

        retrieval = INDEX.unigram_query(question.question, request=5000)
        retrieve.persist_retrieval(DB_NAME, question, retrieval)
        retrieved += 1
    helpers.log(f'Retrieved questions: {retrieved}/{len(questions)}.')

    return retrieved
コード例 #16
0
def build(skip_relevant: bool = True):
    global INDEX, COLUMNS, QUESTION_COUNTS
    global SKIP_RELEVANT
    SKIP_RELEVANT = skip_relevant
    INDEX = Index('tfidf')
    helpers.log('Loaded index.')

    os.makedirs(ct.CANDIDATES_DIR, exist_ok=True)
    with open(ct.TRAIN_HOTPOT_SET, 'r') as file:
        question_set = json.load(file)
        train_question_set = question_set[:ct.TRAIN_DEV_SPLIT]
        dev_question_set = question_set[ct.TRAIN_DEV_SPLIT:]
    with open(ct.DEV_HOTPOT_SET, 'r') as file:
        test_question_set = json.load(file)

    iterator: List[Tuple[str, str, Callable]] = [
        # (train_question_set, 'train', ct.TRAIN_CANDIDATES_DB, ct.CANDIDATES_CHUNK),
        # (dev_question_set, 'dev', ct.DEV_CANDIDATES_DB, ct.CANDIDATES_CHUNK),
        (test_question_set, 'test', ct.TEST_CANDIDATES_DB, ct.CANDIDATES_CHUNK)
    ]

    for (_set, split, candidate_db_path, chunk) in iterator:
        start = datetime.now()

        db = sqlite3.connect(candidate_db_path)
        cursor = db.cursor()
        cursor.execute(sql.create_candidate_table)
        db.commit()
        helpers.log('Created candidates table.')

        QUESTION_COUNTS = cursor.execute(sql.count_question_rows).fetchall()
        QUESTION_COUNTS = {json.loads(_id): _count for (_id, _count) in QUESTION_COUNTS}
        helpers.log(f'Retrieved question counts for {len(QUESTION_COUNTS)} questions.')
        cursor.close()
        db.close()

        helpers.log(f'Creating {split} candidate set with {len(_set)} question.')
        total_count = 0
        _set_generator = parallel.chunk(chunk, zip([split] * len(_set), _set))
        for batch_count in parallel.execute(_build_candidates, _set_generator):
            total_count += batch_count
        helpers.log(f'Created {split} candidate set with {total_count} questions in {datetime.now() - start}')
コード例 #17
0
def rows_to_db(_set: str, rows: List[Any]):
    if _set == 'train':
        db_path = constants.TRAIN_FEATURES_DB
    elif _set == 'dev':
        db_path = constants.DEV_FEATURES_DB
    elif _set == 'test':
        db_path = constants.TEST_FEATURES_DB
    else:
        raise ValueError(f'Unknown set. {_set}')
    done = False
    while not done:
        try:
            connection = sqlite3.connect(db_path)
            cursor = connection.cursor()
            cursor.executemany(sql.insert_features(COLUMNS), [tuple(row) for row in rows])
            connection.commit()
            cursor.close()
            connection.close()
            done = True
        except Exception as e:
            helpers.log(e)
コード例 #18
0
def process(
        command: str) -> Tuple[Dict[str, Dict[str, float]], Dict[str, float]]:
    helpers.log('Loading int2wid and wid2title mappings.')
    global _WID2TITLE, _INT2WID
    with open(WID2TITLE, 'rb') as file:
        _WID2TITLE = pickle.load(file)
    with open(INT2WID, 'rb') as file:
        _INT2WID = pickle.load(file)

    model_type, model_name = command.split('@')
    dataset_id = helpers.training_set_id()
    if model_type == 'term':
        dir_path = os.path.join(TERM_RETRIEVALS_DIR,
                                f'{model_name}.{dataset_id}')
    else:
        raise ValueError(f'Unknown model type: {model_type}')

    reference_path = os.path.join(dir_path, 'reference.json')
    reference_exists = os.path.isfile(reference_path)
    if not reference_exists:
        _create_trec_eval_reference(dir_path)

    run_path = os.path.join(dir_path, 'retrievals.json')
    run_exists = os.path.isfile(run_path)
    if not run_exists:
        run = _create_trec_run(dir_path)
    else:
        with open(run_path, 'r') as file:
            run = json.load(file)

    trec_eval_path = os.path.join(dir_path, 'trec_eval.json')
    trec_eval_agg_path = os.path.join(dir_path, 'trec_eval_agg.json')
    evaluator = Evaluator(reference_path,
                          measures=pytrec_eval.supported_measures)
    trec_eval, trec_eval_agg = evaluator.evaluate(run, trec_eval_path,
                                                  trec_eval_agg_path)

    return trec_eval, trec_eval_agg
コード例 #19
0
def build():
    assert os.path.exists(
        RAW_DATA_DIR
    ), f'Cannot find raw data in {os.path.abspath(RAW_DATA_DIR)}'
    os.makedirs(os.path.abspath(INDEX_DIR), exist_ok=True)

    folder_paths = sorted(glob(os.path.join(RAW_DATA_DIR, '*')))
    title2wid = {}
    wid2title = {}
    logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tBuilding title maps.')
    for group_title2wid in parallel.execute(_build_group_title_map,
                                            folder_paths):
        for (title, wid) in group_title2wid.items():
            if title2wid.get(title, None) is None:
                title2wid[title] = wid
                wid2title[wid] = title
            else:
                # Hack for taking care of the double title that points to a proper article and to a disambiguation
                # article. Assumes the only article of interest is the one that is not a disambiguation one.
                helpers.log(
                    f'Title {title} has the WID {title2wid.get(title)}. Current WID: {wid}.'
                )
                if wid == 2209045:
                    helpers.log(
                        f'Replacing WID {title2wid.get(title)} with WID {wid}.'
                    )
                    title2wid[title] = wid
                    wid2title[wid] = title

    with open(WID2TITLE, 'wb') as file:
        pickle.dump(wid2title, file)
    with open(TITLE2WID, 'wb') as file:
        pickle.dump(title2wid, file)
    logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tBuilt title maps.')

    return
コード例 #20
0
ファイル: index.py プロジェクト: janaleible/hotpotQA-ir-task
    def __init__(self, env: str = 'default', verbose: bool = False, avg_len=False):
        if verbose:
            helpers.log(f'Loading index {INDRI_INDEX_DIR} with {env} query environment.')
        start = datetime.now()

        self.index = pyndri.Index(f'{INDRI_INDEX_DIR}')
        self.token2id, self.id2token, self.id2df = self.index.get_dictionary()
        self.id2tf = self.index.get_term_frequencies()

        if avg_len:
            # Monte Carlo Estimation for document length:
            doc_lengths = np.empty(self.index.document_count(), dtype=np.float)
            for (idx, doc_iid) in enumerate(range(self.index.document_base(), self.index.maximum_document())):
                doc_lengths[idx] = self.index.document_length(doc_iid)
            self.avg_doc_len = float(doc_lengths.mean())

        self.tokenizer = Tokenizer()

        if os.path.isfile(TITLE2WID):
            with open(TITLE2WID, 'rb') as file:
                self.title2wid = pickle.load(file)

        if os.path.isfile(WID2TITLE):
            with open(WID2TITLE, 'rb') as file:
                self.wid2title = pickle.load(file)
        try:
            if os.path.isfile(WID2INT):
                with open(WID2INT, 'rb') as file:
                    self.wid2int = pickle.load(file)

            if os.path.isfile(INT2WID):
                with open(INT2WID, 'rb') as file:
                    self.int2wid = pickle.load(file)
        except FileNotFoundError:
            helpers.log('ID mappings do not exist yet. Not loaded.')

        if env == 'default':
            self.env = pyndri.QueryEnvironment(self.index)
        elif env == 'tfidf':
            self.env = pyndri.TFIDFQueryEnvironment(self.index, k1=1.2, b=0.75)
        elif env == 'prf':
            env = pyndri.QueryEnvironment(self.index)
            self.env = pyndri.PRFQueryEnvironment(env, fb_docs=10, fb_terms=10)
        else:
            raise ValueError(f'Unknown environment configuration {env}')

        stop = datetime.now()
        if verbose:
            helpers.log(f'Loaded index in {stop - start}.')
コード例 #21
0
def process():
    """Filter the collection of 5 million document to just the top 5000 at most according to bigram/unigram
    filtering per question. Processed in parallel."""
    global_start = datetime.now()
    global INDEX
    INDEX = Index(env='tfidf')
    os.makedirs(DIR_NAME)
    (batches, no_batches, no_queries), total_retrieved = retrieve.load_dataset_batches(), 0
    retrieve.create_retrieval_db(DB_NAME)

    helpers.log(f'Retrieving documents. Workers: {os.cpu_count()}')
    start = datetime.now()
    for batch_retrieval in parallel.execute(_process_question_batch, batches):
        total_retrieved += batch_retrieval
    end = datetime.now()
    helpers.log(f'Finished retrieval in {end - start}. Filtered {total_retrieved}/{no_queries}')

    global_end = datetime.now()
    helpers.log(f'Finished process in {global_end - global_start}.')
コード例 #22
0
def _build_candidates(numbered_batch: Tuple[int, List[Dict[str, Any]]]) -> int:
    start = datetime.now()
    (no, batch), db, cursor = numbered_batch, None, None
    processed_count = 0
    skipped_count = 0
    relevant_count = 0

    for split, question in batch:
        if split == 'train':
            no_candidates = ct.TRAIN_NO_CANDIDATES
            candidate_db_path = ct.TRAIN_CANDIDATES_DB
        elif split == 'dev':
            no_candidates = ct.DEV_NO_CANDIDATES
            candidate_db_path = ct.DEV_CANDIDATES_DB
        elif split == 'test':
            no_candidates = ct.TEST_NO_CANDIDATES
            candidate_db_path = ct.TEST_CANDIDATES_DB
        else:
            raise ValueError(f'Unknown set {split}.')

        _id = question['_id']
        _type = question['type']
        _level = question['level']
        _str = question['question']
        relevant_titles = list(map(lambda item: item[0], question['supporting_facts']))

        if QUESTION_COUNTS.get(_id, 0) == no_candidates:
            skipped_count += 1
            continue

        # store relevant documents row
        rows: List[List[str]] = []
        relevant_doc_iids = set(INDEX.wid2int[INDEX.title2wid[title]] for title in relevant_titles)
        if split != 'test':
            for (candidate_idx, doc_iid) in enumerate(relevant_doc_iids):
                row: List[str] = [json.dumps(_id), json.dumps(_type), json.dumps(_level)]
                doc_wid, doc_title = _extract_doc_identifiers(row, INDEX, doc_iid)
                doc_text = _extract_text(row, _str, doc_wid)
                doc_tokens, question_tokens = _extract_tokens(row, INDEX, _str, doc_iid)
                tfidf_score = _extract_tfidf_score(row, INDEX, doc_tokens, question_tokens)
                relevance = _extract_relevance(row, doc_iid, relevant_doc_iids)

                rows.append(row)

        # store irrelevant documents row in order scored by tf-idf until reached candidate_set length
        result_idx = 0
        candidate_idx = ct.RELEVANT_DOCUMENTS
        results = INDEX.unigram_query(_str, no_candidates)
        while candidate_idx < no_candidates:
            (doc_iid, tfidf_score) = results[result_idx]

            row: List[str] = [json.dumps(_id), json.dumps(_type), json.dumps(_level)]
            relevance = _extract_relevance(row, doc_iid, relevant_doc_iids, False)
            if relevance == 1:
                relevant_count += 1
                if not SKIP_RELEVANT:
                    result_idx += 1
                    continue

            doc_wid, doc_title = _extract_doc_identifiers(row, INDEX, doc_iid)
            doc_text = _extract_text(row, _str, doc_wid)
            doc_tokens, question_tokens = _extract_tokens(row, INDEX, _str, doc_iid)

            row.append(json.dumps(tfidf_score))
            row.append(json.dumps(relevance))

            rows.append(row)
            candidate_idx += 1
            result_idx += 1

        if db is None:
            db = sqlite3.connect(candidate_db_path)
            cursor = db.cursor()
        cursor.executemany(sql.insert_candidate, rows)
        db.commit()
        processed_count += 1

    if db is not None:
        cursor.close()
        db.close()

    end = datetime.now()
    helpers.log(f'Processed batch {no} in {end - start}. Processed {processed_count}. Skipped {skipped_count}. '
                f'Relevant documents found {relevant_count}')

    return len(batch)
コード例 #23
0
def run(config: Config) -> None:
    start = datetime.now()
    os.makedirs(ct.L2R_MODEL_DIR.format(config.name), exist_ok=True)
    with open(ct.INT2WID, 'rb') as file:
        global INT2WID
        INT2WID = pickle.load(file)
    with open(ct.WID2TITLE, 'rb') as file:
        global WID2TITLE
        WID2TITLE = pickle.load(file)

    trec_eval_train = ct.L2R_EVAL.format(config.name, 'train')
    trec_eval_agg_train = ct.L2R_EVAL_AGG.format(config.name, 'train')
    trec_eval_dev = ct.L2R_EVAL.format(config.name, 'dev')
    trec_eval_agg_dev = ct.L2R_EVAL_AGG.format(config.name, 'dev')

    query_encoder = config.query_encoder(config.embedding_dim)
    document_encoder = config.document_encoder(config.embedding_dim)
    scorer = config.scorer(**config.scorer_kwargs)
    model = config.ranker(query_encoder, document_encoder,
                          scorer).to(device=ct.DEVICE)
    # noinspection PyCallingNonCallable
    optimizer = config.optimizer(model.parameters(), **config.optimizer_kwargs)
    helpers.log(
        f'Loaded maps, model, and optimizer in {datetime.now() - start}.')

    train_loader, dev_loader = _load_datasets()

    best_recall_100 = _load_checkpoint(model, optimizer, config)
    remaining_epochs = config.epochs - model.epochs_trained

    train_stats = _evaluate_epoch(model, ct.TRAIN_TREC_REFERENCE, train_loader,
                                  trec_eval_train, trec_eval_agg_train, False)
    dev_stats = _evaluate_epoch(model, ct.DEV_TREC_REFERENCE, dev_loader,
                                trec_eval_dev, trec_eval_agg_dev, False)

    # Hack to avoid having to adapt all index accesses below after adding run
    train_stats = train_stats[1:]
    dev_stats = dev_stats[1:]

    _save_epoch_stats(config.name, model.epochs_trained, -1, train_stats,
                      dev_stats)

    for epoch in range(remaining_epochs):
        is_best = False
        last_epoch = (model.epochs_trained + 1) == config.epochs

        # train
        train_loss = _train_epoch(model, optimizer, train_loader, config)

        # only once every 10 epochs for speed.
        model.epochs_trained += 1
        if model.epochs_trained % 10 == 0:
            # evaluate and save statistics

            train_stats = _evaluate_epoch(model, ct.TRAIN_TREC_REFERENCE,
                                          train_loader, trec_eval_train,
                                          trec_eval_agg_train, last_epoch)
            dev_stats = _evaluate_epoch(model, ct.DEV_TREC_REFERENCE,
                                        dev_loader, trec_eval_dev,
                                        trec_eval_agg_dev, last_epoch)

            # Hack to avoid having to adapt all index accesses below after adding run
            train_run = train_stats[0]
            train_stats = train_stats[1:]
            dev_run = dev_stats[0]
            dev_stats = dev_stats[1:]

            _save_epoch_stats(config.name, model.epochs_trained, train_loss,
                              train_stats, dev_stats)

            # save model
            if dev_stats[6] >= best_recall_100:
                best_recall_100 = dev_stats[6]
                is_best = True
            _save_checkpoint(config.name, model, optimizer, best_recall_100,
                             is_best, train_run, dev_run)

    return
コード例 #24
0
def build():
    assert constants.TRAIN_FEATURES_CHUNK > 1
    assert constants.DEV_FEATURES_CHUNK > 1

    global INDEX
    INDEX = Index('tfidf')
    helpers.log('Loaded index.')

    global EXTRACTORS
    EXTRACTORS = []
    if 'entity' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(EntityExtractor(INDEX))
    if 'ibm1' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(IBM1FeatureExtractor(normalized=False))
    if 'nibm1' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(IBM1FeatureExtractor(normalized=True))
    if 'bigram' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=False))
    if 'nbigram' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=True))
    if 'qword' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(QuestionWordFeatureExtractor())
    if 'doclen' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(DocumentLengthFeatureExtractor())
    helpers.log('Loaded extractors.')

    global COLUMNS
    COLUMNS = copy.copy(constants.FEATURE_BASE_COLUMN_NAMES)
    COLUMNS.extend(feature for extractor in EXTRACTORS for feature in extractor.feature_name)
    COLUMNS.append(constants.FEATURE_TARGET_COLUMN_NAME)
    helpers.log('Loaded column names.')

    os.makedirs(constants.FEATURES_DIR, exist_ok=True)
    iterator: List[Tuple[str, str, Callable]] = [
        # (constants.TRAIN_CANDIDATES_DB, constants.TRAIN_FEATURES_DB, constants.TRAIN_FEATURES_CHUNK),
        # (constants.DEV_CANDIDATES_DB, constants.DEV_FEATURES_DB, constants.DEV_FEATURES_CHUNK),
        (constants.TEST_CANDIDATES_DB, constants.TEST_FEATURES_DB, constants.TEST_FEATURES_CHUNK)
    ]

    for (candidate_db_path, feature_db_path, chunk) in iterator:
        start_time = datetime.now()
        _set = candidate_db_path.split("/")[-1].split(".")[1]

        done = False
        feature_db = sqlite3.connect(feature_db_path)
        cursor = feature_db.cursor()
        while not done:
            try:
                cursor.execute(sql.create_features_table(COLUMNS))
                feature_db.commit()
                done = True
            except Exception as e:
                helpers.log(e)
        helpers.log(f'Created {_set} features table.')

        (start,) = cursor.execute('SELECT MAX(id) FROM features').fetchone()
        start = start if start is not None else 0  # first id in the database
        cursor.close()
        feature_db.close()
        helpers.log(f'Starting feature build at {start}.')

        candidate_db = sqlite3.connect(candidate_db_path)
        cursor = candidate_db.cursor()
        (stop,) = cursor.execute('SELECT COUNT(*) FROM candidates').fetchone()  # last id in the database
        cursor.close()
        candidate_db.close()
        id_range = range(start + 1, stop + 1)
        helpers.log(f'Retrieved {len(id_range)} candidate indices for {_set} set.')

        total_count = 0
        _set_generator = parallel.chunk(chunk, zip([_set] * len(id_range), id_range))
        _batched_set_generator = parallel.chunk(constants.GRAND_CHUNK, _set_generator)
        for grand_batch_idx, _batch_set in _batched_set_generator:
            grand_batch_count = 0
            for batch_count in parallel.execute(_build_candidates, _batch_set, _as='process'):
                grand_batch_count += batch_count
                total_count += batch_count
            helpers.log(f'Processed {_set} batch of features no {grand_batch_idx} with {grand_batch_count} pairs.')
        helpers.log(f'Created {_set} features set with {total_count} pairs in {datetime.now() - start_time}')
コード例 #25
0
def run_eval(_set: str, config: Config):
    start = datetime.now()
    if _set == 'train':
        feature_db = ct.TRAIN_FEATURES_DB
        ref = ct.TRAIN_TREC_REFERENCE
    elif _set == 'dev':
        feature_db = ct.DEV_FEATURES_DB
        ref = ct.DEV_TREC_REFERENCE
    elif _set == 'test':
        feature_db = ct.TEST_FEATURES_DB
        ref = ct.TEST_TREC_REFERENCE
    else:
        raise ValueError(f'Unknown set {_set}.')
    with open(ct.INT2WID, 'rb') as file:
        global INT2WID
        INT2WID = pickle.load(file)
    with open(ct.WID2TITLE, 'rb') as file:
        global WID2TITLE
        WID2TITLE = pickle.load(file)

    trec_eval = ct.L2R_EVAL.format(config.name, 'test')
    trec_eval_agg = ct.L2R_EVAL_AGG.format(config.name, 'test')

    query_encoder = config.query_encoder(config.embedding_dim)
    document_encoder = config.document_encoder(config.embedding_dim)
    scorer = config.scorer(**config.scorer_kwargs)
    model = config.ranker(query_encoder, document_encoder,
                          scorer).to(device=ct.DEVICE)
    # noinspection PyCallingNonCallable
    optimizer = config.optimizer(model.parameters(), **config.optimizer_kwargs)
    _ = _load_checkpoint(model, optimizer, config)
    helpers.log(
        f'Loaded maps, model, and optimizer in {datetime.now() - start}.')

    test_data_set = QueryDocumentsDataset(feature_db)
    test_data_loader = data.DataLoader(
        test_data_set,
        ct.BATCH_SIZE,
        False,
        num_workers=os.cpu_count(),
        collate_fn=QueryDocumentsDataset.collate)

    model.eval()
    epoch_run = Run()
    epoch_eval = Evaluator(ref, measures=pytrec_eval.supported_measures)

    final_scores = torch.empty((len(test_data_loader.dataset), 1),
                               dtype=torch.float)
    question_ids = []
    document_ids = []
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(test_data_loader)):
            (questions, documents, features, targets, batch_question_ids,
             batch_document_ids) = batch
            questions = questions.to(device=ct.DEVICE, non_blocking=True)
            documents = documents.to(device=ct.DEVICE, non_blocking=True)
            features = features.to(device=ct.DEVICE, non_blocking=True)

            batch_size = questions.shape[0]
            scores, encodings = model(questions, documents, features)

            question_ids.extend(batch_question_ids)
            document_ids.extend(batch_document_ids)
            if batch_size == ct.BATCH_SIZE:
                final_scores[idx * ct.BATCH_SIZE:(idx + 1) *
                             ct.BATCH_SIZE] = scores
            else:
                final_scores[idx * ct.BATCH_SIZE:] = scores

    for batch_run in tqdm(
            parallel.execute(
                _build_run,
                parallel.chunk(
                    10000, zip(question_ids, document_ids,
                               final_scores.numpy())))):
        epoch_run.update_rankings(batch_run)

    trec_eval, trec_eval_agg = epoch_eval.evaluate(epoch_run, trec_eval,
                                                   trec_eval_agg, False)
    er_10 = 0
    for stats in trec_eval.values():
        er_10 += stats['recall_10'] == 1.0
    er_10 /= len(trec_eval)

    print(f'ndcg@10:\t\t{trec_eval_agg["ndcg_cut_10"]:.4f}')
    print(f'map@10:\t\t{trec_eval_agg["map_cut_10"]:.4f}')
    print(f'er@10:\t\t{er_10:.4f}')
    print(f'recall@10:\t\t{trec_eval_agg["recall_10"]:.4f}')
    print(f'recall@100:\t\t{trec_eval_agg["recall_100"]:.4f}')
    print(f'recall@1000:\t\t{trec_eval_agg["recall_1000"]:.4f}')