def get_data():
    print("Loading data ...") 

    # load preprocessed data 
    download_ap.download_dataset()
    docs_by_id = read_ap.get_processed_docs()
    
    return docs_by_id
Esempio n. 2
0
    def __init__(self, window_size, vocab_size):

        # ensure dataset is downloaded
        download_ap.download_dataset()
        # pre-process the text
        docs_by_id = read_ap.get_processed_docs()

        self.word2id = dict()
        self.id2word = dict()

        self.window_size = window_size
        self.vocab_size = vocab_size

        self.docs_by_id = docs_by_id
        self.read_words(vocab_size)
Esempio n. 3
0
def preprocess(path=PROCESSED_DOCS_PATH):
    # Load the preprocessed docs_by_id file if it exists.
    if os.path.exists(path):
        print("Loading the preprocessed files...")
        with open(path, "rb") as reader:
            return pickle.load(reader)

    # (Down)load the dataset from the ap files and get it in the right form.
    download_ap.download_dataset()
    docs_by_id = read_ap.get_processed_docs()
    print("Filtering infrequent words...")
    docs_by_id = filter_infrequent(docs_by_id)
    print("Converting words to indices...")
    tok2idx, id2corpus = all_words_to_indices(docs_by_id)

    # Store the preprocessing results for faster future retrieval.
    print("Storing the preprocessed files...")
    with open(path, "wb") as writer:
        pickle.dump((tok2idx, id2corpus), writer)
    return tok2idx, id2corpus
Esempio n. 4
0
        results = defaultdict(float)
        for query_term in query_repr:
            if query_term not in self.ii:
                continue
            for (doc_id, tf) in self.ii[query_term]:
                results[doc_id] += np.log(1 + tf) / self.df[query_term]

        results = list(results.items())
        results.sort(key=lambda _: -_[1])
        return results


if __name__ == "__main__":

    # ensure dataset is downloaded
    download_ap.download_dataset()
    # pre-process the text
    docs_by_id = read_ap.get_processed_docs()

    # Create instance for retrieval
    tfidf_search = TfIdfRetrieval(docs_by_id)
    # read in the qrels
    qrels, queries = read_ap.read_qrels()

    overall_ser = {}

    print("Running TFIDF Benchmark")
    # collect results
    for qid in tqdm(qrels): 
        query_text = queries[qid]