def get_data(): print("Loading data ...") # load preprocessed data download_ap.download_dataset() docs_by_id = read_ap.get_processed_docs() return docs_by_id
def __init__(self, window_size, vocab_size): # ensure dataset is downloaded download_ap.download_dataset() # pre-process the text docs_by_id = read_ap.get_processed_docs() self.word2id = dict() self.id2word = dict() self.window_size = window_size self.vocab_size = vocab_size self.docs_by_id = docs_by_id self.read_words(vocab_size)
def preprocess(path=PROCESSED_DOCS_PATH): # Load the preprocessed docs_by_id file if it exists. if os.path.exists(path): print("Loading the preprocessed files...") with open(path, "rb") as reader: return pickle.load(reader) # (Down)load the dataset from the ap files and get it in the right form. download_ap.download_dataset() docs_by_id = read_ap.get_processed_docs() print("Filtering infrequent words...") docs_by_id = filter_infrequent(docs_by_id) print("Converting words to indices...") tok2idx, id2corpus = all_words_to_indices(docs_by_id) # Store the preprocessing results for faster future retrieval. print("Storing the preprocessed files...") with open(path, "wb") as writer: pickle.dump((tok2idx, id2corpus), writer) return tok2idx, id2corpus
results = defaultdict(float) for query_term in query_repr: if query_term not in self.ii: continue for (doc_id, tf) in self.ii[query_term]: results[doc_id] += np.log(1 + tf) / self.df[query_term] results = list(results.items()) results.sort(key=lambda _: -_[1]) return results if __name__ == "__main__": # ensure dataset is downloaded download_ap.download_dataset() # pre-process the text docs_by_id = read_ap.get_processed_docs() # Create instance for retrieval tfidf_search = TfIdfRetrieval(docs_by_id) # read in the qrels qrels, queries = read_ap.read_qrels() overall_ser = {} print("Running TFIDF Benchmark") # collect results for qid in tqdm(qrels): query_text = queries[qid]