def read_collection(coll_main_folder, output_model_path, stemming, stoplist=None): if not os.path.isfile(output_model_path): if stoplist is None: stoplist = util.load_indri_stopwords() text_by_name = {} print('reading files in folder') pool = multiprocessing.Pool(8) fnames_list = os.listdir(coll_main_folder) doc_paths_list = [ os.path.join(coll_main_folder, filename) for filename in fnames_list ] print('processing collection') tokenized_docs = pool.starmap( util.tokenize, [(' '.join(open(fp, 'r').readlines()), stemming, stoplist) for fp in doc_paths_list]) for i in range(len(fnames_list)): text_by_name[fnames_list[i].split(r'.')[0]] = tokenized_docs[i] print('saving model') util.save_model(text_by_name, output_model_path) else: print('loading model: %s' % output_model_path) text_by_name = util.load_model(output_model_path) return text_by_name
def compute_inverted_index(coll_folder, stemming, output_file_path_ii): if not os.path.isfile(output_file_path_ii): print('computing inverted index') inverted_idx = {} sw = util.load_indri_stopwords() doc_n = 0 for filename in tqdm(os.listdir(coll_folder)): fp = os.path.join(coll_folder, filename) doc_id = filename.split(r'.')[0] if os.path.isfile(fp): doc_n += 1 d = util.tokenize(' '.join(open(fp, 'r').readlines()), stemming, stoplist=sw) set_w_in_doc = set(d) for w in set_w_in_doc: if w in inverted_idx.keys(): inverted_idx[w].append((doc_id, d.count(w))) else: inverted_idx[w] = [(doc_id, d.count(w))] util.save_model(inverted_idx, output_file_path_ii) else: inverted_idx = util.load_model(output_file_path_ii) return inverted_idx
def encode_queries(queries_main_folder, wi, stemming): sw = util.load_indri_stopwords() encoded_qbn = {} for filename in tqdm(os.listdir(queries_main_folder)): fp = os.path.join(queries_main_folder, filename) if os.path.isfile(fp): tokenized_query = util.tokenize(' '.join(open(fp, 'r').readlines()), stemming=stemming, stoplist=sw) qn = filename.split(r'.')[0] encoded_qbn[qn] = [wi[w] for w in tokenized_query if w in wi.keys()] return encoded_qbn
def compute_data(): ftext_model_path = '../data/fasttext_models/wiki.en.bin' output_path_wi_model = '../data/fasttext_models/wi_robust' output_path_ii_model = '../data/fasttext_models/ii_robust' output_path_idf_model = '../data/fasttext_models/idf_robust' output_path_encoded_d_model = '../data/fasttext_models/encoded_dbn' output_path_encoded_q_model = '../data/fasttext_models/encoded_qbn' output_path_we_matrix_model = '../data/fasttext_models/word_embeddings_matrix_robust' coll_path = '/Users/albertopurpura/ExperimentalCollections/Robust04/processed/corpus' queries_main_folder = '/Users/albertopurpura/ExperimentalCollections/Robust04/processed/topics' output_model_path = 'data/robust/stemmed_coll_model' encoded_out_folder_docs = 'data/robust/stemmed_encoded_docs_ft' stemming = True if not os.path.isfile(output_path_ii_model): print('computing inverted index') ii = compute_inverted_index(coll_path, stemming, output_path_ii_model) util.save_model(ii, output_path_ii_model) else: print('loading inverted index') ii = util.load_model(output_path_ii_model) if not os.path.isfile(output_path_encoded_d_model): text_dbn = read_collection(coll_path, output_model_path, stemming=stemming, stoplist=util.load_indri_stopwords()) encoded_dbn, wi, we_matrix = compute_input_data( text_dbn, ftext_model_path, encoded_out_folder_docs) util.save_model(encoded_dbn, output_path_encoded_d_model) util.save_model(wi, output_path_wi_model) util.save_model(we_matrix, output_path_we_matrix_model) else: encoded_dbn = util.load_model(output_path_encoded_d_model) wi = util.load_model(output_path_wi_model) we_matrix = util.load_model(output_path_we_matrix_model) if not os.path.isfile(output_path_encoded_q_model): encoded_qbn = encode_queries(queries_main_folder, wi, stemming) util.save_model(encoded_qbn, output_path_encoded_q_model) else: encoded_qbn = util.load_model(output_path_encoded_q_model) idf_scores = du.compute_idf(coll_path, stemming, output_path_ii_model, output_path_idf_model) return encoded_dbn, encoded_qbn, we_matrix, wi, ii, idf_scores