def __init__(self, corpus_path, freq_index_path, tf_idf_index_path, create_index=False): self.preprocessor = Preprocessor(corpus_path) self.corpus = {} # if create_index, then run the create_indexes, otherwise, load them from their already existing file locations if create_index: print("Creating frequency and tf_idf_indexes.") self.corpus = self.preprocessor.parse_corpus() tokens = self.preprocessor.create_tokens(self.corpus) c_counter = self.preprocessor.create_corpus_counter(self.corpus) self.indexer = Indexer(c_counter, tokens) # Uncomment and run with create_index set to True if you would like to see the results using the # different, but faster optimized algorithm for indexing # self.frequency_index = self.indexer.create_frequency_index_optimized() self.frequency_index = self.indexer.create_frequency_index() self.indexer.index_to_file(self.frequency_index, freq_index_path) self.tf_idf_index = self.indexer.create_tf_idf_index( self.frequency_index, len(self.frequency_index)) self.indexer.index_to_file(self.tf_idf_index, tf_idf_index_path) else: print("Loading frequency and tf_idf indexes.") self.corpus = self.preprocessor.parse_corpus(True) self.indexer = Indexer() self.frequency_index = self.indexer.load_index(freq_index_path) self.tf_idf_index = self.indexer.load_index(tf_idf_index_path) self.query = Query(self.frequency_index, self.tf_idf_index, self.corpus)
def __init__(self, corpus_path, freq_index_path, tf_idf_index_path, create_index=False): ''' Sets up the system for testing by parsing the corpus with the Preprocessor and creating the two indexes with the Indexer. If the indexes have already been created and written to a file, it is possible to read them in instead of recreating them to save time. :param corpus_path: the path to the file containing the corpus of words. :param freq_index_path: the path to the file to save/load the frequency index :param tf_idf_index_path: the path to the file to save/load the tf-idf index :param create_index: if True, creates the frequency index and tf-idf index and writes them to the specified path. Otherwise, loads them from a file at the specified path. ''' self.preprocessor = Preprocessor(corpus_path) self.corpus = {} # if create_index, then run the create_indexes, otherwise, load them from their already existing file locations if create_index: print "Creating frequency and tf_idf_indexes." self.corpus = self.preprocessor.parse_corpus() tokens = self.preprocessor.create_tokens(self.corpus) c_counter = self.preprocessor.create_corpus_counter(self.corpus) self.indexer = Indexer(c_counter, tokens) # Uncomment and run with create_index set to True if you would like to see the results using the # different, but faster optimized algorithm for indexing # self.frequency_index = self.indexer.create_frequency_index_optimized() self.frequency_index = self.indexer.create_frequency_index() self.indexer.index_to_file(self.frequency_index, freq_index_path) self.tf_idf_index = self.indexer.create_tf_idf_index( self.frequency_index, len(self.frequency_index)) self.indexer.index_to_file(self.tf_idf_index, tf_idf_index_path) else: print "Loading frequency and tf_idf indexes." self.corpus = self.preprocessor.parse_corpus(True) self.indexer = Indexer() self.frequency_index = self.indexer.load_index(freq_index_path) self.tf_idf_index = self.indexer.load_index(tf_idf_index_path) self.query = Query(self.frequency_index, self.tf_idf_index, self.corpus)
class System: def __init__(self, corpus_path, freq_index_path, tf_idf_index_path, create_index=False): self.preprocessor = Preprocessor(corpus_path) self.corpus = {} # if create_index, then run the create_indexes, otherwise, load them from their already existing file locations if create_index: print("Creating frequency and tf_idf_indexes.") self.corpus = self.preprocessor.parse_corpus() tokens = self.preprocessor.create_tokens(self.corpus) c_counter = self.preprocessor.create_corpus_counter(self.corpus) self.indexer = Indexer(c_counter, tokens) # Uncomment and run with create_index set to True if you would like to see the results using the # different, but faster optimized algorithm for indexing # self.frequency_index = self.indexer.create_frequency_index_optimized() self.frequency_index = self.indexer.create_frequency_index() self.indexer.index_to_file(self.frequency_index, freq_index_path) self.tf_idf_index = self.indexer.create_tf_idf_index( self.frequency_index, len(self.frequency_index)) self.indexer.index_to_file(self.tf_idf_index, tf_idf_index_path) else: print("Loading frequency and tf_idf indexes.") self.corpus = self.preprocessor.parse_corpus(True) self.indexer = Indexer() self.frequency_index = self.indexer.load_index(freq_index_path) self.tf_idf_index = self.indexer.load_index(tf_idf_index_path) self.query = Query(self.frequency_index, self.tf_idf_index, self.corpus) def test_system(self, query): results = [] query_results = self.query.execute_query(query) for i in range(len(query_results)): if i >= 100: break string_write = str(self.corpus[int(query_results[i].get("id"))]) string_write = string_write.replace('\n', "").strip() results.append(string_write) return results
def main(): # setting up paths and directories start_time = time.time() args = get_cmd_args() config = get_config() path_out = get_path_out(args, config) path_in = get_path_in(args, config) path_lang_model = config['paths'][args.location]['path_lang_model'] emb_type = config['embeddings'] if not args.skip_prep: prep_output_dir(path_out) print('Start preprocessing...') # corpus preprocessing if not args.skip_lingpp: print(('Start tokenization, tagging, lemmatization and marking ' 'stop-words...')) if args.corpus == 'dblp': lpp = DBLPLingPreprocessor(path_in, path_out, path_lang_model, max_docs=10000) elif args.corpus == 'sp': lpp = SPLingPreprocessor(path_in, path_out, path_lang_model) # , max_docs=10000) lpp.preprocess_corpus() print('Done.') # term extraction and hearst pattern extraction if not args.skip_pattern_extr: print('Start term extraction and hearst pattern extraction...') te = PatternExtractor(path_out) te.extract() print('Done.') print('Run consistency tests on corpus files...') test_corpus_files(path_out) print('Tests passed.') print('Run consistency tests on term pattern files...') test_term_pattern_files(path_out) print('Tests passed.') # indexing of corpus if not args.skip_idxer: print('Start indexing...') idxer = Indexer(path_out) print('index tokens...') idxer.index_tokens() print('index lemmas...') idxer.index_lemmas() print('convert lemma relations to index...') idxer.hierarch_rels_to_lemma_idx() print('convert token relations to index...') idxer.hierarch_rels_to_token_idx() print('Done.') print('Run consistency tests on indexing files...') test_indexing_files(path_out) print('Tests passed.') # analyze lemma frequencies if not args.skip_freq_an: print('Start frequency analysis for tf, df and dl...') fa = FreqAnalyzer(path_out) print('Calculate token term frequencies...') fa.calc_tf('t') print('Calculate lemma term frequencies...') fa.calc_tf('l') print('Prune terms...') pruner = Pruner(path_out, min_count=82) pruner.prune_tf() print('Calculate token document frequencies...') fa.calc_df('t') print('Calculate lemma document frequencies...') fa.calc_df('l') print('Calculate tfidf for tokens...') fa.calc_tfidf('t') print('Calculate tfidf for lemmas...') fa.calc_tfidf('l') print('Calculate document lengths...') fa.calc_dl() print('Done.') if not args.skip_embeddings: emb_types = ['Word2Vec', 'GloVe'] for etype in emb_types: Embedding = get_emb(etype) print('Train {} token embeddings...'.format(etype)) path_input = os.path.join(path_out, 'processed_corpus/token_idx_corpus.txt') embs_fname = Embedding.train(path_input, 'embs_token_global_' + etype, path_out) print('{} embeddings written to: {}'.format(etype, embs_fname)) print('Train {} lemma embeddings...'.format(etype)) path_input = os.path.join(path_out, 'processed_corpus/lemma_idx_corpus.txt') embs_fname = Embedding.train(path_input, 'embs_lemma_global_' + etype, path_out) print('{} embeddings written to: {}'.format(etype, embs_fname)) # embs.calc_combined_term_vecs() print('Test if all terms have embeddings...') test_embedding_files(path_out) print('Tests passed.') if not args.skip_doc_embs: print('Calculating document embeddings') doc_embedder = DocEmbedder(path_out, emb_type) doc_embedder.embed_docs() print('Done') end_time = time.time() time_used = end_time - start_time print('Time used: {}'.format(time_used)) print('Done.')
def main(): # Read dataset dict = index_dataset() # Preprocess dataset if needed if not os.path.exists('./objects/indexer.pickle') or not os.path.exists( './objects/knn.pickle'): dataset, corpus = preprocess_dataset(dict, lemmatize=True, remove_stopwords=True, measure_time=True) # Load or create indexer if os.path.exists('./objects/indexer.pickle'): indexer = load_object('./objects/indexer.pickle') else: indexer = Indexer(dataset, measure_time=True) save_object(indexer, './objects/indexer.pickle') #Load or create KNN if os.path.exists('./objects/knn.pickle'): knn = load_object('./objects/knn.pickle') else: # Initialize KNN with given dataset knn = KNN(dataset, corpus, measure_time=True) save_object(knn, './objects/knn.pickle') # Main loop for user input print("Type a question:") q = input() while q != 'quit': processed_input = preprocess_input(q, lemmatize=True, remove_stopwords=True) terms_to_search_for = list(processed_input.keys()) print('Terms to search for:') print(terms_to_search_for) print() containing_docs = indexer.retrieve_documents(terms_to_search_for, measure_time=True) res = knn.find_nearest_neigbours(processed_input, containing_docs, k=10, measure_time=True) print("\nResults:\n") i = 1 for r in res: print(f'#{i}') print(r) print() i += 1 print("Type a question:") q = input()
class System: def __init__(self, corpus_path, freq_index_path, tf_idf_index_path, create_index=False): ''' Sets up the system for testing by parsing the corpus with the Preprocessor and creating the two indexes with the Indexer. If the indexes have already been created and written to a file, it is possible to read them in instead of recreating them to save time. :param corpus_path: the path to the file containing the corpus of words. :param freq_index_path: the path to the file to save/load the frequency index :param tf_idf_index_path: the path to the file to save/load the tf-idf index :param create_index: if True, creates the frequency index and tf-idf index and writes them to the specified path. Otherwise, loads them from a file at the specified path. ''' self.preprocessor = Preprocessor(corpus_path) self.corpus = {} # if create_index, then run the create_indexes, otherwise, load them from their already existing file locations if create_index: print "Creating frequency and tf_idf_indexes." self.corpus = self.preprocessor.parse_corpus() tokens = self.preprocessor.create_tokens(self.corpus) c_counter = self.preprocessor.create_corpus_counter(self.corpus) self.indexer = Indexer(c_counter, tokens) # Uncomment and run with create_index set to True if you would like to see the results using the # different, but faster optimized algorithm for indexing # self.frequency_index = self.indexer.create_frequency_index_optimized() self.frequency_index = self.indexer.create_frequency_index() self.indexer.index_to_file(self.frequency_index, freq_index_path) self.tf_idf_index = self.indexer.create_tf_idf_index( self.frequency_index, len(self.frequency_index)) self.indexer.index_to_file(self.tf_idf_index, tf_idf_index_path) else: print "Loading frequency and tf_idf indexes." self.corpus = self.preprocessor.parse_corpus(True) self.indexer = Indexer() self.frequency_index = self.indexer.load_index(freq_index_path) self.tf_idf_index = self.indexer.load_index(tf_idf_index_path) self.query = Query(self.frequency_index, self.tf_idf_index, self.corpus) def test_system(self, run_name, query_path, results_path): ''' Tests the system by running all the queries, saving the results to a file. Formats the file output to match expected format for use with trec_eval :param run_name: a name for the current run, used to distinguish results :param query_path: the path to the file containing the queries :param results_path: the desired path to save the results file in ''' query_tree = ElementTree.parse(query_path) print "Testing system on queries." with open(results_path, 'wb') as f: for child in query_tree.getroot(): qid_re = re.compile("\d{3}") qid = int(qid_re.search(child[0].text).group(0)) query_results = self.query.execute_query(child[1].text) for i in range(len(query_results)): if i >= 1000: break print_out = str(qid) + " " + "Q0" + " " + str(query_results[i].get("id")) + " " + str(i + 1) + " " + \ str(query_results[i].get("score")) + " " + run_name + "\n" f.write(print_out) print "Query results saved to " + results_path
def main(): args = get_cmd_args() location = args.location corpus = args.corpus config = get_config() path_out = config['paths'][location][corpus]['path_out'] emb_type = config['embeddings'] if not args.skip_prep: prep_output_dir(path_out) # Copy TG files into dir-system. papers_to_pp_token_corpus(config, location, corpus) copy_keywords_to_terms(config, location, corpus) # Index corpus. if not args.skip_idxer: # print('Start indexing...') idxer = Indexer(path_out) # idxer.index_tokens() # print('Finished indexing.') print('Start building subtoken index...') idxer.build_token_contains() print('Finished building subtoken index.') # Frequency analysis. if not args.skip_freq_an: print('Start frequency analysis for tf, df and dl...') fa = FreqAnalyzer(path_out) print('Calculate token term frequencies...') fa.calc_tf('t') print('Calculate token document frequencies...') fa.calc_df('t') print('Calculate tfidf for tokens...') fa.calc_tfidf('t') print('Calculate document lengths...') fa.calc_dl() print('Finished frequency analysis.') if not args.skip_embeddings: emb_types = ['Word2Vec', 'GloVe', 'ELMo'] for etype in emb_types: Embedding = get_emb(etype) print('Train {} token embeddings...'.format(etype)) path_input = os.path.join(path_out, 'processed_corpus/token_idx_corpus.txt') embs_fname = Embedding.train( path_input, 'embs_token_global_'+etype, path_out) print('{} embeddings written to: {}'.format(etype, embs_fname)) if not args.skip_doc_embs: print('Calculating document embeddings...') doc_embedder = DocEmbedder(path_out, emb_type) doc_embedder.embed_token_docs() print('Finished document embeddings.') if not args.skip_word_distr: print('Create term distributions pickle file...') path_tf = os.path.join(path_out, 'frequencies/tf_tokens.json') path_tfidf = os.path.join(path_out, 'frequencies/tfidf_tokens.json') path_dl = os.path.join(path_out, 'frequencies/dl.json') path_term_distr = os.path.join( path_out, 'frequencies/term_distr_tokens.json') # Load frequencies. with open(path_tf, 'r', encoding='utf8') as f_tf: tf_base = json.load(f_tf) with open(path_tfidf, 'r', encoding='utf8') as f_tfidf: tfidf_base = json.load(f_tfidf) with open(path_dl, 'r', encoding='utf8') as f_dl: dl_base = json.load(f_dl) # Create term_distr. for doc_id in tfidf_base: for word_id in tf_base[doc_id]: tf = tf_base[doc_id][word_id] tfidf = tfidf_base[doc_id][word_id] term_distr_base[int(doc_id)][int(word_id)] = (tf, tfidf) term_distr_base[int(doc_id)][-1] = dl_base[doc_id] # Dump term_distr. with open(path_term_distr, 'wb') as f: pickle.dump(term_distr_base, f)
if __name__ == '__main__': print('Initializing Search Engine...') tokenized_corpus = p.get_tokenized_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/tokenized_corpus1.pkl' ) + p.get_tokenized_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/tokenized_corpus2.pkl' ) + p.get_tokenized_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/tokenized_corpus3.pkl' ) + p.get_tokenized_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/tokenized_corpus4.pkl' ) + p.get_tokenized_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/tokenized_corpus5.pkl' ) model = Indexer(tokenized_corpus) corpus = p.get_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus1.pkl' ) + p.get_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus2.pkl' ) + p.get_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus3.pkl' ) + p.get_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus4.pkl' ) + p.get_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus5.pkl' ) cat_data = pd.DataFrame( columns=['id', 'tokens', 'category', 'category_code', 'pred_category']) cat_data = t.get_categorised_data( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation1_final.pkl',