コード例 #1
0
 def __init__(self,
              corpus_path,
              freq_index_path,
              tf_idf_index_path,
              create_index=False):
     self.preprocessor = Preprocessor(corpus_path)
     self.corpus = {}
     # if create_index, then run the create_indexes, otherwise, load them from their already existing file locations
     if create_index:
         print("Creating frequency and tf_idf_indexes.")
         self.corpus = self.preprocessor.parse_corpus()
         tokens = self.preprocessor.create_tokens(self.corpus)
         c_counter = self.preprocessor.create_corpus_counter(self.corpus)
         self.indexer = Indexer(c_counter, tokens)
         # Uncomment and run with create_index set to True if you would like to see the results using the
         # different, but faster optimized algorithm for indexing
         # self.frequency_index = self.indexer.create_frequency_index_optimized()
         self.frequency_index = self.indexer.create_frequency_index()
         self.indexer.index_to_file(self.frequency_index, freq_index_path)
         self.tf_idf_index = self.indexer.create_tf_idf_index(
             self.frequency_index, len(self.frequency_index))
         self.indexer.index_to_file(self.tf_idf_index, tf_idf_index_path)
     else:
         print("Loading frequency and tf_idf indexes.")
         self.corpus = self.preprocessor.parse_corpus(True)
         self.indexer = Indexer()
         self.frequency_index = self.indexer.load_index(freq_index_path)
         self.tf_idf_index = self.indexer.load_index(tf_idf_index_path)
     self.query = Query(self.frequency_index, self.tf_idf_index,
                        self.corpus)
コード例 #2
0
 def __init__(self,
              corpus_path,
              freq_index_path,
              tf_idf_index_path,
              create_index=False):
     '''
     Sets up the system for testing by parsing the corpus with the Preprocessor and creating the two indexes with the Indexer.
     If the indexes have already been created and written to a file, it is possible to read them in instead of recreating them to save time.
     :param corpus_path: the path to the file containing the corpus of words.
     :param freq_index_path: the path to the file to save/load the frequency index
     :param tf_idf_index_path: the path to the file to save/load the tf-idf index
     :param create_index: if True, creates the frequency index and tf-idf index and writes them to the specified path.  Otherwise, loads them from a file at the specified path.
     '''
     self.preprocessor = Preprocessor(corpus_path)
     self.corpus = {}
     # if create_index, then run the create_indexes, otherwise, load them from their already existing file locations
     if create_index:
         print "Creating frequency and tf_idf_indexes."
         self.corpus = self.preprocessor.parse_corpus()
         tokens = self.preprocessor.create_tokens(self.corpus)
         c_counter = self.preprocessor.create_corpus_counter(self.corpus)
         self.indexer = Indexer(c_counter, tokens)
         # Uncomment and run with create_index set to True if you would like to see the results using the
         # different, but faster optimized algorithm for indexing
         # self.frequency_index = self.indexer.create_frequency_index_optimized()
         self.frequency_index = self.indexer.create_frequency_index()
         self.indexer.index_to_file(self.frequency_index, freq_index_path)
         self.tf_idf_index = self.indexer.create_tf_idf_index(
             self.frequency_index, len(self.frequency_index))
         self.indexer.index_to_file(self.tf_idf_index, tf_idf_index_path)
     else:
         print "Loading frequency and tf_idf indexes."
         self.corpus = self.preprocessor.parse_corpus(True)
         self.indexer = Indexer()
         self.frequency_index = self.indexer.load_index(freq_index_path)
         self.tf_idf_index = self.indexer.load_index(tf_idf_index_path)
     self.query = Query(self.frequency_index, self.tf_idf_index,
                        self.corpus)
コード例 #3
0
class System:
    def __init__(self,
                 corpus_path,
                 freq_index_path,
                 tf_idf_index_path,
                 create_index=False):
        self.preprocessor = Preprocessor(corpus_path)
        self.corpus = {}
        # if create_index, then run the create_indexes, otherwise, load them from their already existing file locations
        if create_index:
            print("Creating frequency and tf_idf_indexes.")
            self.corpus = self.preprocessor.parse_corpus()
            tokens = self.preprocessor.create_tokens(self.corpus)
            c_counter = self.preprocessor.create_corpus_counter(self.corpus)
            self.indexer = Indexer(c_counter, tokens)
            # Uncomment and run with create_index set to True if you would like to see the results using the
            # different, but faster optimized algorithm for indexing
            # self.frequency_index = self.indexer.create_frequency_index_optimized()
            self.frequency_index = self.indexer.create_frequency_index()
            self.indexer.index_to_file(self.frequency_index, freq_index_path)
            self.tf_idf_index = self.indexer.create_tf_idf_index(
                self.frequency_index, len(self.frequency_index))
            self.indexer.index_to_file(self.tf_idf_index, tf_idf_index_path)
        else:
            print("Loading frequency and tf_idf indexes.")
            self.corpus = self.preprocessor.parse_corpus(True)
            self.indexer = Indexer()
            self.frequency_index = self.indexer.load_index(freq_index_path)
            self.tf_idf_index = self.indexer.load_index(tf_idf_index_path)
        self.query = Query(self.frequency_index, self.tf_idf_index,
                           self.corpus)

    def test_system(self, query):
        results = []
        query_results = self.query.execute_query(query)
        for i in range(len(query_results)):
            if i >= 100:
                break
            string_write = str(self.corpus[int(query_results[i].get("id"))])
            string_write = string_write.replace('\n', "").strip()
            results.append(string_write)
        return results
コード例 #4
0
ファイル: preprocessing_main.py プロジェクト: jagol/BA_Thesis
def main():
    # setting up paths and directories
    start_time = time.time()
    args = get_cmd_args()
    config = get_config()
    path_out = get_path_out(args, config)
    path_in = get_path_in(args, config)
    path_lang_model = config['paths'][args.location]['path_lang_model']
    emb_type = config['embeddings']

    if not args.skip_prep:
        prep_output_dir(path_out)

    print('Start preprocessing...')

    # corpus preprocessing
    if not args.skip_lingpp:
        print(('Start tokenization, tagging, lemmatization and marking '
               'stop-words...'))
        if args.corpus == 'dblp':
            lpp = DBLPLingPreprocessor(path_in,
                                       path_out,
                                       path_lang_model,
                                       max_docs=10000)
        elif args.corpus == 'sp':
            lpp = SPLingPreprocessor(path_in, path_out,
                                     path_lang_model)  # , max_docs=10000)
        lpp.preprocess_corpus()
        print('Done.')

    # term extraction and hearst pattern extraction
    if not args.skip_pattern_extr:
        print('Start term extraction and hearst pattern extraction...')
        te = PatternExtractor(path_out)
        te.extract()
        print('Done.')

        print('Run consistency tests on corpus files...')
        test_corpus_files(path_out)
        print('Tests passed.')
        print('Run consistency tests on term pattern files...')
        test_term_pattern_files(path_out)
        print('Tests passed.')

    # indexing of corpus
    if not args.skip_idxer:
        print('Start indexing...')
        idxer = Indexer(path_out)
        print('index tokens...')
        idxer.index_tokens()
        print('index lemmas...')
        idxer.index_lemmas()
        print('convert lemma relations to index...')
        idxer.hierarch_rels_to_lemma_idx()
        print('convert token relations to index...')
        idxer.hierarch_rels_to_token_idx()
        print('Done.')

        print('Run consistency tests on indexing files...')
        test_indexing_files(path_out)
        print('Tests passed.')

    # analyze lemma frequencies
    if not args.skip_freq_an:
        print('Start frequency analysis for tf, df and dl...')
        fa = FreqAnalyzer(path_out)
        print('Calculate token term frequencies...')
        fa.calc_tf('t')
        print('Calculate lemma term frequencies...')
        fa.calc_tf('l')
        print('Prune terms...')
        pruner = Pruner(path_out, min_count=82)
        pruner.prune_tf()
        print('Calculate token document frequencies...')
        fa.calc_df('t')
        print('Calculate lemma document frequencies...')
        fa.calc_df('l')
        print('Calculate tfidf for tokens...')
        fa.calc_tfidf('t')
        print('Calculate tfidf for lemmas...')
        fa.calc_tfidf('l')
        print('Calculate document lengths...')
        fa.calc_dl()
        print('Done.')

    if not args.skip_embeddings:
        emb_types = ['Word2Vec', 'GloVe']
        for etype in emb_types:
            Embedding = get_emb(etype)
            print('Train {} token embeddings...'.format(etype))
            path_input = os.path.join(path_out,
                                      'processed_corpus/token_idx_corpus.txt')
            embs_fname = Embedding.train(path_input,
                                         'embs_token_global_' + etype,
                                         path_out)
            print('{} embeddings written to: {}'.format(etype, embs_fname))
            print('Train {} lemma embeddings...'.format(etype))
            path_input = os.path.join(path_out,
                                      'processed_corpus/lemma_idx_corpus.txt')
            embs_fname = Embedding.train(path_input,
                                         'embs_lemma_global_' + etype,
                                         path_out)
            print('{} embeddings written to: {}'.format(etype, embs_fname))
            # embs.calc_combined_term_vecs()

        print('Test if all terms have embeddings...')
        test_embedding_files(path_out)
        print('Tests passed.')

    if not args.skip_doc_embs:
        print('Calculating document embeddings')
        doc_embedder = DocEmbedder(path_out, emb_type)
        doc_embedder.embed_docs()
        print('Done')

    end_time = time.time()
    time_used = end_time - start_time
    print('Time used: {}'.format(time_used))
    print('Done.')
コード例 #5
0
def main():

    # Read dataset
    dict = index_dataset()

    # Preprocess dataset if needed
    if not os.path.exists('./objects/indexer.pickle') or not os.path.exists(
            './objects/knn.pickle'):
        dataset, corpus = preprocess_dataset(dict,
                                             lemmatize=True,
                                             remove_stopwords=True,
                                             measure_time=True)

    # Load or create indexer
    if os.path.exists('./objects/indexer.pickle'):
        indexer = load_object('./objects/indexer.pickle')
    else:
        indexer = Indexer(dataset, measure_time=True)
        save_object(indexer, './objects/indexer.pickle')

    #Load or create KNN
    if os.path.exists('./objects/knn.pickle'):
        knn = load_object('./objects/knn.pickle')
    else:
        # Initialize KNN with given dataset
        knn = KNN(dataset, corpus, measure_time=True)
        save_object(knn, './objects/knn.pickle')

    # Main loop for user input
    print("Type a question:")
    q = input()
    while q != 'quit':

        processed_input = preprocess_input(q,
                                           lemmatize=True,
                                           remove_stopwords=True)

        terms_to_search_for = list(processed_input.keys())

        print('Terms to search for:')
        print(terms_to_search_for)
        print()

        containing_docs = indexer.retrieve_documents(terms_to_search_for,
                                                     measure_time=True)

        res = knn.find_nearest_neigbours(processed_input,
                                         containing_docs,
                                         k=10,
                                         measure_time=True)

        print("\nResults:\n")
        i = 1
        for r in res:
            print(f'#{i}')
            print(r)
            print()
            i += 1

        print("Type a question:")
        q = input()
コード例 #6
0
class System:
    def __init__(self,
                 corpus_path,
                 freq_index_path,
                 tf_idf_index_path,
                 create_index=False):
        '''
        Sets up the system for testing by parsing the corpus with the Preprocessor and creating the two indexes with the Indexer.
        If the indexes have already been created and written to a file, it is possible to read them in instead of recreating them to save time.
        :param corpus_path: the path to the file containing the corpus of words.
        :param freq_index_path: the path to the file to save/load the frequency index
        :param tf_idf_index_path: the path to the file to save/load the tf-idf index
        :param create_index: if True, creates the frequency index and tf-idf index and writes them to the specified path.  Otherwise, loads them from a file at the specified path.
        '''
        self.preprocessor = Preprocessor(corpus_path)
        self.corpus = {}
        # if create_index, then run the create_indexes, otherwise, load them from their already existing file locations
        if create_index:
            print "Creating frequency and tf_idf_indexes."
            self.corpus = self.preprocessor.parse_corpus()
            tokens = self.preprocessor.create_tokens(self.corpus)
            c_counter = self.preprocessor.create_corpus_counter(self.corpus)
            self.indexer = Indexer(c_counter, tokens)
            # Uncomment and run with create_index set to True if you would like to see the results using the
            # different, but faster optimized algorithm for indexing
            # self.frequency_index = self.indexer.create_frequency_index_optimized()
            self.frequency_index = self.indexer.create_frequency_index()
            self.indexer.index_to_file(self.frequency_index, freq_index_path)
            self.tf_idf_index = self.indexer.create_tf_idf_index(
                self.frequency_index, len(self.frequency_index))
            self.indexer.index_to_file(self.tf_idf_index, tf_idf_index_path)
        else:
            print "Loading frequency and tf_idf indexes."
            self.corpus = self.preprocessor.parse_corpus(True)
            self.indexer = Indexer()
            self.frequency_index = self.indexer.load_index(freq_index_path)
            self.tf_idf_index = self.indexer.load_index(tf_idf_index_path)
        self.query = Query(self.frequency_index, self.tf_idf_index,
                           self.corpus)

    def test_system(self, run_name, query_path, results_path):
        '''
        Tests the system by running all the queries, saving the results to a file.  Formats the file output to match expected format for use with trec_eval
        :param run_name: a name for the current run, used to distinguish results
        :param query_path: the path to the file containing the queries
        :param results_path: the desired path to save the results file in
        '''
        query_tree = ElementTree.parse(query_path)
        print "Testing system on queries."
        with open(results_path, 'wb') as f:
            for child in query_tree.getroot():
                qid_re = re.compile("\d{3}")
                qid = int(qid_re.search(child[0].text).group(0))
                query_results = self.query.execute_query(child[1].text)
                for i in range(len(query_results)):
                    if i >= 1000:
                        break
                    print_out = str(qid) + " " + "Q0" + " " + str(query_results[i].get("id")) + " " + str(i + 1) + " " + \
                                str(query_results[i].get("score")) + " " + run_name + "\n"
                    f.write(print_out)
        print "Query results saved to " + results_path
コード例 #7
0
def main():
    args = get_cmd_args()
    location = args.location
    corpus = args.corpus
    config = get_config()
    path_out = config['paths'][location][corpus]['path_out']
    emb_type = config['embeddings']

    if not args.skip_prep:
        prep_output_dir(path_out)

    # Copy TG files into dir-system.
    papers_to_pp_token_corpus(config, location, corpus)
    copy_keywords_to_terms(config, location, corpus)

    # Index corpus.
    if not args.skip_idxer:
        # print('Start indexing...')
        idxer = Indexer(path_out)
        # idxer.index_tokens()
        # print('Finished indexing.')
        print('Start building subtoken index...')
        idxer.build_token_contains()
        print('Finished building subtoken index.')

    # Frequency analysis.
    if not args.skip_freq_an:
        print('Start frequency analysis for tf, df and dl...')
        fa = FreqAnalyzer(path_out)
        print('Calculate token term frequencies...')
        fa.calc_tf('t')
        print('Calculate token document frequencies...')
        fa.calc_df('t')
        print('Calculate tfidf for tokens...')
        fa.calc_tfidf('t')
        print('Calculate document lengths...')
        fa.calc_dl()
        print('Finished frequency analysis.')

    if not args.skip_embeddings:
        emb_types = ['Word2Vec', 'GloVe', 'ELMo']
        for etype in emb_types:
            Embedding = get_emb(etype)
            print('Train {} token embeddings...'.format(etype))
            path_input = os.path.join(path_out,
                                      'processed_corpus/token_idx_corpus.txt')
            embs_fname = Embedding.train(
                path_input, 'embs_token_global_'+etype, path_out)
            print('{} embeddings written to: {}'.format(etype, embs_fname))

    if not args.skip_doc_embs:
        print('Calculating document embeddings...')
        doc_embedder = DocEmbedder(path_out, emb_type)
        doc_embedder.embed_token_docs()
        print('Finished document embeddings.')

    if not args.skip_word_distr:
        print('Create term distributions pickle file...')

        path_tf = os.path.join(path_out, 'frequencies/tf_tokens.json')
        path_tfidf = os.path.join(path_out, 'frequencies/tfidf_tokens.json')
        path_dl = os.path.join(path_out, 'frequencies/dl.json')
        path_term_distr = os.path.join(
            path_out, 'frequencies/term_distr_tokens.json')

        # Load frequencies.
        with open(path_tf, 'r', encoding='utf8') as f_tf:
            tf_base = json.load(f_tf)
            with open(path_tfidf, 'r', encoding='utf8') as f_tfidf:
                tfidf_base = json.load(f_tfidf)
                with open(path_dl, 'r', encoding='utf8') as f_dl:
                    dl_base = json.load(f_dl)

        # Create term_distr.
        for doc_id in tfidf_base:
            for word_id in tf_base[doc_id]:
                tf = tf_base[doc_id][word_id]
                tfidf = tfidf_base[doc_id][word_id]
                term_distr_base[int(doc_id)][int(word_id)] = (tf, tfidf)
            term_distr_base[int(doc_id)][-1] = dl_base[doc_id]

        # Dump term_distr.
        with open(path_term_distr, 'wb') as f:
            pickle.dump(term_distr_base, f)
コード例 #8
0
if __name__ == '__main__':

    print('Initializing Search Engine...')
    tokenized_corpus = p.get_tokenized_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/tokenized_corpus1.pkl'
    ) + p.get_tokenized_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/tokenized_corpus2.pkl'
    ) + p.get_tokenized_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/tokenized_corpus3.pkl'
    ) + p.get_tokenized_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/tokenized_corpus4.pkl'
    ) + p.get_tokenized_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/tokenized_corpus5.pkl'
    )
    model = Indexer(tokenized_corpus)
    corpus = p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus1.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus2.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus3.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus4.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus5.pkl'
    )
    cat_data = pd.DataFrame(
        columns=['id', 'tokens', 'category', 'category_code', 'pred_category'])
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation1_final.pkl',