コード例 #1
0
def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None):
    """
    This function build the inverted index over the corpus.
    send each tweet to parsing and indexing.
    if the stemming is True the parsing will use the stemmer on the tokens.
    :param glove_dict: Glove file including all word vectors
    :param corpus_path: root folder containing the raw tweet files
    :param output_path for the inverted index, posting files and tweets dictionary
    :param stemming if True use stemmer on terms
    """

    config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path)
    r = ReadFile(corpus_path=config.get_corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config)
    all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet")
    all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths]
    start_time = time.time()
    file_counter = 0
    for file_name in all_files_names:
        file_start_time = time.time()
        # print("start file :", file_counter)
        documents_list = [document for document in r.read_file(file_name=file_name)]
        # Iterate over every document in the file
        for idx, document in enumerate(documents_list):
            parsed_document = p.parse_doc(document)
            indexer.add_new_doc(parsed_document, glove_dict)
        # print("end file number ", file_counter, " in: ", time.time() - file_start_time)
        file_counter += 1
    total_time = time.time() - start_time
    indexer.finish_indexing()
コード例 #2
0
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve):
    """
    :return:
    """
    number_of_documents = 0

    config = ConfigClass(corpus_path, output_path, stemming)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, p.terms_dic_to_document)
    # Iterate over every document in the file
    for i in r.filesPath:
        documents_list = r.read_file(i)
        start_time = time.time()
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            # update the number of doc in system
            number_of_documents += 1
            # index the document data
            indexer.add_new_doc(parsed_document)
        # print(time.time() - start_time)
    print('--------------------------')
    print('Start writing to disk left overs')
    indexer.save_all_left_overs()
    print('Finish without waiting ' + str(time.time() - start_time))
    print('Start waiting')
    indexer.wait_untill_all_finish()
    print('End Waiting')
    print('Finished writing to disk left overs')
    print('--------------------------')
    print('Finished parsing and indexing. Starting to export files')
    print('Finish all Time ' + str(time.time() - start_time))
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
コード例 #3
0
def run_engine(config):
    """
    :return:
    """
    number_of_documents = 0
    output_path = config.savedFileMainFolder
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem)
    m_Indexer = Indexer(output_path)
    parquetPaths = []
    for (dirPath, dirNames, fileNames) in os.walk(config.get__corpusPath()):
        for fileName in fileNames:
            parquetPaths.append((dirPath + '\\' + fileName))
    for i in range(len(parquetPaths)):
        parquetPaths[i] = parquetPaths[i][parquetPaths[i].find('\\') + 1:]
        if ".DS_Store" in parquetPaths[i]:
            continue
        parquet = r.read_file(file_name=parquetPaths[i])
        for document in parquet:
            number_of_documents += 1
            parsed_document = p.parse_doc(document)
            # index the document data
            m_Indexer.add_new_doc(parsed_document)
    # if there's more postings to flush, do it.
    if len(m_Indexer.postingDictionary) > 0:
        utils.save_obj(m_Indexer.postingDictionary,
                       m_Indexer.postingsPath + '/' + str(m_Indexer.pkl_key))
    # Clear single terms and entities, updated inverted index to disk.
    clearSingleEntities(m_Indexer.inverted_idx, p, output_path,
                        m_Indexer.num_of_docs_in_corpus)
    utils.save_obj(m_Indexer.inverted_idx, output_path + '/inverted_idx')
    m_Indexer.inverted_idx.clear()
    utils.save_obj(number_of_documents,
                   output_path + '/PostingFiles/num_of_docs_in_corpus')
コード例 #4
0
def run_engine(config):
    """

    :param config:
    :return:
    """
    number_of_documents = 0

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem)
    indexer = Indexer(config)
    paruet_list = r.read_all_parquet()
    for list in paruet_list:
        #for i in tqdm(range(0,len(list))): # for every doc
        for i in range(0, len(list)):  # for every doc
            # parse the document
            parsed_document = p.parse_doc(list[i])
            if parsed_document is None:
                continue
            number_of_documents += 1

            # index the document data
            indexer.add_new_doc(parsed_document)

    #print('Finished parsing and indexing. Starting to export files')

    indexer.save_postings()  # saves the remaining posting file .
    PostingsMerge(indexer).chunks_merging()
    utils.save_dict_as_pickle(indexer.inverted_idx, "inverted_idx",
                              config.get_out_path())
コード例 #5
0
    def task(self, queue, document_list):
        parser = Parse()
        indexer = Indexer(self._config)
        for idx, document in enumerate(document_list):
            # parse the document
            parsed_document = parser.parse_doc(document)
            # index the document data
            indexer.add_new_doc(parsed_document)

        queue.put(indexer.get_inverted_index())
コード例 #6
0
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve, word2vec):
    """

    :return:
    """
    # print("start: ", time.asctime(time.localtime(time.time())))
    number_of_documents = 0
    num_of_writes = 1
    config = ConfigClass(corpus_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, word2vec)
    # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet')  # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool)

    # Iterate over every document in the file
    counter = 0
    names = r.get_files_names_in_dir()
    for name in names:
        documents_list = r.read_file_by_name(file_name=str(name))
        for idx, document in enumerate(documents_list):
            parsed_document = p.parse_doc(document)  # parse the document
            if parsed_document == {}:  # RT
                continue
            number_of_documents += 1

            indexer.add_new_doc(parsed_document,
                                num_of_writes)  # index the document data
            counter += 1
            if counter >= 500000:
                write_and_clean_buffer(indexer, num_of_writes, stemming,
                                       config, output_path)
                counter = 0
                # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time())))
                num_of_writes += 1
        # print('Finished parsing and indexing. Starting to export files')
    write_and_clean_buffer(indexer, num_of_writes, stemming, config,
                           output_path)
    # print("finish parser & index: ", time.asctime(time.localtime(time.time())))
    indexer.inverted_idx = {
        key: val
        for key, val in indexer.inverted_idx.items() if val != 1
    }
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    # print("finish save index: ", time.asctime(time.localtime(time.time())))

    return num_of_writes
コード例 #7
0
def test(folder_list, counter, indexer, number_of_documents):
    print(counter)
    cr = datetime.now()
    print(cr)
    p = Parse(config)
    # Iterate over every tweet in the folder
    for idx, tweet in enumerate(folder_list):
        # parse the tweet
        parsed_document = p.parse_doc(tweet)
        number_of_documents += 1
        # index the tweet data
        indexer.add_new_doc(parsed_document)

    print("number of tweets", number_of_documents)
    cn = datetime.now()
    print(cn)
    counter += 1
コード例 #8
0
def run_engine(config, indexer):
    """
    :return:
    """
    number_of_documents = 0

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config)

    doc = r.read_file('benchmark_data_train.snappy.parquet')
    for document in doc:
        parsed_document = p.parse_doc(document)
        indexer.add_new_doc(parsed_document)
        number_of_documents += 1
    capital_letters = p.caps_dict
    indexer.change_inverted_by_caps(capital_letters)
    indexer.save_index('idx_bench')
コード例 #9
0
def run_engine(corpus_path_, output_path_, stemming_):
    """

    :return:
    """

    number_of_documents = 0
    config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_)
    config.corpusPath = corpus_path_
    config.savedFileMainFolder=output_path_
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)

    pathes = r.get_all_path_of_parquet()
    length_of_array = len(pathes)
    iteration = 0
    is_stemmer = config.toStem
    parsed_doc_list = list()
    for i in range(0, length_of_array):
        documents_list = r.get_documents(pathes[i][0], pathes[i][0])
        for doc, j in zip(documents_list, range(len(documents_list))):
            parsed_document = p.parse_doc(doc, stemmer=is_stemmer)
            if parsed_document == None:
                continue
            parsed_doc_list.append(parsed_document)
            number_of_documents += 1
            if number_of_documents % 200000 == 0:
                for doc in parsed_doc_list:
                    indexer.add_new_doc(doc)
                indexer.write_posting_to_txt_file_lower_upper(iteration)
                iteration += 1
                parsed_doc_list.clear()
                parsed_doc_list = list()
            elif j == len(documents_list) - 1 and i == length_of_array - 1:
                for doc in parsed_doc_list:
                    indexer.add_new_doc(doc)
                indexer.write_posting_to_txt_file_lower_upper(iteration)
                parsed_doc_list.clear()
                parsed_doc_list = list()
                indexer.merge_posting_file()
                indexer.merge_two_last_posting_file()
                indexer.split_posting_file_and_create_inverted_index()
                indexer.write_inverted_index_to_txt_file()
                number_of_documents = 0
コード例 #10
0
ファイル: search_engine.py プロジェクト: guy94/SearchEngine
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve):
    """
    :return:
    """
    config = ConfigClass(corpus_path, output_path, stemming)
    number_of_documents = 0
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)
    Parse.stemmer = stemming

    corpus_list = r.read_corpus()

    for idx in range(len(corpus_list)):
        documents_list = r.read_file(file_name=corpus_list[idx],
                                     read_corpus=True)
        for i in tqdm(range(len(documents_list))):
            parsed_document = p.parse_doc(documents_list[i])
            if i == len(documents_list) - 1 and idx == len(corpus_list) - 1:
                indexer.is_last_doc = True
            indexer.add_new_doc(parsed_document)
            number_of_documents += 1
        indexer.is_last_doc = False
    documents_list = []

    with open('spell_dict.json', 'w') as f:
        json.dump(indexer.spell_dict, f)

    pickle_out = open("docs_dict_and_extras", "wb")
    pickle.dump(indexer.docs_dict, pickle_out)
    pickle_out.close()

    start = time.time()
    indexer.merge_files()
    end = time.time()
    print("merge time was: {}".format(end - start))

    utils.save_obj(indexer.inverted_idx, "inverted_index")
    pickle_out = open("docs_dict_and_extras", "ab")
    pickle.dump(number_of_documents, pickle_out)
    pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out)
    pickle.dump(indexer.dump_path, pickle_out)
    pickle_out.close()
コード例 #11
0
def create_table(stemming, corpus):
    myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    mydb = myclient["mydatabase"]
    mycol = mydb["global"]
    mycol.drop()
    r = ReadFile(corpus)
    p = Parse(stemming)
    for documents_list in r:
        step = 1 / len(documents_list)
        for document in documents_list:
            parsed_list = [t.text.lower() for t in p.parse_doc(document) if '$' not in t.text]

            for word_1 in parsed_list:
                query = {'term': word_1}
                row = mycol.find_one(query)
                if not row:
                    mycol.insert_one({**query, 'terms': {}})
                    row = mycol.find_one(query)
                for word_2 in parsed_list:
                    if word_2 not in row['terms'].keys():
                        row['terms'][word_2] = 0
                    row['terms'][word_2] += 1
                try:
                    mycol.update_one(query, {"$set": {'terms': row['terms']}})
                except:
                    print(row['terms'])

            r.progressbar.update(step)
            counter += 1
    global_table = {}
    for word_1 in mycol.find():
        top = []
        for word_2 in word_1['terms'].keys():
            s = word_1['terms'][word_2] / (
                    word_1['terms'][word_1['term']] + mycol.find_one({'term': word_2})['terms'][word_1['term']] -
                    word_1['terms'][word_2])
            if len(top) < 10:
                top.append((word_2, s))
                top.sort(key=lambda score: score[1])
            elif s > top[0][1]:
                top[0] = (word_2, s)
                top.sort(key=lambda score: score[1])
        global_table[word_1['term']] = top
    utils.save_obj(global_table, f'global_table_{stemming}')
コード例 #12
0
def run_engine(config):
    """

    :return:
    """

    number_of_documents = 0
    sum_of_doc_lengths = 0

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem)
    indexer = Indexer(config, glove_dict)
    # documents_list = r.read_file(file_name=config.get__corpusPath())
    parquet_documents_list = r.read_folder(config.get__corpusPath())
    for parquet_file in parquet_documents_list:
        documents_list = r.read_file(file_name=parquet_file)
        # Iterate over every document in the file
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            if parsed_document is None:
                continue
            number_of_documents += 1
            sum_of_doc_lengths += parsed_document.doc_length
            # index the document data
            indexer.add_new_doc(parsed_document)

    # saves last posting file after indexer has done adding documents.
    indexer.save_postings()
    if len(indexer.doc_posting_dict) > 0:
        indexer.save_doc_posting()
    utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path())
    if len(indexer.document_posting_covid) > 0:
        indexer.save_doc_covid()

    indexer.delete_dict_after_saving()

    # merges posting files.
    indexer.merge_chunks()
    utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path())

    dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents }

    utils.save_dict(dits, 'details', config.get_out_path())
コード例 #13
0
def run_engine(corpus_path=None,
               output_path=None,
               stemming=False,
               lemma=False,
               queries=None,
               num_docs_to_retrieve=None):
    """
    :return:
    """
    global config, number_of_documents

    number_of_documents = 0

    config = ConfigClass()
    config.corpusPath = corpus_path
    config.set_output_path(output_path)
    config.toStem = stemming
    config.toLemm = lemma
    if os.path.exists(config.get_output_path()):
        shutil.rmtree(config.get_output_path())

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem, config.toLemm)
    indexer = Indexer(config)

    documents_list = []
    for root, dirs, files in os.walk(corpus_path):
        r.set_corpus_path(root)
        for file in files:
            if file.endswith(".parquet"):
                documents_list += r.read_file(file)
    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document)
    documents_list.clear(
    )  # Finished parsing and indexing all files - need to clean all the used memory
    indexer.cleanup(number_of_documents)
コード例 #14
0
def run_engine(config):
    """
    :return:
    """
    number_of_documents = 0

    if config.toStem:
        if not os.path.exists(config.savedFileMainFolder + "\\WithStem"):
            os.makedirs(config.savedFileMainFolder + "\\WithStem")
        out = config.savedFileMainFolder + "\\WithStem"
    else:
        if not os.path.exists(config.savedFileMainFolder + "\\WithoutStem"):
            os.makedirs(config.savedFileMainFolder + "\\WithoutStem")
        out = config.savedFileMainFolder + "\\WithoutStem"
    out += '\\'

    r = ReadFile(config.corpusPath)
    p = Parse(config.toStem)
    indexer = Indexer(config, out)

    end_of_corpus = False

    for documents_list in r:
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            number_of_documents += 1
            if r.queue.empty() and number_of_documents == len(documents_list) - 1:
                end_of_corpus = True
            # index the document data
            indexer.add_new_doc(parsed_document, end_of_corpus)
            if end_of_corpus:
                end_of_corpus = False
    for letter in indexer.ABC_dict:
        for idx in range(1, (indexer.counter_dict_files[letter]) + 1):
            indexer.merge_files(indexer.out, letter, letter + str(idx))
            os.remove(out + letter + str(idx) + ".pkl")
    p.remove_uppercase_and_entities(indexer)
    indexer.sort_tweet_ids()
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
コード例 #15
0
def run_engine(corpus_path, stemming, output_path):
    """
    :return:
    """
    r = ReadFile(corpus_path)
    p = Parse(stemming)
    m = BinaryMemoryPosting(os.path.join(output_path, PostingFile))
    indexer = Indexer()
    max_posting_size = 100000

    if os.path.exists(os.path.join(output_path, PostingFile)):
        os.remove(os.path.join(output_path, PostingFile))
    if os.path.exists(InvertedIndexFile + '.pkl'):
        os.remove(InvertedIndexFile + '.pkl')
    if not os.path.exists(output_path):
        os.mkdir(output_path)

    # Iterate over every document in the file
    idx = 0
    for documents_list in r:
        step = 1 / len(documents_list)
        for document in documents_list:
            parsed_list = p.parse_doc(document)

            # index the document data
            indexer.add_new_doc(parsed_list, idx, document[0])
            idx += 1

            if idx % max_posting_size == 0:
                m.Save(p.word_dict)
            r.progressbar.update(step)

    r.progressbar.close()
    m.Save(p.word_dict)

    global_table = utils.load_obj(f'global_table_{stemming}')

    inv_index = indexer.CreatInvertedIndex(p.word_dict, idx, global_table)
    m.Merge(inv_index)
    utils.save_obj(inv_index, InvertedIndexFile)
コード例 #16
0
def run_engine(config):
    """
    :return:
    """
    parser = Parse(config)
    r = ReadFile(corpus_path=config.get__corpusPath())
    indexer = Indexer(config)
    number_of_files = 0

    for i, file in enumerate(r.read_corpus()):
        # Iterate over every document in the file
        number_of_files += 1
        for idx, document in enumerate(file):
            # parse the document
            parsed_document = parser.parse_doc(document)
            indexer.add_new_doc(parsed_document)
    indexer.check_last()
    indexer.merge_sort_parallel(3)
    indexer.calculate_idf(parser.number_of_documents)
    avg_doc_len = parser.total_len_docs / parser.number_of_documents
    utils.save_obj(avg_doc_len, config.get_savedFileMainFolder() + "\\data")

    utils.save_obj(indexer.inverted_idx, config.get_savedFileMainFolder() + "\\inverted_idx")
    utils.save_obj(indexer.docs_inverted, config.get_savedFileMainFolder() + "\\docs_inverted")
コード例 #17
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0

    config = ConfigClass()
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)

    documents_list = r.read_file(file_name='sample3.parquet')
    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document)
    print('Finished parsing and indexing. Starting to export files')

    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.postingDict, "posting")
コード例 #18
0
ファイル: search_engine_2.py プロジェクト: GalAgas/SEPartC
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        # self._parser = Parse()
        self._parser = Parse(self._config)
        self._indexer = Indexer(self._config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        self.clean()
        self._indexer.calculate_idf(self._parser.number_of_documents)
        self._indexer.save_index("idx_bench.pkl")
        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        searcher.set_method_type('2')
        return searcher.search(query)

    def clean(self):
        p = 0.0008
        num_of_terms = round(p * len(self._indexer.inverted_idx_term))
        sorted_index = sorted(self._indexer.inverted_idx_term.items(),
                              key=lambda item: item[1][0],
                              reverse=True)

        for i in range(num_of_terms):
            del self._indexer.inverted_idx_term[sorted_index[i][0]]

        for term in list(self._indexer.inverted_idx_term.keys()):
            # TODO - make statistics
            if self._indexer.inverted_idx_term[term][0] <= 1:
                del self._indexer.inverted_idx_term[term]
コード例 #19
0
class SearchEngine:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None
        self._method = thesaurus_method()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        doc_len = len(documents_list)
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document, doc_len)
        # print('Finished parsing and indexing.')

        # print('Finished marge, start rebuild posting dict')
        # self._indexer.rebuild_postingDict()
        self._indexer.rebuild_inverted_index()
        # print('finished rebuild inverted index')

        to_save = (self._indexer.inverted_idx, self._indexer.tweet_dict,
                   self._indexer.reversed_inverted_index)
        utils.save_obj(to_save, 'idx_bench')
        # TODO: inverted_idx, tweet_dict,reversed_inverted_index, to_save change to None
        self._indexer.inverted_idx = None
        self._indexer.tweet_dict = None
        self._indexer.reversed_inverted_index = None
        to_save = None
        # print('Finished rebuild inverted index and build reversed_inverted_index')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """

        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and 
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser,
                            self._indexer,
                            model=self._model,
                            method=self._method)
        return searcher.search(query)
コード例 #20
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        r = ReadFile()
        df = r.read_file(fn)
        documents_list = df
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')

        # self._indexer.save_index('idx_bench.pkl')
        # self._indexer.save_index('inverted_idx.pkl')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        query_as_list = self._parser.parse_sentence(query)
        add_to_query = {}
        for q in query_as_list:
            for syn in wordnet.synsets(q):
                for lemma in syn.lemmas():
                    if lemma.name() == q.lower():
                        continue
                    score = wordnet.synsets(q)[0].wup_similarity(syn)
                    if score is not None and score > 0.8:
                        add_to_query[lemma.name()] = score

        if len(add_to_query) > 3:
            add_to_query = sorted(add_to_query.items(), key=lambda item: item[1], reverse=True)
            query_as_list.extend([add_to_query[0][0], add_to_query[1][0], add_to_query[2][0]])
        else:
            query_as_list.extend(add_to_query)

        new_query = ' '.join(query_as_list)
        relevant_docs = searcher.search(new_query)

        return relevant_docs

    @property
    def indexer(self):
        return self._indexer
コード例 #21
0
class SearchEngine:
    GLOVE_PATH_SERVER = '../../../../glove.twitter.27B.25d.txt'
    GLOVE_PATH_LOCAL = '.\model/model.txt'

    def __init__(self, config=None):
        self._config = config
        self._parser = Parse(False)
        self.reader = ReadFile(corpus_path=config.get__corpusPath())
        self._indexer = Indexer(config)
        self.model = self.initialize_glove_dict()
        self._indexer.set_glove_dict(self.model)

    def initialize_glove_dict(self):
        glove_dict = {}
        with open(self.GLOVE_PATH_LOCAL, 'r', encoding='utf-8') as f:
            for line in tqdm(f):
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                glove_dict[word] = vector
        return glove_dict

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in tqdm(enumerate(documents_list)):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            if parsed_document is None:
                continue
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        tuple_to_save = self._indexer.fix_inverted_index()
        utils.save_pickle_tuple(tuple_to_save, 'idx_engine1',
                                self._config.get_out_path())

        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_path):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    def load_index(self, fn):
        return self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        self._indexer.inverted_idx, self._indexer.document_dict = self.load_index(
            'idx_engine1.pkl')
        searcher = Searcher(self._parser, self._indexer, model=self.model)
        # TODO check about K
        query_as_list = self._parser.parse_sentence(query)
        l_res = searcher.search(query_as_list[0])
        t_ids = [tup[1] for tup in l_res]
        return len(l_res), t_ids
コード例 #22
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0

        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')

        # self._indexer.save_index("idx_bench.pkl")
        #
        # indexer_dic = utils.load_obj("idx_bench")

        self._indexer.save_index("idx.pkl")  # TODO - we need submit this

        indexer_dic = utils.load_obj("idx")  # TODO - we need submit this

        localMethod = False
        globalMethod = False
        wordNet = True
        spellChecker = False

        if localMethod:
            indexer_dic["local"] = True

        if wordNet:
            indexer_dic["wordnet"] = True

        if spellChecker:
            indexer_dic["spellChecker"] = True

        if globalMethod:
            docs_dic, Sij_dic = compute_Wi(indexer_dic, globalMethod)
            indexer_dic["docs"] = docs_dic
            indexer_dic["global"] = Sij_dic
        else:
            docs_dic = compute_Wi(indexer_dic)
            indexer_dic["docs"] = docs_dic

        # utils.save_obj(indexer_dic, "idx_bench")
        utils.save_obj(indexer_dic, "idx")  # TODO - we need submit this

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """

        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
コード例 #23
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None
        self.map_list = []
        self.prec5_list = []
        self.prec10_list = []
        self.prec50_list = []
        self.prec_total_list = []
        self.recall_list = []

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        print("\nNow Starting search engine 3")

        # total_time = datetime.now()
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        # print("len of inverted: ", len(self._indexer.inverted_idx))
        # print("len of posting: ", len(self._indexer.postingDict))
        # print("len of dataSet: ", len(self._indexer.benchDataSet))
        # end_time = datetime.now()
        # print('\n ------ Time To Retrieve: {}'.format(end_time - total_time), " ------\n")
        #
        # print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)

    def run_engine_two(self, fn):

        self.build_index_from_parquet(fn)
        queries_path = "data\\queries_train.tsv"

        all_queries = SearchEngine.query_reader(
            queries_path)["information_need"]

        for i, q in enumerate(all_queries):
            print(q)
            k, docs = self.search(q)
            # print(docs[:10])
            self.check_engine_quality(i + 1, docs[:300])
            print()

        print("Avg map is :", (sum(self.map_list) / len(self.map_list)))

    @staticmethod
    def query_reader(queries_path):

        data = pd.read_csv(queries_path, sep="\t")
        return data

    def get_parser(self):
        return self._parser

    def check_engine_quality(self, query_num, list_of_docs):
        """
        :param query_num:
        :param list_of_docs:
        :return: no return. prints metrics of the query. precision, recall, map.
        """

        benchmark_path = "data\\benchmark_lbls_train.csv"
        df = pd.read_csv(benchmark_path)

        df_prec = df[df['query'] == query_num]
        df_prec = df_prec[df_prec['tweet'].isin(list_of_docs)]
        dict_for_data = df_prec.set_index('tweet')['y_true'].to_dict()

        rmv_lst = []

        ranking = []
        # Add to list for rank
        for doc in list_of_docs:
            try:
                ranking.append(dict_for_data[int(doc)])
            except:
                rmv_lst.append(doc)
        for d in rmv_lst:
            list_of_docs.remove(d)

        data_df = pd.DataFrame({
            'query': query_num,
            'tweet': list_of_docs,
            'y_true': ranking
        })

        df_rec = df[df['query'] == query_num]
        recall_total = len(df_rec[df_rec['y_true'] == 1.0])

        # print("total Relevant doc found with tag 1 :" , len (data_df[data_df['y_true'] == 1.0]))
        # print("total NON relevant doc found with tag 0 :" , len (data_df[data_df['y_true'] == 0]))
        # print("found total of", len(df_prec), "tagged docs")
        # Calculate and print
        prec5 = metrics.precision_at_n(data_df, query_num, 5)
        prec10 = metrics.precision_at_n(data_df, query_num, 10)
        prec50 = metrics.precision_at_n(data_df, query_num, 50)
        prec_total = metrics.precision(data_df, True, query_number=query_num)
        map_of_query = metrics.map(data_df)
        recall_val = metrics.recall_single(data_df, recall_total, query_num)
        self.map_list.append(map_of_query)
        self.prec5_list.append(prec5)
        self.prec10_list.append(prec10)
        self.prec50_list.append(prec50)
        self.prec_total_list.append(prec_total)
        self.recall_list.append(recall_val)

        print()
        print("precision at 5 of query", query_num, "is :", prec5)
        print("precision at 10 of query", query_num, "is :", prec10)
        print("precision at 50 of query", query_num, "is :", prec50)
        print("precision of query", query_num, "is :", prec_total)
        print("recall of query", query_num, "is :", recall_val)
        print("map of query", query_num, "is :", map_of_query)
コード例 #24
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config

        if self._config:
            if not hasattr(self._config, 'toStem'):
                self._config.toStem = False
            if not hasattr(self._config, 'toLemm'):
                self._config.toLemm = False

        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None
        self.corpus_size = 0

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        self._indexer.save_index(
            self._config.get_output_path())  # Save the inverted_index to disk
        self.corpus_size = self._indexer.get_docs_count()
        self.calculate_doc_weight()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)

    def calculate_doc_weight(self):
        """
       The method calculates the TF-IDF for each document
       :return:
       """
        inverted_index = self._indexer.inverted_idx
        docs_index = self._indexer.get_docs_index()

        for word in inverted_index:

            for doc_id in self._indexer.get_term_posting_list(word):
                normalized_term_tf = inverted_index[word]["posting_list"][
                    doc_id][0]
                term_df = inverted_index[word]['df']
                term_idf = math.log2(self.corpus_size / term_df)
                # calculate doc's total weight
                term_weight = normalized_term_tf * term_idf
                inverted_index[word]["posting_list"][doc_id].append(
                    term_weight)
                term_weight_squared = math.pow(term_weight, 2)
                docs_index[doc_id][0] += term_weight_squared
                docs_index[doc_id][0] = round(docs_index[doc_id][0], 3)
コード例 #25
0
ファイル: indexer.py プロジェクト: GuyArieli17/FINISH_PART-1
                    (document.tweet_id, document_dictionary[term]))
                number_arr[0] += 1
            except:
                print('INVERTED: problem with the following key {}'.format(
                    term[0]))
        max_freq = max([document_dictionary.values()])
        self.tmp_pos_doc[document.tweet_id] = document_dictionary
        self.num_in_pos_doc[0] += 1
        if self.num_in_pos_doc[0] >= self.avg_length:
            if 'doc' not in self.set_is_writting.keys():
                self.map_reduce_doc.write_dict(self.tmp_pos_doc)
                self.set_is_writting['doc'] = 1
            else:
                self.map_reduce_doc.wait_untill_finish()
                del self.set_is_writting['doc']
                self.num_in_pos_doc[0] = 0


if __name__ == '__main__':
    p = Parse(True)
    parsed_document = p.parse_doc([
        '1280914835979501568', 'Wed Jul 08 17:21:09 +0000 2020',
        '70% @loganxtalor: Y’all Towson took away my housing cause of COVID and I literally didn’t know where I was gonna go. I was in such a bind. I…',
        '{}', '[]',
        'Y’all Towson took away my housing cause of COVID and I literally didn’t know where I was gonna go. I was in such a… https://t.co/i8IdrIKp2B',
        '{"https://t.co/i8IdrIKp2B":"https://twitter.com/i/web/status/1280659984628490246"}',
        '[[116,139]]', None, None, None, None, None, None
    ])
    i = Indexer()
    i.add_new_doc(parsed_document)
コード例 #26
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = GlobalMethod()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """

        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            if parsed_document is None:
                continue
            self._indexer.add_new_doc(parsed_document)
        if len(self._indexer.inverted_idx)>100000:
            self._indexer.sort_100K_inverted_index()
        self._indexer.add_idf_to_dictionary()
        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        if ".pkl" in fn:
            fn=fn[:-4]

        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        # self._model = KeyedVectors.load_word2vec_format('glove.twitter.27B.25d.txt.word2vec', binary=False)
        pass
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)


# def main():
#     config = ConfigClass()
#     se = SearchEngine(config=config)
#     r = ReadFile(corpus_path=config.get__corpusPath())
#     # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1]
#     # se.build_index_from_parquet(parquet_file_path)
#     se.load_index('idx_bench')
#     query = "trump want to change the world"
#     num,list = se.search(query)
#     # for key in dictionary.keys():
#     #     print('tweet id: {}, score (unique common words with query): {}'.format(key[0], dictionary[key]))
コード例 #27
0
class SearchEngine:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    __slots__ = ['_config', '_indexer', '_parser', '_model', 'searcher', '_run_config', '_config']

    def __init__(self, config=None, run_config=None):
        if not config:
            config = ConfigClass()
        if not run_config:
            run_config = RunConfigClass()
        self._run_config = run_config
        self._config = config
        self._parser = Parse(run_config)
        self._indexer = Indexer(run_config)
        self._model = None
        self.searcher = Searcher(self._parser, self._indexer, run_config, model=self._model)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        # Iterate over every document in the file
        for document in df.values:
            # parse the document
            parsed_list = self._parser.parse_doc(document)
            self._indexer.add_new_doc(parsed_list)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn.strip('.pkl'))

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relevant
            and the last is the least relevant result.
        """
        return self.searcher.search(query, None, {1})
コード例 #28
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            if parsed_document == {}:  # RT
                continue
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        self._indexer.inverted_idx = {
            key: val
            for key, val in self._indexer.inverted_idx.items() if val != 1
        }
        self._indexer.postingDict = {
            key: val
            for key, val in self._indexer.postingDict.items() if len(val) != 1
        }
        print('Finished parsing and indexing.')
        # self._indexer.save_index('idx_bench')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and 
        assign to self._model, which is passed on to the searcher at query time.
        """
        filename = self._config.glove_twitter_27B_25d_path
        word2vec_output_file = 'glove.twitter.27B.25d.txt.word2vec'
        glove2word2vec(filename, word2vec_output_file)
        filename = word2vec_output_file
        self._model = gensim.models.KeyedVectors.load_word2vec_format(
            filename, binary=False)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
コード例 #29
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0
    corpus_path = config.get__corpusPath()
    r = ReadFile(corpus_path)
    indexer = Indexer(config)
    p = Parse(config)

    #reading per folder
    r.create_files_name_list()
    files_list = []  # every index contains all tweets per folder
    for file_name in r.dates_list:
        tweets_per_date = r.read_file(file_name)
        files_list.append(tweets_per_date)
    #print("files_list", len(files_list))

    num_of_tweets = 0
    for folder_list in files_list:
        num_of_tweets += len(folder_list)
    #print("num_of_tweets", num_of_tweets)
    """#reading per folder
    r.create_files_name_list()
    threads = []
    for file_name in r.dates_list:
        t = threading.Thread(target=r.read_file(file_name))
        threads.append(t)
        t.start()
    print("files_list", r.files_list)"""
    """counter = 1
    procs = []
    # Iterate over every folder in the DATA
    for folder_list in files_list:
        proc = Process(target=test, args=(folder_list, counter, indexer, number_of_documents,))
        procs.append(proc)
        proc.start()
    # complete the processes
    for proc in procs:
        proc.join()
    print('Finished parsing and indexing. Starting to export files')"""

    counter = 1
    # Iterate over every folder in the DATA
    for folder_list in files_list:
        #print(counter)
        #print(datetime.now())
        # Iterate over every tweet in the folder
        for idx, tweet in enumerate(folder_list):
            # parse the tweet
            parsed_document = p.parse_doc(tweet)
            number_of_documents += 1
            # index the tweet data
            indexer.add_new_doc(parsed_document, num_of_tweets)

        #print("number of tweets", number_of_documents)
        #print(datetime.now())
        counter += 1
    #print('Finished parsing and indexing. Starting to export files')
    """#read only one folder
    documents_list = r.read_file(file_name='')
    num_indexed = len(documents_list)

    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document, num_indexed)
    #print('Finished parsing and indexing. Starting to export files')"""

    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.tf_idf_dict, "tf_idf_dict")
    return indexer.get__lda__()
コード例 #30
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse(False)
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        dict_of_methods = {}
        dict_of_methods['wordnet'] = True
        dict_of_methods['spell_correction'] = False
        dict_of_methods['thesaurus'] = False
        dict_of_methods['word2vec'] = False
        dict_of_methods['parser'] = False
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')
        self._indexer.slice_uncommon_terms()
        self._indexer.calculate_wij_idf()
        self._indexer.set_dict_methods(dict_of_methods)
        # self._indexer.save_index(self._config.get_output_path() + 'inverted_idx')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)