コード例 #1
0
def main():
    ''' The main loop for the program '''
    config = ConfigClass()
    se = search_engine_best.SearchEngine(config=config)
    r = ReadFile(corpus_path=config.get__corpusPath())
    # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1]
    # se.build_index_from_parquet(parquet_file_path)
    se.load_index('idx_bench')
    g = GUI()

    # s.load_existing_index()  # load if exists, otherwise return empty list

    while True:
        event, values = g.window.read()

        if event is None:
            break

        if event == '_SEARCH_':
            g.clear()
            query = values['TERM']
            start = datetime.now()
            relevant, tweets_id = se.search(query)
            end = datetime.now()
            total_time = (end - start).total_seconds()
            # print the results to output element
            index = 0
            for tweet_id in tweets_id:
                if index < 25:
                    print("%s. tweet id: %s" % (index + 1, tweet_id))
                index += 1

            print()
            print("About %s tweets (%s seconds)" % (relevant, total_time))
コード例 #2
0
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve):
    """
    :return:
    """
    number_of_documents = 0

    config = ConfigClass(corpus_path, output_path, stemming)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, p.terms_dic_to_document)
    # Iterate over every document in the file
    for i in r.filesPath:
        documents_list = r.read_file(i)
        start_time = time.time()
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            # update the number of doc in system
            number_of_documents += 1
            # index the document data
            indexer.add_new_doc(parsed_document)
        # print(time.time() - start_time)
    print('--------------------------')
    print('Start writing to disk left overs')
    indexer.save_all_left_overs()
    print('Finish without waiting ' + str(time.time() - start_time))
    print('Start waiting')
    indexer.wait_untill_all_finish()
    print('End Waiting')
    print('Finished writing to disk left overs')
    print('--------------------------')
    print('Finished parsing and indexing. Starting to export files')
    print('Finish all Time ' + str(time.time() - start_time))
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
コード例 #3
0
    def test_add_new_doc(self):
        config = ConfigClass()
        r = ReadFile(corpus_path=config.get__corpusPath())
        p = Parse()
        indexer = Indexer(config)
        documents_list = r.read_file(file_name='sample3.parquet')
        # text1 = '@ampalombo I was going to my grandsons baseball games and the dumb F****s made a mask mandatory, are you kidding me'
        assert indexer.add_new_doc()

        text = 'i wad born in 2019'
コード例 #4
0
ファイル: part2.py プロジェクト: adiashk/Search_Engine
def write_content_for_tweet_id():
    corpus_path = "C:\\Users\\ASUS\\Desktop\\Data"
    config = ConfigClass(corpus_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    names = r.get_files_names_in_dir()

    with open("text.csv", "w", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        for name in names:
            documents_list = r.read_file_by_name(file_name=str(name))
            for doc in documents_list:
                if doc[0] in tweet_ids:
                    writer.writerow([doc[0], doc[2]])
コード例 #5
0
def run_engine(corpus_path='', output_path='.', stemming=False):
    """
    Entry point for corpus parsing and indexing
    :param corpus_path:
    :param output_path:
    :param stemming: boolean that says if stemming should be apllied
    :return: total number of tweets parsed
    """

    config = ConfigClass(corpus_path, stemming, output_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)

    tweets_parsed = parse_wrapper(r, p, config)
コード例 #6
0
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve, word2vec):
    """

    :return:
    """
    # print("start: ", time.asctime(time.localtime(time.time())))
    number_of_documents = 0
    num_of_writes = 1
    config = ConfigClass(corpus_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, word2vec)
    # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet')  # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool)

    # Iterate over every document in the file
    counter = 0
    names = r.get_files_names_in_dir()
    for name in names:
        documents_list = r.read_file_by_name(file_name=str(name))
        for idx, document in enumerate(documents_list):
            parsed_document = p.parse_doc(document)  # parse the document
            if parsed_document == {}:  # RT
                continue
            number_of_documents += 1

            indexer.add_new_doc(parsed_document,
                                num_of_writes)  # index the document data
            counter += 1
            if counter >= 500000:
                write_and_clean_buffer(indexer, num_of_writes, stemming,
                                       config, output_path)
                counter = 0
                # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time())))
                num_of_writes += 1
        # print('Finished parsing and indexing. Starting to export files')
    write_and_clean_buffer(indexer, num_of_writes, stemming, config,
                           output_path)
    # print("finish parser & index: ", time.asctime(time.localtime(time.time())))
    indexer.inverted_idx = {
        key: val
        for key, val in indexer.inverted_idx.items() if val != 1
    }
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    # print("finish save index: ", time.asctime(time.localtime(time.time())))

    return num_of_writes
コード例 #7
0
def run_engine(corpus_path_, output_path_, stemming_):
    """

    :return:
    """

    number_of_documents = 0
    config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_)
    config.corpusPath = corpus_path_
    config.savedFileMainFolder=output_path_
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)

    pathes = r.get_all_path_of_parquet()
    length_of_array = len(pathes)
    iteration = 0
    is_stemmer = config.toStem
    parsed_doc_list = list()
    for i in range(0, length_of_array):
        documents_list = r.get_documents(pathes[i][0], pathes[i][0])
        for doc, j in zip(documents_list, range(len(documents_list))):
            parsed_document = p.parse_doc(doc, stemmer=is_stemmer)
            if parsed_document == None:
                continue
            parsed_doc_list.append(parsed_document)
            number_of_documents += 1
            if number_of_documents % 200000 == 0:
                for doc in parsed_doc_list:
                    indexer.add_new_doc(doc)
                indexer.write_posting_to_txt_file_lower_upper(iteration)
                iteration += 1
                parsed_doc_list.clear()
                parsed_doc_list = list()
            elif j == len(documents_list) - 1 and i == length_of_array - 1:
                for doc in parsed_doc_list:
                    indexer.add_new_doc(doc)
                indexer.write_posting_to_txt_file_lower_upper(iteration)
                parsed_doc_list.clear()
                parsed_doc_list = list()
                indexer.merge_posting_file()
                indexer.merge_two_last_posting_file()
                indexer.split_posting_file_and_create_inverted_index()
                indexer.write_inverted_index_to_txt_file()
                number_of_documents = 0
コード例 #8
0
ファイル: search_engine.py プロジェクト: guy94/SearchEngine
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve):
    """
    :return:
    """
    config = ConfigClass(corpus_path, output_path, stemming)
    number_of_documents = 0
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)
    Parse.stemmer = stemming

    corpus_list = r.read_corpus()

    for idx in range(len(corpus_list)):
        documents_list = r.read_file(file_name=corpus_list[idx],
                                     read_corpus=True)
        for i in tqdm(range(len(documents_list))):
            parsed_document = p.parse_doc(documents_list[i])
            if i == len(documents_list) - 1 and idx == len(corpus_list) - 1:
                indexer.is_last_doc = True
            indexer.add_new_doc(parsed_document)
            number_of_documents += 1
        indexer.is_last_doc = False
    documents_list = []

    with open('spell_dict.json', 'w') as f:
        json.dump(indexer.spell_dict, f)

    pickle_out = open("docs_dict_and_extras", "wb")
    pickle.dump(indexer.docs_dict, pickle_out)
    pickle_out.close()

    start = time.time()
    indexer.merge_files()
    end = time.time()
    print("merge time was: {}".format(end - start))

    utils.save_obj(indexer.inverted_idx, "inverted_index")
    pickle_out = open("docs_dict_and_extras", "ab")
    pickle.dump(number_of_documents, pickle_out)
    pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out)
    pickle.dump(indexer.dump_path, pickle_out)
    pickle_out.close()
コード例 #9
0
def run_engine(corpus_path=None,
               output_path=None,
               stemming=False,
               lemma=False,
               queries=None,
               num_docs_to_retrieve=None):
    """
    :return:
    """
    global config, number_of_documents

    number_of_documents = 0

    config = ConfigClass()
    config.corpusPath = corpus_path
    config.set_output_path(output_path)
    config.toStem = stemming
    config.toLemm = lemma
    if os.path.exists(config.get_output_path()):
        shutil.rmtree(config.get_output_path())

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem, config.toLemm)
    indexer = Indexer(config)

    documents_list = []
    for root, dirs, files in os.walk(corpus_path):
        r.set_corpus_path(root)
        for file in files:
            if file.endswith(".parquet"):
                documents_list += r.read_file(file)
    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document)
    documents_list.clear(
    )  # Finished parsing and indexing all files - need to clean all the used memory
    indexer.cleanup(number_of_documents)
コード例 #10
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0
    timer = True
    config = ConfigClass()
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()  #p = Parse(with_stemmer=True)
    indexer = Indexer(config)

    data_dir = 'Data' + os.sep + 'Data'
    npy_dirs = [root for root, dirs, files in os.walk(data_dir)]
    for dir_path in npy_dirs:
        files = [
            os.path.join(dir_path, fname) for fname in os.listdir(dir_path)
            if fname.endswith('.parquet')
        ]
        for file in files:
            tweets = r.read_file(file_name=file)
            start_time = time.perf_counter()
            documents_list = multiprocessing.Pool(12).map(p.parse_doc, tweets)
            end_time = time.perf_counter()
            avg_time_per_tweet = (end_time - start_time) / len(tweets)
            print(
                f'Parsed {len(tweets)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds, average per tweet: {avg_time_per_tweet:0.8f} seconds'
            )

            start_time = time.perf_counter()
            for parsed_document in documents_list:
                indexer.add_new_doc(parsed_document)
            end_time = time.perf_counter()
            print(
                f'Indexing {len(documents_list)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds'
            )
    print('Finished parsing and indexing. Starting to export files')
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.postingDict, "posting")
コード例 #11
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0

    config = ConfigClass()
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)

    documents_list = r.read_file(file_name='sample3.parquet')
    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document)
    print('Finished parsing and indexing. Starting to export files')

    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.postingDict, "posting")
コード例 #12
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        if not config:
            self._config = ConfigClass()
        else:
            self._config = config
        self._parser = Parse()
        self._indexer = Indexer(self._config)
        self._model = None
        self._reader = ReadFile(self._config.get__corpusPath())

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implmentation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file

        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)

            # index the document data
            self._indexer.add_new_doc(parsed_document)

        self._indexer.check_pending_list()
        self._indexer.calculate_and_add_idf()
        self._indexer.calculate_sigma_Wij()
        self._indexer.calculate_avg_doc_len()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implmentation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implmentation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        query_as_tuple = self._parser.parse_sentence(query)
        query_as_list = query_as_tuple[0] + query_as_tuple[1]
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query_as_list, k)
コード例 #13
0
def run_engine(stemming='n'):
    """
    :return:
    """
    number_of_documents = 0

    config = ConfigClass()
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)
    indexer.stemming = stemming
    entries = os.listdir('Data')
    start = time.time()
    print(start)
    playsound('Ding.mp3')
    number_of_documents = 0
    i = 1
    # for entire in entries:
    #     documents_list = r.read_file('Data/'+entire)
    #     documents_list_length = len(documents_list)
    # # Iterate over every document in the file
    #     for idx, document in enumerate(documents_list):
    #         parsed_document = p.parse_doc(document)
    #         indexer.add_new_doc(parsed_document, documents_list_length)
    #     # break

    # documents_list = r.read_file(file_name='sample3.parquet')
    # doc_len = len(documents_list)
    # # Iterate over every document in the file
    # for idx, document in enumerate(documents_list):
    #     # parse the document
    #     parsed_document = p.parse_doc(document)
    #     number_of_documents += 1
    #     # index the document data
    #     indexer.add_new_doc(parsed_document, doc_len)

    # utils.save_obj(indexer.postingDict, "posting")
    # indexer.postingDict = None
    # utils.save_obj(indexer.tweet_dict, "tweet_dict")
    # indexer.tweet_dict = None

    # documents = os.listdir('posting_files')
    # for doc in documents:
    #     indexer.read_and_add_to_temp_dict('posting_files/' + doc)
    # # #
    # playsound('Ding.mp3')
    # middle = time.time()
    # print('middle: ',(middle-start)/60," minutes")
    # print('Finished parsing and indexing. Starting to export files')
    #
    # utils.save_obj(indexer.inverted_idx, "inverted_idx")
    # utils.save_obj(indexer.reversed_inverted_index, "reversed_inverted_index")
    # indexer.inverted_idx = None
    # indexer.reversed_inverted_index = None

    r = Ranker()
    # for doc in documents:
    playsound('Ding.mp3')
    r.create_global_method()

    end = time.time()
    print(end)
    print('end: ', (end - start) / 60, " minutes")