Example #1
0
def main():
    ''' The main loop for the program '''
    config = ConfigClass()
    se = search_engine_best.SearchEngine(config=config)
    r = ReadFile(corpus_path=config.get__corpusPath())
    # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1]
    # se.build_index_from_parquet(parquet_file_path)
    se.load_index('idx_bench')
    g = GUI()

    # s.load_existing_index()  # load if exists, otherwise return empty list

    while True:
        event, values = g.window.read()

        if event is None:
            break

        if event == '_SEARCH_':
            g.clear()
            query = values['TERM']
            start = datetime.now()
            relevant, tweets_id = se.search(query)
            end = datetime.now()
            total_time = (end - start).total_seconds()
            # print the results to output element
            index = 0
            for tweet_id in tweets_id:
                if index < 25:
                    print("%s. tweet id: %s" % (index + 1, tweet_id))
                index += 1

            print()
            print("About %s tweets (%s seconds)" % (relevant, total_time))
Example #2
0
def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None):
    """
    This function build the inverted index over the corpus.
    send each tweet to parsing and indexing.
    if the stemming is True the parsing will use the stemmer on the tokens.
    :param glove_dict: Glove file including all word vectors
    :param corpus_path: root folder containing the raw tweet files
    :param output_path for the inverted index, posting files and tweets dictionary
    :param stemming if True use stemmer on terms
    """

    config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path)
    r = ReadFile(corpus_path=config.get_corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config)
    all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet")
    all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths]
    start_time = time.time()
    file_counter = 0
    for file_name in all_files_names:
        file_start_time = time.time()
        # print("start file :", file_counter)
        documents_list = [document for document in r.read_file(file_name=file_name)]
        # Iterate over every document in the file
        for idx, document in enumerate(documents_list):
            parsed_document = p.parse_doc(document)
            indexer.add_new_doc(parsed_document, glove_dict)
        # print("end file number ", file_counter, " in: ", time.time() - file_start_time)
        file_counter += 1
    total_time = time.time() - start_time
    indexer.finish_indexing()
Example #3
0
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve):
    """
    :return:
    """
    number_of_documents = 0

    config = ConfigClass(corpus_path, output_path, stemming)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, p.terms_dic_to_document)
    # Iterate over every document in the file
    for i in r.filesPath:
        documents_list = r.read_file(i)
        start_time = time.time()
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            # update the number of doc in system
            number_of_documents += 1
            # index the document data
            indexer.add_new_doc(parsed_document)
        # print(time.time() - start_time)
    print('--------------------------')
    print('Start writing to disk left overs')
    indexer.save_all_left_overs()
    print('Finish without waiting ' + str(time.time() - start_time))
    print('Start waiting')
    indexer.wait_untill_all_finish()
    print('End Waiting')
    print('Finished writing to disk left overs')
    print('--------------------------')
    print('Finished parsing and indexing. Starting to export files')
    print('Finish all Time ' + str(time.time() - start_time))
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
Example #4
0
 def __init__(self, config=None):
     if not config:
         self._config = ConfigClass()
     else:
         self._config = config
     self._parser = Parse()
     self._indexer = Indexer(self._config)
     self._model = None
     self._reader = ReadFile(self._config.get__corpusPath())
Example #5
0
    def test_add_new_doc(self):
        config = ConfigClass()
        r = ReadFile(corpus_path=config.get__corpusPath())
        p = Parse()
        indexer = Indexer(config)
        documents_list = r.read_file(file_name='sample3.parquet')
        # text1 = '@ampalombo I was going to my grandsons baseball games and the dumb F****s made a mask mandatory, are you kidding me'
        assert indexer.add_new_doc()

        text = 'i wad born in 2019'
Example #6
0
def write_content_for_tweet_id():
    corpus_path = "C:\\Users\\ASUS\\Desktop\\Data"
    config = ConfigClass(corpus_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    names = r.get_files_names_in_dir()

    with open("text.csv", "w", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        for name in names:
            documents_list = r.read_file_by_name(file_name=str(name))
            for doc in documents_list:
                if doc[0] in tweet_ids:
                    writer.writerow([doc[0], doc[2]])
Example #7
0
def run_engine(corpus_path='', output_path='.', stemming=False):
    """
    Entry point for corpus parsing and indexing
    :param corpus_path:
    :param output_path:
    :param stemming: boolean that says if stemming should be apllied
    :return: total number of tweets parsed
    """

    config = ConfigClass(corpus_path, stemming, output_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)

    tweets_parsed = parse_wrapper(r, p, config)
def main():
    config = ConfigClass()

    se = SearchEngine(config)
    se.build_index_from_parquet(
        r'C:\Users\Owner\Desktop\SearchEngine\Part C\data\benchmark_data_train.snappy.parquet'
    )
    n_res, res, docs = se.search('vaccines move freely')
    df = pd.read_parquet(
        r'C:\Users\Owner\Desktop\SearchEngine\Part C\data\benchmark_data_train.snappy.parquet',
        engine="pyarrow")

    to_return = pd.DataFrame(columns=["query", "tweet_id"])

    for r in res:
        to_return = to_return.append({
            "query": 5,
            "tweet_id": r
        },
                                     ignore_index=True)

        print(r)
        print([w for w in df[df.tweet_id == r].full_text.tolist()])

    to_return.to_csv("results6.csv", index=False)
    print(n_res)
Example #9
0
def run_engine(corpus_path, output_path, stemming=False):
    """

    :param corpus_path: path for parquet files
    :param output_path: path to write pickle files
    :param stemming: boolean to use stemming or not
    :return:
    """

    ConfigClass(corpus_path, output_path, stemming)
    r = ReadFile(corpus_path)
    p = Parse(stemming)
    indexer = Indexer(output_path, stemming)

    if corpus_path.endswith('parquet'):
        documents_list = r.read_file(corpus_path)
        parseAndIndexDocuments(documents_list, p, indexer)
    else:
        documents_list = r.read_dir()

        while documents_list:
            parseAndIndexDocuments(documents_list, p, indexer)
            documents_list = r.read_dir()

    documents_list.clear()
    indexer.merge_posting_files()

    lda = LDA(output_path, indexer.dictdoc, stemming)
    lda.build_ldaModel()
Example #10
0
def main(corpus_path, output_path, stemming, queries, num_docs_to_retrieve):
    config = ConfigClass(corpus_path, output_path, stemming)
    indexer = Indexer(config)
    run_engine(config, indexer)
    if isinstance(queries, list):
        queries_list = queries
    else:
        # queries_list = []
        # queries_file = open(queries, encoding='utf8')
        # lines = [l for l in queries_file.readlines() if l is not '\n']
        # for line in lines:
        #     queries_list.append(line[line.index('.') + 1: -1])
        queries_df = pd.read_table(queries)
        queries_list = list(queries_df['information_need'].values)

    lst_to_csv = [['Query_num', 'Tweet_id', 'Score']]
    for num, query in enumerate(queries_list):
        stime = time.time()
        print(f'query {num+1}')
        n_relevant, ranked_doc_ids = search_and_rank_query(
            query=query,
            parser=Parse(config),
            indexer=indexer,
            k=num_docs_to_retrieve)
        for tweet_id, score in ranked_doc_ids:
            print(f'Tweet id: {tweet_id}, Score: {score}')
            lst_to_csv.append([num + 1, tweet_id, score])
        print(f'time for query no. {num+1} is {time.time() - stime}')

    with open('data\\analysis_data.csv', 'w', newline='') as file:
        file.truncate()
        writer = csv.writer(file)
        writer.writerows(lst_to_csv)
Example #11
0
def main():
    # print("start: ", time.asctime(time.localtime(time.time())))
    config = ConfigClass()
    Engine = SearchEngine(config)
    # print("start: ", time.asctime(time.localtime(time.time())))
    corpus_path = "C:\\Users\\ASUS\\Desktop\\data_part_c\\data\\benchmark_data_train.snappy.parquet"
    # corpus_path = "C:\\Users\\ASUS\\Desktop\\Data\\Data\\date=07-19-2020\\covid19_07-19.snappy.parquet"
    # Engine.build_index_from_parquet( corpus_path)
    # Engine._indexer.save_index("inverted_idx")
    # print("finish: ", time.asctime(time.localtime(time.time())))

    Engine.load_index("inverted_idx")
    Engine.load_precomputed_model()
    queries = read_queries("full_queries2.txt")
    df = pd.read_parquet(corpus_path, engine="pyarrow")
    documents_list = df.values.tolist()
    i = 0
    for query in queries:
        n_relevant, ranked_doc_ids = Engine.search(query)
        for doc_tuple in ranked_doc_ids:
            for doc in documents_list:
                if doc[0] == doc_tuple[0]:
                    i += 1
                    print('tweet id: {}, similarity: {}'.format(
                        doc_tuple[0], doc_tuple[1]))
                    print(doc[0], ":", doc[2])
Example #12
0
def run_engine(corpus_path='', output_path='', stemming=False):
    """

    :return:
    """
    # Create PostingFile directory if it doesn't exist
    number_of_documents = 0
    config = ConfigClass()
    r = ReadFile(corpus_path=corpus_path)
    p = Parse(stemming)
    indexer = Indexer(config, output_path)
    # Get all parquet files from corpus path
    parquets = []
    for root, dirs, files in os.walk(corpus_path):
        for name in files:
            if name.endswith((".parquet", ".htm")):
                parquets.append((root, name))

    for index in range(len(parquets)):
        r.corpus_path = parquets[index][0]
        documents_list = r.read_file(file_name=parquets[index][1])
        # Create a new process for each document
        with Pool(CPUCOUNT) as _p:
            for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list):
                number_of_documents += 1
                indexer.add_new_doc(parsed_doc)
            _p.close()
            _p.join()

    p.entities.clear()
    indexer.finish_index()
    save_obj(indexer.term_dict, output_path + '/' + "inverted_idx")
    save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary")
    indexer.document_dict.clear()
    indexer.term_dict.clear()
Example #13
0
def main(corpus_path, output_path, stemming, queries, num_docs_to_retrieve):

    config = ConfigClass(corpus_path)
    word2vec = Word2vec()
    num_of_writes = run_engine(corpus_path, output_path, stemming, queries,
                               num_docs_to_retrieve, word2vec)
    union_posting_files(num_of_writes, stemming, config, output_path)
    # print("finish union posting files: ", time.asctime(time.localtime(time.time())))
    if type(queries) != list:
        queries = read_queries(queries)

    inverted_index = utils.load_inverted_index()
    # temp1 = dict(sorted(inverted_index.items(), key=lambda item: item[1].isdigit(), reverse=False))
    # temp2 = dict(sorted(inverted_index.items(), reverse=True))

    rank_query = search_and_rank_query(corpus_path, queries, inverted_index,
                                       num_docs_to_retrieve, stemming,
                                       word2vec, output_path)
    path = os.path.join(output_path, 'results.csv')
    with open(path, 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Query_num", "Tweet_id", "Rank"])
    for i in rank_query:
        for doc_tuple in rank_query[i]:
            print('tweet id: {}, similarity: {}'.format(
                doc_tuple[0], doc_tuple[1]))
            with open(path, 'a', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([i + 1, doc_tuple[0], doc_tuple[1]])
Example #14
0
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve, word2vec):
    """

    :return:
    """
    # print("start: ", time.asctime(time.localtime(time.time())))
    number_of_documents = 0
    num_of_writes = 1
    config = ConfigClass(corpus_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, word2vec)
    # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet')  # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool)

    # Iterate over every document in the file
    counter = 0
    names = r.get_files_names_in_dir()
    for name in names:
        documents_list = r.read_file_by_name(file_name=str(name))
        for idx, document in enumerate(documents_list):
            parsed_document = p.parse_doc(document)  # parse the document
            if parsed_document == {}:  # RT
                continue
            number_of_documents += 1

            indexer.add_new_doc(parsed_document,
                                num_of_writes)  # index the document data
            counter += 1
            if counter >= 500000:
                write_and_clean_buffer(indexer, num_of_writes, stemming,
                                       config, output_path)
                counter = 0
                # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time())))
                num_of_writes += 1
        # print('Finished parsing and indexing. Starting to export files')
    write_and_clean_buffer(indexer, num_of_writes, stemming, config,
                           output_path)
    # print("finish parser & index: ", time.asctime(time.localtime(time.time())))
    indexer.inverted_idx = {
        key: val
        for key, val in indexer.inverted_idx.items() if val != 1
    }
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    # print("finish save index: ", time.asctime(time.localtime(time.time())))

    return num_of_writes
Example #15
0
    def __init__(self):

        #self.model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True,encoding='utf-8')
        self.model = gensim.models.KeyedVectors.load_word2vec_format(
            ConfigClass().google_news_vectors_negative300_path,
            binary=True,
            encoding='utf-8')
        self.terms_dict = {}
 def __init__(self, config=None):
     if not config:
         self._config = ConfigClass()
     else:
         self._config = config
     self._parser = Parse()
     self._indexer = Indexer(self._config)
     self._model = None
Example #17
0
def run_engine(corpus_path_, output_path_, stemming_):
    """

    :return:
    """

    number_of_documents = 0
    config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_)
    config.corpusPath = corpus_path_
    config.savedFileMainFolder=output_path_
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)

    pathes = r.get_all_path_of_parquet()
    length_of_array = len(pathes)
    iteration = 0
    is_stemmer = config.toStem
    parsed_doc_list = list()
    for i in range(0, length_of_array):
        documents_list = r.get_documents(pathes[i][0], pathes[i][0])
        for doc, j in zip(documents_list, range(len(documents_list))):
            parsed_document = p.parse_doc(doc, stemmer=is_stemmer)
            if parsed_document == None:
                continue
            parsed_doc_list.append(parsed_document)
            number_of_documents += 1
            if number_of_documents % 200000 == 0:
                for doc in parsed_doc_list:
                    indexer.add_new_doc(doc)
                indexer.write_posting_to_txt_file_lower_upper(iteration)
                iteration += 1
                parsed_doc_list.clear()
                parsed_doc_list = list()
            elif j == len(documents_list) - 1 and i == length_of_array - 1:
                for doc in parsed_doc_list:
                    indexer.add_new_doc(doc)
                indexer.write_posting_to_txt_file_lower_upper(iteration)
                parsed_doc_list.clear()
                parsed_doc_list = list()
                indexer.merge_posting_file()
                indexer.merge_two_last_posting_file()
                indexer.split_posting_file_and_create_inverted_index()
                indexer.write_inverted_index_to_txt_file()
                number_of_documents = 0
Example #18
0
 def __init__(self):
     self.config = ConfigClass()
     self.local_address = r"C:\Users\benro\OneDrive\Desktop\glove.twitter.27B.25d.txt"
     self.server_address = self.config.glove_twitter_27B_25d_path
     self.input_file = self.local_address
     self.output_file = 'glove.twitter.27B.25d.txt.word2vec'
     glove2word2vec(self.input_file, self.output_file)
     self.model = KeyedVectors.load_word2vec_format(self.output_file,
                                                    binary=False)
Example #19
0
 def __init__(self, parser, indexer, model=None):
     # self._model = model
     self.parser = parser
     self.ranker = Ranker(indexer.tweet_info)
     self.inverted_index = indexer.inverted_idx
     self.firstUnion = True
     self.posting_dir = ConfigClass.get_output()
     self.DocsToRetrieve = ConfigClass.numOfDocsToRetrieve
     self.scoreLowerBoundFactor = 0.5
Example #20
0
def main(corpus_path, output_path, stemming, queries, num_doc_to_retrieve):
    config = ConfigClass()
    config.corpusPath = corpus_path
    config.savedFileMainFolder = output_path
    config.toStem = stemming

    run_engine(config)
    inverted_index = load_index()
    queries_file = open(queries, encoding="utf8")
    tuple_answers = []
    query_num = 1
    for query in queries_file:
        for doc_tuple in search_and_rank_query(query[:-1], inverted_index, num_doc_to_retrieve, config):
            print('tweet id: {} Score: {}'.format(doc_tuple[0], doc_tuple[1]))
            doc_tuple = doc_tuple + (query_num,)
            tuple_answers.append(doc_tuple)
        query_num += 1
    queries_file.close()
Example #21
0
def main(corpus_path='',
         output_path='',
         stemming=False,
         queries=None,
         num_docs_to_retrieve=10):
    ConfigClass.set_path(output_path)
    run_engine(corpus_path, output_path, stemming)
    docs = load_index(output_path)
    if type(queries) != list:
        file1 = open(queries, 'r', encoding="utf8")
        queries = file1.readlines()
    for idx, query in enumerate(queries):
        query = query.replace('\n', '')
        for doc_tuple in search_and_rank_query(query, docs,
                                               num_docs_to_retrieve, stemming,
                                               output_path):
            print('Tweet id: ' + str(doc_tuple[0]) + ' Score: ' +
                  str(doc_tuple[1]))
Example #22
0
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve):
    """
    :return:
    """
    config = ConfigClass(corpus_path, output_path, stemming)
    number_of_documents = 0
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)
    Parse.stemmer = stemming

    corpus_list = r.read_corpus()

    for idx in range(len(corpus_list)):
        documents_list = r.read_file(file_name=corpus_list[idx],
                                     read_corpus=True)
        for i in tqdm(range(len(documents_list))):
            parsed_document = p.parse_doc(documents_list[i])
            if i == len(documents_list) - 1 and idx == len(corpus_list) - 1:
                indexer.is_last_doc = True
            indexer.add_new_doc(parsed_document)
            number_of_documents += 1
        indexer.is_last_doc = False
    documents_list = []

    with open('spell_dict.json', 'w') as f:
        json.dump(indexer.spell_dict, f)

    pickle_out = open("docs_dict_and_extras", "wb")
    pickle.dump(indexer.docs_dict, pickle_out)
    pickle_out.close()

    start = time.time()
    indexer.merge_files()
    end = time.time()
    print("merge time was: {}".format(end - start))

    utils.save_obj(indexer.inverted_idx, "inverted_index")
    pickle_out = open("docs_dict_and_extras", "ab")
    pickle.dump(number_of_documents, pickle_out)
    pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out)
    pickle.dump(indexer.dump_path, pickle_out)
    pickle_out.close()
Example #23
0
def main(corpus_path='',
         output_path='.',
         stemming=False,
         queries='',
         num_docs_to_retrive=10):
    """
    This is the main function for the search engine.
    It manages parsing the data, indexing id and running queries on the data.
    :param corpus_path: string that points to corpus path, where the input files lay
    :param output_path: String that points to where the results of the query should be writen
    :param stemming: boolean that decides if the negine will apply stemming or not
    :param queries: list of queris, or a string that points to a file with queries
    :param num_docs_to_retrive: Maximum number of tweets to retrive per query
    :return: -
    """

    start = dt.datetime.now()
    # Entry point to the parsing and indexing phase
    run_engine(corpus_path, output_path, stemming)
    end = dt.datetime.now()
    total_parse_and_ind_time = (end - start).total_seconds() / 60.0
    #print("Total parsing and building index and posting time was: {}".format(total_parse_and_ind_time))
    start = dt.datetime.now()
    k = num_docs_to_retrive
    config = ConfigClass(corpus_path, stemming, output_path)
    inverted_index = load_index(config)

    # Handle both cases of queries input, list and file name
    if type(queries) is list:
        queries_list = queries
    else:
        queries_list = parse_queries_from_file(queries)

    output_set = []
    for i in range(len(queries_list)):
        query = queries_list[i]

        #print(query)

        # quering phase
        doc_tuples = search_and_rank_query(query, inverted_index, k, config)
        for j in range(len(doc_tuples)):
            doc_tuple = doc_tuples[j]
            output_set.append((i + 1, doc_tuple[0], doc_tuple[1]))
            print(
                'query number:{} tweet id: {}, score (TF-idf cosine similarity): {}'
                .format(i + 1, doc_tuple[0], doc_tuple[1]))
    results_set = pd.DataFrame(output_set,
                               columns=['query_num', 'tweet_id', 'tf_score'])
    # Write results to output

    outfile = output_path + '/results.csv'

    results_set.to_csv(outfile)
    end = dt.datetime.now()
    total_query_time = (end - start).total_seconds()
 def __init__(self, config=None):
     if config == None:
         config = ConfigClass()
     self._config = config
     if config.toStem:
         self._parser = Parse_stem()
     else:
         self._parser = Parse()
     self._indexer = Indexer(config)
     self._model = None
Example #25
0
 def __init__(self, config=None, run_config=None):
     if not config:
         config = ConfigClass()
     if not run_config:
         run_config = RunConfigClass()
     self._run_config = run_config
     self._config = config
     self._parser = Parse(run_config)
     self._indexer = Indexer(run_config)
     self._model = None
     self.searcher = Searcher(self._parser, self._indexer, run_config, model=self._model)
Example #26
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0
    timer = True
    config = ConfigClass()
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()  #p = Parse(with_stemmer=True)
    indexer = Indexer(config)

    data_dir = 'Data' + os.sep + 'Data'
    npy_dirs = [root for root, dirs, files in os.walk(data_dir)]
    for dir_path in npy_dirs:
        files = [
            os.path.join(dir_path, fname) for fname in os.listdir(dir_path)
            if fname.endswith('.parquet')
        ]
        for file in files:
            tweets = r.read_file(file_name=file)
            start_time = time.perf_counter()
            documents_list = multiprocessing.Pool(12).map(p.parse_doc, tweets)
            end_time = time.perf_counter()
            avg_time_per_tweet = (end_time - start_time) / len(tweets)
            print(
                f'Parsed {len(tweets)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds, average per tweet: {avg_time_per_tweet:0.8f} seconds'
            )

            start_time = time.perf_counter()
            for parsed_document in documents_list:
                indexer.add_new_doc(parsed_document)
            end_time = time.perf_counter()
            print(
                f'Indexing {len(documents_list)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds'
            )
    print('Finished parsing and indexing. Starting to export files')
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.postingDict, "posting")
Example #27
0
def main(corpus_path, output_path, stemming, queries, num_docs_to_retrieve):
    if queries is not None:
        config = ConfigClass(corpus_path, output_path, stemming)
        run_engine(config)
        query_list = utils.load_queries_list(queries)
        inverted_index = load_index(output_path)
        for idx in range(1, len(query_list) + 1):
            print("query {}:".format(idx))
            for doc_tuple in search_and_rank_query(query_list[idx - 1],
                                                   inverted_index,
                                                   k=num_docs_to_retrieve,
                                                   config=config):
                print('\ttweet id: {} | score : {} '.format(
                    doc_tuple[0], doc_tuple[1]))
Example #28
0
def main(corpus_path=None, output_path='', stemming=False, queries=None, num_docs_to_retrieve=1):
    if queries is not None:
        config = ConfigClass(corpus_path, output_path, stemming)
        run_engine(config)

        query_list = handle_queries(queries)
        inverted_index, document_dict, num_of_docs, avg_length_per_doc = load_index(output_path)
        # tweet_url = 'http://twitter.com/anyuser/status/'
        # num_of_docs = 10000000
        # avg_length_per_doc = 21.5
        for idx, query in enumerate(query_list):
            docs_list = search_and_rank_query(query, inverted_index, document_dict, num_docs_to_retrieve, num_of_docs, avg_length_per_doc, config)
            for doc_tuple in docs_list:
                print('tweet id: {}, score: {}'.format(str(doc_tuple[1]), doc_tuple[0]))
Example #29
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0

    config = ConfigClass()
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)

    documents_list = r.read_file(file_name='sample3.parquet')
    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document)
    print('Finished parsing and indexing. Starting to export files')

    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.postingDict, "posting")
def main():
    config = ConfigClass()
    corpus_path = configuration.ConfigClass.get__corpusPath(config)
    Search_Engine = SearchEngine(config)
    Search_Engine.build_index_from_parquet(corpus_path)
    #Search_Engine.load_index('idx_bench.pkl')
    print(datetime.now())
    final_tweets = Search_Engine.search('Herd immunity has been reached.')
    print(datetime.now())
    print("num of relevant:", final_tweets[0])
    num = 1
    for tweet_id in final_tweets[1].keys():
        if num <= 5:
            print("Tweet id: " + "{" + tweet_id + "}" + " Score: " + "{" + str(num) + "}")
            num += 1