Example #1
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0
    corpus_path = config.get__corpusPath()
    r = ReadFile(corpus_path)
    indexer = Indexer(config)
    p = Parse(config)

    #reading per folder
    r.create_files_name_list()
    files_list = []  # every index contains all tweets per folder
    for file_name in r.dates_list:
        tweets_per_date = r.read_file(file_name)
        files_list.append(tweets_per_date)
    #print("files_list", len(files_list))

    num_of_tweets = 0
    for folder_list in files_list:
        num_of_tweets += len(folder_list)
    #print("num_of_tweets", num_of_tweets)
    """#reading per folder
    r.create_files_name_list()
    threads = []
    for file_name in r.dates_list:
        t = threading.Thread(target=r.read_file(file_name))
        threads.append(t)
        t.start()
    print("files_list", r.files_list)"""
    """counter = 1
    procs = []
    # Iterate over every folder in the DATA
    for folder_list in files_list:
        proc = Process(target=test, args=(folder_list, counter, indexer, number_of_documents,))
        procs.append(proc)
        proc.start()
    # complete the processes
    for proc in procs:
        proc.join()
    print('Finished parsing and indexing. Starting to export files')"""

    counter = 1
    # Iterate over every folder in the DATA
    for folder_list in files_list:
        #print(counter)
        #print(datetime.now())
        # Iterate over every tweet in the folder
        for idx, tweet in enumerate(folder_list):
            # parse the tweet
            parsed_document = p.parse_doc(tweet)
            number_of_documents += 1
            # index the tweet data
            indexer.add_new_doc(parsed_document, num_of_tweets)

        #print("number of tweets", number_of_documents)
        #print(datetime.now())
        counter += 1
    #print('Finished parsing and indexing. Starting to export files')
    """#read only one folder
    documents_list = r.read_file(file_name='')
    num_indexed = len(documents_list)

    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document, num_indexed)
    #print('Finished parsing and indexing. Starting to export files')"""

    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.tf_idf_dict, "tf_idf_dict")
    return indexer.get__lda__()