Ejemplo n.º 1
0
def read_text_and_save(user,
                       folder_path,
                       parsed_path,
                       parsed_filename,
                       fast=True):
    """Will iterate over all documents from a User and extract all text, then write and return it."""
    text = ''
    filename = path_leaf(user.curriculum_url)
    if filename != parsed_filename:

        # renames any file that has spaces for one with no spaces.
        # because it's easier to execute shell commands.
        filename = h.rename_filename(folder_path, filename)

        extension = os.path.splitext(filename)[1].lower()
        text += get_text(folder_path, filename, extension, fast=fast)

    text = h.remove_accents_and_non_ascii(text).lower()

    # TODO: Activate stemming: adding only the roots of words
    #text = multilingual_stemmer(text)

    with open(parsed_path, 'w', encoding='UTF-8') as f:
        f.write(text)
        h.log('new document: {}'.format(parsed_path))

    return text
Ejemplo n.º 2
0
def run():

    sys.stdout = h.Unbuffered(open('context_search.log', 'a'))

    h.log("STARTED CONTEXT SEARCH")
    t0 = time.time()
    # TODO: Get relevances with word2vec and gensim.
    compute_related_words()
    t1 = time.time()
    h.log('FINISHED CONTEXT SEARCH, time: {}'.format(t1 - t0))
Ejemplo n.º 3
0
def run_watchdog(f, limit_hours=2.0):
    print("{}: STARTED {}".format(datetime.today(), retrieve_name(f)))

    start = time.time()
    p = Process(target=f, args=())
    p.start()

    while True:  # control cycle
        time.sleep(CHECKING_INTERVAL)
        if time.time() - start > limit_hours * 3600:
            p.terminate()
            p.join()
            print(
                "ERROR Timeout: Process manager killed {p} after {seconds} second\n"
                .format(p=retrieve_name(f), seconds=str(time.time() - start)))

        if not p.is_alive():
            h.log('{0}, took: {1}'.format(retrieve_name(f),
                                          time.time() - start))
            break
Ejemplo n.º 4
0
def get_text(folder_path, doc, extension, fast=True):
    """Gets the text regardless of the extension"""

    filename = os.path.join(folder_path, doc)
    text = ''

    if extension in VALID_EXTENSIONS:
        if extension in {'.jpg', '.png', '.jpeg'}:  # image.
            text = h.get_image_text(filename)
        elif extension in {'.doc', '.docx'}:  # word doc.
            text = h.get_word_text(filename)
        elif extension == '.pdf':
            text = h.get_pdf_text(folder_path, filename, fast=fast)
        elif extension == '.txt':
            text = h.get_text_from_txt_file(filename)

    else:
        h.log('Found invalid or unimplemented extension {}, will not read.'.
              format(extension))

    return text
Ejemplo n.º 5
0
def run():
    #f = open('search_engine.log', 'a')
    #sys.stdout = h.Unbuffered(f)

    h.log("STARTED RELEVANCE DICT")
    t0 = time.time()
    save_relevance_dictionary()
    t1 = time.time()
    h.log('RELEVANCE DICTIONARY, time: {}'.format(t1 - t0))

    h.log("STARTED USER RELEVANCE DICT")
    t0 = time.time()
    save_user_relevance_dictionary()
    t1 = time.time()
    h.log('USER RELEVANCE DICTIONARY, time: {}'.format(t1 - t0))
Ejemplo n.º 6
0
    print("{}: STARTED {}".format(datetime.today(), retrieve_name(f)))

    start = time.time()
    p = Process(target=f, args=())
    p.start()

    while True:  # control cycle
        time.sleep(CHECKING_INTERVAL)
        if time.time() - start > limit_hours * 3600:
            p.terminate()
            p.join()
            print(
                "ERROR Timeout: Process manager killed {p} after {seconds} second\n"
                .format(p=retrieve_name(f), seconds=str(time.time() - start)))

        if not p.is_alive():
            h.log('{0}, took: {1}'.format(retrieve_name(f),
                                          time.time() - start))
            break


if __name__ == '__main__':
    sys.stdout = h.Unbuffered(open('main.log', 'a'))

    h.log('PROCESS MANAGER STARTED')
    #run_watchdog(document_reader_run, limit_hours=2)
    #run_watchdog(search_engine_run, limit_hours=3)
    #run_watchdog(context_search_run, limit_hours=3)
    #run_watchdog(model_run, limit_hours=1)
    h.log('PROCESS MANAGER FINISHED')