def read_text_and_save(user, folder_path, parsed_path, parsed_filename, fast=True): """Will iterate over all documents from a User and extract all text, then write and return it.""" text = '' filename = path_leaf(user.curriculum_url) if filename != parsed_filename: # renames any file that has spaces for one with no spaces. # because it's easier to execute shell commands. filename = h.rename_filename(folder_path, filename) extension = os.path.splitext(filename)[1].lower() text += get_text(folder_path, filename, extension, fast=fast) text = h.remove_accents_and_non_ascii(text).lower() # TODO: Activate stemming: adding only the roots of words #text = multilingual_stemmer(text) with open(parsed_path, 'w', encoding='UTF-8') as f: f.write(text) h.log('new document: {}'.format(parsed_path)) return text
def run(): sys.stdout = h.Unbuffered(open('context_search.log', 'a')) h.log("STARTED CONTEXT SEARCH") t0 = time.time() # TODO: Get relevances with word2vec and gensim. compute_related_words() t1 = time.time() h.log('FINISHED CONTEXT SEARCH, time: {}'.format(t1 - t0))
def run_watchdog(f, limit_hours=2.0): print("{}: STARTED {}".format(datetime.today(), retrieve_name(f))) start = time.time() p = Process(target=f, args=()) p.start() while True: # control cycle time.sleep(CHECKING_INTERVAL) if time.time() - start > limit_hours * 3600: p.terminate() p.join() print( "ERROR Timeout: Process manager killed {p} after {seconds} second\n" .format(p=retrieve_name(f), seconds=str(time.time() - start))) if not p.is_alive(): h.log('{0}, took: {1}'.format(retrieve_name(f), time.time() - start)) break
def get_text(folder_path, doc, extension, fast=True): """Gets the text regardless of the extension""" filename = os.path.join(folder_path, doc) text = '' if extension in VALID_EXTENSIONS: if extension in {'.jpg', '.png', '.jpeg'}: # image. text = h.get_image_text(filename) elif extension in {'.doc', '.docx'}: # word doc. text = h.get_word_text(filename) elif extension == '.pdf': text = h.get_pdf_text(folder_path, filename, fast=fast) elif extension == '.txt': text = h.get_text_from_txt_file(filename) else: h.log('Found invalid or unimplemented extension {}, will not read.'. format(extension)) return text
def run(): #f = open('search_engine.log', 'a') #sys.stdout = h.Unbuffered(f) h.log("STARTED RELEVANCE DICT") t0 = time.time() save_relevance_dictionary() t1 = time.time() h.log('RELEVANCE DICTIONARY, time: {}'.format(t1 - t0)) h.log("STARTED USER RELEVANCE DICT") t0 = time.time() save_user_relevance_dictionary() t1 = time.time() h.log('USER RELEVANCE DICTIONARY, time: {}'.format(t1 - t0))
print("{}: STARTED {}".format(datetime.today(), retrieve_name(f))) start = time.time() p = Process(target=f, args=()) p.start() while True: # control cycle time.sleep(CHECKING_INTERVAL) if time.time() - start > limit_hours * 3600: p.terminate() p.join() print( "ERROR Timeout: Process manager killed {p} after {seconds} second\n" .format(p=retrieve_name(f), seconds=str(time.time() - start))) if not p.is_alive(): h.log('{0}, took: {1}'.format(retrieve_name(f), time.time() - start)) break if __name__ == '__main__': sys.stdout = h.Unbuffered(open('main.log', 'a')) h.log('PROCESS MANAGER STARTED') #run_watchdog(document_reader_run, limit_hours=2) #run_watchdog(search_engine_run, limit_hours=3) #run_watchdog(context_search_run, limit_hours=3) #run_watchdog(model_run, limit_hours=1) h.log('PROCESS MANAGER FINISHED')