def run_pipeline(package: merm_model.PipelinePackage): log.getLogger().warning("------- STARTING PIPELINE -------") #create factory factory = package.dependencies_dict["pipe_process"].PipelineFactory() # specify steps pipeline_steps = pick_pipeline() log.getLogger().info(str(pipeline_steps)) pipeline_steps.sort(key=lambda tup: tup[0]) # ...and we're off to the races :) for step_tuple in pipeline_steps: if env.continue_run() == True: package = factory.next_step(step_tuple[1], package) else: log.getLogger().warning("Continue run is FALSE") log.getLogger().info("------- PIPELINE COMPLETED -------") # Post pipeline; This is where the data is no longer changing. Rather, the data is ready # for functional application. log.getLogger().warning("------- POST PROCESS APPLICATION -------") if env.continue_run() == True: post_process.triage(package)
def _iterate_corpus(package: merm_model.PipelinePackage): count = 0 try: for linked_doc in package.linked_document_list: if env.continue_run() == True: #for each doc in the corpus #split the raw text into sentences. 1 LinkedDocument per sentence log.getLogger().debug(str(env.continue_run())) doc_by_sentence_list = _split_linked_doc_by_sentence(linked_doc) #tokenize and lemmatize the sentences doc_uid = linked_doc.uid doc_url = linked_doc.ui lemmatized_sentences = _lemmatize_sentences(doc_by_sentence_list) if len(lemmatized_sentences) > 2000: lemmatized_sentences = lemmatized_sentences[:2000] #startmsg = "\n\n" + doc_uid + " | " + linked_doc.ui + " | length: " + str(len(lemmatized_sentences)) + "\n\n" #log.getLogger().info(startmsg) salient_corpus_map =_generate_partof_docs(package, lemmatized_sentences, doc_uid, doc_url) endmsg = "\n\n" + str(count) + ": Dispatching " + str(len(salient_corpus_map)) + " parts from " +doc_url+ ".\n\n" log.getLogger().debug(endmsg) _generate_json_and_dispatch(salient_corpus_map, 0) count = count + 1 if count % 300 == 0: log.getLogger().info("running " + str(count)) except Exception as e: msg = "\n\nERROR: " + str(e) log.getLogger().error(msg)
def initiate_run(): try: log.getLogger().info(env.printEnvironment()) env.init() log.getLogger().info(env.printConf()) continue_run = True dependencies_dict = {} dependencies_dict["env"] = env dependencies_dict["factory"] = factory dependencies_dict["es_extract"] = es_extract dependencies_dict["pipe_process"] = pipe_process dependencies_dict["utils"] = utils dependencies_dict["dfutils"] = dfutils while continue_run == True: es_extract.initiate_extraction(pipeline.run_pipeline, dependencies_dict) continue_run = env.continue_run() if (not env.run_forever()): break log.getLogger().info( "#################### Run Completed :) #################### ") except Exception as e: msg = str(e) log.getLogger().error(env.print_traceback()) log.getLogger().error(msg)
def run_post_process(package: merm_model.PipelinePackage): if env.continue_run(): tfidf_top_terms: List[List[Tuple[str, float]]] = package.any_analysis() _validate_corpus(tfidf_top_terms, package.linked_document_list) _create_spaces() log.getLogger().info("Corpus size: " + str(len(package.linked_document_list))) _iterate_corpus(package)
def step_through(package: merm_model.PipelinePackage, pipeline_steps, log_string): factory = package.dependencies_dict["pipe_process"].PipelineFactory() for step_tuple in pipeline_steps: start_time = time.time() if env.continue_run() == True: package = factory.next_step(step_tuple[1], package) end_time = time.time() - start_time log.getLogger().info("Time to complete: " + str(end_time)) log_string = log_string + "\n\n------------\n\n" + step_tuple[ 1] + "\n\n" + package.stage_log() + "\nTime: " + str(end_time) else: log.getLogger().warning("Continue run is FALSE") package.log_stage(log_string) return package
def run_pipeline(package: merm_model.PipelinePackage): log.getLogger().warning("------- STARTING PIPELINE -------") env = package.dependencies_dict["env"] report_dir = env.config["job_instructions"]["output_folder"] provider = env.config["extract_instructions"]["provider"] pipeline_name = env.config["pipeline_instructions"]["pipeline_name"] queryvalue = env.config["extract_instructions"]["query_value"] dt = datetime.now() suffix = str(dt.microsecond)[-4:] file_name = package.dependencies_dict["utils"].clean_string_for_tokenizing( provider + "_" + pipeline_name + "_" + queryvalue + "_" + suffix).replace(" ", "_") + ".txt" #create factory # specify steps pipeline_steps = pick_pipeline() log.getLogger().info(str(pipeline_steps)) pipeline_steps.sort(key=lambda tup: tup[0]) log_string = "" # ...and we're off to the races :) package = step_through(package, pipeline_steps, log_string) if "current_loop" in package.any_inputs_dict.keys(): current_loop = package.any_inputs_dict["current_loop"] while current_loop < package.any_inputs_dict["loop_count"]: current_loop = package.any_inputs_dict["current_loop"] package = step_through(package, pipeline_steps, package.stage_log()) env.overwrite_file(report_dir + "/" + file_name, package.stage_log()) log.getLogger().info("------- PIPELINE COMPLETED -------") # Post pipeline; This is where the data is no longer changing. Rather, the data is ready # for functional application. log.getLogger().warning("------- POST PROCESS APPLICATION -------") if env.continue_run() == True: post_process.triage(package)
def initiate_run(): try: log.getLogger().info(env.printEnvironment()) env.init() log.getLogger().info(env.printConf()) continue_run = True dependencies_dict = {} dependencies_dict["env"] = env dependencies_dict["factory"] = factory dependencies_dict["es_extract"] = es_extract dependencies_dict["pipe_process"] = pipe_process dependencies_dict["utils"] = utils dependencies_dict["dfutils"] = dfutils dependencies_dict["colutils"] = colutils dependencies_dict["log"] = log dependencies_dict["es_conn"] = es_conn dependencies_dict["ingestor"] = ingestor dependencies_dict["syntax"] = syntax log.getLogger().info("Dependencies: ") for k, v in dependencies_dict.items(): log.getLogger().info(str(k) + " : " + str(v)) while continue_run == True: package = merm_model.PipelinePackage(None, None, None, None, {}, {}, dependencies_dict) package.any_analysis_dict["stage_log"] = "" pipeline.run_pipeline(package) continue_run = env.continue_run() if(not env.run_forever()): break log.getLogger().info("#################### Run Completed :) #################### ") except Exception as e: msg = str(e) log.getLogger().error(env.print_traceback()) log.getLogger().error(msg)