def initiate_run(): try: log.getLogger().info(env.printEnvironment()) env.init() log.getLogger().info(env.printConf()) continue_run = True dependencies_dict = {} dependencies_dict["env"] = env dependencies_dict["factory"] = factory dependencies_dict["es_extract"] = es_extract dependencies_dict["pipe_process"] = pipe_process dependencies_dict["utils"] = utils dependencies_dict["dfutils"] = dfutils while continue_run == True: es_extract.initiate_extraction(pipeline.run_pipeline, dependencies_dict) continue_run = env.continue_run() if (not env.run_forever()): break log.getLogger().info( "#################### Run Completed :) #################### ") except Exception as e: msg = str(e) log.getLogger().error(env.print_traceback()) log.getLogger().error(msg)
def perform(self, package: merm_model.PipelinePackage): if ("job" not in package.any_analysis_dict["provider"]): raise Exception("This classs will not work on " + str(package.any_analysis_dict["provider"])) df = package.corpus log.getLogger().info("Shape of DF: " + str(df.shape)) jobs_dict = {} for index, row in df.iterrows(): majorFinal = row["majorFinal"] if majorFinal is None: jobs_string = row["jobFinal"] if jobs_string in jobs_dict.keys(): jobs_dict[jobs_string] = jobs_dict[jobs_string] + 1 else: jobs_dict[jobs_string] = 1 package.any_analysis_dict["no_major_jobs_count"] = jobs_dict for index, row in df.iterrows(): jobs_string = row["jobFinal"] if jobs_string in jobs_dict.keys(): jobs_dict[jobs_string] = jobs_dict[jobs_string] + 1 else: jobs_dict[jobs_string] = 1 package.any_analysis_dict["jobs_count"] = jobs_dict return package
def _run_lda(self, topic_count, report_word_count, permitted_overlap, package:merm_model.PipelinePackage): topic_dict = {} topic_dict_friendly = {} lda_model = gensim.models.ldamodel.LdaModel(corpus=package.corpus, id2word=package.dict, num_topics=topic_count, update_every=1, alpha='auto', per_word_topics=False, iterations=100) topics = lda_model.show_topics(formatted=False, num_words=report_word_count) for index, topic in topics: # print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic])) words_for_topic = [] words_for_topic_friendly = [] for w in topic: words_for_topic.append((w[0], w[1])) words_for_topic_friendly.append(str(w[0]) + "," + str(w[1])) topic_dict[index] = words_for_topic topic_dict_friendly[index] = words_for_topic_friendly topic_overlap = self._topic_overlap(topic_dict) log.getLogger().info(str(topic_overlap)) stop_words = self._dynamic_stop_words(topic_overlap, permitted_overlap) if len(stop_words) > permitted_overlap: log.getLogger().info("\n**********\nRerunning LDA after removing " + str(len(stop_words)) + " words") package = self._remove_stop_words(stop_words,package) package = self._rebuild_corpus(package) return self._run_lda(topic_count,report_word_count,permitted_overlap,package) package.any_analysis_dict[lda_analysis_key(package) + "_topic_overlap"] = topic_overlap package.any_analysis_dict[lda_analysis_key(package)] = topic_dict package.any_analysis_dict[lda_analysis_key(package) + "_friendly"] = topic_dict_friendly return package
def perform(self, package:data_models.PipelinePackage): doc_list = [] term_dict = {} for linked_doc in package.linked_document_list: doc_list.append(linked_doc.tokens) term_dict[linked_doc.any_inputs["terms"]] = 1 model = gensim.models.Word2Vec( doc_list, size=100, window=10, min_count=2, workers=5, iter=10) for terms in list(term_dict.keys()): term_list = terms.split(" ") for term in term_list: if term in list(model.wv.index2entity): result = model.wv.most_similar(positive=term) output = "\n_____ " + term + " _____\n" for rel in result: output = output + rel[0] + "\t" + str(rel[1]) + "\n" output = output + "\n - - -\n" log.getLogger().info(output) return package
def perform(self, package: merm_model.PipelinePackage): if ("job" not in package.any_analysis_dict["provider"]): raise Exception("This classs will not work on " + str(package.any_analysis_dict["provider"])) df = package.corpus log.getLogger().info("Shape of DF: " + str(df.shape)) areas_of_study_dict_undefined = {} for index, row in df.iterrows(): majorFinal = row["majorFinal"] if majorFinal is None: areas_of_study = row["areasOfStudy"] if len(areas_of_study) > 0: areasOfStudyList = areas_of_study.split(",") for s in areasOfStudyList: if s in areas_of_study_dict_undefined.keys(): areas_of_study_dict_undefined[ s] = areas_of_study_dict_undefined[s] + 1 else: areas_of_study_dict_undefined[s] = 1 package.any_analysis_dict[ "undefined_areas_of_study_count"] = areas_of_study_dict_undefined areas_of_study_dict = {} for index, row in df.iterrows(): majorFinal = row["majorFinal"] if majorFinal in areas_of_study_dict.keys(): areas_of_study_dict[ majorFinal] = areas_of_study_dict[majorFinal] + 1 else: areas_of_study_dict[majorFinal] = 1 package.any_analysis_dict["areas_of_study_count"] = areas_of_study_dict return package
def retrieve_index_registry(): es = connectToES() results = es.indices.get('*') indices = results.keys() for key in indices: log.getLogger().info("%d spaces found" + str(key)) return indices
def run_post_process(package: merm_model.PipelinePackage): log.getLogger().info("save text rank results to file") path = env.config["job_instructions"]["output_folder"] text_rank_results = package.any_analysis_dict["text_rank_all_groups"] text_rank_overall = package.any_analysis_dict["text_rank_0"] # count = 0 # for key in text_rank_results: # # analysis = text_rank_results[key] # if "ict" in type(analysis).__name__: # file_name = path +"/" + "TextRank_" + str(key) + ".csv" # log.getLogger().info("Saving "+ file_name) # with open(file_name, 'w') as f: # for k in analysis.keys(): # for sentence in analysis[k]: # count = count + 1 # f.write("%s,%s,%s\n" % (k, sentence[0], sentence[1])) toes = env.config.getboolean("job_instructions", "output_to_elasticsearch") if True == toes: _reset_index(package) _dispatch_to_elastic_search_all_groups( text_rank_results, package.any_analysis_dict["provider"]) _dispatch_to_elastic_search(text_rank_overall, package.any_analysis_dict["provider"])
def _generate_json_and_dispatch(linked_doc:merm_model.LinkedDocument): es = es_conn.connectToES() index_name = linked_doc.index_name log.getLogger().debug("Dispatching: " + str(linked_doc.uid) + " | " + index_name) result = es.update(index=index_name, doc_type='_doc', id=linked_doc.uid, body=_generate_json()) log.getLogger().debug("Dispatched with result " + str(result))
def perform(self, package: merm_model.PipelinePackage): package.corpus.to_csv( env.config['job_instructions']['es_file_location'], index=False) log.getLogger().info( "Saved ElasticSearch Data as CSV at: " + env.config['job_instructions']['es_file_location']) return package
def run_post_process(package: merm_model.PipelinePackage): log.getLogger().info("rake post process") keywords_dict = package.any_analysis_dict["rake"] sorted_keywords_dict = _sortKeywords(keywords_dict) _saveToFile(sorted_keywords_dict)
def perform(self, package: merm_model.PipelinePackage): df = package.corpus log.getLogger().info("Shape of DF: " + str(df.shape)) groupby_dict = {} column = package.dependencies_dict["env"].config["ml_instructions"][ "df_groupby_column"] count = 0 for index, row in df.iterrows(): count = count + 1 if count % 1000 == 0: sys.stdout.write(".") jobs_string = row[column] if jobs_string in groupby_dict.keys(): groupby_dict[jobs_string] = groupby_dict[jobs_string] + 1 else: groupby_dict[jobs_string] = 1 package.log_stage( "Broke a pandas data frame into a dict of data grouped by " + str(column)) package.any_analysis_dict["group_by_" + column] = groupby_dict return package
def run_post_process(package: merm_model.PipelinePackage): log.getLogger().info("run_post_process") csv_list_of_lists = [] csv_list_of_lists.append(["index_name", "topic_id", "term", "weight"]) report_sentences = env.config.getboolean( 'ml_instructions', 'gensim_lda_report_sentence_level') for idxname, topicdict in package.any_analysis().items(): report_for_index = "\n\n\n+++++++++++++++++++\n\nReport for " + idxname + "\n\n" docs_list = package.linked_document_list[idxname] if report_sentences == True: corpus_as_sentences = break_corpus_as_sentences(docs_list) report_for_index += "Corpus Size: " + str(len(docs_list)) + "\n" if len(docs_list) > 100: for topicid, topiclist in topicdict["default_analysis_key"].items( ): report_for_index += "\n\nTOPIC:" + str(topicid) + "\n" for entry in topiclist: report_for_index += str(entry[0]) report_for_index += "\t\t\t" report_for_index += str(entry[1]) report_for_index += "\n" csv_list_of_lists.append( [idxname, topicid, entry[0], entry[1]]) if report_sentences == True: salient_sentences = find_salient_sentences( topiclist, corpus_as_sentences) report_for_index += "\n\nSALIENT_SENTENCES\n" for sentence in salient_sentences: report_for_index += sentence + "\n" log.getReportLogger().info(report_for_index) _save_topic_model(package) _save_csv(csv_list_of_lists, "lda_analysis_by_subset")
def perform(self, package: merm_model.PipelinePackage): utils = package.dependencies_dict["utils"] colutils = package.dependencies_dict["colutils"] env = package.dependencies_dict["env"] embeddings_file = env.config["ml_instructions"][ "text_rank_embeddings_file"] dimensions = env.config.getint("ml_instructions", "glove_dimensions") word_embeddings_list = self._word_embeddings(embeddings_file) #sentences = package.dependencies_dict["utils"].corpus_as_sentence_list(package) tokenized_sentences_by_doc = utils.corpus_as_tokenized_sentence_linked_doc_list_grouped_by_doc( package, True) log.getLogger().info("we have " + str(len(tokenized_sentences_by_doc)) + " docs") rank_by_dict = self._prep_rank_by_doc_dict(package) count = 0 for docid, sentences in tokenized_sentences_by_doc.items(): sentence_by_rank_dict = self.rank_by_document( sentences, word_embeddings_list, package, dimensions) for key, value in sentence_by_rank_dict.items(): sentence_list_for_that_rank = rank_by_dict[key] sentence_list_for_that_rank.append([dimensions, docid, value]) if count % 100 == 0: print(count) count = count + 1 analysis_key = colutils.incrementing_key("text_rank", package.any_analysis_dict) package.any_analysis_dict[analysis_key] = rank_by_dict package.log_stage("Conducting text rank. Total document count is " + str(len(package.linked_document_list)) + \ ". For each document the top " + str(len(list(rank_by_dict.keys()))) + " ranked sentences were captured." + \ "\nGlove dimension count: " + str(dimensions)) return package
def perform(self, package: merm_model.PipelinePackage): analysis_id = self._analysis_id(package) log.getLogger().info("K means prediciting. Tea time") X = package.any_inputs_dict["SKX"] env = package.dependencies_dict["env"] test_range = env.config["ml_instructions"] ["silhouette_range"].split(",") reporting_count = env.config.getint("ml_instructions","sklearn_kmeans_term_per_cluster_reporting_count") Xarray = X.toarray() silhouette_results = _silhouette(Xarray,test_range) cluster_count_tuple = max(silhouette_results, key=lambda x:x[1]) skdict = package.any_inputs_dict["SKdict"] kmeans = KMeans(n_clusters=cluster_count_tuple[0], random_state=10) kmeans.fit_predict(Xarray) centers = kmeans.cluster_centers_.argsort()[:, ::-1] centroid_list = [] centroid_list.append(["cluster","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16"]) for i in range(cluster_count_tuple[0]): row_list = [i] for ind in centers[i, :reporting_count]: row_list.append(skdict[ind]) centroid_list.append(row_list) cluster_list = [] cluster_list.append(["cluster","sentence"]) package.any_analysis_dict[analysis_id + "_top_terms"] = centroid_list package.any_inputs_dict["kmeans_top_terms_key"] = analysis_id + "_top_terms" package.log_stage("Kmeans Clustering, no repeats\nSilhouette : " + str(silhouette_results) + "\nCluster count : " + str(cluster_count_tuple)) return package
def pick_pipeline(): pipeline_name = env.config["pipeline_instructions"]["pipeline_name"] log.getLogger().info(pipeline_name) if pipeline_name == "gensim_lda": return _gensim_lda_steps elif pipeline_name == "gensim_lda_by_subset": return _gensim_lda_by_subset_steps elif pipeline_name == "sklearn_lda": return _sklearn_lda_steps elif pipeline_name == "lda_topic_comparator": return _lda_topic_comparator_steps elif pipeline_name == 'save_as_csv': return _save_as_csv elif pipeline_name == '_job_integrity_analysis': return _job_integrity_analysis elif pipeline_name == '_group_by_column': return _group_by_column elif pipeline_name == '_rake': return _rake else: log.getLogger().warning( str(pipeline_name) + " is invalid. Please configure tools.ini and create a relevant list of steps within this script" ) return []
def run_pipeline(package: merm_model.PipelinePackage): log.getLogger().warning("------- STARTING PIPELINE -------") #create factory factory = package.dependencies_dict["pipe_process"].PipelineFactory() # specify steps pipeline_steps = pick_pipeline() log.getLogger().info(str(pipeline_steps)) pipeline_steps.sort(key=lambda tup: tup[0]) # ...and we're off to the races :) for step_tuple in pipeline_steps: if env.continue_run() == True: package = factory.next_step(step_tuple[1], package) else: log.getLogger().warning("Continue run is FALSE") log.getLogger().info("------- PIPELINE COMPLETED -------") # Post pipeline; This is where the data is no longer changing. Rather, the data is ready # for functional application. log.getLogger().warning("------- POST PROCESS APPLICATION -------") if env.continue_run() == True: post_process.triage(package)
def lemmatize_tokens(corpora_list: List[merm_model.LinkedDocument], stop_words: List[str]): nlp = spacy.load('en_core_web_sm') stoplist = stop_words lemmatized_corpus = [] iter_count = 0 lemmatizer = WordNetLemmatizer() # log.getLogger().info("Lemmatizing corpus. This can be slow.") for doc in corpora_list: lemmatized_text = [] for word in doc.tokens: # print("word: " + word) lemmatized_word = lemmatizer.lemmatize(word) if lemmatized_word is not None: cleanword = text_utils.clean_string_for_tokenizing( lemmatized_word) if cleanword not in stoplist and len( cleanword) > 1 and not text_utils.hasNumbers( cleanword): # print(cleanword) lemmatized_text.append(cleanword) doc.tokens = lemmatized_text lemmatized_corpus.append(doc) iter_count += 1 if env.test_env( ) == True and iter_count > env.test_env_doc_processing_count(): log.getLogger().info("DEV MODE: Breaking loop here") break return lemmatized_corpus
def col_names(df, df_name=""): colNames = df.columns.values cnstr = "" for cn in colNames: cnstr = str(cnstr) + "\n" + str(cn) log.getLogger().info(df_name + " Column Names: " + cnstr + "\n")
def create_and_register_index(index_name:str, body_json): try: es = connectToES() es.indices.create(index=index_name, body=body_json) except Exception as e: s = str(e) log.getLogger().error("Could not create index. " + s)
def run_post_process(package: merm_model.PipelinePackage): log.getLogger().info("run_post_process: Gensim LDA Report") report_string = "" report_sentences = env.config.getboolean( 'ml_instructions', 'gensim_lda_report_sentence_level') csv_list_of_lists = [] csv_list_of_lists.append(["index_name", "topic_id", "term", "weight"]) for topicid, topiclist in package.any_analysis().items(): report_string += "\n\nTOPIC:" + str(topicid) + "\n" if report_sentences == True: corpus_as_sentences = break_corpus_as_sentences( package.linked_document_list) for entry in topiclist: report_string += str(entry[0]) report_string += "\t\t\t" report_string += str(entry[1]) report_string += "\n" csv_list_of_lists.append([ package.any_analysis_dict["provider"], topicid, entry[0], entry[1] ]) if report_sentences == True: salient_sentences = find_salient_sentences( topiclist, corpus_as_sentences) report_string += "\n\nSALIENT_SENTENCES\n" for sentence in salient_sentences: report_string += sentence + "\n" log.getReportLogger().info(report_string) _save_topic_model(package) _save_csv(csv_list_of_lists, "lda_topics_toplevel")
def perform(self, package: merm_model.PipelinePackage): analysis_id = self._analysis_id(package) log.getLogger().info("K means prediciting. Tea time") X = package.any_inputs_dict["SKX"] env = package.dependencies_dict["env"] test_range = env.config["ml_instructions"] ["silhouette_range"].split(",") Xarray = X.toarray() silhouette_results = _silhouette(Xarray,test_range) cluster_count_tuple = max(silhouette_results, key=lambda x:x[1]) y = package.any_inputs_dict["SKY"] skdict = package.any_inputs_dict["SKdict"] cluster = AgglomerativeClustering(n_clusters=cluster_count_tuple[0], affinity='euclidean', linkage='ward') result = cluster.fit_predict(X.toarray()) labels = cluster.labels_ cluster_list = [] for j in range(labels.shape[0]): row_list = [] sentence = package.linked_document_list[j].raw cluster = labels[j] row_list.append(cluster) row_list.append(sentence) cluster_list.append(row_list) cluster_list package.any_analysis_dict[analysis_id+"_result"] = cluster_list package.log_stage("Agglomerative Clustering\nSilhouette : " + str(silhouette_results) + "\nCluster count : " + str(cluster_count_tuple)) return package
def delete_index(index_name): try: es = connectToES() es.indices.delete(index=index_name, ignore=[400, 404]) except Exception as e: msg = "WARN: " + str(e) log.getLogger().error(msg)
def _process_major_final(package): aggregated_majors_path = env.config["local_data"][ "aggregated_majors_filepath"] json1_file = open(aggregated_majors_path) json1_str = json1_file.read() aggregated_majors_dict = json.loads(json1_str) for index, row in package.corpus.iterrows(): majorFinal = row["majorFinal"] if majorFinal is None: areas_of_study = row["areasOfStudy"] if len(areas_of_study) > 0: areasOfStudyList = areas_of_study.split(",") for s in areasOfStudyList: supper = s.upper() if supper in aggregated_majors_dict and majorFinal is None: major_final_from_file = aggregated_majors_dict[supper] package.corpus.loc[ index, "majorFinal"] = major_final_from_file log.getLogger().info(major_final_from_file) log.getLogger().info( "added to df: " + str(package.corpus.loc[index, "majorFinal"])) majorFinal = major_final_from_file doc_id = row["id"] _generate_json_and_dispatch(doc_id, row["indexname"], major_final_from_file)
def triage(package: merm_model.PipelinePackage): instructions = env.config["pipeline_instructions"]["post_process"] instruction_list = instructions.split(",") for instruction in instruction_list: if instruction == "tfidf_partof_sentence_breakout": tfidf_breakout.run_post_process(package) elif instruction == "page_views_confluence": page_view_update.run_post_process(package) elif instruction == "gensim_lda_report_by_subset": gensim_lda_report_by_subset.run_post_process(package) elif instruction == "gensim_lda_report": gensim_lda_report.run_post_process(package) elif instruction == "tfidf_log_text_detector": log_detector.run_post_process(package) elif instruction == "gensim_lda_report_topic_similarity": gensim_similarity_report.run_post_process(package) elif instruction == "save_dictionaries_to_file": save_dictionaries_to_file.run_post_process(package) elif instruction == "major_analysis": major_analysis.run_post_process(package) elif instruction == "rake": rake.run_post_process(package) elif instruction == "none": log.getLogger().info("Nothing to do. No post-process assigned.")
def _do_glove(self, package, cooccurrence_dict, dimensions, alpha, x_max, vocab): glove_start = time.time() model = glove.Glove(cooccurrence_dict, d=dimensions, alpha=alpha, x_max=x_max) glove_time = (time.time() - glove_start) log.getLogger().info("glove_time " + str(glove_time)) glove_train_start = time.time() model.train(batch_size=200, workers=9) glove_train_time = (time.time() - glove_train_start) log.getLogger().info("glove_train_time " + str(glove_train_time)) glove_list = self.output_format(model.W, vocab) glove_output_key = str(dimensions) + "d_" + str(x_max) + "_" + str( alpha) + "_glove_output" if "glove_output_key" in package.any_inputs_dict.keys(): package.any_inputs_dict[ "glove_output_key"] = package.any_inputs_dict[ "glove_output_key"] + "," + glove_output_key else: package.any_inputs_dict["glove_output_key"] = glove_output_key package.any_analysis_dict[glove_output_key] = glove_list package.any_analysis_dict["gl0ve_vocab"] = vocab
def _generate_json_and_dispatch(salient_corpus_map:Dict, retry_count=0): try: es = es_conn.connectToES() total_sentences=0 for key, value in salient_corpus_map.items(): sentence_list = value[1] docid = key total_sentences = total_sentences + len(sentence_list) if len(sentence_list) > 0: linked_doc = _extract_linked_doc_from_list(value[1]) index_name = linked_doc.index_name log.getLogger().debug("Dispatching: " + str(docid) + " | " + index_name) es.index(index=index_name + index_suffix, doc_type='_doc', id=key, body=_generate_json(linked_doc, _convert_linkeddoclist_to_string(sentence_list))) except Exception as e: retry_count = retry_count + 1 msg = "WARN: " + str(e) log.getLogger().error(msg) if "time" in msg.lower() and retry_count < 10: _generate_json_and_dispatch(salient_corpus_map, retry_count) else: pass
def delete_index(index_name): try: es_conn.delete_index(index_name) time.sleep(5) except Exception as e: msg = "WARN: " + str(e) log.getLogger().error(msg) pass
def _extract_from_all_providers(es, pipe, dependencies_dict): providers = dependencies_dict["env"].config["extract_instructions"][ "all_providers"] providers_list = providers.split(",") log.getLogger().debug("Extracting from all providers: " + str(providers_list)) for provider in providers_list: _extract_from_one_provider(es, provider, pipe, dependencies_dict)
def run_post_process(package: merm_model.PipelinePackage): if env.continue_run(): tfidf_top_terms: List[List[Tuple[str, float]]] = package.any_analysis() _validate_corpus(tfidf_top_terms, package.linked_document_list) _create_spaces() log.getLogger().info("Corpus size: " + str(len(package.linked_document_list))) _iterate_corpus(package)
def _validate_corpus(tfidf_top_terms: List[List[Tuple[str, float]]], linked_doc_list: List[merm_model.LinkedDocument]): docidx = 0 for terms in tfidf_top_terms: linked_doc = linked_doc_list[docidx] for word, freq in terms: if word not in linked_doc.tokens: log.getLogger().error("NOT FOUND " + word) raise Exception("NOT FOUND " + word + ". NLP corpus out of sync with source corpus") docidx = docidx + 1