def perform(self, package: merm_model.PipelinePackage): colutils = package.dependencies_dict["colutils"] if "corpus_word_frequency" not in package.any_analysis_dict.keys(): word_counter = LinkedDocCorpusWordCount() package = word_counter.perform(package) corpus_word_frequency = package.any_analysis_dict[ "corpus_word_frequency"] total_word_count = str(sum(list(corpus_word_frequency.values()))) unique_word_count = str(len(list(corpus_word_frequency.keys()))) stop_words_global = package.dependencies_dict[ "utils"]._stop_word_list_generator(package) stop_words_top_tuple = self._top_threshold(package) stop_words_top = stop_words_top_tuple[0] lowest_freq_at_top = stop_words_top_tuple[1] stop_words_bottom_tuple = self._bottom_threshold(package) stop_words_bottom = stop_words_bottom_tuple[0] max_freq_at_bottom = stop_words_bottom_tuple[1] stop_words = stop_words_bottom + stop_words_top + stop_words_global analysis_key = colutils.incrementing_key("stop_words", package.any_analysis_dict) package.any_analysis_dict[analysis_key] = stop_words self.save_to_file(stop_words, package) package.log_stage("Generated stop words. \nGlobal stop word count: " + str(len(stop_words_global)) + "\nHigh frequency dynamically generated stop words: " + \ str(len(stop_words_top)) + "\nLow frequency dynamically generated stop words: " + str(len(stop_words_bottom)) +"\nLowest frequency: " + str(max_freq_at_bottom) + \ "\nHighest frequency: " + str(lowest_freq_at_top) + "\nOriginal word unique count is " + str(unique_word_count) + "\nTotal word count is " + str(total_word_count)) return package
def perform(self, package: merm_model.PipelinePackage): if type(package.linked_document_list) is list: corpus_word_frequency = self.count_as_doc_list( package.linked_document_list) else: corpus_word_frequency = self.count_as_doc_dict( package.linked_document_list) package.any_analysis_dict[ "corpus_word_frequency"] = corpus_word_frequency count = str(len(corpus_word_frequency)) total_word_count = str(sum(list(corpus_word_frequency.values()))) mx = str(max(list(corpus_word_frequency.values()))) median = str(stats.median(list(corpus_word_frequency.values()))) stdev = str(stats.stdev(list(corpus_word_frequency.values()))) doc_count = str(len(package.linked_document_list)) log_string = "\nDocument count: " + doc_count + \ "\nTotal_word count: " + total_word_count + \ "\nUnique_word count: " + count + \ "\nMax Frequency: " + mx + \ "\nMedian Frequency: " + median + \ "\nstdev: " + stdev package.log_stage(log_string) return package
def perform(self, package: merm_model.PipelinePackage): df = package.corpus log.getLogger().info("Shape of DF: " + str(df.shape)) groupby_dict = {} column = package.dependencies_dict["env"].config["ml_instructions"][ "df_groupby_column"] count = 0 for index, row in df.iterrows(): count = count + 1 if count % 1000 == 0: sys.stdout.write(".") jobs_string = row[column] if jobs_string in groupby_dict.keys(): groupby_dict[jobs_string] = groupby_dict[jobs_string] + 1 else: groupby_dict[jobs_string] = 1 package.log_stage( "Broke a pandas data frame into a dict of data grouped by " + str(column)) package.any_analysis_dict["group_by_" + column] = groupby_dict return package
def perform(self, package: merm_model.PipelinePackage): analysis_id = self._analysis_id(package) log.getLogger().info("K means prediciting. Tea time") X = package.any_inputs_dict["SKX"] env = package.dependencies_dict["env"] test_range = env.config["ml_instructions"] ["silhouette_range"].split(",") reporting_count = env.config.getint("ml_instructions","sklearn_kmeans_term_per_cluster_reporting_count") Xarray = X.toarray() silhouette_results = _silhouette(Xarray,test_range) cluster_count_tuple = max(silhouette_results, key=lambda x:x[1]) skdict = package.any_inputs_dict["SKdict"] kmeans = KMeans(n_clusters=cluster_count_tuple[0], random_state=10) kmeans.fit_predict(Xarray) centers = kmeans.cluster_centers_.argsort()[:, ::-1] centroid_list = [] centroid_list.append(["cluster","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16"]) for i in range(cluster_count_tuple[0]): row_list = [i] for ind in centers[i, :reporting_count]: row_list.append(skdict[ind]) centroid_list.append(row_list) cluster_list = [] cluster_list.append(["cluster","sentence"]) package.any_analysis_dict[analysis_id + "_top_terms"] = centroid_list package.any_inputs_dict["kmeans_top_terms_key"] = analysis_id + "_top_terms" package.log_stage("Kmeans Clustering, no repeats\nSilhouette : " + str(silhouette_results) + "\nCluster count : " + str(cluster_count_tuple)) return package
def perform(self, package: merm_model.PipelinePackage): if ("job" not in package.any_analysis_dict["provider"]): raise Exception("This classs will not work on " + str(package.any_analysis_dict["provider"])) df = package.corpus log.getLogger().info("Shape of DF: " + str(df.shape)) jobs_dict = {} for index, row in df.iterrows(): majorFinal = row["majorFinal"] if majorFinal is None: jobs_string = row["jobFinal"] if jobs_string in jobs_dict.keys(): jobs_dict[jobs_string] = jobs_dict[jobs_string] + 1 else: jobs_dict[jobs_string] = 1 package.any_analysis_dict["no_major_jobs_count"] = jobs_dict for index, row in df.iterrows(): jobs_string = row["jobFinal"] if jobs_string in jobs_dict.keys(): jobs_dict[jobs_string] = jobs_dict[jobs_string] + 1 else: jobs_dict[jobs_string] = 1 package.any_analysis_dict["jobs_count"] = jobs_dict return package
def _run_lda(self, topic_count, report_word_count, permitted_overlap, package:merm_model.PipelinePackage): topic_dict = {} topic_dict_friendly = {} lda_model = gensim.models.ldamodel.LdaModel(corpus=package.corpus, id2word=package.dict, num_topics=topic_count, update_every=1, alpha='auto', per_word_topics=False, iterations=100) topics = lda_model.show_topics(formatted=False, num_words=report_word_count) for index, topic in topics: # print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic])) words_for_topic = [] words_for_topic_friendly = [] for w in topic: words_for_topic.append((w[0], w[1])) words_for_topic_friendly.append(str(w[0]) + "," + str(w[1])) topic_dict[index] = words_for_topic topic_dict_friendly[index] = words_for_topic_friendly topic_overlap = self._topic_overlap(topic_dict) log.getLogger().info(str(topic_overlap)) stop_words = self._dynamic_stop_words(topic_overlap, permitted_overlap) if len(stop_words) > permitted_overlap: log.getLogger().info("\n**********\nRerunning LDA after removing " + str(len(stop_words)) + " words") package = self._remove_stop_words(stop_words,package) package = self._rebuild_corpus(package) return self._run_lda(topic_count,report_word_count,permitted_overlap,package) package.any_analysis_dict[lda_analysis_key(package) + "_topic_overlap"] = topic_overlap package.any_analysis_dict[lda_analysis_key(package)] = topic_dict package.any_analysis_dict[lda_analysis_key(package) + "_friendly"] = topic_dict_friendly return package
def _set_model(self, package: merm_model.PipelinePackage, lda_models_by_group): if type(package.model) is dict: for key, value in lda_models_by_group.items(): package.model[key] = value else: package.model = lda_models_by_group
def perform(self, package: merm_model.PipelinePackage): if ("job" not in package.any_analysis_dict["provider"]): raise Exception("This classs will not work on " + str(package.any_analysis_dict["provider"])) df = package.corpus log.getLogger().info("Shape of DF: " + str(df.shape)) areas_of_study_dict_undefined = {} for index, row in df.iterrows(): majorFinal = row["majorFinal"] if majorFinal is None: areas_of_study = row["areasOfStudy"] if len(areas_of_study) > 0: areasOfStudyList = areas_of_study.split(",") for s in areasOfStudyList: if s in areas_of_study_dict_undefined.keys(): areas_of_study_dict_undefined[ s] = areas_of_study_dict_undefined[s] + 1 else: areas_of_study_dict_undefined[s] = 1 package.any_analysis_dict[ "undefined_areas_of_study_count"] = areas_of_study_dict_undefined areas_of_study_dict = {} for index, row in df.iterrows(): majorFinal = row["majorFinal"] if majorFinal in areas_of_study_dict.keys(): areas_of_study_dict[ majorFinal] = areas_of_study_dict[majorFinal] + 1 else: areas_of_study_dict[majorFinal] = 1 package.any_analysis_dict["areas_of_study_count"] = areas_of_study_dict return package
def perform(self, package: merm_model.PipelinePackage): lda_topics_by_subset_raw = self.load_topics_by_subset(package, "dict") lda_topics_toplevel_raw = self.load_top_level_topics(package, "dict") word_to_id = self.build_dict(lda_topics_by_subset_raw, lda_topics_toplevel_raw) lda_topics_by_subset_raw_byrow = self.load_topics_by_subset( package, "records") lda_topics_toplevel_raw_byrow = self.load_top_level_topics( package, "records") lda_topics_by_subset_raw_byrow_coded = self.code_terms( lda_topics_by_subset_raw_byrow, word_to_id) lda_topics_toplevel_raw_byrow_coded = self.code_terms( lda_topics_toplevel_raw_byrow, word_to_id) lda_topics_by_subset_formatted = self.reformat_data( lda_topics_by_subset_raw_byrow_coded) lda_topics_toplevel_formatted = self.reformat_data( lda_topics_toplevel_raw_byrow_coded) package.any_analysis_dict[ "lda_topics_by_subset_formatted"] = lda_topics_by_subset_formatted package.any_analysis_dict[ "lda_topics_toplevel_formatted"] = lda_topics_toplevel_formatted return merm_model.PipelinePackage(package.model, package.corpus, word_to_id, package.linked_document_list, package.any_analysis_dict, package.dependencies_dict)
def perform(self, package: merm_model.PipelinePackage): analysis_id = self._analysis_id(package) log.getLogger().info("K means prediciting. Tea time") X = package.any_inputs_dict["SKX"] env = package.dependencies_dict["env"] test_range = env.config["ml_instructions"] ["silhouette_range"].split(",") Xarray = X.toarray() silhouette_results = _silhouette(Xarray,test_range) cluster_count_tuple = max(silhouette_results, key=lambda x:x[1]) y = package.any_inputs_dict["SKY"] skdict = package.any_inputs_dict["SKdict"] cluster = AgglomerativeClustering(n_clusters=cluster_count_tuple[0], affinity='euclidean', linkage='ward') result = cluster.fit_predict(X.toarray()) labels = cluster.labels_ cluster_list = [] for j in range(labels.shape[0]): row_list = [] sentence = package.linked_document_list[j].raw cluster = labels[j] row_list.append(cluster) row_list.append(sentence) cluster_list.append(row_list) cluster_list package.any_analysis_dict[analysis_id+"_result"] = cluster_list package.log_stage("Agglomerative Clustering\nSilhouette : " + str(silhouette_results) + "\nCluster count : " + str(cluster_count_tuple)) return package
def perform(self, package: data_models.PipelinePackage): linked_doc_list = package.linked_document_list log.getLogger().info( "Converting corpora as bag of words. Input format is List[List[str]]. Output is Gensim Dictionary" ) log.getLogger().info("Corpus size: " + str(len(package.linked_document_list))) bowlist = [] for doc in linked_doc_list: bowlist.append(doc.tokens) dictionary = corpora.Dictionary(bowlist) #log.getLogger().info(dictionary) log.getLogger().info("Incoming doc count: " + str(len(linked_doc_list))) corpus = [dictionary.doc2bow(line) for line in bowlist] log.getLogger().info("Feature count: " + str(len(dictionary.id2token))) package.log_stage( "Converted the corpus into a Gensim dictionary (i.e., bag of words)" ) return data_models.PipelinePackage(None, corpus, dictionary, linked_doc_list, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict)
def perform(self, package: merm_model.PipelinePackage): utils = package.dependencies_dict["utils"] colutils = package.dependencies_dict["colutils"] env = package.dependencies_dict["env"] embeddings_file = env.config["ml_instructions"][ "text_rank_embeddings_file"] dimensions = env.config.getint("ml_instructions", "glove_dimensions") word_embeddings_list = self._word_embeddings(embeddings_file) #sentences = package.dependencies_dict["utils"].corpus_as_sentence_list(package) tokenized_sentences_by_doc = utils.corpus_as_tokenized_sentence_linked_doc_list_grouped_by_doc( package, True) log.getLogger().info("we have " + str(len(tokenized_sentences_by_doc)) + " docs") rank_by_dict = self._prep_rank_by_doc_dict(package) count = 0 for docid, sentences in tokenized_sentences_by_doc.items(): sentence_by_rank_dict = self.rank_by_document( sentences, word_embeddings_list, package, dimensions) for key, value in sentence_by_rank_dict.items(): sentence_list_for_that_rank = rank_by_dict[key] sentence_list_for_that_rank.append([dimensions, docid, value]) if count % 100 == 0: print(count) count = count + 1 analysis_key = colutils.incrementing_key("text_rank", package.any_analysis_dict) package.any_analysis_dict[analysis_key] = rank_by_dict package.log_stage("Conducting text rank. Total document count is " + str(len(package.linked_document_list)) + \ ". For each document the top " + str(len(list(rank_by_dict.keys()))) + " ranked sentences were captured." + \ "\nGlove dimension count: " + str(dimensions)) return package
def perform(self, package: merm_model.PipelinePackage): env = package.dependencies_dict["env"] utils = package.dependencies_dict["utils"] original_count = len(package.linked_document_list) merge_by = env.config["ml_instructions"]["merge_docs_field"] merged_docs_dict = {} for sub_doc in package.linked_document_list: if merge_by == "groupedBy": key = sub_doc.groupedBy elif merge_by == "uid": key = sub_doc.uid else: key = sub_doc.space if key in merged_docs_dict.keys(): merged_docs_dict[ key].raw = merged_docs_dict[key].raw + ". " + sub_doc.raw merged_docs_dict[ key].tokens = merged_docs_dict[key].tokens + sub_doc.tokens else: merged_docs_dict[key] = sub_doc new_linked_doc_list = list(merged_docs_dict.values()) for full_doc in new_linked_doc_list: full_doc.raw = utils.cleanstring_doubled_period(full_doc.raw) package.linked_document_list = new_linked_doc_list package.log_stage("Merged documents by " + str(merge_by) + " tokens. \n Original doc count: " + str(original_count) + "\nNew doc count: " + str(len(package.linked_document_list))) return package
def perform(self, package: merm_model.PipelinePackage): env = package.dependencies_dict text_utils = env["utils"] syntax = env["syntax"] syntax.lemmatize_docs(package.linked_document_list, text_utils.standard_stop_words()) package.log_stage("lemmatized tokens") return package
def perform(self, package: merm_model.PipelinePackage): for linked_doc in package.linked_document_list: new_sentence = "" for token in linked_doc.tokens: new_sentence = new_sentence + token + " " linked_doc.raw = new_sentence package.log_stage("Converted tokens to concatenated strings") return package
def perform(self, package: merm_model.PipelinePackage): original_linked_doc_size = len(package.linked_document_list) package.uncache_linked_docs() package.log_stage("Original linked doc count: " + str(original_linked_doc_size) + "Current linked doc count: " + str(len(package.linked_document_list))) return package
def perform(self, package: merm_model.PipelinePackage): package.corpus.to_csv( env.config['job_instructions']['es_file_location'], index=False) log.getLogger().info( "Saved ElasticSearch Data as CSV at: " + env.config['job_instructions']['es_file_location']) package.log_stage("Saved ElasticSearch Data as CSV at: " + env.config['job_instructions']['es_file_location']) return package
def perform(self, package: merm_model.PipelinePackage): classifier = RandomForestClassifier(n_estimators=1000, random_state=0) test_proportion = package.dependencies_dict["env"].config.getfloat( "ml_instructions", "rf_test_proportion") random_state = 0 rf_categories = package.any_inputs_dict["SKcategories"] X = package.any_inputs_dict["SKX"] y = package.any_inputs_dict["SKY"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_proportion, random_state=random_state) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) report = pd.DataFrame(confusion_matrix(y_test, y_pred)).values.tolist() report_string = self._report_string(report) analysis_id = self._analysis_id(package) package.any_inputs_dict["sk_last_id"] = analysis_id package.any_analysis_dict[analysis_id + "_rfclassifier"] = classifier package.any_analysis_dict[analysis_id + "_confusion"] = report package.any_analysis_dict[analysis_id + "_ypred"] = y_pred package.any_analysis_dict[analysis_id + "_ytest"] = y_test package.any_analysis_dict[analysis_id + "_Xtest"] = X_test package.any_analysis_dict[analysis_id + "_Ycategories"] = rf_categories package.log_stage("\nTraining doc count: " + str(X_train.shape[0]) + "\nTraining feature count: " + str(X_train.shape[1]) + "\nTestTrain split:" + str(test_proportion) + "\nRF confusion matrix:\n" + report_string + "\nclassification_report:\n" + str(classification_report(y_test, y_pred)) + "Accuracy:\n" + str(accuracy_score(y_test, y_pred))) return package
def perform(self, package: merm_model.PipelinePackage): if "current_loop" in package.any_inputs_dict.keys(): current_loop = package.any_inputs_dict["current_loop"] else: current_loop = 0 env = package.dependencies_dict["env"] package.any_inputs_dict["current_loop"] = current_loop + 1 loop_count = env.config.getint("pipeline_instructions", "loop_count") package.any_inputs_dict["loop_count"] = loop_count package.log_stage("Current loop: " + str(current_loop) + "\nTotal loops: " + str(loop_count)) return package
def perform(self, package: merm_model.PipelinePackage): from nltk.stem import PorterStemmer pstemmer = PorterStemmer() for linked_doc in package.linked_document_list: stemmed_tokens = [] for token in linked_doc.tokens: stemmed = pstemmer.stem(token) stemmed_tokens.append(stemmed) linked_doc.tokens = stemmed_tokens package.log_stage("Stemmed tokens") return package
def perform(self, package: merm_model.PipelinePackage): original_linked_doc_size = len(package.linked_document_list) text_utils = package.dependencies_dict["utils"] linked_docs_by_sentence = text_utils.corpus_as_tokenized_sentence_linked_doc_list( package) package.linked_document_list = linked_docs_by_sentence package.log_stage( "LinkedDocToLinkedSentences: Original linked doc count: " + str(original_linked_doc_size) + "Current linked doc count: " + str(len(package.linked_document_list))) return package
def perform(self, package: data_models.PipelinePackage): env = package.dependencies_dict["env"] report_count = env.config.getint("ml_instructions", "glove_loadings_count_to_report") glove_output_key_list = package.any_inputs_dict[ "glove_output_key"].split(",") for glove_output_key in glove_output_key_list: self._process_loadings(package, glove_output_key, report_count) package.log_stage("GloveLoadings: ") package.any_analysis_dict["glove_variance"] = self.variance_dict return package
def _analysis_id(self, package: merm_model.PipelinePackage): dt = datetime.now() suffix = str(dt.microsecond)[-4:] if "kmeans_iteration_count" in package.any_inputs_dict.keys(): rf_count = package.any_inputs_dict["kmeans_iteration_count"] + 1 package.any_inputs_dict["kmeans_iteration_count"] = rf_count else: rf_count = 0 package.any_inputs_dict["kmeans_iteration_count"] = rf_count categories = package.any_inputs_dict["SKcategories"] category_count = len(list(categories.keys())) id = "km1_" + str(rf_count) + "_" + str(category_count) + "_" + str(len(package.any_inputs_dict["SKY"])) + "_" + suffix return id
def perform(self, package: merm_model.PipelinePackage): mfst = package.dependencies_dict["factory"].PipelineManifest.manifest #breaks corpus into subsets grouped_doc_package = mfst["SubsetData"].perform(package) if ("ackage" in type(grouped_doc_package).__name__): log.getLogger().info("STRUCTURE after SubsetData:" + grouped_doc_package.structure()) else: log.getLogger().warning( "The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :(" ) grouped_linked_docs = grouped_doc_package.linked_document_list analysis_by_group_rake = {} analysis_by_group_text_rank = {} analysis_by_group_noun_phrase = {} minimum_doc_count = package.dependencies_dict["env"].config.getint( 'ml_instructions', 'minimum_doc_count') log_string = "\n======================\nSubset Analysis for text rank, rake and noun phrase.\n" for sub_corpus_name_untyped, doc_list in grouped_linked_docs.items(): sub_corpus_name = str(sub_corpus_name_untyped) if len(doc_list) > minimum_doc_count: package_one_group = merm_model.PipelinePackage( package.model, package.corpus, package.dict, doc_list, {}, package.any_inputs_dict, package.dependencies_dict) package_one_group.any_inputs_dict[ "corpus_name"] = sub_corpus_name package_one_group = self._analyze_subset( package_one_group, sub_corpus_name, mfst, doc_list) analysis_by_group_text_rank[ sub_corpus_name] = package_one_group.any_analysis_dict[ "text_rank_0"] log_string = log_string + package_one_group.stage_log() package.any_analysis_dict[ "text_rank_all_groups"] = analysis_by_group_text_rank package.any_analysis_dict["rake_all_groups"] = analysis_by_group_rake package.any_analysis_dict[ "noun_phrase_all_groups"] = analysis_by_group_noun_phrase new_package = merm_model.PipelinePackage(package.model, package.corpus, package.dict, grouped_linked_docs, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict) new_package.log_stage(log_string) return new_package
def generate_linked_docs_unranked(package: merm_model.PipelinePackage, analysis_key): linked_doc_dict = {} package.cache_linked_docs() all_groups = package.any_analysis_dict[analysis_key] for key, sentence_list in all_groups.items(): linked_doc_list = [] for sentence in sentence_list: linked_doc = package.dependencies_dict[ "utils"].sentence_to_linked_doc(sentence) linked_doc_list.append(linked_doc) linked_doc_dict[key] = linked_doc_list package.linked_document_list = linked_doc_dict return package
def next_step(self, task:str, package:merm_model.PipelinePackage): self.step_count = self.step_count + 1 msg = "\n\nEntering " + task + " " + str(self.step_count) + "\n\n" log.getLogger().info(msg) manifest = factory.PipelineManifest new_task = manifest.manifest[task] package = new_task.perform(package) package.any_inputs_dict["previous_task"] = task package.any_inputs_dict["history"].append(task) if("Package" in type(package).__name__): log.getLogger().warning("STRUCTURE after " + task + ": " + package.structure()) else: log.getLogger().warning("The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :(") return package
def _extract_from_providers_merge(es, providers, package: merm_model.PipelinePackage): msg = "\n\n-------------------------\nPROVIDERS: " + str( providers) + "\n---------------------\n\n" log.getLogger().warning(msg) ignore_indices = package.dependencies_dict["env"].config[ "extract_instructions"]["ignore_indices"] ignore_indices_list = ignore_indices.split(",") indices = es_conn.retrieve_index_registry() limit = _dev_limit(package.dependencies_dict) count = 0 df_per_space_list: List[DataFrame] = [] for provider in providers: count = 0 for index_name in indices: if "@" in index_name: continue if index_name in ignore_indices_list: continue if count > limit: break if provider.strip() in index_name: df = _retrieve_index_content(es, index_name, provider, limit, package.dependencies_dict) if not df.empty: log.getLogger().debug("Retrieved " + index_name + ": row count " + str(df.shape)) count = count + df.shape[0] df_per_space_list.append(df) if len(df_per_space_list) > 0: complete_corpus_df = pd.concat(df_per_space_list, ignore_index=True) if True == _dev_bool(package.dependencies_dict): complete_corpus_df = complete_corpus_df.head(limit) #log.getLogger().info("\n\nExtraction Complete. Document count = " + str(complete_corpus_df[:5])) log.getLogger().info("complete_corpus_df shape: " + str(complete_corpus_df.shape)) dfu.col_names(df, "complete_corpus_df") msg = "\n\n>>>>>>>>>>>>>> Entering Pipeline For " + str( providers) + ">>>>>>>>>>\n\n" log.getLogger().info(msg) analysis_dict = {} analysis_dict["provider"] = str(providers) package.any_analysis_dict = analysis_dict package.corpus = complete_corpus_df return package
def perform(self, package: merm_model.PipelinePackage): last_id = package.any_inputs_dict["sk_last_id"] y_test = package.any_analysis_dict[last_id + "_ytest"] y_pred = package.any_analysis_dict[last_id + "_ypred"] X_test = package.any_analysis_dict[last_id + "_Xtest"] rf_dict = package.any_inputs_dict["SKdict"] rf_categories = package.any_inputs_dict["SKcategories"] inv_rf_categories = {v: k for k, v in rf_categories.items()} sentence_match_list = [] sentence_match_list.append( ["Actual", "Predicted", "Sentence", "Correct"]) near_missies_dict = package.any_analysis_dict[last_id + "_near_misses_dict"] for idx, major in enumerate(y_test): pred_major = y_pred[idx] if pred_major == major: match = True else: match = False if match or self._add_to_sentence_list( major, pred_major, inv_rf_categories, near_missies_dict): sentence = X_test[[idx], :] #print(X_test[[idx], :]) sentence_string = "" for word_idx in sentence.indices: sentence_string = sentence_string + rf_dict[word_idx] + " " alist = [ inv_rf_categories[major], inv_rf_categories[pred_major], sentence_string, match ] sentence_match_list.append(alist) analysis_id = self._analysis_id(package) package.any_analysis_dict[analysis_id + "_sentences"] = sentence_match_list package.log_stage( "Found sentences that accurately predict each major or were near misses" ) return package
def run_post_process(package: merm_model.PipelinePackage): log.getLogger().info("run_post_process") csv_list_of_lists = [] csv_list_of_lists.append(["index_name", "topic_id", "term", "weight"]) report_sentences = env.config.getboolean( 'ml_instructions', 'gensim_lda_report_sentence_level') for idxname, topicdict in package.any_analysis().items(): report_for_index = "\n\n\n+++++++++++++++++++\n\nReport for " + idxname + "\n\n" docs_list = package.linked_document_list[idxname] if report_sentences == True: corpus_as_sentences = break_corpus_as_sentences(docs_list) report_for_index += "Corpus Size: " + str(len(docs_list)) + "\n" if len(docs_list) > 100: for topicid, topiclist in topicdict["default_analysis_key"].items( ): report_for_index += "\n\nTOPIC:" + str(topicid) + "\n" for entry in topiclist: report_for_index += str(entry[0]) report_for_index += "\t\t\t" report_for_index += str(entry[1]) report_for_index += "\n" csv_list_of_lists.append( [idxname, topicid, entry[0], entry[1]]) if report_sentences == True: salient_sentences = find_salient_sentences( topiclist, corpus_as_sentences) report_for_index += "\n\nSALIENT_SENTENCES\n" for sentence in salient_sentences: report_for_index += sentence + "\n" log.getReportLogger().info(report_for_index) _save_topic_model(package) _save_csv(csv_list_of_lists, "lda_analysis_by_subset")
def perform(self, package: merm_model.PipelinePackage): original_count = len(package.linked_document_list) filter_dict = {} for linked_doc in package.linked_document_list: filter_dict[linked_doc.raw] = linked_doc doc_list = list(filter_dict.values()) package.linked_document_list = doc_list package.log_stage("Before removing duplicates: " + str(original_count) + "\nAfter removing duplicates: " + str(len(package.linked_document_list))) return package