def perform(self, package: merm_model.PipelinePackage): mfst = package.dependencies_dict["factory"].PipelineManifest.manifest #breaks corpus into subsets grouped_doc_package = mfst["SubsetData"].perform(package) if ("ackage" in type(grouped_doc_package).__name__): log.getLogger().info("STRUCTURE after SubsetData:" + grouped_doc_package.structure()) else: log.getLogger().warning( "The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :(" ) log_string = "\n\n_______________________\nPerforming LDA on subsets\n" grouped_linked_docs = grouped_doc_package.linked_document_list lda_models_by_group = {} lda_corpus_by_group = {} lda_dict_by_group = {} lda_analysis_by_group = {} minimum_doc_count = package.dependencies_dict["env"].config.getint( 'ml_instructions', 'minimum_doc_count') dict_for_group_processing = {} dict_for_group_processing["grouped_linked_docs"] = grouped_linked_docs dict_for_group_processing["lda_models_by_group"] = lda_models_by_group dict_for_group_processing["lda_corpus_by_group"] = lda_corpus_by_group dict_for_group_processing["lda_dict_by_group"] = lda_dict_by_group dict_for_group_processing[ "lda_analysis_by_group"] = lda_analysis_by_group stop_words = package.dependencies_dict[ "utils"]._stop_word_list_generator(package) for sub_corpus_name, doc_list in grouped_linked_docs.items(): package_one_group = merm_model.PipelinePackage( lda_models_by_group, lda_corpus_by_group, lda_dict_by_group, grouped_linked_docs[sub_corpus_name], {}, package.any_inputs_dict, package.dependencies_dict) package_one_group.any_analysis_dict["stop_words"] = stop_words if len(doc_list) >= minimum_doc_count: msg = "\n Subset: " + str(sub_corpus_name) + "\n\n" log.getLogger().info(msg) self._analyze_subset(package_one_group, dict_for_group_processing, str(sub_corpus_name), mfst) log_string = log_string + package_one_group.stage_log() self._set_analysis(package, lda_analysis_by_group) self._set_model(package, lda_models_by_group) new_package = merm_model.PipelinePackage( package.model, lda_corpus_by_group, lda_dict_by_group, package.linked_document_list, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict) new_package.log_stage(log_string) return new_package
def perform(self, package: merm_model.PipelinePackage): mfst = package.dependencies_dict["factory"].PipelineManifest.manifest #breaks corpus into subsets grouped_doc_package = mfst["SubsetData"].perform(package) if ("ackage" in type(grouped_doc_package).__name__): log.getLogger().info("STRUCTURE after SubsetData:" + grouped_doc_package.structure()) else: log.getLogger().warning( "The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :(" ) grouped_linked_docs = grouped_doc_package.linked_document_list analysis_by_group_rake = {} analysis_by_group_text_rank = {} analysis_by_group_noun_phrase = {} minimum_doc_count = package.dependencies_dict["env"].config.getint( 'ml_instructions', 'minimum_doc_count') log_string = "\n======================\nSubset Analysis for text rank, rake and noun phrase.\n" for sub_corpus_name_untyped, doc_list in grouped_linked_docs.items(): sub_corpus_name = str(sub_corpus_name_untyped) if len(doc_list) > minimum_doc_count: package_one_group = merm_model.PipelinePackage( package.model, package.corpus, package.dict, doc_list, {}, package.any_inputs_dict, package.dependencies_dict) package_one_group.any_inputs_dict[ "corpus_name"] = sub_corpus_name package_one_group = self._analyze_subset( package_one_group, sub_corpus_name, mfst, doc_list) analysis_by_group_text_rank[ sub_corpus_name] = package_one_group.any_analysis_dict[ "text_rank_0"] log_string = log_string + package_one_group.stage_log() package.any_analysis_dict[ "text_rank_all_groups"] = analysis_by_group_text_rank package.any_analysis_dict["rake_all_groups"] = analysis_by_group_rake package.any_analysis_dict[ "noun_phrase_all_groups"] = analysis_by_group_noun_phrase new_package = merm_model.PipelinePackage(package.model, package.corpus, package.dict, grouped_linked_docs, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict) new_package.log_stage(log_string) return new_package
def perform(self, package: merm_model.PipelinePackage): linked_doc_by_index = {} slackProvider = "slack" slack_channels = self._retrieve_slack_channel_names() for linked_doc in package.linked_document_list: if slackProvider in linked_doc.provider: self._process_slack_doc(linked_doc, linked_doc_by_index, slack_channels) else: if linked_doc.index_name in linked_doc_by_index: linked_doc_by_index[linked_doc.index_name].append( linked_doc) else: groupby_list = [] groupby_list.append(linked_doc) linked_doc_by_index[linked_doc.index_name] = groupby_list new_package = merm_model.PipelinePackage(package.model, package.corpus, package.dict, linked_doc_by_index, package.any_analysis, package.dependencies_dict) return new_package
def perform(self, package: merm_model.PipelinePackage): lda_topics_by_subset_formatted = package.any_analysis_dict[ "lda_topics_by_subset_formatted"] lda_topics_toplevel_formatted = package.any_analysis_dict[ "lda_topics_toplevel_formatted"] similarity_dict = {} for source, topic_dict in lda_topics_toplevel_formatted.items(): termidx_list = topic_dict["term_indices"] weight_list = topic_dict["weights"] tuples_list = list(zip(termidx_list, weight_list)) result = self._similarity_score(lda_topics_by_subset_formatted, tuples_list) term_list = topic_dict["terms"] result_dict = {} result_dict["terms"] = term_list result_dict["spaces"] = result similarity_dict[source] = result_dict package.any_analysis_dict["similarity_dict"] = similarity_dict return merm_model.PipelinePackage(package.model, package.corpus, package.dict, package.linked_document_list, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict)
def perform(self, package: merm_model.PipelinePackage): log.getLogger().info("Analyzing Gensim TF-IDF model") log.getLogger().info("Corpus size: " + str(len(package.linked_document_list))) self._validate(package) idx = 0 top_tf_idf_corpus = [] for model_result in package.model[package.corpus]: top_tfidf_doc = [] sorteddoc = sorted(model_result, key=itemgetter(1), reverse=True) linked_doc_source = package.linked_document_list[idx] for id, freq in sorteddoc[:10]: top_tfidf_doc.append( (package.dict[id], np.around(freq, decimals=3))) str1 = "\n\n\n" log.getLogger().debug(str1) top_tf_idf_corpus.append(top_tfidf_doc) linked_doc_source.any_analysis = top_tfidf_doc idx = idx + 1 package.any_analysis_dict[ package.default_analysis_key()] = top_tf_idf_corpus return merm_model.PipelinePackage(package.model, package.corpus, package.dict, package.linked_document_list, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict)
def perform(self, package: data_models.PipelinePackage): linked_doc_list = package.linked_document_list log.getLogger().info( "Converting corpora as bag of words. Input format is List[List[str]]. Output is Gensim Dictionary" ) log.getLogger().info("Corpus size: " + str(len(package.linked_document_list))) bowlist = [] for doc in linked_doc_list: bowlist.append(doc.tokens) dictionary = corpora.Dictionary(bowlist) #log.getLogger().info(dictionary) log.getLogger().info("Incoming doc count: " + str(len(linked_doc_list))) corpus = [dictionary.doc2bow(line) for line in bowlist] log.getLogger().info("Feature count: " + str(len(dictionary.id2token))) package.log_stage( "Converted the corpus into a Gensim dictionary (i.e., bag of words)" ) return data_models.PipelinePackage(None, corpus, dictionary, linked_doc_list, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict)
def perform(self, package: merm_model.PipelinePackage): thetype = type(package.linked_document_list) if thetype is dict: return package include_list = package.dependencies_dict["env"].config[ "ml_instructions"]["filter_group_include"].split(",") exclude_list = package.dependencies_dict["env"].config[ "ml_instructions"]["filter_group_exclude"].split(",") included = self.include_docs(include_list, package.linked_document_list) new_linked_doc_list = self.exclude_list(exclude_list, included) new_package = merm_model.PipelinePackage(package.model, package.corpus, package.dict, new_linked_doc_list, package.any_analysis, package.any_inputs_dict, package.dependencies_dict) new_package.log_stage("\nInclude filter was: " + str(include_list) + "\nExclude filter was:" + str(exclude_list) + "\nRemaining documents count: " + str(len(new_linked_doc_list))) return new_package
def perform(self, package: merm_model.PipelinePackage): lda_topics_by_subset_raw = self.load_topics_by_subset(package, "dict") lda_topics_toplevel_raw = self.load_top_level_topics(package, "dict") word_to_id = self.build_dict(lda_topics_by_subset_raw, lda_topics_toplevel_raw) lda_topics_by_subset_raw_byrow = self.load_topics_by_subset( package, "records") lda_topics_toplevel_raw_byrow = self.load_top_level_topics( package, "records") lda_topics_by_subset_raw_byrow_coded = self.code_terms( lda_topics_by_subset_raw_byrow, word_to_id) lda_topics_toplevel_raw_byrow_coded = self.code_terms( lda_topics_toplevel_raw_byrow, word_to_id) lda_topics_by_subset_formatted = self.reformat_data( lda_topics_by_subset_raw_byrow_coded) lda_topics_toplevel_formatted = self.reformat_data( lda_topics_toplevel_raw_byrow_coded) package.any_analysis_dict[ "lda_topics_by_subset_formatted"] = lda_topics_by_subset_formatted package.any_analysis_dict[ "lda_topics_toplevel_formatted"] = lda_topics_toplevel_formatted return merm_model.PipelinePackage(package.model, package.corpus, word_to_id, package.linked_document_list, package.any_analysis_dict, package.dependencies_dict)
def perform(self, package: merm_model.PipelinePackage): classes = {} groupby_count = {} numeric_class = [] corpus = [] category_count = 0 env = package.dependencies_dict["env"] category_field = self._get_category(env) for linked_doc in package.linked_document_list: corpus.append(linked_doc.raw) if category_field == "group_by": category = linked_doc.groupedBy else: category = linked_doc.space if category in classes.keys(): numeric_class.append(classes[category]) groupby_count[category] = groupby_count[category] + 1 else: classes[category] = category_count numeric_class.append(category_count) category_count = category_count + 1 groupby_count[category] = 1 package.any_analysis_dict["scikit_category_catalog"] = classes vectorizer_type = env.config["ml_instructions"]["vectorizer_type"] max_features = env.config.getint("ml_instructions", "rf_max_features") if "tfidf" in vectorizer_type.lower(): vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words='english', max_features=max_features) else: vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english', max_features=max_features) matrix = vectorizer.fit_transform(corpus) feature_names = vectorizer.get_feature_names() package.any_inputs_dict["SKX"] = matrix package.any_inputs_dict["SKY"] = numeric_class package.any_inputs_dict["SKdict"] = feature_names package.any_inputs_dict["SKcategories"] = classes package.log_stage("\nPrepared corpus. \nVectorizor type:" + vectorizer_type + "\nCategory map " + self.class_log(classes) + "\n Groupby map" + "\n\n" + self.groupby_log(groupby_count)) new_package = merm_model.PipelinePackage(None, (numeric_class, matrix), feature_names, package.linked_document_list, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict) return new_package
def perform(self, package: merm_model.PipelinePackage): new_model = self._doLDA(package.corpus) new_package = merm_model.PipelinePackage(new_model, package.corpus, package.dict, package.linked_document_list, package.any_analysis_dict, package.dependencies_dict) log.getLogger().info(new_package.structure()) return new_package
def perform(self, package: merm_model.PipelinePackage): log.getLogger().info("Generating Gensim TF-IDF model") model = TfidfModel(package.corpus) # fit model return merm_model.PipelinePackage(model, package.corpus, package.dict, package.linked_document_list, package.any_analysis_dict, package.dependencies_dict)
def perform(self, package:merm_model.PipelinePackage): #scipy_csc_matrix = gensim.matutils.corpus2csc(package.corpus) log.getLogger().info("STAGE: Seeking to identify similar topics across multiple corpii") prepare_data = self._prepare_data(package) matching_topics = self._iterate_similar_topics(prepare_data) package.any_analysis_dict[package.default_analysis_key()] = matching_topics return merm_model.PipelinePackage(package.model,package.corpus,package.dict, package.linked_document_list,package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict)
def perform(self, package:merm_model.PipelinePackage): #scipy_csc_matrix = gensim.matutils.corpus2csc(package.corpus) log.getLogger().info("STAGE: Running a standard LDA in Gensim") topic_count = env.config.getint('ml_instructions', 'gensim_lda_topics') log.getLogger().info("Seeking " + str(topic_count) + " topics") report_word_count = env.config.getint('ml_instructions', 'gensim_lda_term_per_topic_reporting_count') if len(package.dict.token2id) > 50: topic_dict = {} topic_dict_friendly = {} lda_model = gensim.models.ldamodel.LdaModel(corpus=package.corpus, id2word=package.dict, num_topics=topic_count, update_every=1, alpha='auto', per_word_topics=False, iterations= 100) for index, topic in lda_model.show_topics(formatted=False, num_words=report_word_count): #print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic])) words_for_topic = [] words_for_topic_friendly = [] for w in topic: msg = str(index) + ":" + str(w) log.getLogger().info(msg) words_for_topic.append((w[0],w[1])) words_for_topic_friendly.append(str(w[0]) + "," + str(w[1])) topic_dict[index] = words_for_topic topic_dict_friendly[index] = words_for_topic_friendly package.any_analysis_dict[lda_analysis_key(package)] = topic_dict package.any_analysis_dict[lda_analysis_key(package) + "_friendly"] = topic_dict_friendly new_package = merm_model.PipelinePackage(lda_model,package.corpus,package.dict,package.linked_document_list,package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict) new_package.log_stage("Performed Gensim LDA.\nTopic Count: " + str(topic_count) + "\nIterations: " + str(100) + \ "\nalpha = 0 \nUpdate Every: 1\n per_word_topics: False\nReporting on top " + str(report_word_count) + "words in each topic\n") return new_package else: new_package = merm_model.PipelinePackage(None, package.corpus, package.dict, package.linked_document_list, [], package.any_inputs_dict, package.dependencies_dict) new_package.log_stage("Gensim LDA aborted. There were too few tokens") return new_package
def perform(self, package: merm_model.PipelinePackage): #scipy_csc_matrix = gensim.matutils.corpus2csc(package.corpus) log.getLogger().info("STAGE: Running a standard LDA in Gensim") topic_count = env.config.getint('ml_instructions', 'gensim_lda_topics') log.getLogger().info("Seeking " + str(topic_count) + " topics") report_word_count = env.config.getint( 'ml_instructions', 'gensim_lda_term_per_topic_reporting_count') if len(package.dict.token2id) > 50: topic_dict = {} lda_model = gensim.models.ldamodel.LdaModel(corpus=package.corpus, id2word=package.dict, num_topics=topic_count, update_every=1, alpha='auto', per_word_topics=False, iterations=100) for index, topic in lda_model.show_topics( formatted=False, num_words=report_word_count): #print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic])) words_for_topic = [] for w in topic: msg = str(index) + ":" + str(w) log.getLogger().info(msg) words_for_topic.append((w[0], w[1])) topic_dict[index] = words_for_topic package.any_analysis_dict[ package.default_analysis_key()] = topic_dict new_package = merm_model.PipelinePackage( lda_model, package.corpus, package.dict, package.linked_document_list, package.any_analysis_dict, package.dependencies_dict) return new_package else: new_package = merm_model.PipelinePackage( None, package.corpus, package.dict, package.linked_document_list, [], package.dependencies_dict) return new_package
def perform(self, package: merm_model.PipelinePackage): mfst = package.dependencies_dict["factory"].PipelineManifest.manifest #breaks corpus into subsets grouped_doc_package = mfst["SubsetData"].perform(package) stop_word_applied_linked_docs = [] grouped_linked_docs = grouped_doc_package.linked_document_list log_string = "\n======================\nSubset Stopword removal.\n" for sub_corpus_name_untyped, doc_list in grouped_linked_docs.items(): sub_corpus_name = str(sub_corpus_name_untyped) package_one_group: merm_model.PipelinePackage = merm_model.PipelinePackage( package.model, package.corpus, package.dict, doc_list, {}, package.any_inputs_dict, package.dependencies_dict) package_one_group.any_inputs_dict["corpus_name"] = sub_corpus_name package_one_group = self._analyze_subset(package_one_group, sub_corpus_name, mfst, doc_list) stop_word_applied_linked_docs = stop_word_applied_linked_docs + package_one_group.linked_document_list log_string = log_string + package_one_group.stage_log() new_package = merm_model.PipelinePackage( package.model, package.corpus, package.dict, stop_word_applied_linked_docs, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict) new_package.log_stage(log_string) if ("ackage" in type(new_package).__name__): log.getLogger().info("STRUCTURE after SubsetData:" + new_package.structure()) else: log.getLogger().warning( "The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :(" ) return new_package
def _extract(es, pipe, dependencies_dict: Dict): provider = dependencies_dict["env"].config["extract_instructions"][ "provider"] msg = "\n\n\n================\nExtracting from " + str(provider) log.getLogger().warning(msg) if provider == "all": _extract_from_all_providers(es, pipe, dependencies_dict) elif provider == "none": _enter_pipeline( merm_model.PipelinePackage(None, None, None, None, None, dependencies_dict), pipe) else: _extract_from_one_provider(es, provider, pipe, dependencies_dict)
def perform(self, package: data_models.PipelinePackage): df = package.corpus log.getLogger().info( "Stage: Converting dataframe of documents (previously mapped through DataFrameConvertForPipeline) to tokenized and lemmatized List[List[str]]. Outer List is corpora, inner list is document as bag of words" ) log.getLogger().info("Corpus size: " + str(df.shape)) corpora_list = self._dfToList(package) token_list = package.dependencies_dict["utils"].tokenize(corpora_list) merm_tools_linkeddocument_list = package.dependencies_dict[ "utils"].lemmatize_tokens( token_list, package.dependencies_dict["utils"].standard_stop_words()) package = data_models.PipelinePackage(None, None, None, merm_tools_linkeddocument_list, package.any_analysis_dict, package.dependencies_dict) return package
def perform(self, package: merm_model.PipelinePackage): corpus = [] for linked_doc in package.linked_document_list: corpus.append(linked_doc.raw) vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english') matrix = vectorizer.fit_transform(corpus) feature_names = vectorizer.get_feature_names() new_package = merm_model.PipelinePackage(None, matrix, feature_names, package.linked_document_list, package.any_analysis_dict, package.dependencies_dict) return new_package
def _extract_from_one_provider(es, provider, pipe, dependencies_dict: Dict): msg = "\n\n-------------------------\nPROVIDER: " + str( provider) + "\n---------------------\n\n" log.getLogger().warning(msg) ignore_indices = dependencies_dict["env"].config["extract_instructions"][ "ignore_indices"] ignore_indices_list = ignore_indices.split(",") indices = es_conn.retrieve_index_registry() limit = _dev_limit(dependencies_dict) count = 0 df_per_space_list: List[DataFrame] = [] for index_name in indices: if "@" in index_name: continue if index_name in ignore_indices_list: continue #log.getLogger().info("Retrieved " + str(count) + " rows.") if count > limit: break if provider in index_name: df = _retrieve_index_content(es, index_name, provider) if not df.empty: #log.getLogger().debug("Retrieved " + index_name + ": row count " + str(df.shape)) count = count + df.shape[0] df_per_space_list.append(df) if len(df_per_space_list) > 0: complete_corpus_df = pd.concat(df_per_space_list, ignore_index=True) if True == _dev_bool(dependencies_dict): complete_corpus_df = complete_corpus_df.head(limit) #log.getLogger().info("\n\nExtraction Complete. Document count = " + str(complete_corpus_df[:5])) log.getLogger().info("complete_corpus_df shape: " + str(complete_corpus_df.shape)) dfu.col_names(df, "complete_corpus_df") msg = "\n\n>>>>>>>>>>>>>> Entering Pipeline For " + str( provider) + ">>>>>>>>>>\n\n" log.getLogger().info(msg) analysis_dict = {} analysis_dict["provider"] = provider _enter_pipeline( merm_model.PipelinePackage(None, complete_corpus_df, None, None, analysis_dict, dependencies_dict), pipe)
def perform(self, package: merm_model.PipelinePackage): mfst = package.dependencies_dict["factory"].PipelineManifest.manifest #breaks corpus into subsets grouped_doc_package = mfst["GroupByESIndex"].perform(package) if ("ackage" in type(grouped_doc_package).__name__): log.getLogger().info("STRUCTURE after GroupByESIndex:" + grouped_doc_package.structure()) else: log.getLogger().warning( "The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :(" ) grouped_linked_docs = grouped_doc_package.linked_document_list lda_models_by_group = {} lda_corpus_by_group = {} lda_dict_by_group = {} lda_analysis_by_group = {} dict_for_group_processing = {} dict_for_group_processing["grouped_linked_docs"] = grouped_linked_docs dict_for_group_processing["lda_models_by_group"] = lda_models_by_group dict_for_group_processing["lda_corpus_by_group"] = lda_corpus_by_group dict_for_group_processing["lda_dict_by_group"] = lda_dict_by_group dict_for_group_processing[ "lda_analysis_by_group"] = lda_analysis_by_group for sub_corpus_name, doc_list in grouped_linked_docs.items(): if len(doc_list) > 100: self._analyze_subset(grouped_doc_package, dict_for_group_processing, grouped_doc_package.any_analysis_dict, sub_corpus_name, mfst, doc_list) package.any_analysis_dict[ package.default_analysis_key()] = lda_analysis_by_group new_package = merm_model.PipelinePackage(lda_models_by_group, lda_corpus_by_group, lda_dict_by_group, grouped_linked_docs, package.any_analysis_dict, package.dependencies_dict) return new_package
def perform(self, package:merm_model.PipelinePackage): #scipy_csc_matrix = gensim.matutils.corpus2csc(package.corpus) log.getLogger().info("STAGE: Seeking to reduce topics to those specified in input flatfile") csv = package.dependencies_dict["env"].config["local_data"]["confluence_lda_bysubset"] df = pd.read_csv(csv) df.dropna(inplace=True) reduced_topics = df.to_dict(orient="records") prepared_reduced_topics = self._prepare_reduced_topics(reduced_topics) prepare_data = self._prepare_data(package) matching_topics = self._iterate_similar_topics(prepare_data, prepared_reduced_topics) package.any_analysis_dict[package.default_analysis_key()] = matching_topics return merm_model.PipelinePackage(package.model, package.corpus, package.dict, package.linked_document_list, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict)
def perform(self, package: data_models.PipelinePackage): df = package.corpus log.getLogger().info( "Stage: Converting dataframe of documents (previously mapped through DataFrameConvertForPipeline) to tokenized and lemmatized List[List[str]]. Outer List is corpora, inner list is document as bag of words" ) log.getLogger().info("Corpus size: " + str(df.shape)) corpora_list = self._dfToList(package) tokenized_linked_docs = package.dependencies_dict["utils"].tokenize( corpora_list) #merm_tools_linkeddocument_list =package.dependencies_dict["utils"].lemmatize_tokens(token_list, package.dependencies_dict["utils"].standard_stop_words()) package = data_models.PipelinePackage(None, None, None, tokenized_linked_docs, package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict) category_group_tuple = data_models.category_group_tuple( package.any_analysis_dict["provider"]) package.log_stage( "Converted a pandas dataframe into our own document list format. \nDocument count is " + str(len(tokenized_linked_docs)) + ".\n Category is " + category_group_tuple[0] + "\n GroupBy " + category_group_tuple[1]) return package
def perform(self, package:merm_model.PipelinePackage): #scipy_csc_matrix = gensim.matutils.corpus2csc(package.corpus) log.getLogger().info("STAGE: Running a standard LDA in Gensim") topic_count = env.config.getint('ml_instructions', 'gensim_lda_topics') permitted_overlap = env.config.getint('ml_instructions', 'gensim_lda_permitted_term_overlap_across_topics') log.getLogger().info("Seeking " + str(topic_count) + " topics") report_word_count = env.config.getint('ml_instructions', 'gensim_lda_term_per_topic_reporting_count') if len(package.dict.token2id) > 50: new_package = self._run_lda(topic_count,report_word_count, permitted_overlap, package) new_package.log_stage("Performed Gensim LDA.\nTopic Count: " + str(topic_count) + "\nIterations: " + str(100) + \ "\nalpha = 0 \nUpdate Every: 1\n per_word_topics: False\nReporting on top " + str(report_word_count) + "words in each topic\n") return new_package else: new_package = merm_model.PipelinePackage(None, package.corpus, package.dict, package.linked_document_list, [], package.any_inputs_dict, package.dependencies_dict) new_package.log_stage("Gensim LDA aborted. There were too few tokens") return new_package
def perform(self, package: merm_model.PipelinePackage): thetype = type(package.linked_document_list) if thetype is dict: return package env = package.dependencies_dict["env"] by_space = env.config.getboolean("ml_instructions", "subset_by_space") if by_space == True: linked_doc_by_index = self._by_space(package) else: linked_doc_by_index = self._by_group(package) new_package = merm_model.PipelinePackage(package.model, package.corpus, package.dict, linked_doc_by_index, package.any_analysis, package.any_inputs_dict, package.dependencies_dict) new_package.log_stage( "Divided the entire corpus into groups. The groups created are " + str(linked_doc_by_index.keys())) return new_package
def initiate_run(): try: log.getLogger().info(env.printEnvironment()) env.init() log.getLogger().info(env.printConf()) continue_run = True dependencies_dict = {} dependencies_dict["env"] = env dependencies_dict["factory"] = factory dependencies_dict["es_extract"] = es_extract dependencies_dict["pipe_process"] = pipe_process dependencies_dict["utils"] = utils dependencies_dict["dfutils"] = dfutils dependencies_dict["colutils"] = colutils dependencies_dict["log"] = log dependencies_dict["es_conn"] = es_conn dependencies_dict["ingestor"] = ingestor dependencies_dict["syntax"] = syntax log.getLogger().info("Dependencies: ") for k, v in dependencies_dict.items(): log.getLogger().info(str(k) + " : " + str(v)) while continue_run == True: package = merm_model.PipelinePackage(None, None, None, None, {}, {}, dependencies_dict) package.any_analysis_dict["stage_log"] = "" pipeline.run_pipeline(package) continue_run = env.continue_run() if(not env.run_forever()): break log.getLogger().info("#################### Run Completed :) #################### ") except Exception as e: msg = str(e) log.getLogger().error(env.print_traceback()) log.getLogger().error(msg)
def _analyze_subset(self, grouped_doc_package, dict_for_group_processing, any_analysis_dict, sub_corpus_name, manifest, doc_list): package_one_group = merm_model.PipelinePackage( grouped_doc_package.model, grouped_doc_package.corpus, grouped_doc_package.dict, doc_list, any_analysis_dict, grouped_doc_package.dependencies_dict) package_one_group = manifest["StopWordRemoval"].perform( package_one_group) package_one_group = manifest["ListOfListsToGensimCorpora"].perform( package_one_group) package_one_group = manifest["GensimLDA"].perform(package_one_group) dict_for_group_processing["lda_models_by_group"][ sub_corpus_name] = package_one_group.model dict_for_group_processing["lda_corpus_by_group"][ sub_corpus_name] = package_one_group.corpus dict_for_group_processing["lda_dict_by_group"][ sub_corpus_name] = package_one_group.dict dict_for_group_processing["lda_analysis_by_group"][ sub_corpus_name] = package_one_group.any_analysis_dict overlap_dict = self._topic_overlap( dict_for_group_processing["lda_analysis_by_group"] [sub_corpus_name]) stop_list = self._dynamic_stop_words( overlap_dict, grouped_doc_package.dependencies_dict) if len(stop_list) > 4: msg = "\n\n=============\nWill try again while removing " + str( stop_list) + " from " + sub_corpus_name log.getLogger().info(msg) any_analysis_dict["stop_words"] = stop_list package_one_group = self._analyze_subset( grouped_doc_package, dict_for_group_processing, any_analysis_dict, sub_corpus_name, manifest, doc_list) return package_one_group