Example #1
0
    def perform(self, package: merm_model.PipelinePackage):
        colutils = package.dependencies_dict["colutils"]

        if "corpus_word_frequency" not in package.any_analysis_dict.keys():
            word_counter = LinkedDocCorpusWordCount()
            package = word_counter.perform(package)
        corpus_word_frequency = package.any_analysis_dict[
            "corpus_word_frequency"]
        total_word_count = str(sum(list(corpus_word_frequency.values())))
        unique_word_count = str(len(list(corpus_word_frequency.keys())))
        stop_words_global = package.dependencies_dict[
            "utils"]._stop_word_list_generator(package)
        stop_words_top_tuple = self._top_threshold(package)
        stop_words_top = stop_words_top_tuple[0]
        lowest_freq_at_top = stop_words_top_tuple[1]
        stop_words_bottom_tuple = self._bottom_threshold(package)
        stop_words_bottom = stop_words_bottom_tuple[0]
        max_freq_at_bottom = stop_words_bottom_tuple[1]
        stop_words = stop_words_bottom + stop_words_top + stop_words_global
        analysis_key = colutils.incrementing_key("stop_words",
                                                 package.any_analysis_dict)
        package.any_analysis_dict[analysis_key] = stop_words
        self.save_to_file(stop_words, package)
        package.log_stage("Generated stop words. \nGlobal stop word count: " + str(len(stop_words_global)) + "\nHigh frequency dynamically generated stop words: " + \
                          str(len(stop_words_top)) + "\nLow frequency dynamically generated stop words: " + str(len(stop_words_bottom)) +"\nLowest frequency: " + str(max_freq_at_bottom) + \
                          "\nHighest frequency: " + str(lowest_freq_at_top) + "\nOriginal word unique count is " + str(unique_word_count) + "\nTotal word count is " + str(total_word_count))
        return package
Example #2
0
    def perform(self, package: merm_model.PipelinePackage):
        if type(package.linked_document_list) is list:
            corpus_word_frequency = self.count_as_doc_list(
                package.linked_document_list)
        else:
            corpus_word_frequency = self.count_as_doc_dict(
                package.linked_document_list)

        package.any_analysis_dict[
            "corpus_word_frequency"] = corpus_word_frequency
        count = str(len(corpus_word_frequency))
        total_word_count = str(sum(list(corpus_word_frequency.values())))
        mx = str(max(list(corpus_word_frequency.values())))
        median = str(stats.median(list(corpus_word_frequency.values())))
        stdev = str(stats.stdev(list(corpus_word_frequency.values())))
        doc_count = str(len(package.linked_document_list))

        log_string = "\nDocument count: " + doc_count + \
                     "\nTotal_word count: " + total_word_count + \
                     "\nUnique_word count: " + count + \
            "\nMax Frequency: " + mx + \
            "\nMedian Frequency: " + median + \
            "\nstdev: " + stdev

        package.log_stage(log_string)
        return package
Example #3
0
    def perform(self, package: merm_model.PipelinePackage):

        df = package.corpus
        log.getLogger().info("Shape of DF: " + str(df.shape))
        groupby_dict = {}
        column = package.dependencies_dict["env"].config["ml_instructions"][
            "df_groupby_column"]

        count = 0
        for index, row in df.iterrows():
            count = count + 1
            if count % 1000 == 0:
                sys.stdout.write(".")
            jobs_string = row[column]

            if jobs_string in groupby_dict.keys():
                groupby_dict[jobs_string] = groupby_dict[jobs_string] + 1
            else:
                groupby_dict[jobs_string] = 1
        package.log_stage(
            "Broke a pandas data frame into a dict of data grouped by " +
            str(column))
        package.any_analysis_dict["group_by_" + column] = groupby_dict

        return package
Example #4
0
    def perform(self, package: merm_model.PipelinePackage):
        analysis_id = self._analysis_id(package)
        log.getLogger().info("K means prediciting. Tea time")
        X = package.any_inputs_dict["SKX"]
        env = package.dependencies_dict["env"]
        test_range = env.config["ml_instructions"] ["silhouette_range"].split(",")
        reporting_count = env.config.getint("ml_instructions","sklearn_kmeans_term_per_cluster_reporting_count")

        Xarray = X.toarray()
        silhouette_results = _silhouette(Xarray,test_range)
        cluster_count_tuple = max(silhouette_results, key=lambda x:x[1])

        skdict = package.any_inputs_dict["SKdict"]
        kmeans = KMeans(n_clusters=cluster_count_tuple[0], random_state=10)
        kmeans.fit_predict(Xarray)

        centers = kmeans.cluster_centers_.argsort()[:, ::-1]

        centroid_list = []
        centroid_list.append(["cluster","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16"])
        for i in range(cluster_count_tuple[0]):
            row_list = [i]
            for ind in centers[i, :reporting_count]:
                row_list.append(skdict[ind])

            centroid_list.append(row_list)


        cluster_list = []
        cluster_list.append(["cluster","sentence"])

        package.any_analysis_dict[analysis_id + "_top_terms"] = centroid_list
        package.any_inputs_dict["kmeans_top_terms_key"] = analysis_id + "_top_terms"
        package.log_stage("Kmeans Clustering, no repeats\nSilhouette : " + str(silhouette_results) + "\nCluster count : " + str(cluster_count_tuple))
        return package
Example #5
0
    def perform(self, package: merm_model.PipelinePackage):
        if ("job" not in package.any_analysis_dict["provider"]):
            raise Exception("This classs will not work on " +
                            str(package.any_analysis_dict["provider"]))
        df = package.corpus
        log.getLogger().info("Shape of DF: " + str(df.shape))
        jobs_dict = {}

        for index, row in df.iterrows():
            majorFinal = row["majorFinal"]
            if majorFinal is None:
                jobs_string = row["jobFinal"]

                if jobs_string in jobs_dict.keys():
                    jobs_dict[jobs_string] = jobs_dict[jobs_string] + 1
                else:
                    jobs_dict[jobs_string] = 1
        package.any_analysis_dict["no_major_jobs_count"] = jobs_dict

        for index, row in df.iterrows():
            jobs_string = row["jobFinal"]

            if jobs_string in jobs_dict.keys():
                jobs_dict[jobs_string] = jobs_dict[jobs_string] + 1
            else:
                jobs_dict[jobs_string] = 1
        package.any_analysis_dict["jobs_count"] = jobs_dict

        return package
Example #6
0
    def _run_lda(self, topic_count, report_word_count, permitted_overlap, package:merm_model.PipelinePackage):
        topic_dict = {}
        topic_dict_friendly = {}
        lda_model = gensim.models.ldamodel.LdaModel(corpus=package.corpus,
                                                    id2word=package.dict,
                                                    num_topics=topic_count,
                                                    update_every=1,
                                                    alpha='auto',
                                                    per_word_topics=False,
                                                    iterations=100)

        topics = lda_model.show_topics(formatted=False, num_words=report_word_count)
        for index, topic in topics:
            # print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))
            words_for_topic = []
            words_for_topic_friendly = []
            for w in topic:
                words_for_topic.append((w[0], w[1]))
                words_for_topic_friendly.append(str(w[0]) + "," + str(w[1]))
            topic_dict[index] = words_for_topic
            topic_dict_friendly[index] = words_for_topic_friendly

        topic_overlap = self._topic_overlap(topic_dict)
        log.getLogger().info(str(topic_overlap))
        stop_words = self._dynamic_stop_words(topic_overlap, permitted_overlap)
        if len(stop_words) > permitted_overlap:
            log.getLogger().info("\n**********\nRerunning LDA after removing " + str(len(stop_words)) + " words")
            package = self._remove_stop_words(stop_words,package)
            package = self._rebuild_corpus(package)
            return self._run_lda(topic_count,report_word_count,permitted_overlap,package)
        package.any_analysis_dict[lda_analysis_key(package) + "_topic_overlap"] = topic_overlap
        package.any_analysis_dict[lda_analysis_key(package)] = topic_dict
        package.any_analysis_dict[lda_analysis_key(package) + "_friendly"] = topic_dict_friendly
        return package
 def _set_model(self, package: merm_model.PipelinePackage,
                lda_models_by_group):
     if type(package.model) is dict:
         for key, value in lda_models_by_group.items():
             package.model[key] = value
     else:
         package.model = lda_models_by_group
Example #8
0
    def perform(self, package: merm_model.PipelinePackage):
        if ("job" not in package.any_analysis_dict["provider"]):
            raise Exception("This classs will not work on " +
                            str(package.any_analysis_dict["provider"]))
        df = package.corpus
        log.getLogger().info("Shape of DF: " + str(df.shape))
        areas_of_study_dict_undefined = {}

        for index, row in df.iterrows():
            majorFinal = row["majorFinal"]
            if majorFinal is None:
                areas_of_study = row["areasOfStudy"]
                if len(areas_of_study) > 0:
                    areasOfStudyList = areas_of_study.split(",")
                    for s in areasOfStudyList:
                        if s in areas_of_study_dict_undefined.keys():
                            areas_of_study_dict_undefined[
                                s] = areas_of_study_dict_undefined[s] + 1
                        else:
                            areas_of_study_dict_undefined[s] = 1
        package.any_analysis_dict[
            "undefined_areas_of_study_count"] = areas_of_study_dict_undefined

        areas_of_study_dict = {}
        for index, row in df.iterrows():
            majorFinal = row["majorFinal"]

            if majorFinal in areas_of_study_dict.keys():
                areas_of_study_dict[
                    majorFinal] = areas_of_study_dict[majorFinal] + 1
            else:
                areas_of_study_dict[majorFinal] = 1
        package.any_analysis_dict["areas_of_study_count"] = areas_of_study_dict

        return package
Example #9
0
    def perform(self, package: merm_model.PipelinePackage):

        lda_topics_by_subset_raw = self.load_topics_by_subset(package, "dict")
        lda_topics_toplevel_raw = self.load_top_level_topics(package, "dict")
        word_to_id = self.build_dict(lda_topics_by_subset_raw,
                                     lda_topics_toplevel_raw)

        lda_topics_by_subset_raw_byrow = self.load_topics_by_subset(
            package, "records")
        lda_topics_toplevel_raw_byrow = self.load_top_level_topics(
            package, "records")

        lda_topics_by_subset_raw_byrow_coded = self.code_terms(
            lda_topics_by_subset_raw_byrow, word_to_id)
        lda_topics_toplevel_raw_byrow_coded = self.code_terms(
            lda_topics_toplevel_raw_byrow, word_to_id)

        lda_topics_by_subset_formatted = self.reformat_data(
            lda_topics_by_subset_raw_byrow_coded)
        lda_topics_toplevel_formatted = self.reformat_data(
            lda_topics_toplevel_raw_byrow_coded)

        package.any_analysis_dict[
            "lda_topics_by_subset_formatted"] = lda_topics_by_subset_formatted
        package.any_analysis_dict[
            "lda_topics_toplevel_formatted"] = lda_topics_toplevel_formatted

        return merm_model.PipelinePackage(package.model, package.corpus,
                                          word_to_id,
                                          package.linked_document_list,
                                          package.any_analysis_dict,
                                          package.dependencies_dict)
Example #10
0
    def perform(self, package: merm_model.PipelinePackage):
        analysis_id = self._analysis_id(package)
        log.getLogger().info("K means prediciting. Tea time")
        X = package.any_inputs_dict["SKX"]
        env = package.dependencies_dict["env"]
        test_range = env.config["ml_instructions"] ["silhouette_range"].split(",")

        Xarray = X.toarray()
        silhouette_results = _silhouette(Xarray,test_range)
        cluster_count_tuple = max(silhouette_results, key=lambda x:x[1])
        y = package.any_inputs_dict["SKY"]
        skdict = package.any_inputs_dict["SKdict"]
        cluster = AgglomerativeClustering(n_clusters=cluster_count_tuple[0], affinity='euclidean', linkage='ward')

        result = cluster.fit_predict(X.toarray())
        labels = cluster.labels_
        cluster_list = []
        for j in range(labels.shape[0]):
            row_list = []
            sentence = package.linked_document_list[j].raw
            cluster = labels[j]
            row_list.append(cluster)
            row_list.append(sentence)
            cluster_list.append(row_list)
        cluster_list

        package.any_analysis_dict[analysis_id+"_result"] = cluster_list
        package.log_stage("Agglomerative Clustering\nSilhouette : " + str(silhouette_results) + "\nCluster count : " + str(cluster_count_tuple))
        return package
Example #11
0
    def perform(self, package: data_models.PipelinePackage):
        linked_doc_list = package.linked_document_list
        log.getLogger().info(
            "Converting corpora as bag of words. Input format is List[List[str]]. Output is Gensim Dictionary"
        )
        log.getLogger().info("Corpus size: " +
                             str(len(package.linked_document_list)))
        bowlist = []
        for doc in linked_doc_list:
            bowlist.append(doc.tokens)

        dictionary = corpora.Dictionary(bowlist)

        #log.getLogger().info(dictionary)
        log.getLogger().info("Incoming doc count: " +
                             str(len(linked_doc_list)))
        corpus = [dictionary.doc2bow(line) for line in bowlist]

        log.getLogger().info("Feature count: " + str(len(dictionary.id2token)))
        package.log_stage(
            "Converted the corpus into a Gensim dictionary (i.e., bag of words)"
        )
        return data_models.PipelinePackage(None, corpus, dictionary,
                                           linked_doc_list,
                                           package.any_analysis_dict,
                                           package.any_inputs_dict,
                                           package.dependencies_dict)
Example #12
0
    def perform(self, package: merm_model.PipelinePackage):
        utils = package.dependencies_dict["utils"]
        colutils = package.dependencies_dict["colutils"]
        env = package.dependencies_dict["env"]
        embeddings_file = env.config["ml_instructions"][
            "text_rank_embeddings_file"]
        dimensions = env.config.getint("ml_instructions", "glove_dimensions")

        word_embeddings_list = self._word_embeddings(embeddings_file)

        #sentences = package.dependencies_dict["utils"].corpus_as_sentence_list(package)
        tokenized_sentences_by_doc = utils.corpus_as_tokenized_sentence_linked_doc_list_grouped_by_doc(
            package, True)
        log.getLogger().info("we have " +
                             str(len(tokenized_sentences_by_doc)) + " docs")
        rank_by_dict = self._prep_rank_by_doc_dict(package)
        count = 0
        for docid, sentences in tokenized_sentences_by_doc.items():
            sentence_by_rank_dict = self.rank_by_document(
                sentences, word_embeddings_list, package, dimensions)
            for key, value in sentence_by_rank_dict.items():
                sentence_list_for_that_rank = rank_by_dict[key]
                sentence_list_for_that_rank.append([dimensions, docid, value])
            if count % 100 == 0:
                print(count)
            count = count + 1
        analysis_key = colutils.incrementing_key("text_rank",
                                                 package.any_analysis_dict)
        package.any_analysis_dict[analysis_key] = rank_by_dict
        package.log_stage("Conducting text rank. Total document count is " + str(len(package.linked_document_list)) + \
                          ". For each document the top " + str(len(list(rank_by_dict.keys()))) + " ranked sentences were captured." + \
                          "\nGlove dimension count: " + str(dimensions))
        return package
Example #13
0
    def perform(self, package: merm_model.PipelinePackage):
        env = package.dependencies_dict["env"]
        utils = package.dependencies_dict["utils"]
        original_count = len(package.linked_document_list)
        merge_by = env.config["ml_instructions"]["merge_docs_field"]
        merged_docs_dict = {}

        for sub_doc in package.linked_document_list:
            if merge_by == "groupedBy":
                key = sub_doc.groupedBy
            elif merge_by == "uid":
                key = sub_doc.uid
            else:
                key = sub_doc.space

            if key in merged_docs_dict.keys():
                merged_docs_dict[
                    key].raw = merged_docs_dict[key].raw + ". " + sub_doc.raw
                merged_docs_dict[
                    key].tokens = merged_docs_dict[key].tokens + sub_doc.tokens
            else:
                merged_docs_dict[key] = sub_doc

        new_linked_doc_list = list(merged_docs_dict.values())
        for full_doc in new_linked_doc_list:
            full_doc.raw = utils.cleanstring_doubled_period(full_doc.raw)
        package.linked_document_list = new_linked_doc_list

        package.log_stage("Merged documents by " + str(merge_by) +
                          " tokens. \n Original doc count: " +
                          str(original_count) + "\nNew doc count: " +
                          str(len(package.linked_document_list)))
        return package
Example #14
0
 def perform(self, package: merm_model.PipelinePackage):
     env = package.dependencies_dict
     text_utils = env["utils"]
     syntax = env["syntax"]
     syntax.lemmatize_docs(package.linked_document_list,
                           text_utils.standard_stop_words())
     package.log_stage("lemmatized tokens")
     return package
Example #15
0
    def perform(self, package: merm_model.PipelinePackage):

        for linked_doc in package.linked_document_list:
            new_sentence = ""
            for token in linked_doc.tokens:
                new_sentence = new_sentence + token + " "
            linked_doc.raw = new_sentence
        package.log_stage("Converted tokens to concatenated strings")
        return package
Example #16
0
    def perform(self, package: merm_model.PipelinePackage):
        original_linked_doc_size = len(package.linked_document_list)
        package.uncache_linked_docs()

        package.log_stage("Original linked doc count: " +
                          str(original_linked_doc_size) +
                          "Current linked doc count: " +
                          str(len(package.linked_document_list)))
        return package
 def perform(self, package: merm_model.PipelinePackage):
     package.corpus.to_csv(
         env.config['job_instructions']['es_file_location'], index=False)
     log.getLogger().info(
         "Saved ElasticSearch Data as CSV at: " +
         env.config['job_instructions']['es_file_location'])
     package.log_stage("Saved ElasticSearch Data as CSV at: " +
                       env.config['job_instructions']['es_file_location'])
     return package
Example #18
0
    def perform(self, package: merm_model.PipelinePackage):
        classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
        test_proportion = package.dependencies_dict["env"].config.getfloat(
            "ml_instructions", "rf_test_proportion")
        random_state = 0

        rf_categories = package.any_inputs_dict["SKcategories"]
        X = package.any_inputs_dict["SKX"]
        y = package.any_inputs_dict["SKY"]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_proportion, random_state=random_state)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)

        report = pd.DataFrame(confusion_matrix(y_test, y_pred)).values.tolist()
        report_string = self._report_string(report)
        analysis_id = self._analysis_id(package)
        package.any_inputs_dict["sk_last_id"] = analysis_id

        package.any_analysis_dict[analysis_id + "_rfclassifier"] = classifier
        package.any_analysis_dict[analysis_id + "_confusion"] = report
        package.any_analysis_dict[analysis_id + "_ypred"] = y_pred
        package.any_analysis_dict[analysis_id + "_ytest"] = y_test
        package.any_analysis_dict[analysis_id + "_Xtest"] = X_test
        package.any_analysis_dict[analysis_id + "_Ycategories"] = rf_categories

        package.log_stage("\nTraining doc count: " + str(X_train.shape[0]) +
                          "\nTraining feature count: " +
                          str(X_train.shape[1]) + "\nTestTrain split:" +
                          str(test_proportion) + "\nRF confusion matrix:\n" +
                          report_string + "\nclassification_report:\n" +
                          str(classification_report(y_test, y_pred)) +
                          "Accuracy:\n" + str(accuracy_score(y_test, y_pred)))
        return package
Example #19
0
 def perform(self, package: merm_model.PipelinePackage):
     if "current_loop" in package.any_inputs_dict.keys():
         current_loop = package.any_inputs_dict["current_loop"]
     else:
         current_loop = 0
     env = package.dependencies_dict["env"]
     package.any_inputs_dict["current_loop"] = current_loop + 1
     loop_count = env.config.getint("pipeline_instructions", "loop_count")
     package.any_inputs_dict["loop_count"] = loop_count
     package.log_stage("Current loop: " + str(current_loop) +
                       "\nTotal loops: " + str(loop_count))
     return package
Example #20
0
    def perform(self, package: merm_model.PipelinePackage):
        from nltk.stem import PorterStemmer
        pstemmer = PorterStemmer()
        for linked_doc in package.linked_document_list:

            stemmed_tokens = []
            for token in linked_doc.tokens:
                stemmed = pstemmer.stem(token)
                stemmed_tokens.append(stemmed)
            linked_doc.tokens = stemmed_tokens
        package.log_stage("Stemmed tokens")
        return package
Example #21
0
    def perform(self, package: merm_model.PipelinePackage):
        original_linked_doc_size = len(package.linked_document_list)
        text_utils = package.dependencies_dict["utils"]
        linked_docs_by_sentence = text_utils.corpus_as_tokenized_sentence_linked_doc_list(
            package)
        package.linked_document_list = linked_docs_by_sentence

        package.log_stage(
            "LinkedDocToLinkedSentences: Original linked doc count: " +
            str(original_linked_doc_size) + "Current linked doc count: " +
            str(len(package.linked_document_list)))
        return package
Example #22
0
    def perform(self, package: data_models.PipelinePackage):

        env = package.dependencies_dict["env"]
        report_count = env.config.getint("ml_instructions",
                                         "glove_loadings_count_to_report")

        glove_output_key_list = package.any_inputs_dict[
            "glove_output_key"].split(",")
        for glove_output_key in glove_output_key_list:
            self._process_loadings(package, glove_output_key, report_count)

        package.log_stage("GloveLoadings: ")
        package.any_analysis_dict["glove_variance"] = self.variance_dict
        return package
Example #23
0
    def _analysis_id(self, package: merm_model.PipelinePackage):
        dt = datetime.now()
        suffix = str(dt.microsecond)[-4:]
        if "kmeans_iteration_count" in package.any_inputs_dict.keys():
            rf_count = package.any_inputs_dict["kmeans_iteration_count"] + 1
            package.any_inputs_dict["kmeans_iteration_count"] = rf_count
        else:
            rf_count = 0
            package.any_inputs_dict["kmeans_iteration_count"] = rf_count

        categories = package.any_inputs_dict["SKcategories"]
        category_count = len(list(categories.keys()))
        id =  "km1_" + str(rf_count) + "_" + str(category_count)  + "_" + str(len(package.any_inputs_dict["SKY"])) + "_" + suffix
        return id
Example #24
0
    def perform(self, package: merm_model.PipelinePackage):
        mfst = package.dependencies_dict["factory"].PipelineManifest.manifest

        #breaks corpus into subsets
        grouped_doc_package = mfst["SubsetData"].perform(package)
        if ("ackage" in type(grouped_doc_package).__name__):
            log.getLogger().info("STRUCTURE after SubsetData:" +
                                 grouped_doc_package.structure())
        else:
            log.getLogger().warning(
                "The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :("
            )

        grouped_linked_docs = grouped_doc_package.linked_document_list
        analysis_by_group_rake = {}
        analysis_by_group_text_rank = {}
        analysis_by_group_noun_phrase = {}
        minimum_doc_count = package.dependencies_dict["env"].config.getint(
            'ml_instructions', 'minimum_doc_count')
        log_string = "\n======================\nSubset Analysis for text rank, rake and noun phrase.\n"
        for sub_corpus_name_untyped, doc_list in grouped_linked_docs.items():
            sub_corpus_name = str(sub_corpus_name_untyped)
            if len(doc_list) > minimum_doc_count:
                package_one_group = merm_model.PipelinePackage(
                    package.model, package.corpus, package.dict, doc_list, {},
                    package.any_inputs_dict, package.dependencies_dict)
                package_one_group.any_inputs_dict[
                    "corpus_name"] = sub_corpus_name
                package_one_group = self._analyze_subset(
                    package_one_group, sub_corpus_name, mfst, doc_list)
                analysis_by_group_text_rank[
                    sub_corpus_name] = package_one_group.any_analysis_dict[
                        "text_rank_0"]

                log_string = log_string + package_one_group.stage_log()

        package.any_analysis_dict[
            "text_rank_all_groups"] = analysis_by_group_text_rank
        package.any_analysis_dict["rake_all_groups"] = analysis_by_group_rake
        package.any_analysis_dict[
            "noun_phrase_all_groups"] = analysis_by_group_noun_phrase
        new_package = merm_model.PipelinePackage(package.model, package.corpus,
                                                 package.dict,
                                                 grouped_linked_docs,
                                                 package.any_analysis_dict,
                                                 package.any_inputs_dict,
                                                 package.dependencies_dict)

        new_package.log_stage(log_string)
        return new_package
Example #25
0
def generate_linked_docs_unranked(package: merm_model.PipelinePackage,
                                  analysis_key):
    linked_doc_dict = {}
    package.cache_linked_docs()
    all_groups = package.any_analysis_dict[analysis_key]
    for key, sentence_list in all_groups.items():
        linked_doc_list = []

        for sentence in sentence_list:
            linked_doc = package.dependencies_dict[
                "utils"].sentence_to_linked_doc(sentence)
            linked_doc_list.append(linked_doc)
        linked_doc_dict[key] = linked_doc_list
    package.linked_document_list = linked_doc_dict
    return package
Example #26
0
    def next_step(self, task:str, package:merm_model.PipelinePackage):

        self.step_count = self.step_count + 1
        msg = "\n\nEntering " + task + " " + str(self.step_count) + "\n\n"
        log.getLogger().info(msg)
        manifest = factory.PipelineManifest
        new_task = manifest.manifest[task]
        package = new_task.perform(package)
        package.any_inputs_dict["previous_task"] = task
        package.any_inputs_dict["history"].append(task)
        if("Package" in type(package).__name__):
            log.getLogger().warning("STRUCTURE after " + task + ": " + package.structure())
        else:
            log.getLogger().warning("The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :(")
        return package
Example #27
0
def _extract_from_providers_merge(es, providers,
                                  package: merm_model.PipelinePackage):
    msg = "\n\n-------------------------\nPROVIDERS: " + str(
        providers) + "\n---------------------\n\n"
    log.getLogger().warning(msg)
    ignore_indices = package.dependencies_dict["env"].config[
        "extract_instructions"]["ignore_indices"]
    ignore_indices_list = ignore_indices.split(",")
    indices = es_conn.retrieve_index_registry()

    limit = _dev_limit(package.dependencies_dict)
    count = 0

    df_per_space_list: List[DataFrame] = []
    for provider in providers:
        count = 0
        for index_name in indices:
            if "@" in index_name:
                continue
            if index_name in ignore_indices_list:
                continue
            if count > limit:
                break
            if provider.strip() in index_name:
                df = _retrieve_index_content(es, index_name, provider, limit,
                                             package.dependencies_dict)
                if not df.empty:
                    log.getLogger().debug("Retrieved " + index_name +
                                          ": row count " + str(df.shape))
                    count = count + df.shape[0]
                    df_per_space_list.append(df)

    if len(df_per_space_list) > 0:
        complete_corpus_df = pd.concat(df_per_space_list, ignore_index=True)
        if True == _dev_bool(package.dependencies_dict):
            complete_corpus_df = complete_corpus_df.head(limit)
            #log.getLogger().info("\n\nExtraction Complete. Document count = " + str(complete_corpus_df[:5]))
        log.getLogger().info("complete_corpus_df shape: " +
                             str(complete_corpus_df.shape))
        dfu.col_names(df, "complete_corpus_df")
        msg = "\n\n>>>>>>>>>>>>>>   Entering Pipeline For  " + str(
            providers) + ">>>>>>>>>>\n\n"
        log.getLogger().info(msg)
        analysis_dict = {}
        analysis_dict["provider"] = str(providers)
        package.any_analysis_dict = analysis_dict
        package.corpus = complete_corpus_df
        return package
Example #28
0
    def perform(self, package: merm_model.PipelinePackage):
        last_id = package.any_inputs_dict["sk_last_id"]
        y_test = package.any_analysis_dict[last_id + "_ytest"]
        y_pred = package.any_analysis_dict[last_id + "_ypred"]
        X_test = package.any_analysis_dict[last_id + "_Xtest"]

        rf_dict = package.any_inputs_dict["SKdict"]

        rf_categories = package.any_inputs_dict["SKcategories"]
        inv_rf_categories = {v: k for k, v in rf_categories.items()}
        sentence_match_list = []
        sentence_match_list.append(
            ["Actual", "Predicted", "Sentence", "Correct"])
        near_missies_dict = package.any_analysis_dict[last_id +
                                                      "_near_misses_dict"]

        for idx, major in enumerate(y_test):

            pred_major = y_pred[idx]

            if pred_major == major:
                match = True
            else:
                match = False

            if match or self._add_to_sentence_list(
                    major, pred_major, inv_rf_categories, near_missies_dict):
                sentence = X_test[[idx], :]
                #print(X_test[[idx], :])
                sentence_string = ""

                for word_idx in sentence.indices:
                    sentence_string = sentence_string + rf_dict[word_idx] + " "
                alist = [
                    inv_rf_categories[major], inv_rf_categories[pred_major],
                    sentence_string, match
                ]
                sentence_match_list.append(alist)

        analysis_id = self._analysis_id(package)

        package.any_analysis_dict[analysis_id +
                                  "_sentences"] = sentence_match_list

        package.log_stage(
            "Found sentences that accurately predict each major or were near misses"
        )
        return package
def run_post_process(package: merm_model.PipelinePackage):
    log.getLogger().info("run_post_process")
    csv_list_of_lists = []
    csv_list_of_lists.append(["index_name", "topic_id", "term", "weight"])
    report_sentences = env.config.getboolean(
        'ml_instructions', 'gensim_lda_report_sentence_level')
    for idxname, topicdict in package.any_analysis().items():
        report_for_index = "\n\n\n+++++++++++++++++++\n\nReport for " + idxname + "\n\n"
        docs_list = package.linked_document_list[idxname]
        if report_sentences == True:
            corpus_as_sentences = break_corpus_as_sentences(docs_list)
        report_for_index += "Corpus Size: " + str(len(docs_list)) + "\n"
        if len(docs_list) > 100:
            for topicid, topiclist in topicdict["default_analysis_key"].items(
            ):
                report_for_index += "\n\nTOPIC:" + str(topicid) + "\n"

                for entry in topiclist:
                    report_for_index += str(entry[0])
                    report_for_index += "\t\t\t"
                    report_for_index += str(entry[1])
                    report_for_index += "\n"
                    csv_list_of_lists.append(
                        [idxname, topicid, entry[0], entry[1]])
                if report_sentences == True:
                    salient_sentences = find_salient_sentences(
                        topiclist, corpus_as_sentences)
                    report_for_index += "\n\nSALIENT_SENTENCES\n"
                    for sentence in salient_sentences:
                        report_for_index += sentence + "\n"

            log.getReportLogger().info(report_for_index)
    _save_topic_model(package)
    _save_csv(csv_list_of_lists, "lda_analysis_by_subset")
Example #30
0
    def perform(self, package: merm_model.PipelinePackage):

        original_count = len(package.linked_document_list)
        filter_dict = {}
        for linked_doc in package.linked_document_list:
            filter_dict[linked_doc.raw] = linked_doc

        doc_list = list(filter_dict.values())
        package.linked_document_list = doc_list

        package.log_stage("Before removing duplicates:  " +
                          str(original_count) +
                          "\nAfter removing duplicates: " +
                          str(len(package.linked_document_list)))

        return package