def get_topic_extraction(self, passed_args):
        passed_args, data_args, dictionary_args, model_args, version_args, request_args = ExeParams.get_parameters(
            passed_args)
        Advisor.set_data_folder_path(data_args[DataParams.data_folder_path])
        processed_data = self._get_processed_data(data_args, version_args)

        all_models_args_values = ModelParams.get_possible_model_params_values()

        for requested_model_type in request_args[
                RequestParams.requested_models]:
            for lang in processed_data:
                if lang in request_args[RequestParams.requested_langs]:
                    # ModelParams.set_model_params_value(model_args)
                    logging.info("- Get Topic extraction for '%s' language" %
                                 lang)
                    for test_param in all_models_args_values:
                        metrics = self._get_requested_models_metrics(
                            requested_model_type, lang, processed_data[lang],
                            dictionary_args,
                            all_models_args_values[test_param], version_args,
                            request_args, test_param)
                        metrics = self._make_param_metrics_ready_to_plot(
                            metrics)
                        self.comparative_view.plot_metrics(
                            metrics, test_param, lang, requested_model_type,
                            version_args[VersionifyParams.data_version],
                            version_args[VersionifyParams.dictionary_version],
                            version_args[VersionifyParams.model_version])
        return
Example #2
0
    def get_train_data_ready_to_work(
        self,
        data_file_name: str,
        data_file_extension: str,
        data_file_type: str,
    ) -> dict:
        """
        :param data_file_name: data file name -> {data_file_name}-train.{data_file_extension}
        :param data_file_extension: data file extension
        :param data_file_type: only three types are supported["CommonCrawl", "Json", "SemiJson"]
                        (path-to-data-folder/(data-file-name)-(train/test).data-extension)
        :return: dic of languages as keys, and their values are list of that language's documents;
        and each document is one string that holds all web page's text
        """
        data_file_path = Advisor.get_data_folders_file_path(
            data_file_name, data_file_extension)
        ready_to_train_data = dict()
        logging.info("--- Get data file content")

        text_data = self._get_raw_data_from_path_file(data_file_type,
                                                      data_file_path)
        logging.info("--- Getting pages's code and language")
        for index, page in enumerate(text_data):
            page = BeautifulSoup(page, features="html.parser")
            text, lang = self._get_text_data_from_page(page)
            if lang not in ready_to_train_data:
                ready_to_train_data[lang] = list()
            ready_to_train_data[lang].append(text)
        logging.info("--- Got pages's text and language")
        logging.info("-- Data is ready for further processes")
        return ready_to_train_data
Example #3
0
 def __read_data_meta_file():
     meta_file_path = Advisor.get_data_file_meta_path()
     if path.exists(meta_file_path):
         with open(meta_file_path, "r") as json_file:
             meta = json.load(json_file)
             json_file.close()
         return meta
     return None
Example #4
0
    def write_model_evaluation_metrics(self, lang: str, data_version,
                                       dictionary_version, model_version,
                                       param_name: str, param_version: int,
                                       metrics: dict, model_parameters: dict):
        model_evaluation_file_path = Advisor.get_model_type_folders_file_path(
            lang, data_version, dictionary_version, model_version, param_name,
            param_version, self.model_type, "evaluation")
        model_version_file_path = Advisor.get_model_type_folders_file_path(
            lang, data_version, dictionary_version, model_version, param_name,
            param_version, self.model_type, "meta.json")
        with open(model_evaluation_file_path, "w") as json_file:
            json.dump(metrics, json_file, indent=4)
            json_file.close()

        with open(model_version_file_path, "w") as json_file:
            json.dump(model_parameters, json_file, indent=4)
            json_file.close()
        return
Example #5
0
 def get_topics_words_and_their_tfidf_of_them_over_docs(
         self, model, processed_data, model_type):
     topics = model.show_topics(formatted=False,
                                num_topics=-1,
                                num_words=30)
     words_tfidf = self._get_words_tfidf(topics, processed_data)
     max_doc = len(processed_data)
     document_id_range = range(0, max_doc + 1, int(
         max_doc / 10)) if max_doc > 10 else range(0, max_doc + 1)
     for topic_id, topic in topics:
         file_path = Advisor.get_topic_folders_file_path_from_topic_version(
             self.topic_version_path, model_type, topic_id, "Words TFiDF",
             "png")
         words_no = len(topic)
         # fig, axes = pyplot.subplots(words_no, 1, figsize=(2 * 2.2, min([(len(processed_data) + 1) * 5.0, 2 ** 16])))
         fig, axes = pyplot.subplots(words_no, 1)
         fig.suptitle("TFiDF of Topic '%d' over Docs" % topic_id)
         if words_no > 1:
             axes = axes.flatten()
         else:
             axes = [axes]
         ax_no = 0
         for word, _ in topic:
             ax = axes[ax_no]
             ax_no += 1
             data = list()
             word_data = words_tfidf[word]
             for doc_id in word_data:
                 data.append([word, doc_id, word_data[doc_id]])
             data = DataFrame(data,
                              columns=["Word", "Document_No", "TFiDF"])
             ax.plot('Document_No',
                     'TFiDF',
                     data=data,
                     marker='o',
                     color=self._get_random_color(),
                     label='Word : %s' % word,
                     dashes=[6, 2])
             ax.legend()
             ax.set_ylabel("TFiDF of Words")
             ax.set_xlabel("Documents")
             ax.set_title("Word %s" % word)
             ax.set_xticks(document_id_range)
             ax.set_xticklabels([
                 'Document %d' % document for document in document_id_range
             ],
                                rotation=30,
                                horizontalalignment='right',
                                fontsize=8)
             ax.grid(True)
         fig.savefig(file_path)
         pyplot.close(fig)
     return
Example #6
0
    def __write_processed_data(data_file_name: str, processed_data: dict, version: int):
        """
        :param data_file_name: processed data's file name
        :param processed_data: the data that we want to write it down
        :param version: the version of data
        """
        for lang in processed_data:
            processed_data_file_path = Advisor.get_data_version_folders_file_path(lang, version, data_file_name, "json")

            with open(processed_data_file_path, 'w') as json_file:
                json.dump(processed_data[lang], json_file)
                json_file.close()
        return
Example #7
0
    def __write_meta_data(language_list: list, tags: dict, data_version: int, lang_length: dict):
        data_file_meta_path = Advisor.get_data_file_meta_path()
        data_file_meta_content = {"languages": language_list,
                                  "lang_length": lang_length}

        data_process_version_meta_content = {
                                             "tags": tags, "token_ validation": {"must": ["alpha"],
                                                                                 "must Not":
                                                                                     ["stop_word", "space", "bracket",
                                                                                      "currency",
                                                                                      "url", "email", "number",
                                                                                      "verb"]}}
        with open(data_file_meta_path, "w") as json_file:
            json.dump(data_file_meta_content, json_file, indent=4)
            json_file.close()
        for lang in language_list:
            data_version_meta_path = Advisor.get_data_version_folders_file_path(lang, data_version,
                                                                                "data-process-mata", "json")
            with open(data_version_meta_path, "w") as json_file:
                json.dump(data_process_version_meta_content, json_file, indent=4)
                json_file.close()
        return
Example #8
0
    def _contribution_of_dominate_topics_in_docs(
            self, contribution_of_dominate_topic_in_docs: dict,
            model_type: str, nth_topic: int, doc_no: int):
        file_path = Advisor.get_visualization_file_path_from_topic_version(
            self.topic_version_path, model_type, "Contribution_of_"
            "%d_dominate_topics_in_docs" % nth_topic, "png")
        plot_no = len(contribution_of_dominate_topic_in_docs)
        document_id_range = range(0, doc_no + 1, int(
            doc_no / 10)) if doc_no > 10 else range(0, doc_no + 1, 1)
        fig, axes = pyplot.subplots(plot_no,
                                    1,
                                    figsize=(2 * 2.0, (plot_no + 1) * 5.0))
        if plot_no > 1:
            axes = axes.flatten()
        else:
            axes = [axes]
        ax_no = 0
        fig.suptitle("Contribution Of '%dth' Dominate Topics In Docs" %
                     nth_topic)
        docs = set()
        for topic_id in contribution_of_dominate_topic_in_docs:
            ax = axes[ax_no]
            ax_no += 1
            label = "%d" % topic_id
            data = DataFrame(
                contribution_of_dominate_topic_in_docs[topic_id],
                columns=["Document_No", "Percentage_of_Contribution"])
            ax.plot('Document_No',
                    'Percentage_of_Contribution',
                    data=data,
                    marker='o',
                    color=self._get_random_color(),
                    label=label,
                    dashes=[6, 2])
            docs.update(data["Document_No"].tolist())

            ax.legend()
            ax.set_ylabel("Contribution of Topic")
            ax.set_title("Topic %d" % topic_id)
            ax.set_xticks(document_id_range)
            ax.set_xticklabels(
                ['Document %d' % document for document in document_id_range],
                rotation=30,
                horizontalalignment='right',
                fontsize=8)
            ax.grid(True)

        fig.savefig(file_path)
        pyplot.close(fig)
        return
Example #9
0
    def _contribution_of_each_dominate_topic_in_docs(
            self, contribution_of_dominate_topic_in_docs: dict, doc_no,
            model_type: str, nth_topic: int):
        document_id_range = range(0, doc_no + 1, int(
            doc_no / 10)) if doc_no > 10 else range(0, doc_no + 1, 1)
        for topic_id in contribution_of_dominate_topic_in_docs:
            file_path = Advisor.get_topic_folders_file_path_from_topic_version(
                self.topic_version_path, model_type, int(topic_id),
                "Contribution_of_"
                "'%d'th_dominate_topic_in_docs" % nth_topic, "png")
            fig, ax = pyplot.subplots()
            ax.set_title("Contribution Of Topic '%d' ")
            data = DataFrame(
                contribution_of_dominate_topic_in_docs[topic_id],
                columns=["Document_No", "Percentage_of_Contribution"])
            ax.plot('Document_No',
                    'Percentage_of_Contribution',
                    data=data,
                    marker='o',
                    color=self._get_random_color(),
                    dashes=[6, 2])
            ax.set_ylabel("Contribution")
            ax.set_xlabel("Documents")
            ax.set_title("Contribution of Topic %d Over Documents" % topic_id)
            ax.set_xticks(document_id_range)
            ax.set_xticklabels(
                ['Document %d' % document for document in document_id_range],
                rotation=45,
                horizontalalignment='right',
                fontsize=8)
            text = list(data.Percentage_of_Contribution)
            d_mean = round(mean(text))
            d_median = round(median(text))
            d_std = round(std(text))
            d_one_percent = round(quantile(text, q=0.01))
            d_ninety_nine_percent = round(quantile(text, q=0.99))
            d_text = "Mean : {}\nMedian : {}\nStdev: {}\n1%ile : {}\n99%ile : {}".format(
                d_mean, d_median, d_std, d_one_percent, d_ninety_nine_percent)
            ax.text(0.9,
                    0.98,
                    d_text,
                    transform=ax.transAxes,
                    bbox=dict(fc="none"),
                    color='purple')
            ax.grid(True)

            fig.savefig(file_path)
            pyplot.close(fig)
        return
Example #10
0
    def _plot_number_of_docs_in_each_dominate_topic(
            self, number_of_docs_in_each_dominate_topic: list, model_type,
            nth_topic):
        file_path = Advisor.get_visualization_file_path_from_topic_version(
            self.topic_version_path, model_type, "Number_of_docs_in_each_"
            "'%d'th_dominate_topic" % nth_topic, "png")
        divide_no = 10
        number_of_docs_in_each_dominate_topic = DataFrame(
            number_of_docs_in_each_dominate_topic,
            columns=["Topic_id", "No_of_Docs"])
        number_of_docs_in_each_dominate_topic = number_of_docs_in_each_dominate_topic.sort_values(
            "No_of_Docs")

        topic_no = len(number_of_docs_in_each_dominate_topic)
        plot_no = int(topic_no /
                      divide_no) if topic_no % divide_no == 0 else int(
                          topic_no / divide_no) + 1
        fig, axes = pyplot.subplots(plot_no, 1)
        fig.set_size_inches(divide_no * 2.0, plot_no * 8.0)
        if plot_no > 1:
            axes = axes.flatten()
        else:
            axes = [axes]
        start = 0
        for i in range(plot_no):
            ax = axes[i]
            plot_data = number_of_docs_in_each_dominate_topic[start:start +
                                                              divide_no]
            ax.bar(x='Topic_id',
                   height="No_of_Docs",
                   data=plot_data,
                   width=0.5,
                   alpha=0.3)
            dominate_topic_range = plot_data.Topic_id.to_list()
            ax.set_ylabel('Document Count Percentage')
            ax.set_xlabel('Topics')
            ax.set_xticks(dominate_topic_range)
            ax.set_xticklabels(
                ['Topic %d' % topic for topic in dominate_topic_range],
                rotation=30,
                horizontalalignment='right',
                fontsize=8)
            self._set_bar_plot_text(ax)
            start += divide_no
        fig.savefig(file_path)
        fig.suptitle('Number of Documents in Percentage for each Topic',
                     fontsize=14)
        pyplot.close(fig)
        return
Example #11
0
    def __init__(self,
                 lang: str,
                 data_version: int,
                 dictionary_version: float,
                 model_version: str,
                 param_name: str,
                 param_version: int,
                 number_of_decimal_digits: int = 5,
                 max_colwidth: int = 100):

        self.topic_version_path = Advisor.get_param_version_folder_path(
            lang, data_version, dictionary_version, model_version, param_name,
            param_version)
        self.number_of_decimal_digits = number_of_decimal_digits
        options.display.max_colwidth = max_colwidth
        return
 def get_dictionary(self, lang, data_version, dictionary_version, no_above,
                    no_below, n_most_frequent,
                    language_processed_data: list):
     logging.info("--- Getting dictionary")
     if self.dictionary is None:
         dictionary_file_path = Advisor.get_dictionary_version_folder_file_path(
             lang, data_version, dictionary_version, self.file_types[0][0],
             self.file_types[0][1])
         if path.exists(dictionary_file_path):
             logging.info("---- Dictionary was created before")
             self.dictionary = Dictionary.load(dictionary_file_path)
         else:
             self.set_dictionary(language_processed_data, no_below,
                                 no_above, n_most_frequent,
                                 dictionary_file_path)
     logging.info("--- Dictionary captured")
     return
 def get_corpus(self,
                lang,
                data_version,
                dictionary_version,
                language_processed_data: list = None):
     logging.info("--- Getting corpus")
     if self.corpus is None:
         corpus_file_path = Advisor.get_dictionary_version_folder_file_path(
             lang, data_version, dictionary_version, self.file_types[1][0],
             self.file_types[1][1])
         if path.exists(corpus_file_path):
             logging.info("---- Corpus was created before")
             self.corpus = list(MmCorpus(corpus_file_path))
         else:
             self.set_corpus(language_processed_data, corpus_file_path)
     logging.info("--- Corpus captured")
     return
 def get_model(self, lang, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int,
               language_processed_data: list, model_view: bool):
     logging.info("--- Getting LDA model")
     if self.model is None:
         model_file_path = Advisor.get_model_type_folders_file_path(lang, data_version,
                                                                    dictionary_version, model_version, param_name, param_version,
                                                                    self.model_type, "LDA-model")
         if path.exists(model_file_path):
             logging.info("---- LDA model was crated before")
             self.model = LdaModel.load(model_file_path)
         else:
             self.set_model(lang, data_version, dictionary_version, model_version, param_name, param_version,
                            model_file_path, language_processed_data)
     logging.info("--- LDA model captured")
     if model_view:
         self.visualization.get_model_visualizations(self.model_type, self.model, self.essentials.corpus,
                                                     language_processed_data)
     return self.model
Example #15
0
 def __read_processed_data(data_file_name: str, version: int):
     """
     :param data_file_name: processed data's file name
     :param version: the data that we want to read it
     :return: (dict format) the processed data that we wrote it down before
     """
     meta = TextPreprocessor.__read_data_meta_file()
     if meta is None:
         return None
     processed_data = dict()
     for lang in meta["languages"]:
         processed_data_file_name = Advisor.get_data_version_folders_file_path(lang, version, data_file_name, "json")
         if path.exists(processed_data_file_name) is False:
             return None
         with open(processed_data_file_name) as json_file:
             processed_data[lang] = json.load(json_file)
             json_file.close()
     return processed_data
 def plot_metrics(cls, metrics: dict, explore_param: str, lang, model_name, data_version, dict_version, model_version):
     plot_no = len(metrics)
     fig, axes = pyplot.subplots(plot_no, 1, figsize=(2 * 8.0, (plot_no + 1) * 5.0))
     if plot_no > 1:
         axes = axes.flatten()
     else:
         axes = [axes]
     for i, metric in enumerate(metrics):
         data = DataFrame(metrics[metric], columns=[explore_param, metric])
         ax = axes[i]
         ax.plot(data[explore_param], data[metric], marker='o', )
         ax.set_title("Coherence of %s" % metric)
         ax.set_xlabel(explore_param)
         ax.set_xticks(data[explore_param])
     file_name = "%s-%s-%s" % (model_name, explore_param, model_version)
     fig_file_name = Advisor.get_model_version_folders_file_path(lang, data_version, dict_version,
                                                                 model_version,
                                                                 file_name, "png")
     fig.savefig(fig_file_name)
     pyplot.close(fig=fig)
     return
Example #17
0
    def word_cloud_of_top_n_words_in_each_topic(self, top_n: int, lda_model,
                                                model_type):
        cloud = WordCloud(background_color='white',
                          max_words=top_n,
                          max_font_size=5 * top_n,
                          prefer_horizontal=1.0,
                          font_step=5)

        topics = lda_model.show_topics(formatted=False,
                                       num_topics=-1,
                                       num_words=30)
        for topic in topics:
            topic_words = dict(topic[1])
            try:
                data_file_path = Advisor.get_topic_folders_file_path_from_topic_version(
                    self.topic_version_path, model_type, topic[0], "WordCloud",
                    'png')
                cloud.generate_from_frequencies(topic_words,
                                                max_font_size=10 * top_n)
                cloud.to_file(data_file_path.format(topic[0]))
            except OSError as err:
                logging.error(err)
        return
Example #18
0
    def topic_words_and_its_joint(self, model, model_type, processed_data):
        topics = model.show_topics(formatted=False,
                                   num_topics=-1,
                                   num_words=30)
        all_topics_words = self._get_topic_words_weight_and_words_counter(
            model, processed_data)
        for topic in topics:
            topic_and_its_words_in_whole_data = dict()
            for word, weight in topic[1]:
                group = all_topics_words.loc[all_topics_words.word ==
                                             word].groupby("topic_id")
                for topic_id, out_line in group:
                    topic_id = int(topic_id)
                    if topic_and_its_words_in_whole_data.get(topic_id,
                                                             None) is None:
                        topic_and_its_words_in_whole_data[topic_id] = list()
                    topic_and_its_words_in_whole_data[topic_id].append([
                        word, out_line.importance.values[0],
                        out_line.word_count.values[0]
                    ])
            sub_plot_number = len(topic_and_its_words_in_whole_data)

            fig, axes = pyplot.subplots(sub_plot_number + 1,
                                        1,
                                        figsize=(2 * 8.0,
                                                 (sub_plot_number + 1) * 5.0))
            if len(topic_and_its_words_in_whole_data) > 1:
                axes = axes.flatten()
            word_count_ax = axes[0]
            topic_word_count = DataFrame(
                topic_and_its_words_in_whole_data[int(topic[0])],
                columns=['word', 'word_importance', 'word_count'])
            word_count_ax.bar(x='word',
                              height="word_count",
                              data=topic_word_count,
                              width=0.5,
                              alpha=0.3)
            self._set_bar_plot_text(word_count_ax)
            word_count_ax.set_ylabel('Word Count')
            word_count_ax.set_title('Topic: %d Word Count' % topic[0],
                                    fontsize=12)
            word_count_ax.set_xticklabels(topic_word_count.word,
                                          rotation=30,
                                          horizontalalignment='right',
                                          fontsize=8)

            ax_n = 1
            for i in topic_and_its_words_in_whole_data:
                tdf = DataFrame(
                    topic_and_its_words_in_whole_data[i],
                    columns=['word', 'word_importance', 'word_count'])
                topic_word_ax = axes[ax_n]
                topic_word_ax.bar(x='word',
                                  height="word_importance",
                                  data=tdf,
                                  width=0.2)
                topic_word_ax.set_title('Topic: %d Word Weight' % i,
                                        fontsize=12)
                topic_word_ax.set_xticklabels(tdf.word,
                                              rotation=30,
                                              horizontalalignment='right',
                                              fontsize=8)
                self._set_bar_plot_text(topic_word_ax)

                ax_n += 1
            # fig.tight_layout(w_pad=2)
            fig.suptitle('Word Count and Importance of Topic Keywords',
                         fontsize=18,
                         y=1.05)

            file_name = "WordCountsOfTopicKeywords"
            fig_file_name = Advisor.get_topic_folders_file_path_from_topic_version(
                self.topic_version_path, model_type, topic[0], file_name,
                "png")
            fig.savefig(fig_file_name)
            pyplot.close(fig=fig)
        return