Beispiel #1
0
    def get_categories_1_weight_matrix(self):
        tsm = self.tsm
        cfm = CategoryFeatureMatrix()
        sfm = SampleFeatureMatrix()

        categories = self.get_categories()
        for category_name in categories.categories_1:
            category_id = categories.categories_1[category_name]
            positive_samples_list, unlabeled_samples_list = tsm.get_samples_list_by_category_1(category_id)

            print "\n%s(%d) Positive Samples: %d Unlabeled Samples: %d" % (category_name, category_id, len(positive_samples_list), len(unlabeled_samples_list))

            terms_positive_degree = get_terms_positive_degree_by_category(tsm, positive_samples_list, unlabeled_samples_list)
            features = {}
            for term_id in terms_positive_degree:
                (pd_word, speciality, popularity) = terms_positive_degree[term_id]
                features[term_id] = pd_word
            cfm.set_features(category_id, features)

            for sample_id in positive_samples_list:
                (sample_category, sample_terms, term_map) = tsm.get_sample_row(sample_id)
                category_1_id = Categories.get_category_1_id(sample_category)
                sfm.set_sample_category(sample_id, category_1_id)
                for term_id in term_map:
                    if term_id in terms_positive_degree:
                        (pd_word, speciality, popularity) = terms_positive_degree[term_id]
                        sfm.add_sample_feature(sample_id, term_id, pd_word)
                        no_terms = False

        return cfm, sfm
Beispiel #2
0
    def show_category_keywords(self, result_dir):
        if not os.path.isdir(result_dir):
            try:
                os.mkdir(result_dir)
            except OSError:
                logging.error(Logger.error("mkdir %s failed." % (result_dir)))
                return

        tsm = self.tsm

        categories = self.get_categories()
        for category_name in categories.categories_2:
            category_id = categories.categories_2[category_name]
            positive_samples_list, unlabeled_samples_list = tsm.get_samples_list_by_category_2(category_id)

            print "%s(%d) Positive Samples: %d Unlabeled Samples: %d" % (category_name, category_id, len(positive_samples_list), len(unlabeled_samples_list))

            terms_positive_degree = get_terms_positive_degree_by_category(tsm, positive_samples_list, unlabeled_samples_list)

            pd.save_terms_positive_degree(terms_positive_degree, self.corpus.vocabulary, "%s/keywords_%d_%s.txt" % (result_dir, category_id, category_name))

            samples_positive = None
            samples_unlabeled = None