Exemple #1
0
def PULearning_test(samples_positive, samples_unlabeled):

    threshold_pd_word = 0.4
    threshold_speciality = 0.8
    threshold_popularity = 0.01

    tsm_positive = samples_positive.tsm
    tsm_unlabeled = samples_unlabeled.tsm

    terms_positive_degree = select_features_by_positive_degree(tsm_positive, tsm_unlabeled, (threshold_pd_word, threshold_speciality, threshold_popularity))

    vocabulary = samples_positive.corpus.vocabulary

    pd.save_terms_positive_degree(terms_positive_degree, vocabulary, "./result/keywords.txt")

    #samples_positive_degree_P = pd.calculate_samples_positive_degree(tsm_positive, terms_positive_degree, max_terms = 20)
    #pd.save_samples_positive_degree(samples_positive, samples_positive_degree_P)

    samples_positive_degree_U = pd.calculate_samples_positive_degree(tsm_unlabeled, terms_positive_degree, max_terms = 20)
    pd.save_samples_positive_degree(samples_unlabeled, samples_positive_degree_U)
Exemple #2
0
    def show_category_keywords(self, result_dir):
        if not os.path.isdir(result_dir):
            try:
                os.mkdir(result_dir)
            except OSError:
                logging.error(Logger.error("mkdir %s failed." % (result_dir)))
                return

        tsm = self.tsm

        categories = self.get_categories()
        for category_name in categories.categories_2:
            category_id = categories.categories_2[category_name]
            positive_samples_list, unlabeled_samples_list = tsm.get_samples_list_by_category_2(category_id)

            print "%s(%d) Positive Samples: %d Unlabeled Samples: %d" % (category_name, category_id, len(positive_samples_list), len(unlabeled_samples_list))

            terms_positive_degree = get_terms_positive_degree_by_category(tsm, positive_samples_list, unlabeled_samples_list)

            pd.save_terms_positive_degree(terms_positive_degree, self.corpus.vocabulary, "%s/keywords_%d_%s.txt" % (result_dir, category_id, category_name))

            samples_positive = None
            samples_unlabeled = None