def get_categories_1_weight_matrix(self): tsm = self.tsm cfm = CategoryFeatureMatrix() sfm = SampleFeatureMatrix() categories = self.get_categories() for category_name in categories.categories_1: category_id = categories.categories_1[category_name] positive_samples_list, unlabeled_samples_list = tsm.get_samples_list_by_category_1(category_id) print "\n%s(%d) Positive Samples: %d Unlabeled Samples: %d" % (category_name, category_id, len(positive_samples_list), len(unlabeled_samples_list)) terms_positive_degree = get_terms_positive_degree_by_category(tsm, positive_samples_list, unlabeled_samples_list) features = {} for term_id in terms_positive_degree: (pd_word, speciality, popularity) = terms_positive_degree[term_id] features[term_id] = pd_word cfm.set_features(category_id, features) for sample_id in positive_samples_list: (sample_category, sample_terms, term_map) = tsm.get_sample_row(sample_id) category_1_id = Categories.get_category_1_id(sample_category) sfm.set_sample_category(sample_id, category_1_id) for term_id in term_map: if term_id in terms_positive_degree: (pd_word, speciality, popularity) = terms_positive_degree[term_id] sfm.add_sample_feature(sample_id, term_id, pd_word) no_terms = False return cfm, sfm
def show_category_keywords(self, result_dir): if not os.path.isdir(result_dir): try: os.mkdir(result_dir) except OSError: logging.error(Logger.error("mkdir %s failed." % (result_dir))) return tsm = self.tsm categories = self.get_categories() for category_name in categories.categories_2: category_id = categories.categories_2[category_name] positive_samples_list, unlabeled_samples_list = tsm.get_samples_list_by_category_2(category_id) print "%s(%d) Positive Samples: %d Unlabeled Samples: %d" % (category_name, category_id, len(positive_samples_list), len(unlabeled_samples_list)) terms_positive_degree = get_terms_positive_degree_by_category(tsm, positive_samples_list, unlabeled_samples_list) pd.save_terms_positive_degree(terms_positive_degree, self.corpus.vocabulary, "%s/keywords_%d_%s.txt" % (result_dir, category_id, category_name)) samples_positive = None samples_unlabeled = None