def get_categories_1_weight_matrix(self): tsm = self.tsm cfm = CategoryFeatureMatrix() sfm = SampleFeatureMatrix() categories = self.get_categories() for category_name in categories.categories_1: category_id = categories.categories_1[category_name] positive_samples_list, unlabeled_samples_list = tsm.get_samples_list_by_category_1(category_id) print "\n%s(%d) Positive Samples: %d Unlabeled Samples: %d" % (category_name, category_id, len(positive_samples_list), len(unlabeled_samples_list)) terms_positive_degree = get_terms_positive_degree_by_category(tsm, positive_samples_list, unlabeled_samples_list) features = {} for term_id in terms_positive_degree: (pd_word, speciality, popularity) = terms_positive_degree[term_id] features[term_id] = pd_word cfm.set_features(category_id, features) for sample_id in positive_samples_list: (sample_category, sample_terms, term_map) = tsm.get_sample_row(sample_id) category_1_id = Categories.get_category_1_id(sample_category) sfm.set_sample_category(sample_id, category_1_id) for term_id in term_map: if term_id in terms_positive_degree: (pd_word, speciality, popularity) = terms_positive_degree[term_id] sfm.add_sample_feature(sample_id, term_id, pd_word) no_terms = False return cfm, sfm
def transform(tsm, sfm, fw_type, feature_weights = None): logging.debug(Logger.debug("FeatureWeight.transform() tsm: %d samples %d terms." % (tsm.get_total_samples(), tsm.get_total_terms()))) if sfm is None: sfm = SampleFeatureMatrix() sfm.init_cagegories(tsm.get_categories()) if fw_type == FeatureWeight.TFIDF: sfm = FeatureWeight.transform_tfidf(tsm, sfm, feature_weights) elif fw_type == FeatureWeight.TFRF: sfm = FeatureWeight.transform_tfrf(tsm, sfm, feature_weights) elif fw_type == FeatureWeight.TFIPNDF: sfm = FeatureWeight.transform_tfipndf(tsm, sfm, feature_weights) num_samples = sfm.get_num_samples() num_features = sfm.get_num_features() num_categories = sfm.get_num_categories() logging.debug(Logger.debug("FeatureWeight.transform(). sfm: %d samples %d terms %d categories." % (num_samples, num_features, num_categories))) return sfm
def multicategories_predict(samples_test, model_name, result_dir): if model_name is None or len(model_name) == 0: logging.warn(Logger.warn("model_name must not be NULL.")) return if result_dir is None: cfm_file = "%s.cfm" % (model_name) sfm_file = "%s.sfm" % (model_name) else: if not os.path.isdir(result_dir): try: os.mkdir(result_dir) except OSError: logging.error(Logger.error("mkdir %s failed." % (result_dir))) return cfm_file = "%s/%s.cfm" % (result_dir, model_name) sfm_file = "%s/%s.sfm" % (result_dir, model_name) logging.debug(Logger.error("Loading train sample feature matrix ...")) sfm_train = SampleFeatureMatrix() sfm_train.load(sfm_file) logging.debug(Logger.debug("Loading train category feature matrix ...")) cfm_train = CategoryFeatureMatrix() cfm_train.load(cfm_file) logging.debug(Logger.debug("Making sample feature matrix for test data ...")) category_id = 2000000 sfm_test = SampleFeatureMatrix(sfm_train.get_category_id_map(), sfm_train.get_feature_id_map()) features = cfm_train.get_features(category_id) for sample_id in samples_test.tsm.sample_matrix(): (sample_category, sample_terms, term_map) = samples_test.tsm.get_sample_row(sample_id) category_1_id = Categories.get_category_1_id(sample_category) sfm_test.set_sample_category(sample_id, category_1_id) for feature_id in features: if feature_id in term_map: feature_weight = features[feature_id] sfm_test.add_sample_feature(sample_id, feature_id, feature_weight) logging.debug(Logger.debug("train sample feature matrix - features:%d categories:%d" % (sfm_train.get_num_features(), sfm_train.get_num_categories()))) X_train, y_train = sfm_train.to_sklearn_data() logging.debug(Logger.debug("test sample feature matrix - features:%d categories:%d" % (sfm_test.get_num_features(), sfm_test.get_num_categories()))) X_test, y_test = sfm_test.to_sklearn_data() clf = Classifier() logging.debug(Logger.debug("Classifier training ...")) clf.train(X_train, y_train) logging.debug(Logger.debug("Classifier predicting ...")) categories = samples_test.get_categories() categories_1_names = [] categories_1_idx_map = {} categories_1_idlist = categories.get_categories_1_idlist() for category_id in categories_1_idlist: category_idx = sfm_test.get_category_idx(category_id) category_name = categories.get_category_name(category_id) categories_1_idx_map[category_idx] = (category_id, category_name) categories_1_idx_list = sorted_dict(categories_1_idx_map) for (category_idx, (category_id, category_name)) in categories_1_idx_list: categories_1_names.append("%s(%d)" % (category_name, category_id)) clf.predict(X_test, y_test, categories_1_names)