Beispiel #1
0
    def get_samples_list_by_category_2(self, category_positive):
        positive_samples_list = []
        unlabeled_samples_list = []

        category_1 = Categories.get_category_1_id(category_positive)

        for sample_id in self.sm_matrix:
            (category_id, sample_terms, term_map) = self.sm_matrix[sample_id]

            #if category_id == 2054000:
                #logging.debug(Logger.debug("category_id:%d category_positive:%d " % (category_id, category_positive) ))

            category_1_id = Categories.get_category_1_id(category_id)
            if category_1_id != category_1:
                continue

            is_positive = False
            if category_id == category_positive:
                is_positive = True

            #logging.debug(Logger.debug("category_id:%d category_positive:%d is_positive:%d" % (category_id, category_positive, is_positive) ))

            if is_positive:
                positive_samples_list.append(sample_id)
            else:
                unlabeled_samples_list.append(sample_id)

        return positive_samples_list, unlabeled_samples_list
Beispiel #2
0
    def get_categories_1_weight_matrix(self):
        tsm = self.tsm
        cfm = CategoryFeatureMatrix()
        sfm = SampleFeatureMatrix()

        categories = self.get_categories()
        for category_name in categories.categories_1:
            category_id = categories.categories_1[category_name]
            positive_samples_list, unlabeled_samples_list = tsm.get_samples_list_by_category_1(category_id)

            print "\n%s(%d) Positive Samples: %d Unlabeled Samples: %d" % (category_name, category_id, len(positive_samples_list), len(unlabeled_samples_list))

            terms_positive_degree = get_terms_positive_degree_by_category(tsm, positive_samples_list, unlabeled_samples_list)
            features = {}
            for term_id in terms_positive_degree:
                (pd_word, speciality, popularity) = terms_positive_degree[term_id]
                features[term_id] = pd_word
            cfm.set_features(category_id, features)

            for sample_id in positive_samples_list:
                (sample_category, sample_terms, term_map) = tsm.get_sample_row(sample_id)
                category_1_id = Categories.get_category_1_id(sample_category)
                sfm.set_sample_category(sample_id, category_1_id)
                for term_id in term_map:
                    if term_id in terms_positive_degree:
                        (pd_word, speciality, popularity) = terms_positive_degree[term_id]
                        sfm.add_sample_feature(sample_id, term_id, pd_word)
                        no_terms = False

        return cfm, sfm
Beispiel #3
0
    def to_sklearn_data(self):
        indptr = [0]
        indices = []
        data = []
        categories = []
        terms = {}
        category_map = {}
        for sample_id in self.sm_matrix:
            (category_id, sample_terms, term_map) = self.sm_matrix[sample_id]

            category_1_id = Categories.get_category_1_id(category_id)
            category_id_1 = category_1_id / 1000000
            category_idx = category_map.setdefault(category_id_1, len(category_map))
            categories.append(category_idx)
            #categories.append(category)

            for term_id in term_map:
                term_idx = terms.setdefault(term_id, len(terms))
                indices.append(term_idx)
                term_used_in_sample = term_map[term_id]
                data.append(term_used_in_sample)
            indptr.append(len(indices))

        rows = len(self.sm_matrix)
        cols = len(terms)
        print rows, cols
        X = csr_matrix((np.array(data), np.array(indices), np.array(indptr)), shape = (rows, cols))
        y = categories

        return X, y, terms, category_map
Beispiel #4
0
def multicategories_predict(samples_test, model_name, result_dir):
    if model_name is None or len(model_name) == 0:
        logging.warn(Logger.warn("model_name must not be NULL."))
        return

    if result_dir is None:
        cfm_file = "%s.cfm" % (model_name)
        sfm_file = "%s.sfm" % (model_name)
    else:
        if not os.path.isdir(result_dir):
            try:
                os.mkdir(result_dir)
            except OSError:
                logging.error(Logger.error("mkdir %s failed." % (result_dir)))
                return
        cfm_file = "%s/%s.cfm" % (result_dir, model_name)
        sfm_file = "%s/%s.sfm" % (result_dir, model_name)

    logging.debug(Logger.error("Loading train sample feature matrix ..."))
    sfm_train = SampleFeatureMatrix()
    sfm_train.load(sfm_file)
    logging.debug(Logger.debug("Loading train category feature matrix ..."))
    cfm_train = CategoryFeatureMatrix()
    cfm_train.load(cfm_file)

    logging.debug(Logger.debug("Making sample feature matrix for test data ..."))
    category_id = 2000000
    sfm_test = SampleFeatureMatrix(sfm_train.get_category_id_map(), sfm_train.get_feature_id_map())

    features = cfm_train.get_features(category_id)

    for sample_id in samples_test.tsm.sample_matrix():
        (sample_category, sample_terms, term_map) = samples_test.tsm.get_sample_row(sample_id)

        category_1_id = Categories.get_category_1_id(sample_category)

        sfm_test.set_sample_category(sample_id, category_1_id)
        for feature_id in features:
            if feature_id in term_map:
                feature_weight = features[feature_id]
                sfm_test.add_sample_feature(sample_id, feature_id, feature_weight)

    logging.debug(Logger.debug("train sample feature matrix - features:%d categories:%d" % (sfm_train.get_num_features(), sfm_train.get_num_categories())))
    X_train, y_train = sfm_train.to_sklearn_data()

    logging.debug(Logger.debug("test sample feature matrix - features:%d categories:%d" % (sfm_test.get_num_features(), sfm_test.get_num_categories())))
    X_test, y_test = sfm_test.to_sklearn_data()

    clf = Classifier()

    logging.debug(Logger.debug("Classifier training ..."))
    clf.train(X_train, y_train)

    logging.debug(Logger.debug("Classifier predicting ..."))

    categories = samples_test.get_categories()

    categories_1_names = []

    categories_1_idx_map = {}
    categories_1_idlist = categories.get_categories_1_idlist()
    for category_id in categories_1_idlist:
        category_idx = sfm_test.get_category_idx(category_id)
        category_name = categories.get_category_name(category_id)
        categories_1_idx_map[category_idx] = (category_id, category_name)
    categories_1_idx_list = sorted_dict(categories_1_idx_map)
    for (category_idx, (category_id, category_name)) in categories_1_idx_list:
        categories_1_names.append("%s(%d)" % (category_name, category_id))

    clf.predict(X_test, y_test, categories_1_names)