def train(self, features_file_path, save_dir_path):
        train_features, train_labels, test_features, test_labels = data_loading.load_numerical_data(features_file_path, normalise=False)

        MultiLabelBinarizer.set_params(range(0, 16))
        mlb = MultiLabelBinarizer()
        train_labels = np.array(train_labels)
        # Used to create a baseline for random chance
        # np.random.shuffle(train_labels)
        train_labels = mlb.fit_transform(np.array(train_labels))
        test_labels = mlb.fit_transform(np.array(test_labels))

        # Reserve the first 4 tracks in test set for displaying predictions to dev
        predict_features = test_features[:4]
        predict_labels = test_labels[:4]

        test_features = test_features[4:]
        test_labels = test_labels[4:]

        print('Training RF...')
        model = sklearn.ensemble.RandomForestClassifier(verbose=1)
        model.fit(train_features, train_labels)

        print('Evaluating RF...')
        print('Accuracy: ' + str(model.score(test_features, test_labels)))

        print()
        print('Model Predictions:')
        print(np.array([[int(s) for s in x] for x in model.predict(predict_features)]))

        print()
        print('Correct Labels:')
        print(predict_labels)

        pickle.dump(model, open(os.path.join(save_dir_path, "model.pickle"), 'wb'))
Exemple #2
0
class DataProcess(object):  # 特征处理
    def __init__(self, process_type):
        self.process_type = process_type

        if self.process_type == "Binary":  # 二值化处理
            self.processmodule = Binarizer(copy=True, threshold=0.0)
            # 大于 threshold 的映射为1, 小于 threshold 的映射为0

        elif self.process_type == "MinMax":  # 归一化处理
            self.processmodule = MinMaxScaler(feature_range=(0, 1), copy=True)

        elif self.process_type == "Stand":  # 标准化处理
            self.processmodule = StandardScaler(copy=True, with_mean=True, with_std=True)

        elif self.process_type == "Normal":  # 正则化处理
            self.processmodule = Normalizer(copy=True, norm="l2")  # 可选择l1, max ,l2三种

        elif self.process_type == "MultiLabelBinar":   # 多标签二值化处理
            self.processmodule = MultiLabelBinarizer(sparse_output=False)  # 使用其他CRS格式使用True
        else:
            raise ValueError("please select a correct process_type")

    def fit_transform(self, data):
        return self.processmodule.fit_transform(data)

    def fit(self, data):
        self.processmodule.fit(data)

    def transform(self, data):
        self.processmodule.transform(data)

    def set_params(self, params):
        self.processmodule.set_params(**params)

    def get_params(self):
        return self.processmodule.get_params(deep=True)

    def get_classes(self):
        assert self.process_type in {"MultiLabelBinar"}
        return self.processmodule.classes_  # 输出相关的classs有哪些不同的值

    def invser_transform(self, data):
        assert self.process_type in {"MultiLabelBinar", "MinMax", "Stand"}
        return self.processmodule.inverse_transform(data)

    def get_max(self):  # 获取数组中所多有维度上的最大值与最小值
        assert self.process_type in {"MinMax", "Stand"}
        return self.processmodule.data_max_

    def get_min(self):
        assert self.process_type in {"MinMax", "Stand"}
        return self.processmodule.data_min_

    def partial_fit(self):
        # 使用最后的一个缩放函数来在线计算最大值与最小值
        assert self.process_type in {"MinMax", "Stand"}
        return self.processmodule.partial_fit()
    def classify(self, features):
        model = pickle.load(open("classification\\numerical\\random_forest\\model\\model.pickle", 'rb'))
        result = model.predict(features)

        # todo put this functionality into the common classifier template
        MultiLabelBinarizer.set_params(range(0, 16))
        mlb = MultiLabelBinarizer()
        mlb.fit([range(0, 16)])
        genre_predictions_categorized = mlb.inverse_transform(result)

        if len(genre_predictions_categorized) == 0 or not all(genre_predictions_categorized):
            return ["Unclassifiable"]

        genre_predictions_categorized = [x[0] for x in mlb.inverse_transform(result)]  # this needs checkinf for which value o fthe tuple is the actual value

        genre_predictions = []
        lm = LabelManipulator()
        for label in genre_predictions_categorized:
            genre_predictions.append(lm.uncategorise_genre(label))

        # convert the ids to names

        return genre_predictions
    return np.array(new_features), np.array(new_labels)


# Test accuracy with only best populated genres -> Experimental (38k), Electronic (34k), Rock (33k)
def cut_all_but_3_genres(train_features, train_labels, test_features, test_labels):
    train_features, train_labels = cut_genres_from_list(train_features, train_labels)
    test_features, test_labels = cut_genres_from_list(test_features, test_labels)

    return train_features, train_labels, test_features, test_labels

train_features, train_labels, test_features, test_labels = data_loading.load_numerical_data(normalise=False)

# train_features, train_labels, test_features, test_labels = cut_all_but_3_genres(train_features, train_labels, test_features, test_labels)

MultiLabelBinarizer.set_params(range(0, 16))
mlb = MultiLabelBinarizer()
train_labels = np.array(train_labels)
np.random.shuffle(train_labels)
train_labels = mlb.fit_transform(np.array(train_labels))
test_labels = mlb.fit_transform(np.array(test_labels))

predict_features = test_features[:4]
predict_labels = test_labels[:4]

test_features = test_features[4:]
test_labels = test_labels[4:]

print('Training RF...')
model = RandomForestClassifier(verbose=1)
model.fit(train_features, train_labels)