Esempio n. 1
0
def main():
    """
    """
    # data = credit.dataset_31_credit_g()
    data = wine.wine_quality_red_csv()
    print(data.columns)
    # column = data['volatile_acidity'].values.reshape(-1, 1)
    # column = data[].values.reshape(-1, 1)
    X, y = data['volatile_acidity'].values.reshape(-1, 1), data['class']
    X_train, X_test, y_train, y_test = split(X,
                                             y,
                                             test_size=0.2,
                                             random_state=0)
    sets = split(X_test, y_test, test_size=.5, random_state=0)
    X_first_half, X_second_half, y_first_half, y_second_half = sets
    # print(X_first_half.shape, X_second_half.shape)
    # X_train, X_test, y_train, y_test = split(X, y,
    #                                          test_size=0.2,
    #                                          random_state=0)

    pipeline = WineQualityPipeline()
    classifier = RandomForest()
    model = pipeline.with_estimator(classifier).fit(X_train, y_train)
    # prediction = model.predict(X_test)
    # pipeline = CreditGPipeline()
    shift_detector = SklearnDataShiftDetector(model, n_bins=30)
    shift_detector.iteration(X_first_half)
    new_second_half = deepcopy(X_second_half)
    mask = np.logical_and(X_second_half > .4, X_second_half < 1.)
    new_second_half[mask] *= 3.
    plt.plot(range(X_first_half.shape[0]), X_first_half, 'go')
    plt.plot(range(new_second_half.shape[0]), new_second_half, 'r^')
    plt.show()
    shift_detector.iteration(new_second_half)
    print(shift_detector.data_is_shifted())
Esempio n. 2
0
def main():
    """
    """
    resource_path = get_resource_path()
    folder = os.path.join(resource_path, 'data/amazon')
    datafile_train = 'Electronics_5.json'
    datafile_test = 'Books_5.json'
    X_train, y_train = [], []
    X_test, y_test = [], []

    for line in open(os.path.join(folder, datafile_train)):
        content = json.loads(line)
        X_train.append(content["reviewText"])
        y_train.append(float(content["overall"]))

    for line in open(os.path.join(folder, datafile_test)):
        content = json.loads(line)
        X_test.append(content["reviewText"])
        y_test.append(float(content["overall"]))

    size = 100

    # pipeline = FullTextPipeline(RandomForest())
    pipeline = HashingPipeline(RandomForest())
    # pipeline = TfIdfPipeline(RandomForest())
    model = pipeline.fit(X_train[:size], y_train[:size])

    shift_detector = SklearnDataShiftDetector(model, n_bins=1000)
    shift_detector.iteration(X_train[:100])
    shift_detector.iteration(X_test[:100])
    print(shift_detector.data_is_shifted())
Esempio n. 3
0
    def test_typos_in_data_with_random_forest(self):
        classifier = RandomForest()
        model = self.pipeline.with_estimator(classifier).fit(self.X_train,
                                                             self.y_train)
        error_generator = Typos()
        num_cols = 3
        # Will fail if the number of columns is less than 3
        columns = np.random.choice(self.features, num_cols, replace=False)
        corrupted_X_test = error_generator.run(self.X_test, columns=columns)
        # prediction = model.predict(X_test)
        # print(accuracy_score(y_test, prediction))

        # suite = TestSuite()
        pipeline_profile = SklearnPipelineProfiler().on(model)
        tests, warnings = (self.automated_suite
                           .with_profiles(self.data_profile, pipeline_profile)
                           .on(corrupted_X_test))
        for column, profile in zip(columns, self.data_profile.profiles):
            if profile.scale != DataScale.NOMINAL:
                continue
            self.assertIn(Test(Severity.CRITICAL).is_in_range(profile), tests)
            self.assertIn(Warning(ErrorType.NOT_IN_RANGE,
                                  Severity.CRITICAL, Message().not_in_range %
                                  (profile.column_name, str(profile.range))),
                          warnings)
Esempio n. 4
0
def main():
    """
    """
    # data = credit.dataset_31_credit_g()
    data = wine.wine_quality_red_csv()
    print(data.shape)
    print(data.columns)

    target = "class"
    X, y = data[[col for col in data.columns if col != target]], data[target]
    X_train, X_test, y_train, y_test = split(X,
                                             y,
                                             test_size=0.2,
                                             random_state=0)

    # pipeline = CreditGPipeline()
    pipeline = WineQualityPipeline()
    classifier = RandomForest(size=40)
    model = pipeline.with_estimator(classifier).fit(X_train, y_train)

    prediction = model.predict(X_test)
    print(accuracy_score(y_test, prediction))

    suite = TestSuite()
    automated_suite = AutomatedTestSuite()
    data_profile = DataFrameProfiler().on(X_train)
    pipeline_profile = SklearnPipelineProfiler().on(model)

    suite.add(Test().is_complete(
        data_profile.for_column('volatile_acidity')).is_in_range(
            data_profile.for_column('alcohol')))

    warnings = suite.on(X_test)

    print("*** TEST_SUITE, X_TEST")
    if warnings and (len(warnings) != 0):
        print("======= WARNINGS =======")
        for warn in warnings:
            print(warn)

    error_generator = ExplicitMissingValues()
    corrupted_X_test = error_generator.run(X_test, ['volatile_acidity'])

    warnings = suite.on(corrupted_X_test)

    print("*** TEST_SUITE, CORRUPTED_X_TEST")
    if warnings and (len(warnings) != 0):
        print("======= WARNINGS =======")
        for warn in warnings:
            print(warn)

    tests, warnings = (automated_suite.with_profiles(
        data_profile, pipeline_profile).run(corrupted_X_test))

    print("*** AUTOMATED_TEST_SUITE, CORRUPTED_X_TEST")
    if warnings and (len(warnings) != 0):
        print("======= WARNINGS =======")
        for warn in warnings:
            print(warn)
Esempio n. 5
0
def comparing_models(X_train, X_test, y_train, y_test):
    AdaBoost(X_train, X_test, y_train, y_test)
    Logistic_Regression(X_train, X_test, y_train, y_test)
    NaiveBayes(X_train, X_test, y_train, y_test)
    XGBoost(X_train, X_test, y_train, y_test)
    RandomForest(X_train, X_test, y_train, y_test)
    SVM(X_train, X_test, y_train, y_test)
    NeuralNetwork(X_train, X_test, y_train, y_test)
Esempio n. 6
0
def main():
    """
    """
    from pipelines import CreditGPipeline
    from models import RandomForest
    dataframe = pd.read_csv('resources/data/dataset_31_credit-g.csv')
    DataFrameProfiler().on(dataframe)
    pipeline = CreditGPipeline()
    SklearnPipelineProfiler().on(pipeline.with_estimator(RandomForest(40)))
Esempio n. 7
0
def trainWithHotEncoding(hot_encoded_train_features, hot_encoded_train_labels,
                         hot_encoded_test_features, hot_encoded_test_labels,
                         results, algorithms, isTesting):
    # TRAIIN RANDOM FOREST
    class_weights = class_weight.compute_class_weight(
        'balanced', np.unique(hot_encoded_train_labels.flatten()),
        hot_encoded_train_labels.flatten())
    param_grid = {
        'max_features': [
            # None,
            # "sqrt",
            "log2"
        ],
        'n_estimators': [
            # 1000,
            2000,
            # 3000
        ]
    }
    if (isTesting):
        print("TESTING MODE RF: ONLY TRAINING 1 MODEL")
        param_grid = {}

    rf_hot_encoding = RandomForest(class_weights=class_weights,
                                   param_grid=param_grid)
    rf_hot_encoding.title = "RANDOM FOREST HOT ENCODING TRAIN"
    rf_hot_encoding.train(hot_encoded_train_features, hot_encoded_train_labels)

    model_name = 'RF-HOT.model'
    print("SAVING MODEL: ", model_name)
    try:
        joblib.dump(rf_hot_encoding.model, model_name)
    except Exception as e:
        print("Cannot save {} because: \n\n".format(model_name), str(e))

    algorithms["RANDOM FOREST HOT ENCODING"] = rf_hot_encoding
    rf_hot_encoding.drawCurves(X=hot_encoded_train_features,
                               y=hot_encoded_train_labels)
    rf_hot_encoding.title = "RANDOM FOREST HOT ENCODING TEST"
    results["RANDOM FOREST HOT ENCODING"] = rf_hot_encoding.drawCurves(
        X=hot_encoded_test_features, y=hot_encoded_test_labels)
    return rf_hot_encoding, results, algorithms
Esempio n. 8
0
def trainWithFrecuencies(tetra_freq_train_features, tetra_freq_train_labels,
                         tetra_freq_test_features, tetra_freq_test_labels,
                         results, algorithms, isTesting):
    class_weights = class_weight.compute_class_weight(
        'balanced', np.unique(tetra_freq_train_labels.flatten()),
        tetra_freq_train_labels.flatten())
    param_grid = {
        'max_features': [
            # None,
            # "sqrt",
            "log2"
        ],
        'n_estimators': [
            # 1000,
            2000,
            # 3000
        ]
    }
    if (isTesting):
        print("TESTING MODE RF: ONLY TRAINING 1 MODEL")
        param_grid = {}

    rf = RandomForest(class_weights=class_weights, param_grid=param_grid)
    rf.title = "RANDOM FOREST TETRA NUCLEOTIDE FREQUENCY TRAIN"
    rf.train(tetra_freq_train_features, tetra_freq_train_labels)

    model_name = 'RF-TETRA.model'
    print("SAVING MODEL USING JOBLIB: ", model_name)
    try:
        joblib.dump(rf.model, model_name)
    except Exception as e:
        print("Cannot save {} because: \n\n".format(model_name), str(e))

    algorithms["RANDOM FOREST"] = rf
    rf.drawCurves(X=tetra_freq_train_features, y=tetra_freq_train_labels)
    rf.title = "RANDOM FOREST TETRA NUCLEOTIDE FREQUENCY TEST"
    results["RANDOM FOREST"] = rf.drawCurves(X=tetra_freq_test_features,
                                             y=tetra_freq_test_labels)
    return rf, results, algorithms
def main(grid):
	# Get Clean Data
	X, Y = read_clean_data()
	# Linear Regression
	try:
		LinearRegression(X, Y, grid)
	except Exception as e:
		print(e)
	# Binarize Y
	Y_binary = BinaryY(Y)
	# Logistic Regression
	try:
		LogisticRegression(X, Y_binary, grid)
	except Exception as e:
		print(e)
	# Decision Tree
	try:
		DecisionTree(X, Y_binary, grid)
	except Exception as e:
		print(e)
	# Support Vector Machine
	try:
		SVM(X, Y_binary, grid)
	except Exception as e:
		print(e)
	# Random Forest
	try:
		RandomForest(X, Y_binary, grid)
	except Exception as e:
		print(e)
	# Bagging Classifier
	try:
		Bagging(X, Y_binary, grid)
	except Exception as e:
		print(e)
	# Neural Network
	try:
		NeuralNet(X, Y_binary, grid)
	except Exception as e:
		print(e)
Esempio n. 10
0
    def test_missing_values_in_data_with_random_forest(self):
        classifier = RandomForest()
        model = self.pipeline.with_estimator(classifier).fit(self.X_train,
                                                             self.y_train)
        error_generator = ExplicitMissingValues()
        num_cols = 3
        # Will fail if the number of columns is less than 3
        columns = np.random.choice(self.features, num_cols, replace=False)
        corrupted_X_test = error_generator.run(self.X_test, columns=columns)
        # prediction = model.predict(X_test)
        # print(accuracy_score(y_test, prediction))

        # suite = TestSuite()
        pipeline_profile = SklearnPipelineProfiler().on(model)
        tests, warnings = (self.automated_suite
                           .with_profiles(self.data_profile, pipeline_profile)
                           .on(corrupted_X_test))
        for column, profile in zip(columns, self.data_profile.profiles):
            self.assertIn(Test(Severity.CRITICAL).is_complete(profile), tests)
            self.assertIn(Warning(ErrorType.MISSING_VALUE,
                                  Severity.CRITICAL,
                                  Message().not_complete % column),
                          warnings)
Esempio n. 11
0
    def __init__(self):
        self.resource_folder = get_resource_path()
        # for dataset_name in sorted(os.listdir(folder)):
        #     if dataset_name.endswith('.csv'):
        #         print(dataset_name[:-4])
        self.pipelines = {
            'credit-g': (
                'credit-g/dataset_31_credit-g.csv', 'class',
                CreditGPipeline()),
            'wine-quality': (
                'wine-quality/wine-quality-red.csv', 'class',
                WineQualityPipeline()),
            'wq-missing': (
                'wine-quality/wine-quality-red.csv', 'class',
                WineQualityMissingPipeline()),
            'abalone': (
                'abalone/abalone.csv', 'Rings',
                AbalonePipeline()),
            'adult': (
                'adult/adult.csv', 'class',
                AdultPipeline()),
            'adult-missing': (
                'adult/adult.csv', 'class',
                AdultMissingPipeline()),
            'heart': (
                'heart/heart.csv', 'class',
                HeartPipeline())}

        self.classifiers = {
            'dtc': DecisionTree(),
            'rfc40': RandomForest(size=40),
            'ertc40': ExtremelyRandomizedTrees(size=40),
            'xgb': XGB(),
            'svm': SVM(),
            'lsvm': LinearSVM(),
            'knn': KNN(n_neighbors=7),
            'logreg': LogRegression(),
            'gaus': GausNB(),
            'brfc40': BaggingRandomForest(size=40),
            'mlpc': MLPC(input_size=[16, 32, 16, 8])
        }

        self.error_gens = {
            'numeric anomalies': (
                Anomalies(), lambda x: x.dtype in [DataType.INTEGER,
                                                   DataType.FLOAT]),
            'typos': (
                Typos(), lambda x: x.dtype == DataType.STRING),
            'explicit misvals': (
                ExplicitMissingValues(), lambda x: True),
            'implicit misvals': (
                ImplicitMissingValues(), lambda x: True),
            'swap fields': (
                SwapFields(), lambda x: True)}

        self.params = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8]

        self.tests = {'num disc': lambda x: (x.scale == DataScale.NOMINAL
                                             and x.dtype in [DataType.INTEGER,
                                                             DataType.FLOAT]),
                      'num cont': lambda x: (x.scale == DataScale.NOMINAL
                                             and x.dtype in [DataType.INTEGER,
                                                             DataType.FLOAT]),
                      'string': lambda x: x.dtype == DataType.STRING}

        self.results = Table(rows=sorted(self.pipelines.keys()),
                             columns=sorted(self.classifiers.keys()),
                             subrows=self.tests.keys(),
                             subcolumns=self.error_gens.keys())
Esempio n. 12
0
forecast_pr = model_pr.predict(start=model_pr.test.index[0],
                               end=model_pr.test.index[-1])

eval_pr = EvaluateModel(model_pr.test, forecast_pr)
print('RMSE for Prophet model: ', eval_pr.rmse())
eval_pr.plot(label='Prophet')

# ======================================================================================================================
# Random Forest
# ======================================================================================================================

FE = FeatureEngineering(DL)
FE.generate_lags(features_to_lag=['Customers'], lags=[1, 12])
FE.split_features_target(type='train')

model_rf = RandomForest(FE)
model_rf.fit(trend='additive', seasonality=12)
X_test, y_test = FE.split_features_target(type='test')
X_test = X_test[model_rf.selected_features]
forecast_rf = model_rf.predict(X_test)
forecast_rf.set_index(keys=model_rf.test.index, inplace=True)

eval_rf = EvaluateModel(model_rf.test[model_rf.target], forecast_rf)
print('RMSE for Random Forest model: ', eval_rf.rmse())
eval_rf.plot(label='Random Forest')

# ======================================================================================================================
# Extra-tree regressor
# ======================================================================================================================

FE = FeatureEngineering(DL)
Esempio n. 13
0
        '[+] Loading last saved BaseLine class. Delete it if you want to train a new model!'
    )
    curr_dir = os.path.dirname(os.path.abspath(__file__))
    BaseLine_path = os.path.abspath(curr_dir + f"/utils/model/BaseLine.bcls")
    BaseLine_file = open(BaseLine_path, 'rb')
    model = pickle.load(BaseLine_file)
else:
    logging.info('[+] Training on selected model...')
    if model == "sgd_classifier":
        model = _SGDClassifier(training_dataloader=training_dataloader)
    elif model == "naive_bayesian":
        model = NaiveBayesian(training_dataloader=training_dataloader)
    elif model == "support_vector_machine":
        model = SupportVectorMachine(training_dataloader=training_dataloader)
    elif model == "random_forest":
        model = RandomForest(training_dataloader=training_dataloader)
    elif model == "logistic_regression":
        model = _LogisticRegression(training_dataloader=training_dataloader)
    else:
        print("[?] Invalid Model!")
        sys.exit(1)
    model.train()

# ------------ testing and statistical process based on pre-trained model
# --------------------------------------------------------------------------
statistics = model.stat()
print("\t- Accuracy : ", statistics["accuracy"])
print("\t- Precision : ", statistics["precision"])
print("\t- Recall : ", statistics["recall"])
print("\t- f1-score : ", statistics["f1_score"])
print("\t- ROC AUC score : ", statistics["roc_auc_score"])
Esempio n. 14
0
    test = {
        'outlook': 'sunny',
        'temp': 'hot',
        'humidity': 'normal',
        'windy': False
    }
    return df, target, test


if __name__ == '__main__':

    X_tr, Y_tr, X_ts, Y_ts = load_data()

    # decision tree
    model = DecisionTree(n_attrs=4)
    model.fit(X_tr, Y_tr)
    y_pred = model.predict(X_ts)
    assert accuracy(y_pred, Y_ts) == 0.7837837837837838

    # random forest
    model = RandomForest()
    model.fit(X_tr, Y_tr)
    y_pred = model.predict(X_ts)
    assert accuracy(y_pred, Y_ts) == 1

    # decision tree categorical
    df, attr_targe, record_test = load_data_categorical()
    tree = DecicionTreeCategorical()
    tree.fit(df, attr_targe)
    assert tree.predict_one(record_test) == {'yes': 1.0}
Esempio n. 15
0
        y = model.predict(X)
        preds_to_lab(y, param['hop_size'], param['fs'], category, save_path,
                     song_name)


if __name__ == '__main__':
    parser = get_train_rf_parser()
    args = parser.parse_args(sys.argv[1:])
    log.info('Arguments:\n' + pformat(args.__dict__))
    # prepare train dataset
    params, y_size, y_ind = get_params_by_category(args.category)
    conv_root = args.conv_root
    if args.use_librosa:
        conv_root = conv_root + '/librosa/'
    else:
        conv_root = conv_root + '/mauch/'
    conv_list = args.conv_list
    if not conv_list:
        conv_list = gen_train_data(args.songs_list,
                                   args.audio_root,
                                   args.gt_root,
                                   params,
                                   conv_root,
                                   args.subsong_len,
                                   args.song_len,
                                   use_librosa=args.use_librosa)
    model = RandomForest(criterion=args.criterion,
                         max_features=args.max_features,
                         n_estimators=args.n_estimators)
    model = train_rf(model, conv_list)