Example #1
0
    def find_best_params(self,validation_data_x,validation_data_y,n_jobs=1,params=[]):
        if not params:
            params = self.get_default_param_grid()

        merged_x = Data.merge_arrays(self.training_data_x, validation_data_x)
        merged_y = Data.merge_arrays(self.training_data_y, validation_data_y)
        test_fold = []

        for i in range(0,len(self.training_data_y)):
            test_fold.append(1)
        for i in range(0,len(validation_data_y)):
            test_fold.append(0)

        cv = PredefinedSplit(test_fold)

        gs = GridSearchCV(
            estimator=GaussianNB(),
            scoring='f1_micro',
            param_grid=params,
            n_jobs=n_jobs,
            cv=cv
        )

        gs.fit(merged_x,merged_y)

        best_params = gs.best_params_
        results = gs.cv_results_
        return best_params,results
Example #2
0
    def find_best_params(self,validation_data_x,validation_data_y,alpha_vals,n_jobs=1):

        merged_x = Data.merge_arrays(self.training_data_x, validation_data_x)
        merged_y = Data.merge_arrays(self.training_data_y, validation_data_y)
        test_fold = []

        for i in range(0,len(self.training_data_y)):
            test_fold.append(1)
        for i in range(0,len(validation_data_y)):
            test_fold.append(0)

        cv = PredefinedSplit(test_fold)

        param = {"alpha": alpha_vals}
        gs = GridSearchCV(
            estimator=BernoulliNB(),
            scoring='f1_micro',
            param_grid=param,
            n_jobs=n_jobs,
            cv=cv
        )

        gs.fit(merged_x,merged_y)

        best_params = gs.best_params_
        results = gs.cv_results_
        return best_params,results
Example #3
0
def run_naive_bayes_gaussian():
    OUTPUT_PATH = "Outputs/yelp/naive-bayes-fbow-out.txt"
    f = open(OUTPUT_PATH, "w")

    TRAINING_DATA_PATH = "Data/FrequencyBOW/yelp-train"
    VALIDATION_DATA_PATH = "Data/FrequencyBOW/yelp-valid"
    TESTING_DATA_PATH = "Data/FrequencyBOW/yelp-test"

    f.write(
        "Loading Frequency Bag-Of-Words Representation for Training Data\n")
    training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv")
    training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv")

    f.write(
        "Initializing Gaussian Naive Bayes Class Classifier with training data\n"
    )
    gnb = GaussianNaiveBayesClassifier(training_data_x=training_data_x,
                                       training_data_y=training_data_y)

    f.write(
        "Finding best variance smoothing value for Gaussian Naive Bayes Model\n"
    )
    f.write("Loading validation data\n")
    validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv")
    validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv")

    best_params, results = gnb.find_best_params(validation_data_x,
                                                validation_data_y,
                                                n_jobs=1)
    f.write("The best variance smoothing value found was {}\n".format(
        best_params["var_smoothing"]))
    f.write("\nPerformance metrics for all var_smoothing values tested:\n\n")

    index = 0
    while (index < 100 and index < len(results['params'])):
        f.write("var_smoothing: {} --> {}\n".format(
            results['params'][index]['var_smoothing'],
            results['mean_test_score'][index]))
        index += 1

    f.write(
        "\n\nInitializing and training a Gaussian Naive Bayes Model with best hyper-parameters\n"
    )
    gnb = GaussianNaiveBayesClassifier(training_data_x, training_data_y)
    gnb.initialize_classifier(var_smoothing=best_params['var_smoothing'])
    gnb.train()

    testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv")
    testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv")

    f.write("Finding F1-Measure for different datasets\n")
    f1_train = gnb.get_f1_measure(training_data_x, training_data_y)
    f1_valid = gnb.get_f1_measure(validation_data_x, validation_data_y)
    f1_test = gnb.get_f1_measure(testing_data_x, testing_data_y)

    f.write("The F1-Measure on training data is {}\n".format(f1_train))
    f.write("The F1-Measure on validation data is {}\n".format(f1_valid))
    f.write("The F1-Measure on testing data is {}\n".format(f1_test))

    f.close()
Example #4
0
    result.columns = np.concatenate(
        [ask_column, bid_column, ask_vol_column, bid_vol_column, mid_column])

    result.iloc[:, :20] = result.iloc[:, :20].values / result[
        'mid'].values[:, np.newaxis] - 1
    for i in range(9):
        result.iloc[:, i + 21] += result.iloc[:, i + 20]
        result.iloc[:, i + 31] += result.iloc[:, i + 30]

    result = result.iloc[1:]

    labels = [1, 2, 3, 5, 10]
    label = data.iloc[:, labels.index(k) - 5]

    return result, label


if __name__ == '__main__':
    from data_processor import DataForModel as Data
    data = DataGenerator(test=True)
    data, label = Preprocess(data)
    data = pd.concat([data, label], axis=1)

    print(data.shape)

    data = Data(data, test_ratio=1)
    print(data.len_train)

    x, y = data.get_test_batch(300, False)
    print(x.shape)
Example #5
0
def run_naive_bayes_bernoulli():
    OUTPUT_PATH = "Outputs/yelp/naive-bayes-bbow-out.txt"
    f = open(OUTPUT_PATH, "w+")

    TRAINING_DATA_PATH = "Data/BinaryBOW/yelp-train"
    VALIDATION_DATA_PATH = "Data/BinaryBOW/yelp-valid"
    TESTING_DATA_PATH = "Data/BinaryBOW/yelp-test"

    f.write("Loading Binary Bag-Of-Words Representation for Training Data\n")
    training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv")
    training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv")

    f.write(
        "Initializing Bernoulli Naive Bayes Class Classifier with training data\n"
    )
    bnb = BernoulliNaiveBayesClassifier(training_data_x=training_data_x,
                                        training_data_y=training_data_y)

    f.write("Finding best alpha value for Naive Bayes Bernoulli Model\n")
    f.write("Loading validation data\n")
    validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv")
    validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv")

    alpha_start = 0
    alpha_stop = 1
    num_intervals = 200
    alpha_vals = np.linspace(start=alpha_start,
                             stop=alpha_stop,
                             num=num_intervals)
    f.write("Testing {} alpha values between {} and {}:\n".format(
        num_intervals, alpha_start, alpha_stop))

    best_params, results = bnb.find_best_params(validation_data_x,
                                                validation_data_y,
                                                alpha_vals,
                                                n_jobs=10)
    f.write("The best alpha value found was {}\n".format(best_params["alpha"]))
    f.write("\nPerformance metrics for all alpha values tested:\n\n")

    output = ""
    for i in range(0, len(alpha_vals)):
        output += "Alpha value: " + str(
            alpha_vals[i]) + " --> F1-Score: " + str(
                results['mean_test_score'][i]) + "\n"
    f.write(output)

    f.write(
        "\n\nInitializing and training a Bernoulli Naive Bayes Model with alpha={}\n"
        .format(best_params['alpha']))
    alpha = float(best_params['alpha'])
    bnb = BernoulliNaiveBayesClassifier(training_data_x, training_data_y,
                                        alpha)
    bnb.train()

    testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv")
    testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv")

    f.write("Finding F1-Measure for different datasets\n")
    f1_train = bnb.get_f1_measure(training_data_x, training_data_y)
    f1_valid = bnb.get_f1_measure(validation_data_x, validation_data_y)
    f1_test = bnb.get_f1_measure(testing_data_x, testing_data_y)

    f.write("The F1-Measure on training data with alpha={} is {}\n".format(
        alpha, f1_train))
    f.write("The F1-Measure on validation data with alpha={} is {}\n".format(
        alpha, f1_valid))
    f.write("The F1-Measure on testing data with alpha={} is {}\n".format(
        alpha, f1_test))

    f.close()
Example #6
0
def run_decision_tree():
    OUTPUT_PATH = "Outputs/yelp/decision-tree-bbow-out.txt"
    f = open(OUTPUT_PATH, "w")

    TRAINING_DATA_PATH = "Data/BinaryBOW/yelp-train"
    VALIDATION_DATA_PATH = "Data/BinaryBOW/yelp-valid"
    TESTING_DATA_PATH = "Data/BinaryBOW/yelp-test"

    f.write("Loading Binary Bag-Of-Words Representation for Training Data\n")
    training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv")
    training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv")

    f.write("Initializing Decision Tree Classifier with training data\n")
    dt = DecisionTree(training_data_x=training_data_x,
                      training_data_y=training_data_y)

    f.write("Loading validation data\n")
    validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv")
    validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv")

    f.write("Finding the best hyper-parameters:\n")
    best_params, best_score, results = dt.find_best_params(validation_data_x,
                                                           validation_data_y,
                                                           n_jobs=1)

    f.write("The best hyper-parameters are as follows: \n")
    f.write(
        "max_depth: {}\t| min_samples_split: {}\t| min_samples_leaf: {}\t| max_features: {} with an average F1-Measure of {}\n\n"
        .format(best_params['max_depth'], best_params['min_samples_split'],
                best_params['min_samples_leaf'], best_params['max_features'],
                best_score))

    f.write(
        "\nPerformance metrics for the first 100 hyper-parameters_tested:\n\n")
    index = 0
    while (index < 100 and index < len(results['params'])):
        f.write(
            "max_depth: {}\t| min_samples_split: {}\t| min_samples_leaf: {}\t| max_features: {} --> {}\n"
            .format(results['params'][index]['max_depth'],
                    results['params'][index]['min_samples_split'],
                    results['params'][index]['min_samples_leaf'],
                    results['params'][index]['max_features'],
                    results['mean_test_score'][index]))
        index += 1

    f.write(
        "\n\nInitializing and training a Decision Tree Classifier with the best parameters \n"
    )
    dt = DecisionTree(training_data_x, training_data_y)
    dt.initialize_classifier(
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'])
    dt.train()

    testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv")
    testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv")

    f.write("Finding F1-Measure for different datasets\n")
    f1_train = dt.get_f1_measure(training_data_x, training_data_y)
    f1_valid = dt.get_f1_measure(validation_data_x, validation_data_y)
    f1_test = dt.get_f1_measure(testing_data_x, testing_data_y)

    f.write(
        "The F1-Measure on training data with these parameters is {}\n".format(
            f1_train))
    f.write("The F1-Measure on validation data with these parameters is {}\n".
            format(f1_valid))
    f.write(
        "The F1-Measure on testing data with these parameters is {}\n".format(
            f1_test))

    f.close()
Example #7
0
def run_linear_svm():
    OUTPUT_PATH = "Outputs/yelp/linear-svm-bbow-out.txt"
    f = open(OUTPUT_PATH, "w")

    TRAINING_DATA_PATH = "Data/BinaryBOW/yelp-train"
    VALIDATION_DATA_PATH = "Data/BinaryBOW/yelp-valid"
    TESTING_DATA_PATH = "Data/BinaryBOW/yelp-test"

    f.write("Loading Binary Bag-Of-Words Representation for Training Data\n")
    training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv")
    training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv")

    f.write(
        "Initializing Linear Support Vector Classifier with training data\n")
    lsvc = LinearSupportVectorClassifier(training_data_x=training_data_x,
                                         training_data_y=training_data_y)

    f.write("Loading validation data\n")
    validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv")
    validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv")

    f.write("Finding the best hyper-parameters:\n")
    best_params, best_score, results = lsvc.find_best_params(validation_data_x,
                                                             validation_data_y,
                                                             n_jobs=10)

    f.write("The best hyper-parameters are as follows: \n")
    f.write("C: {}\t| tol: {} with an F1-Measure of {}\n\n".format(
        best_params['C'], best_params['tol'], best_score))

    f.write(
        "\nPerformance metrics for the first 100 hyper-parameters_tested:\n\n")
    index = 0
    while (index < 100 and index < len(results['params'])):
        f.write("C: {}\t| tol: {} --> {}\n".format(
            results['params'][index]['C'], results['params'][index]['tol'],
            results['mean_test_score'][index]))
        index += 1

    f.write(
        "\n\nInitializing and training a Linear Support Vector Classifier with C={} and tol={} \n"
        .format(best_params['C'], best_params['tol']))
    best_C = float(best_params['C'])
    best_tol = float(best_params['tol'])
    lsvc = LinearSupportVectorClassifier(training_data_x, training_data_y)
    lsvc.initialize_classifier(tol=best_tol, C=best_C)
    lsvc.train()

    testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv")
    testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv")

    f.write("Finding F1-Measure for different datasets\n")
    f1_train = lsvc.get_f1_measure(training_data_x, training_data_y)
    f1_valid = lsvc.get_f1_measure(validation_data_x, validation_data_y)
    f1_test = lsvc.get_f1_measure(testing_data_x, testing_data_y)

    f.write(
        "The F1-Measure on training data with C={} and tol={} is {}\n".format(
            best_C, best_tol, f1_train))
    f.write("The F1-Measure on validation data with C={} and tol={} is {}\n".
            format(best_C, best_tol, f1_valid))
    f.write(
        "The F1-Measure on testing data with C={} and tol={} is {}\n".format(
            best_C, best_tol, f1_test))

    f.close()