Esempio n. 1
0
def main():
    data = scipy.io.loadmat('spamData.mat')
    xtrain = preprocess.binarize(data['Xtrain'])
    # xtrain = data['Xtrain']
    ytrain = data['ytrain']
    xtest = preprocess.binarize(data['Xtest'])
    ytest = data['ytest']
    threshold = 0.0001
    reg_learn_pairs = [(0.01, 0.0001)]
    for regularization_weight, learning_rate in reg_learn_pairs:
        print 'Regularization_weight %s learning_rate %s' % (regularization_weight, learning_rate)
        xplot = []
        yplot = []
        beta = batch(xtrain, ytrain, threshold, regularization_weight, learning_rate, xplot, yplot)
        train = test_error(xtrain, ytrain, beta)
        test = test_error(xtest, ytest, beta)
        with open('res.txt', 'a') as f:
            f.write('%s\t%s\t%s\t%s\n' % (regularization_weight, learning_rate, train, test))
            f.flush()
        # plot xplot vs yplot
        pyplot.plot(xplot, yplot)
        pyplot.title('Training Loss vs Number of Iterations.\nregularization_weight %s learning_rate %s' % (
            regularization_weight, learning_rate))
        pyplot.xlabel("Number of Iterations")
        pyplot.ylabel("Negative Log Likelihood")
        pyplot.show()
Esempio n. 2
0
def generate_auc_roc(X_test, y_test):
    model_loaded = load_model(model_file_h5)
    generate_classification_report(model_loaded, X_test, y_test)
    predicted_classes = model_loaded.predict_classes(X_test)
    print("Predicted Classes")
    print(predicted_classes)
    score, acc = model_loaded.evaluate(X_test, y_test, batch_size=BATCH_SIZE)
    print(score)
    print(acc)
    y_score = model_loaded.predict_proba(X_test)
    print("Predicted Probabilities")
    print(y_score)
    bin_output = preprocess.binarize(y_test)
    multiclassROC.calculate_roc(bin_output, y_score, "RnnClassifierModel",
                                CLASSES)
    df_telco = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',
                           delimiter=',',
                           header=None,
                           na_values='\s+',
                           skiprows=1)
    df_telco.iloc[:, 5] = pd.to_numeric(df_telco.iloc[:, 5])
    df_telco.iloc[:, 18] = pd.to_numeric(df_telco.iloc[:, 18])
    df_telco.iloc[:, 19] = pd.to_numeric(df_telco.iloc[:, 19], errors='coerce')
    df_telco = tp.process_missing_label(df_telco)
    df_telco = tp.process_missing_attribute(
        df_telco, [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17])
    print('Missing values handled')
    df_telco = tp.process_string_to_int(
        df_telco, [0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20])
    print('String converted to integer labels')
    df_telco, binarizers_telco, binarizers_telco_columns = tp.binarize(
        df_telco, [5, 18, 19])
    print('Continuous values binarized')
    df_telco = df_telco.reset_index(drop=True)
    df_telco = df_telco.drop([0], axis=1)
    df_telco = df_telco.T.reset_index(drop=True).T
    df_telco.to_csv('Preprocessed_Telco.csv', sep=',')
    print('Preprocessing Finished...')
    end_preprocessing = time.time()

    print("Preprocessing took " +
          str(float(end_preprocessing - start_preprocessing) / 60) + " min")
    print()
    print()
else:
    df_telco = pd.read_csv('Preprocessed_Telco.csv',
                           delimiter=',',
import preprocess as ap
import decision_tree
import adaboost


prepro = 0
if prepro == 1:
    start_preprocessing = time.time()
    print('Preprocessing started...')
    df_adult_train = pd.read_csv('adult_data.csv', delimiter=',', header=None, na_values=' ?')
    df_adult_train = ap.process_missing_label(df_adult_train)
    df_adult_train = ap.process_missing_attribute(df_adult_train, [1, 3, 5, 6, 7, 8, 9, 13])
    print('Missing values handled')
    df_adult_train = ap.process_string_to_int(df_adult_train, [1, 3, 5, 6, 7, 8, 9, 13, 14])
    print('String converted to integer labels')
    df_adult_train, binarizers_adult, binarizers_adult_columns = ap.binarize(df_adult_train, [0, 2, 4, 10, 11, 12])
    print('Continuous values binarized')
    df_adult_train = df_adult_train.reset_index(drop=True)
    df_adult_train.to_csv('Preprocessed_Adult_Train.csv', sep=',')
    end_preprocessing = time.time()

    print("Preprocessing training data took " + str(float(end_preprocessing - start_preprocessing) / 60) + " min")

    start_preprocessing = time.time()
    df_adult_test = pd.read_csv('adult_test.csv', delimiter=',', header=None, na_values=' ?')
    df_adult_test = ap.process_missing_label(df_adult_test)
    df_adult_test = ap.process_missing_attribute(df_adult_test, [1, 3, 5, 6, 7, 8, 9, 13])
    df_adult_test = ap.process_string_to_int(df_adult_test, [1, 3, 5, 6, 7, 8, 9, 13, 14])
    df_adult_test = ap.binarize_test(binarizers_adult, binarizers_adult_columns, df_adult_test)
    df_adult_test = df_adult_test.reset_index(drop=True)
    df_adult_test.to_csv('Preprocessed_Adult_Test.csv', sep=',')
Esempio n. 5
0
import sys
import cv2
import preprocess
import segmentbeautify

# File paths
filepath = "public/img/"
file = sys.argv[1]

print "\n This is the filename: " + filepath + file + "\n"

# Read image
img = cv2.imread(filepath + file, 0)
print img

# Preprocess the image
img = preprocess.binarize(img)
img = preprocess.removeSaltnPepperNoise(img)

# Segment words and lines
words_mapping, word_spacing, line_spacing = segmentbeautify.extractLines(
    img, file)
print word_spacing, line_spacing

# Beautify the text
beautified = segmentbeautify.beautify(img, words_mapping, word_spacing,
                                      line_spacing, file, filepath)
print "beautified", beautified
Esempio n. 6
0
    if label_split[0] != "a01-117-05-02":
        labels.append([label_split[0], label_split[-1]])
print labels

i = 0
filepath = "Datasets/words/"
target = open('sequence1.csv', "w")
for label in labels:
    if (113624 <= i):
        print label[1]
        navi = label[0].split("-")
        word = cv2.imread(
            filepath + navi[0] + "/" + navi[0] + "-" + navi[1] + "/" +
            label[0] + ".png", 0)
        cv2.imshow("original", word)
        word = preprocess.binarize(word)
        cv2.imshow("binarized", word)
        print word.shape
        word = preprocess.resizeImage(word)
        print word.shape
        word = preprocess.sharpen(word)
        word = cv2.bitwise_not(word)
        word = word / 255
        for row in range(word.shape[0]):
            for col in range(word.shape[1]):
                target.write(str(word[row, col]) + ",")
        target.write(label[1])
        target.write("\n")
    i += 1

target.close()
Esempio n. 7
0
    # Path = 'datasets/NN/24_riboswitches.csv'
    Path = 'processed_datasets/final_32train.csv'

    #Call function to Load Dataset
    Data_train, Output_train = preprocess.Load_Data_baseModel(Path, Data_train, Output_train)

    #Converting the train data into Float
    Data_train, Output_train = preprocess.Convert_to_Float(Data_train, Output_train)

    Path = 'processed_datasets/final_32test.csv'

    #Call function to Load Dataset
    Data_test, Output_test = preprocess.Load_Data_baseModel(Path, Data_test, Output_test)

    #Converting the train data into Float
    Data_test, Output_test = preprocess.Convert_to_Float(Data_test, Output_test)

    bin_output = preprocess.binarize(Output_test)



    scaler = StandardScaler()
    scaler.fit(Data_train)
    Data_train = scaler.transform(Data_train)
    Data_test = scaler.transform(Data_test)


    construct_models(Data_train, Data_test, Output_train, Output_test, bin_output)
    total_class=preprocess.get_totalclass('processed_datasets/final_32test.csv')
    generate_roc(Data_train, Data_test, Output_train, Output_test, bin_output,total_class)
import decision_tree
import adaboost

prepro = 0
if prepro == 1:
    start_preprocessing = time.time()
    print('Preprocessing started...')
    df_credit_temp = pd.read_csv('creditcard.csv', delimiter=',', header=None, na_values='\s+', skiprows=1)
    df_credit_pos = df_credit_temp.loc[df_credit_temp.iloc[:, df_credit_temp.shape[1] - 1] == 1]
    df_credit_neg = df_credit_temp.loc[df_credit_temp.iloc[:, df_credit_temp.shape[1] - 1] == 0]
    df_credit_neg = df_credit_neg.sample(n=20000, replace=False)
    df_credit = pd.concat([df_credit_neg, df_credit_pos], axis=0)
    df_credit = cp.process_missing_label(df_credit)
    df_credit = cp.process_missing_attribute(df_credit, range(0, df_credit.shape[1] - 1))
    print('Missing values handled')
    df_credit, binarizers_credit, binarizers_credit_columns = cp.binarize(df_credit, range(0, df_credit.shape[1] - 1))
    print('Continuous values binarized')
    df_credit = df_credit.reset_index(drop=True)
    df_credit.to_csv('Preprocessed_Credit.csv', sep=',')
    print('Preprocessing Finished...')
    end_preprocessing = time.time()

    print("Preprocessing took " + str(float(end_preprocessing - start_preprocessing) / 60) + " min")
    print()
    print()
else:
    df_credit = pd.read_csv('Preprocessed_Credit.csv', delimiter=',', header=None)

df_credit_train, df_credit_test = model_selection.train_test_split(df_credit, test_size=0.20)

start_training = time.time()
Esempio n. 9
0
print 'Logistic Regression with Binarized Features:'

# print 'refreshing dataset...'

# parse train and test text files
train_x = get_features('spam_train.txt')
train_y = get_classification('spam_train.txt')

test_x = get_features('spam_test.txt')
test_y = get_classification('spam_test.txt')

# print 'binarizing features...'

# standardize features
train_x = binarize(train_x)
test_x = binarize(test_x)

# add 1 y-intercept column
train_x = add_ones(train_x)
test_x = add_ones(test_x)

# print 'calculating weights...'

# find W for logistic regression with gradient descent
w = logistic_regression(train_x, train_y)

# print 'predicting...'

# make predictions
train_predictions = predict_y(train_x, w)