Exemple #1
0
def main():
    # simple arguments check
    if len(sys.argv) != 6:
        print("wrong number of arguments")
        sys.exit(0)

    # passing args
    learning_rate = sys.argv[1]
    num_hidden_units = sys.argv[2]
    num_epoches = sys.argv[3]
    training_set_path = sys.argv[4]
    test_set_path = sys.argv[5]

    # load data
    training_set, test_set = func.load_data(training_set_path, test_set_path)
    # find index of num and category features
    num_feature_index_list, cate_feature_index_list = func.classify_features(training_set, test_set)
    # getting traning set matrices
    training_set_num_feature_matrix, training_set_cate_feature_matrix, training_set_label_matrix = func.get_matrices(training_set, num_feature_index_list, cate_feature_index_list)
    test_set_num_feature_matrix, test_set_cate_feature_matrix, test_set_label_matrix = func.get_matrices(test_set, num_feature_index_list, cate_feature_index_list)
    # get the total number of training and test instances
    total_num_training = training_set_num_feature_matrix.shape[0]
    total_num_test = test_set_num_feature_matrix.shape[0]
    # fill up the feature status list
    # [num, cate]
    feature_status = [0,0]
    if training_set_num_feature_matrix.size != 0 :
        feature_status[0] = 1
    if training_set_label_matrix.size != 0 :
        feature_status[1] = 1
    # standardize num features
    normed_training_set_num_feature_matrix = np.zeros((total_num_training,0))
    normed_test_set_num_feature_matrix = np.zeros((total_num_test,0))
    if feature_status[0] == 1:
        normed_training_set_num_feature_matrix, normed_test_set_num_feature_matrix = func.standardize(training_set_num_feature_matrix, test_set_num_feature_matrix)

    # combine the feature matrix, reorder
    combined_index_list = num_feature_index_list + cate_feature_index_list
    sorted_index_list = np.argsort(combined_index_list)
    combined_training_set_feature_matrix = np.hstack((normed_training_set_num_feature_matrix,training_set_cate_feature_matrix))
    combined_test_set_feature_matrix = np.hstack((normed_test_set_num_feature_matrix,test_set_cate_feature_matrix))
    ordered_training_set_feature_matrix = combined_training_set_feature_matrix[:,sorted_index_list]
    ordered_test_set_feature_matrix = combined_test_set_feature_matrix[:,sorted_index_list]

    # one-hot
    num_to_skip = 0
    for idx,original_idx in enumerate(cate_feature_index_list):
        variant_list = training_set["metadata"]["features"][:-1][original_idx][1]

        cur_training_col = training_set_cate_feature_matrix[:,idx]
        cur_test_col = test_set_cate_feature_matrix[:,idx]

        for jdx, variant in enumerate(variant_list):
            cur_training_col[cur_training_col == variant] = jdx
            cur_test_col[cur_test_col == variant] = jdx

        cur_training_col = cur_training_col.astype(int)
        cur_test_col = cur_test_col.astype(int)

        expanded_training_cols = np.zeros((total_num_training,len(variant_list)))
        expanded_training_cols[np.arange(total_num_training),cur_training_col.flatten()] = 1
        expanded_test_cols = np.zeros((total_num_test,len(variant_list)))
        expanded_test_cols[np.arange(total_num_test),cur_test_col.flatten()] = 1

        ordered_training_set_feature_matrix = np.delete(ordered_training_set_feature_matrix,original_idx + num_to_skip,axis=1)
        ordered_training_set_feature_matrix = np.insert(ordered_training_set_feature_matrix,[original_idx + num_to_skip],expanded_training_cols,axis=1)
        ordered_test_set_feature_matrix = np.delete(ordered_test_set_feature_matrix,original_idx + num_to_skip,axis=1)
        ordered_test_set_feature_matrix = np.insert(ordered_test_set_feature_matrix,[original_idx + num_to_skip],expanded_test_cols,axis=1)
        num_to_skip += (len(variant_list) - 1)

    # append bias entry
    ordered_training_set_feature_matrix = np.insert(ordered_training_set_feature_matrix,0,1,axis=1).astype(float)
    ordered_test_set_feature_matrix = np.insert(ordered_test_set_feature_matrix,0,1,axis=1).astype(float)
    # initialize weight
    w_i_h = np.random.uniform(low=-0.01, high=0.01, size=(int(num_hidden_units), ordered_training_set_feature_matrix.shape[1]))
    w_h_o = np.random.uniform(low=-0.01, high=0.01, size=(1, int(num_hidden_units) + 1))

    # nn SGD
    class_list = training_set["metadata"]["features"][-1][1]
    for epoch in range(int(num_epoches)):
        num_corr = 0
        num_incorr = 0
        sum_E = 0
        for idx in range(total_num_training):
            # index indicate hidden unit, 1d
            net_i_h = np.dot(ordered_training_set_feature_matrix[idx,:],np.transpose(w_i_h))
            h = func.sigmoid(net_i_h)
            # adding bias entry
            h_o = np.insert(h,0,1).astype(float)
            net_h_o = np.dot(w_h_o, h_o)
            o = func.sigmoid(net_h_o)
            y = training_set_label_matrix[idx,0]
            if class_list.index(y) == 0:
                y = 0
            else:
                y = 1
            E = -y * np.log(o) - (1 - y) * np.log(1 - o)
            sum_E += E
            d_o = y - o
            d_h = h_o*(1 - h_o)*d_o*w_h_o
            update_h_o = float(learning_rate)*d_o*h_o
            update_i_h = float(learning_rate)*d_h[:,1]*ordered_training_set_feature_matrix[idx,:]
            for curcol in range(2,d_h.shape[1]):
                temp = float(learning_rate)*d_h[:,curcol]*ordered_training_set_feature_matrix[idx,:]
                update_i_h = np.vstack((update_i_h,temp))
            w_i_h += update_i_h
            w_h_o += update_h_o

            pred = 0
            if o > 0.5:
                pred = 1
            else:
                pred = 0

            if pred == y:
                num_corr +=1
            else:
                num_incorr +=1

        print(str(epoch+1)+ " {:.12f}".format(sum_E[0])+ " " + str(num_corr) + " " + str(num_incorr))

    # prediction on test set
    num_corr = 0
    num_incorr = 0
    # true positive
    tp = 0
    # predicted positive
    pp = 0
    for idx in range(total_num_test):
        # index indicate hidden unit, 1d
        net_i_h = np.dot(ordered_test_set_feature_matrix[idx,:],np.transpose(w_i_h))
        h = func.sigmoid(net_i_h)
        # adding bias entry
        h_o = np.insert(h,0,1).astype(float)
        net_h_o = np.dot(w_h_o, h_o)
        o = func.sigmoid(net_h_o)
        y = test_set_label_matrix[idx,0]
        if class_list.index(y) == 0:
            y = 0
        else:
            y = 1

        pred = 0
        if o > 0.5:
            pred = 1
            pp += 1
        else:
            pred = 0

        if pred == y:
            num_corr +=1
            if pred == 1:
                tp += 1
        else:
            num_incorr +=1
        print("{:.12f} ".format(o[0]) + str(pred) + " " + str(y))
    print(str(num_corr) + " " + str(num_incorr))
    actual_pos = np.sum(test_set_label_matrix == class_list[1])
    recall = tp/actual_pos
    precision = tp/pp
    F1 = 2*precision*recall/(precision + recall)
    print("{:.12f}".format(F1))
Exemple #2
0
def main():
    # simple arguments check
    if len(sys.argv) != 5:
        print("wrong number of arguments")
        sys.exit(0)

    # passing args
    learning_rate = sys.argv[1]
    max_epoch = sys.argv[2]
    training_set_path = sys.argv[3]
    test_set_path = sys.argv[4]

    # load data
    training_set, test_set = func.load_data(training_set_path, test_set_path)
    # find index of num and category features
    num_feature_index_list, cate_feature_index_list = func.classify_features(
        training_set, test_set)
    # getting traning set matrices
    training_set_num_feature_matrix, training_set_cate_feature_matrix, training_set_label_matrix = func.get_matrices(
        training_set, num_feature_index_list, cate_feature_index_list)
    test_set_num_feature_matrix, test_set_cate_feature_matrix, test_set_label_matrix = func.get_matrices(
        test_set, num_feature_index_list, cate_feature_index_list)
    # get the total number of training and test instances
    total_num_training = training_set_num_feature_matrix.shape[0]
    total_num_test = test_set_num_feature_matrix.shape[0]
    # fill up the feature status list
    # [num, cate]
    feature_status = [0, 0]
    if training_set_num_feature_matrix.size != 0:
        feature_status[0] = 1
    if training_set_label_matrix.size != 0:
        feature_status[1] = 1
    # standardize num features
    normed_training_set_num_feature_matrix = np.zeros((total_num_training, 0))
    normed_test_set_num_feature_matrix = np.zeros((total_num_test, 0))
    if feature_status[0] == 1:
        normed_training_set_num_feature_matrix, normed_test_set_num_feature_matrix = func.standardize(
            training_set_num_feature_matrix, test_set_num_feature_matrix)

    # combine the feature matrix, reorder
    combined_index_list = num_feature_index_list + cate_feature_index_list
    sorted_index_list = np.argsort(combined_index_list)
    combined_training_set_feature_matrix = np.hstack(
        (normed_training_set_num_feature_matrix,
         training_set_cate_feature_matrix))
    combined_test_set_feature_matrix = np.hstack(
        (normed_test_set_num_feature_matrix, test_set_cate_feature_matrix))
    ordered_training_set_feature_matrix = combined_training_set_feature_matrix[:,
                                                                               sorted_index_list]
    ordered_test_set_feature_matrix = combined_test_set_feature_matrix[:,
                                                                       sorted_index_list]

    # one-hot
    num_to_skip = 0
    for idx, original_idx in enumerate(cate_feature_index_list):
        variant_list = training_set["metadata"]["features"][:-1][original_idx][
            1]

        cur_training_col = training_set_cate_feature_matrix[:, idx]
        cur_test_col = test_set_cate_feature_matrix[:, idx]

        for jdx, variant in enumerate(variant_list):
            cur_training_col[cur_training_col == variant] = jdx
            cur_test_col[cur_test_col == variant] = jdx

        cur_training_col = cur_training_col.astype(int)
        cur_test_col = cur_test_col.astype(int)

        expanded_training_cols = np.zeros(
            (total_num_training, len(variant_list)))
        expanded_training_cols[np.arange(total_num_training),
                               cur_training_col.flatten()] = 1
        expanded_test_cols = np.zeros((total_num_test, len(variant_list)))
        expanded_test_cols[np.arange(total_num_test),
                           cur_test_col.flatten()] = 1

        ordered_training_set_feature_matrix = np.delete(
            ordered_training_set_feature_matrix,
            original_idx + num_to_skip,
            axis=1)
        ordered_training_set_feature_matrix = np.insert(
            ordered_training_set_feature_matrix, [original_idx + num_to_skip],
            expanded_training_cols,
            axis=1)
        ordered_test_set_feature_matrix = np.delete(
            ordered_test_set_feature_matrix,
            original_idx + num_to_skip,
            axis=1)
        ordered_test_set_feature_matrix = np.insert(
            ordered_test_set_feature_matrix, [original_idx + num_to_skip],
            expanded_test_cols,
            axis=1)
        num_to_skip += (len(variant_list) - 1)

    # append bias entry
    ordered_training_set_feature_matrix = np.insert(
        ordered_training_set_feature_matrix, 0, 1, axis=1).astype(float)
    ordered_test_set_feature_matrix = np.insert(
        ordered_test_set_feature_matrix, 0, 1, axis=1).astype(float)

    # SGD
    F1_training = []
    F1_test = []
    class_list = training_set["metadata"]["features"][-1][1]
    for num_epoches in range(1, int(max_epoch) + 1):
        # initialize weight
        w = np.random.uniform(
            low=-0.01,
            high=0.01,
            size=(1, ordered_training_set_feature_matrix.shape[1]))
        for epoch in range(num_epoches):
            for idx in range(total_num_training):
                net = np.dot(w, ordered_training_set_feature_matrix[idx, :])
                o = func.sigmoid(net)
                y = training_set_label_matrix[idx, 0]
                if class_list.index(y) == 0:
                    y = 0
                else:
                    y = 1
                E = -y * np.log(o) - (1 - y) * np.log(1 - o)
                grad = (o - y) * ordered_training_set_feature_matrix[idx, :]
                update = -float(learning_rate) * grad
                w += update

        # prediction on test set
        num_corr = 0
        num_incorr = 0
        # true positive
        tp = 0
        # predicted positive
        pp = 0
        for idx in range(total_num_test):
            net = np.dot(w, ordered_test_set_feature_matrix[idx, :])
            o = func.sigmoid(net)
            y = test_set_label_matrix[idx, 0]
            if class_list.index(y) == 0:
                y = 0
            else:
                y = 1

            pred = 0
            if o > 0.5:
                pred = 1
                pp += 1
            else:
                pred = 0

            if pred == y:
                num_corr += 1
                if pred == 1:
                    tp += 1
            else:
                num_incorr += 1
        actual_pos = np.sum(test_set_label_matrix == class_list[1])
        recall = tp / actual_pos
        precision = tp / pp
        F1 = 2 * precision * recall / (precision + recall)
        F1_test.append(F1)

        # prediction on training set
        num_corr = 0
        num_incorr = 0
        # true positive
        tp = 0
        # predicted positive
        pp = 0
        for idx in range(total_num_training):
            net = np.dot(w, ordered_training_set_feature_matrix[idx, :])
            o = func.sigmoid(net)
            y = training_set_label_matrix[idx, 0]
            if class_list.index(y) == 0:
                y = 0
            else:
                y = 1

            pred = 0
            if o > 0.5:
                pred = 1
                pp += 1
            else:
                pred = 0

            if pred == y:
                num_corr += 1
                if pred == 1:
                    tp += 1
            else:
                num_incorr += 1
        actual_pos = np.sum(training_set_label_matrix == class_list[1])
        recall = tp / actual_pos
        precision = tp / pp
        F1 = 2 * precision * recall / (precision + recall)
        F1_training.append(F1)

    plt.plot(range(1,
                   int(max_epoch) + 1),
             F1_training,
             label="on training set")
    plt.plot(range(1, int(max_epoch) + 1), F1_test, label="on test set")
    plt.title("F1 vs #epoches on heart dataset, learning rate = 0.05")
    plt.ylabel("F1")
    plt.xlabel("#epoches")
    plt.legend()
    plt.show()
Exemple #3
0
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from func import load_data

#outline
#1.EDA: Exploratory Data Analysis with Visualization
#2.Feature Extraction
#3.Data Modeling
#4.Model Evaluation

#1.EDA: Exploratory Data Analysis with Visualization

#1.1 load data
train, test, combine = load_data()
#1.2 data structure
print train.shape  #891*12
train.describe()  #statistics on numerical variables
train.describe(include=['O'])  #categorical data
train.info()  #check data tyoe and missing value
train.isnull().sum()
train['Embarked'].value_counts(normalize=True)

#1.3 relationship btw features and target variable
#target var distribution
survived = train['Survived'][train['Survived'] == 1]
not_survived = train['Survived'][train['Survived'] == 0]
print "Survived: %i (%.1f%%)" % (len(survived),
                                 float(len(survived)) / len(train) * 100.0)
print "Not Survived: %i (%.1f%%)" % (
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.utils import np_utils
from func import load_data, chord2index
import numpy as np

(x_train, y_train) = load_data('../data/train_note.csv',
                               '../data/train_chord.csv')

# data pre-processing
y_train = np_utils.to_categorical(y_train, num_classes=24)

x_test = x_train
y_test = y_train

#
max_features = 1024

model = Sequential()
model.add(Embedding(max_features, output_dim=13))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(24, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=16, epochs=10)
Exemple #5
0
                        default=256)
    parser.add_argument("--epochs", type=int, help="Epochs", default=7)
    parser.add_argument('--gpu', action="store_true", help='Enable GPU')

    arguments = parser.parse_args()
    return arguments


if __name__ == "__main__":
    arguments = arg_parser()
    data_dir = arguments.data_dir
    checkpoint_dir = arguments.save_dir
    architecture = arguments.arch
    learning_rate = arguments.learning_rate
    hidden_units = arguments.hidden_units
    epochs = arguments.epochs
    gpu = arguments.gpu
    device = torch.device("cuda" if gpu else "cpu")

    dataloaders, validloaders, testloader, image_datasets = load_data(data_dir)
    model, criterion, optimizer = build_model(architecture, learning_rate,
                                              hidden_units, epochs, device,
                                              image_datasets.class_to_idx)

    train_model(epochs, dataloaders, validloaders, model, criterion, optimizer,
                device)

    calculate_acc(model, testloader, device)

    save_checkpoint(architecture, model, image_datasets, optimizer, epochs,
                    checkpoint_dir)
tensor_out = Flatten()(tensor_out)
tensor_out = Dropout(0.5)(tensor_out)

tensor_out = [Dense(10, name='digit1', activation='softmax')(tensor_out),\
              Dense(10, name='digit2', activation='softmax')(tensor_out),\
              Dense(10, name='digit3', activation='softmax')(tensor_out),\
              Dense(10, name='digit4', activation='softmax')(tensor_out)]

model = Model(inputs=tensor_in, outputs=tensor_out)
model.compile(loss='categorical_crossentropy',
              optimizer='Adamax',
              metrics=['accuracy'])
model.summary()

x_train, y_train, x_val, y_val = load_data('label.txt', split_threshold=800)
'''
data = pd.read_csv('label.txt', header=None)
di = dict()
for index, row in data.iterrows():
    if (len(str(row[1])) < 4 ):
        row[1] = ('0000' + str(row[1]))[-4:]
    di[row[0]] = str(row[1])
#print(di)
split_th = 800

x_train = []
y_train = []
yListData = [[] for _ in range(4)]
yListVal = [[] for _ in range(4)]
for data_idx, key in enumerate(di.keys()):