Exemple #1
0
def run_test():
    dictionary = DP.read_dict(dict_file)
    raw_test, choices = DP.read_test(test_file, choices_file)
    test = DataSet(raw_test, len(dictionary), cut=False)

    # RNN Parameters
    N_input = test.datalen
    N_class = len(dictionary)
    N_iter = N_epoch * N_input

    # Input
    x = tf.placeholder(dtype=tf.int32, shape=[batch_size, None])
    y = tf.placeholder(dtype=tf.int32, shape=[batch_size, None])

    embeddings = tf.Variable(tf.random_uniform([N_class, N_hidden], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, x)

    y_reshape = tf.reshape(y, [-1])

    # Weights
    w = tf.Variable(tf.random_normal([N_hidden, N_class]))
    b = tf.Variable(tf.random_normal([N_class]))

    # RNN
    pred = RNN(embed, w, b)

    # accuracy
    correct_pred = tf.equal(tf.argmax(pred, 1), tf.cast(y_reshape, tf.int64))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    init = tf.global_variables_initializer()

    ans = []

    #    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    with tf.Session() as sess:
        sess.run(init)
        saver = tf.train.Saver()
        saver.restore(sess, model_file)

        for i in range(N_input):
            batch_x, _ = test.next_batch(batch_size=1)

            spaceID = np.argwhere(batch_x[0] == SPACE)[0, 0]

            prob = sess.run(pred, feed_dict={x: batch})

            best_choice = np.argmax(prob[spaceID - 1, choices[i]])
            ans.append(best_choice)

    return np.array(ans)
def load_valid(valid_id_file, feature_path):
    # Inputs
    # load valid
    dictionary = DP.read_dict(dict_file)
    inv_dictionary = {value: key for (key, value) in dictionary.items()}

    ID = []
    with open(valid_id_file, 'r') as f:
        reader = csv.reader(f)
        for line in reader:
            ID.append(line[0])

    if feature_path[-1] is not '/':
        feature_path += '/'

    feat = []
    for filename in ID:
        x = np.load(feature_path + filename + '.npy')
        feat.append(x)
    feat = np.array(feat)

    with open(valid_truth_file, 'r') as f:
        ref = json.load(f)

    # Parameters
    N_input = len(ID)
    feat_timestep = feat.shape[1]
    feat_dim = feat.shape[-1]
    vocab_size = len(dictionary)
    return feat, ID, inv_dictionary, ref
def train_and_predict():

    X, Y, toPredict= DataPreprocessor.get_preprocessed_data(config.Data_Path)

    data = DataPreprocessor.train_cv_test_split(X, Y, config.Training_Rate, config.Cross_Validate_Rate, config.Test_Rate)
    (train_X, train_Y, cv_X, cv_Y, test_X, test_Y) = data
    model = Model(train_X, train_Y, batch_size = get_batch_size() , drop = get_drop(), regularizer = get_regularizer(), norm = get_norm(), optimizer = get_opt())
    model.print_Info(config.Print_Info, config.Print_At)
    model.cross_validate(cv_X, cv_Y)
    model.add_layer(192, ini = He(), acti = relu())
    model.add_layer(96, ini = He(), acti = relu())
    model.add_layer(48, ini = He(), acti = relu())
    model.add_last_layer(ini= He())
    model.fit(epoch = config.Epoch, learning_rate = config.Learning_Rate)
    model.test(test_X, test_Y)
    model.plot(config.Plot_Loss, config.Plot_Accuracy)

    predict = model.predict(toPredict).T
    predict = np.argmax(predict, axis = 1)
    f = h5py.File(config.Save_To + "/Predicted_labels.h5",'a')
    f.create_dataset('/predicted_label',data = predict, dtype = np.float32)
    f.close()
Exemple #4
0
def main():
    '''
    img1=cv2.imread("/home/michael/Cell_Classification/Code/Data_Preprocessing/Small_Windows_Whitened_23.04/D102_F001_C01.png",0)
    print(img1,'\n'*10)
    img1= matplotlib.image.imread("/home/michael/Cell_Classification/Code/Data_Preprocessing/Small_Windows_Whitened_23.04/D102_F001_C01.png")
    print(img1,'\n'*10)
    img1 = plt.imread("/home/michael/Cell_Classification/Code/Data_Preprocessing/Small_Windows_Whitened_23.04/D102_F001_C01.png")
    print(img1,'\n'*10)


    img2=cv2.imread("/home/michael/Cell_Classification/Files/Small_Windows_150/D102_F001_C01.png",0)
    print(img2,'\n'*10)
    img2= matplotlib.image.imread("/home/michael/Cell_Classification/Files/Small_Windows_150/D102_F001_C01.png")
    print(img2,'\n'*10)
    img2 = plt.imread("/home/michael/Cell_Classification/Files/Small_Windows_150/D102_F001_C01.png")
    print(img2,'\n'*10)
    '''

    Data = DataPreprocessor(**PREPROCESSING_DATA_SETTINGS)
    Data.preprocess_data(save_df_to_file=SAVE_DF,
                         to_rearrange_df=REARRANGE_DF,
                         to_save_imgs=SAVE_IMAGES,
                         url=URL)
Exemple #5
0
def main(argv, _LOGFILENAME, timestamp):
    try:
        opts, args = getopt.getopt(argv,"hi:",["ifile="])
    except getopt.GetoptError:
        print 'test_model.py -i <inputfile>'
        EventIssuer.issueExit(_LOGFILENAME, timestamp)
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'test_model.py -i <inputfile>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            EssayFileName = arg
    essay_vector = DataPreprocessor.preprocessEssayText(_LOGFILENAME, EssayFileName)
    # print essay_vector.shape
    model = DeepScore_Core.loadDeepScoreModel(_LOGFILENAME, "1494040329.92")
    predicted_score = np.argmax(np.squeeze(model.predict(essay_vector)))
    # print predicted_score
    EventIssuer.issueSuccess("The essay has been graded. I think the score should be " + str(predicted_score) + " out of 12", _LOGFILENAME, ifBold=True)
    with open("/Users/abhinandandubey/Documents/resui.txt", 'w') as fui:
        fui.write(str(predicted_score))
    EventIssuer.issueExit(_LOGFILENAME, timestamp)
Exemple #6
0
def run_train():
    dictionary = DP.read_dict(dict_file)
    train = DataSet(DP.read_train(train_file), len(dictionary), cut=True)

    # RNN Parameters
    N_input = train.datalen
    N_class = len(dictionary)
    N_iter = N_epoch * N_input

    # Input
    x = tf.placeholder(dtype=tf.int32, shape=[batch_size, None])
    y = tf.placeholder(dtype=tf.int32, shape=[batch_size, None])

    embeddings = tf.Variable(tf.random_uniform([N_class, N_hidden], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, x)

    y_reshape = tf.reshape(y, [-1])

    # Weights
    w = tf.Variable(tf.random_normal([N_hidden, N_class]))
    b = tf.Variable(tf.random_normal([N_class]))

    # RNN
    pred = RNN(embed, w, b)

    # cost function and optimizer
    cost = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pred,
                                                       labels=y_reshape))
    optimizer = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(cost)

    # accuracy
    correct_pred = tf.equal(tf.argmax(pred, 1), tf.cast(y_reshape, tf.int64))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    init = tf.global_variables_initializer()

    #    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    with tf.Session() as sess:
        sess.run(init)
        step = 0

        t = time.time()
        while step < N_iter:
            batch_x, batch_y = train.next_batch(batch_size=batch_size)

            sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})

            if step % display_step == 0:
                used_time = time.time() - t
                t = time.time()
                acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y})
                loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})
                print(
                    str(step) + ' step: accuracy = ' + str(acc) + ' loss = ' +
                    str(loss) + ' time = ' + str(used_time) + ' secs')

            step += 1

        saver = tf.train.Saver()
        saver.save(sess, model_file)

    return
def run_test(testing_id_file, feature_path):
    # Inputs
    dictionary = DP.read_dict(dict_file)
    inv_dictionary = {value: key for (key, value) in dictionary.items()}

    ID = []
    with open(testing_id_file, 'r') as f:
        reader = csv.reader(f)
        for line in reader:
            ID.append(line[0])

    if feature_path[-1] is not '/':
        feature_path += '/'

    feat = []
    for filename in ID:
        x = np.load(feature_path + filename + '.npy')
        feat.append(x)
    feat = np.array(feat)

    # Parameters
    N_input = len(ID)
    feat_timestep = feat.shape[1]
    feat_dim = feat.shape[-1]
    vocab_size = len(dictionary)

    print('Total testing steps: %d' % N_input)

    # Model
    model = AttentionModel(image_dim=feat_dim,
                           vocab_size=vocab_size,
                           N_hidden=N_hidden,
                           N_video_step=feat_timestep,
                           N_caption_step=maxseqlen,
                           batch_size=1)

    tf_video, tf_caption, _ = model.build_test_model(dictionary)

    init = tf.global_variables_initializer()

    result = []

    #    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    with tf.Session() as sess:
        sess.run(init)
        model.restore_model(sess, model_file)
        step = 0

        t = time.time()
        for i, x in enumerate(feat):
            caption = {}
            caption['caption'] = ''
            caption['id'] = ID[i]
            pred = sess.run(tf_caption, feed_dict={tf_video: [x]})

            for j, word in enumerate(pred):
                if inv_dictionary[word] == EOS_tag:
                    break
                else:
                    if j > 0:
                        caption['caption'] += ' '

                    caption['caption'] += inv_dictionary[word]

            result.append(caption)

    return result
def run_train():
    # Inputs
    dictionary = DP.read_dict(dict_file)
    train_label = DP.read_train(train_label_file)
    train = DataSet(train_path, train_label, len(dictionary),
                    dictionary[EOS_tag])

    # Parameters
    N_input = train.datalen
    N_iter = N_input * N_epoch // batch_size
    print('Total training steps: %d' % N_iter)

    # Model
    model = AttentionModel(image_dim=train.feat_dim,
                           vocab_size=train.vocab_size,
                           N_hidden=N_hidden,
                           N_video_step=train.feat_timestep,
                           N_caption_step=train.maxseqlen,
                           batch_size=batch_size)

    # Loss function and optimizer
    tf_loss, tf_video_train, tf_caption_train, tf_caption_mask_train, _ = model.build_train_model(
        dictionary)
    tf_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)

    # Initialize validation network

    valid_feat, ID, inv_dictionary, ref = load_valid(valid_id_file, valid_path)
    tf_video, tf_caption, _ = model.build_valid_model(dictionary)

    init = tf.global_variables_initializer()

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                          device_count={'GPU': 1})) as sess:
        sess.run(init)
        #model.restore_model(sess, model_file)
        step = 0

        t = time.time()
        while step < N_iter:
            batch_x, batch_y = train.next_batch(batch_size=batch_size)
            y = np.full((batch_size, train.maxseqlen), dictionary[EOS_tag])
            y_mask = np.zeros(y.shape)

            for i, caption in enumerate(batch_y):
                y[i, :len(caption)] = caption
                y_mask[i, :len(caption)] = 1

            sess.run(tf_optimizer,
                     feed_dict={
                         tf_video_train: batch_x,
                         tf_caption_train: y,
                         tf_caption_mask_train: y_mask
                     })

            #            if True:
            if step % display_step == 0:
                used_time = time.time() - t
                t = time.time()
                loss = sess.run(tf_loss,
                                feed_dict={
                                    tf_video_train: batch_x,
                                    tf_caption_train: y,
                                    tf_caption_mask_train: y_mask
                                })
                print(
                    str(step) + '/' + str(N_iter) + ' step: loss = ' +
                    str(loss) + ' time = ' + str(used_time) + ' secs')
                model.save_model(sess, model_file)

            if step % valid_step == 0 and step > 0:
                result = predict(sess, valid_feat, ID, inv_dictionary,
                                 tf_caption, tf_video)
                print(result)
                bleu = evaluate_list(result, ref)
                print(
                    str(step) + '/' + str(N_iter) + ' step: bleu = ' +
                    str(bleu))
                with open(r'seq2seq_random_b256.csv', 'a') as f:
                    writer = csv.writer(f)
                    writer.writerow([step, bleu])

            step += 1

        model.save_model(sess, model_file)

    return
Exemple #9
0
def main(dataset2="diabetic.csv", dataset1="earthquake_processed.csv", run_part_1=True):

    #sys.stdout = open(os.path.join(LOG_PATH, 'log' + time.strftime("%Y%m%d-%H%M%S") + ".txt"), 'w+')

    if dataset1 != "":

        print("Loading dataset " + dataset1, flush=True)

        # Load the data.
        dataset_csv_path = os.path.join(DATA_PATH, dataset1)
        dataset = pd.read_csv(dataset_csv_path)

        X = dataset.drop("class", axis=1)
        y = dataset["class"].copy()

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
        y_train = y_train.tolist()
        y_test = y_test.tolist()

        pipe = DataPreprocessor.preprocess_data(X_train)
        X_train_transformed = pipe.fit_transform(X_train)

        if run_part_1:

            PlottingUtils.generate_pair_plot("Feature Pair Plot - True Labels - DR Dataset", X_train_transformed,
                                             np.array(y_train), columns=dataset.columns,
                                             x_labels=dataset.columns.tolist()[:-1],
                                             y_labels=dataset.columns.tolist()[:-1])

            k_values = np.arange(2, 10, 1)

            PlottingUtils.plot_k_means_scores(k_values, X_train_transformed, "Normalized Scores of Various Metrics vs K - DR Dataset")

            # By inspection, 3 was the best number of clusters.
            kmeans = KMeans(n_clusters=3, max_iter=500)
            kmeans.fit_predict(X_train_transformed)

            homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(kmeans.labels_, y_train)
            print("Scores for DR Dataset")
            print("K Means")
            print("homogeneity:" + str(homogeneity))
            print("completeness:" + str(completeness))
            print("v measure:" + str(v_measure))
            print("Adjusted mutual info score:" + str(adjusted_mutual_info_score(y_train, kmeans.labels_)))
            print()

            PlottingUtils.generate_pair_plot("Feature Pair Plot - KMeans - DR Dataset", X_train_transformed, kmeans.labels_, columns=dataset.columns,x_labels=dataset.columns.tolist()[:-1],
                                             y_labels=dataset.columns.tolist()[:-1])

            k_values = np.arange(2, 15, 1)
            PlottingUtils.plot_gmm_scores(k_values, X_train_transformed, "EM - DR Dataset")

            # By inspection, 4 clusters were best.
            gmm = GaussianMixture(4, max_iter=500, n_init=10)
            gmm.fit(X_train_transformed)

            labels = np.array(gmm.predict(X_train_transformed))

            homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(labels, y_train)
            print("EM")
            print("homogeneity:" + str(homogeneity))
            print("completeness:" + str(completeness))
            print("v measure:" + str(v_measure))
            print("Adjusted mutual info score:" + str(adjusted_mutual_info_score(y_train, labels)))
            print()

            PlottingUtils.generate_pair_plot("Feature Pair Plot - EM - DR Dataset", X_train_transformed, labels, columns=dataset.columns, x_labels=dataset.columns.tolist()[:-1],
                                             y_labels=dataset.columns.tolist()[:-1])


    if dataset2 != "":

        print("Loading dataset " + dataset2)

        # Load the data.
        dataset_csv_path = os.path.join(DATA_PATH, dataset2)
        dataset = pd.read_csv(dataset_csv_path)

        X = dataset.drop("class", axis=1)
        y = dataset["class"].copy()

        numeric_features = list(X.select_dtypes(include=np.number))
        cat_features = list(X.select_dtypes(exclude=np.number))

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

        pipe = DataPreprocessor.preprocess_data(X_train)
        X_train_transformed = pipe.fit_transform(X_train)

        enc_cat_features = pipe.named_transformers_['cat'].get_feature_names()
        labels = np.concatenate([numeric_features, enc_cat_features])
        transformed_df_columns = pd.DataFrame(pipe.transform(X_train), columns=labels).columns.tolist()
        transformed_df_columns.append("class")

        if run_part_1:

            k_values = np.arange(2, 100, 1)

            #PlottingUtils.plot_k_means_scores(k_values, X_train_transformed, "Normalized Scores of Various Metrics vs K - Earthquake Dataset")

            kmeans = KMeans(n_clusters=21, max_iter=500)
            kmeans.fit_predict(X_train_transformed)

            homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(kmeans.labels_, y_train)
            print("Scores for Earthquake Dataset")
            print("K Means")
            print("homogeneity:" + str(homogeneity))
            print("completeness:" + str(completeness))
            print("v measure:" + str(v_measure))
            print("Adjusted mutual info score:" + str(adjusted_mutual_info_score(y_train, kmeans.labels_)))
            print()

            #PlottingUtils.generate_tsne("TSNE Visualization of K-Means Clusters - Earthquake Dataset", X_train_transformed, kmeans.labels_)

            k_values = np.arange(2, 50, 1)
            #PlottingUtils.plot_gmm_scores(k_values, X_train_transformed, "BIC & AIC Scores EM - Earthquake Dataset")

            gmm = GaussianMixture(17, max_iter=500, n_init=10)
            gmm.fit(X_train_transformed)

            labels = np.array(gmm.predict(X_train_transformed))

            homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(labels, y_train)

            print("EM")
            print("homogeneity:" + str(homogeneity))
            print("completeness:" + str(completeness))
            print("v measure:" + str(v_measure))
            print("Adjusted mutual info score:" + str(adjusted_mutual_info_score(y_train, labels)))

            PlottingUtils.generate_tsne("TSNE Visualization of EM Clusters - Earthquake Dataset", X_train_transformed, labels)

    sys.stdout.flush()
    sys.stdout.close()
Exemple #10
0
        denseOutput = graph.get_tensor_by_name(
            "concat_pool_flat_output/dense/Tanh:0")
        testY_Predict = sess.run(denseOutput, feed_dict)
        predict = sess.run(tf.argmax(testY_Predict, 1))
        real = sess.run(tf.argmax(testY, 1))
        TP_List = []
        FN_List = []
        TN_List = []
        FP_List = []
        for i in range(len(predict)):
            if predict[i] == real[i] and predict[i] == 1:
                TP_List.append(str(testFileNameNoDict[i]).split("-")[0])
            elif predict[i] != real[i] and predict[i] == 0 and real[i] == 1:
                FN_List.append(str(testFileNameNoDict[i]).split("-")[0])
            elif predict[i] == real[i] and predict[i] == 0:
                TN_List.append(str(testFileNameNoDict[i]).split("-")[0])
            elif predict[i] != real[i] and predict[i] == 1 and real[i] == 0:
                FP_List.append(str(testFileNameNoDict[i]).split("-")[0])
        print("TP num:" + str(len(TP_List)))
        print("FN num:" + str(len(FN_List)))
        print("TN num:" + str(len(TN_List)))
        print("FP num:" + str(len(FP_List)))
        return TP_List, FN_List, TN_List, FP_List


if __name__ == '__main__':
    sess = tf.Session()
    dbcn = DBCN_Model()
    dataPreprocessor = DataPreprocessor.DataPreprocessor()
    TP_List, FN_List, TN_List, FP_List = dbcn.test(sess, dataPreprocessor)
    # dbcn.train(sess,dataPreprocessor)
Exemple #11
0
def CC_train():
	# Parameters
	exp_batch_size = 14
	input_size = (512,512,3)
	output_size = (128,128,1)
	
	npy_presave = False
	root_dir = 'train/'
	image_dir = root_dir + "train_image_patches/"
	image_npy_dir = root_dir + "images_npy/"
	heatmap_dir = root_dir + "train_dm_patches/"
	heatmap_npy_dir = root_dir + "heatmaps_npy/"
	CSV_conf_path = root_dir + "data_conf_.csv"
	output_dir = root_dir + "output/"
	model_path = output_dir + 'model.json'
	weights_path = output_dir + 'weights_205000.mat'
	if not os.path.exists(output_dir):
		os.makedirs(output_dir)
	
    	# Data preprocessing: 
	if npy_presave:
		DataPreprocessor.convert_CC_images_to_npy(image_dir,image_npy_dir, False,input_size)
		DataPreprocessor.convert_CC_images_to_npy(heatmap_dir,heatmap_npy_dir, True,output_size)
		DataPreprocessor.generate_CSV(image_npy_dir, heatmap_npy_dir, CSV_conf_path)	
		data_dir = image_npy_dir
		GT_dir = heatmap_npy_dir	
	else:
		DataPreprocessor.generate_CSV(image_dir, heatmap_dir, CSV_conf_path)
		
		data_dir = image_dir
		GT_dir = heatmap_dir
		
	# Build the model
	session = tf.Session(config = config)
	model = DeepModels.load_CC_ResNet()
	model.load_weights('weights.093-0.00284886.hdf5')
	#model.load_weights('weights.062-0.00326376.hdf5')
	model.summary()

	validate_set_dir = "val/"
	image_paths = glob.glob(validate_set_dir + "validation_image_patches_512/" + "/*.png")	
	im_eval,dm_eval = read_eval_input_dm_patches(validate_set_dir,image_paths,input_size)
	
	def testmodel(epoch, logs):
    		predx = im_eval[0:32,:,:,:]#get_batch(data_dir, GT_dir, CSV_conf_path, input_size, output_size, exp_batch_size,npy_presave)
    		predy = dm_eval[0:32,:,:,:]
    		predout = model.predict(predx,batch_size= 32)
    		output_images(predx,output_dir+"/images/")
		output_images(predy,output_dir+"/heatmaps/")		
		output_images(predout,output_dir+"/heatmap_predictions/")
		
    		
	def smoothL1(y_true, y_pred):
		HUBER_DELTA = 0.05
    		x = K.abs(y_true - y_pred)
        	x = tf.where(x < HUBER_DELTA, 0.5 * (x ** 2), HUBER_DELTA * (x - 0.5 * HUBER_DELTA))
        	return  K.mean(x,axis = -1)
        
        def learning_rate_changer(epoch_index):
        	if epoch_index != 0 and epoch_index % 10 == 0:
        		print "Changing learning rate"
        		return float(K.set_value(adma.lr,0.9*K.get_value(adam.lr))) 

	adam = Adam(lr=0.000001,beta_1 = 0.9,beta_2 = 0.999, epsilon = 1e-08, decay = 0)
	
	model.compile(loss= 'mae', optimizer= adam, metrics = ['mse'])
	
	tensorboard = TensorBoard(log_dir='./logs_densenet_finetune', histogram_freq=0, batch_size= 16, write_graph=True)
	check_point = ModelCheckpoint("weights.{epoch:03d}-{val_loss:.8f}.hdf5",verbose=1)
	model.fit_generator(get_batch(data_dir, GT_dir, CSV_conf_path, input_size, output_size, exp_batch_size,npy_presave), steps_per_epoch = 500 ,epochs= 1000, verbose=1, callbacks= [tensorboard,check_point] ,validation_data = (im_eval,dm_eval),initial_epoch = 63)
	return
Exemple #12
0
    youTube_data[0] = IOHelper.readCsvToStringList(
        '../Data/YouTubeDataContainer/' + key_word + '.csv')
    print "Completed fetching youtube comments from file !!\n\n"
    is_yt_comment_from_file = True
else:
    # Fetch data from youtube
    print "Fetching videos and comments from YouTube for keyword = ' " + key_word + ' Trailer' + " '. Please wait.....\n"
    youTube_data[0] = getYouTubeComments(key_word + ' Trailer', 10)
    print "Completed fetching YouTube comments !!"

#--------------------------------------- STEP 4: PREPROCESS TWITTER DATA---------------------------------------

if (is_tweet_from_file):
    print "Data is already pre-processed !! Skipping twitter data pre-processing"
else:
    twitter_data[0] = DataPreprocessor.PreprocessStringList(twitter_data[0])
    print "Twitter Data pre-processing complete !!"

#--------------------------------------- STEP 5: PREPROCESS YOUTUBE DATA---------------------------------------

if (is_yt_comment_from_file):
    print "Data is already pre-processed !! Skipping youtube data pre-processing"
else:
    youTube_data[0] = DataPreprocessor.PreprocessStringList(youTube_data[0])
    print "YouTube Data pre-processing complete !!"

#--------------------------------------- STEP 6: STORE TWITTER AND YOUTUBE DATA-----------------------------------

if (not is_tweet_from_file):
    IOHelper.writeStringListToCsv(
        '../Data/TwitterDataContainer/' + key_word + '.csv', twitter_data[0])
Exemple #13
0
import numpy as np
import pandas as pd
import DataPreprocessor as dp
from Model import LogisticRegression

columns = ["F1", "F2", "F3", "F4", "T"]
raw_df = pd.read_csv("data.txt", header=None, names=columns, sep=",")
raw_df = raw_df.sample(n=len(raw_df), random_state=14)
X_train, Y_train, xval, yval, x_test, y_test = dp.train_test_split(
    raw_df, normalize=False, standardize=True)

model = LogisticRegression(nfeatures=4)
model.compile(epochs=15000, learning_rate=0.01, penalty=None, metrics="fscore")

model.fit(X_train, Y_train, plot_freq=None)

pred = model.predict(x_test, model.weights)
model.evaluate(pred, y_test, verbose=True)
w = abs(model.get_params())
print(w)
        self.ptr += 1

        return dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames


def unit_test(mini_batch_loader):
    """unit test to validate MiniBatchLoader using max-frequency (exclusive).
    The accuracy should be around 0.37 and should be invariant over different batch sizes."""
    hits, n = 0., 0
    for d, q, a, m_d, m_q, c, m_c in mini_batch_loader:
        for i in xrange(len(d)):
            prediction, max_count = -1, 0
            for cand in c[i]:
                count = (d[i] == cand).sum() + (q[i] == cand).sum()
                if count > max_count and cand not in q[i]:
                    max_count = count
                    prediction = cand
            n += 1
            hits += a[i] == prediction
        acc = hits / n
        print acc


if __name__ == '__main__':

    from DataPreprocessor import *

    cnn = DataPreprocessor().preprocess("cnn/questions", no_training_set=True)
    mini_batch_loader = MiniBatchLoader(cnn.validation, 64)
    unit_test(mini_batch_loader)
Exemple #15
0
def main(dataset1="earthquake_processed.csv",
         dataset2="diabetic.csv",
         run_dt=True,
         run_nn=True,
         run_boost=True,
         run_svm=True,
         run_knn=True):

    sys.stdout = open(
        os.path.join(LOG_PATH,
                     'log' + time.strftime("%Y%m%d-%H%M%S") + ".txt"), 'w+')

    dt_param_grid_coarse = {
        # 'min_impurity_decrease':[0.0, 0.03, 0.9],
        'max_depth': [1, 100, 500],
        'min_samples_split': [0.01, 0.05],
        'min_samples_leaf': [0.01, 0.05],
        'max_features': [None, 'auto']
    }

    ann_param_grid_coarse = {
        'hidden_layer_sizes': [(5), (50), (100), (5, 5), (50, 50), (100, 100)],
        'activation': ['identity', 'logistic', 'tanh', 'relu'],
        'solver': ['adam', 'sgd'],
    }

    svm_rbf_kernel_param_grid_coarse = {
        'kernel': ['rbf'],
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
        'C': [0.1, 1, 10, 100, 1000]
    }

    if dataset1 != "":

        print("Loading dataset " + dataset1, flush=True)

        # Load the data.
        dataset_csv_path = os.path.join(DATA_PATH, dataset1)
        dataset = pd.read_csv(dataset_csv_path)

        X = dataset.drop("class", axis=1)
        y = dataset["class"].copy()

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=1)

        pipe = DataPreprocessor.preprocess_data(X_train)
        X_train_transformed = pipe.fit_transform(X_train)

        pre_processed_feature_names = X_train_transformed

        earthquake_svm_poly_score = None
        earthquake_svm_poly_train_times = None
        earthquake_svm_poly_test_times = None

        earthquake_svm_rbf_score = None
        earthquake_svm_rbf_train_times = None
        earthquake_svm_rbf_test_times = None

        earthquake_decision_tree_score = None
        earthquake_decision_tree_train_times = None
        earthquake_decision_tree_test_times = None

        earthquake_nn_score = None
        earthquake_nn_train_times = None
        earthquake_nn_test_times = None

        earthquake_booster_score = None
        earthquake_booster_train_times = None
        earthquake_booster_test_times = None

        earthquake_knn_score = None
        earthquake_knn_train_times = None
        earthquake_knn_test_times = None

        earthquake_train_sizes = None

        if run_svm:

            svm_poly_kernel_param_grid_coarse = {
                'kernel': ['poly'],
                'degree': [3],
                'gamma': [0.1, 0.01, 0.001, 0.0001],
                'C': [0.1, 1, 10, 100]
            }

            svm_poly = SVMModel(X_train_transformed, X_test, y_train, y_test,
                                pipe, pre_processed_feature_names,
                                ["No Damage ", "Destroyed"], "Earthquake")

            svm_rbf_kernel_param_grid_coarse = {
                'kernel': ['rbf'],
                'gamma': [1, 0.1],
                'C': [0.1, 1, 10, 100, 1000]
            }

            svm_rbf = SVMModel(X_train_transformed, X_test, y_train, y_test,
                               pipe, pre_processed_feature_names,
                               ["No Damage ", "Destroyed"], "Earthquake")

            svm_rbf.find_hyper_params_coarse(svm_rbf_kernel_param_grid_coarse,
                                             "SVMRbf")
            # Best params: C:1, gamma:0.1

            svm_poly.find_hyper_params_coarse(
                svm_poly_kernel_param_grid_coarse, "SVMPoly")
            # Best params: C:0.1 degree:3, gamma:0.1

            Utils.plot_svm_learners_on_same_curve(
                svm_poly, "poly", svm_rbf, "rbf",
                "Learning Curves After Coarse Grid Search")

            svm_poly.model_params['kernel'] = 'poly'
            svm_poly.update_and_refit_model()

            param_range = [2, 3, 4]
            svm_poly.tune_hyper_parameter('degree', param_range, "SVMPoly", 1)

            svm_poly.model_params['degree'] = 2
            svm_poly.update_and_refit_model()

            param_range = np.linspace(0.0, 0.2, 50)
            svm_poly.tune_hyper_parameter('C', param_range, "SVMPoly", 1)

            svm_poly.model_params['C'] = 0.1
            svm_poly.update_and_refit_model()

            param_range = np.linspace(0.0, 5, 25)
            svm_poly.tune_hyper_parameter('coef0', param_range, "SVMPoly", 1)

            svm_poly.model_params['coef0'] = 3
            svm_poly.update_and_refit_model()

            param_range = np.linspace(0.0, 0.010, 25)
            svm_poly.tune_hyper_parameter('gamma', param_range, "SVMPoly", 1)

            svm_poly.model_params['gamma'] = .008
            svm_poly.update_and_refit_model()

            ############## RBF Kernel ##########################################

            svm_rbf.model_params['kernel'] = 'rbf'
            svm_rbf.update_and_refit_model()

            param_range = np.linspace(0.0, 0.05, 50)
            svm_rbf.tune_hyper_parameter('gamma', param_range, "SVMRbf", 1)

            svm_rbf.model_params['gamma'] = 0.008
            svm_rbf.update_and_refit_model()

            param_range = np.linspace(0, 5, 50)
            svm_rbf.tune_hyper_parameter('C', param_range, "SVMRbf", 1)

            svm_rbf.model_params['C'] = 0.9
            svm_rbf.update_and_refit_model()

            earthquake_svm_rbf_score = svm_rbf.run_cross_val_on_test_set(
                "SVM RBF Kernel")
            earthquake_svm_poly_score = svm_poly.run_cross_val_on_test_set(
                "SVM Poly Kernel")

            earthquake_svm_poly_train_times, earthquake_svm_poly_test_times, earthquake_svm_rbf_train_times, earthquake_svm_rbf_test_times = \
                Utils.plot_svm_learners_on_same_curve(svm_poly, "poly", svm_rbf, "rbf", "Final Learning Curves")

        if run_dt:

            print("running dt experiment", flush=True)
            dt = DecisionTreeModel(X_train_transformed, X_test, y_train,
                                   y_test, pipe, pre_processed_feature_names,
                                   ["No Damage", "Destroyed"], "Earthquake")

            dt.generate_learning_curves("CV Learning Curve Before Any Pruning",
                                        "DecisionTree")

            print("Depth of decision tree before pruning" +
                  str(dt.model.get_depth()),
                  flush=True)
            print("Number of nodes before pruning" +
                  str(dt.model.tree_.node_count),
                  flush=True)

            print("finding coarse hyper params for pre-pruning.", flush=True)

            dt.find_hyper_params_coarse(dt_param_grid_coarse, "DecisionTree")

            param_range = ['gini', 'entropy']
            dt.tune_hyper_parameter('criterion', param_range, "DecisionTree")

            dt.model_params['criterion'] = 'gini'
            dt.update_and_refit_model()

            print("Depth of decision tree after pre-pruning: " +
                  str(dt.model.get_depth()),
                  flush=True)
            print("Number of nodes after pre-pruning: " +
                  str(dt.model.tree_.node_count),
                  flush=True)

            print("generating learning curves", flush=True)
            dt.generate_learning_curves(
                "CV Learning Curve after Coarse Grid Search for Pre-Pruning",
                "DecisionTree")

            print("Finding optimal depth of tree", flush=True)
            param_range = np.arange(1, 14, 1)
            dt.tune_hyper_parameter('max_depth', param_range, "DecisionTree")

            dt.model_params['max_depth'] = 6
            dt.update_and_refit_model()

            print("generating learning curves")
            dt.generate_learning_curves(
                "CV Learning Curve after Refined Max Depth Search",
                "DecisionTree")

            print("Finding optimal min sample leaves of tree")
            dt.tune_hyper_parameter('min_samples_leaf', param_range,
                                    "DecisionTree")

            dt.model_params['min_samples_leaf'] = 5
            dt.update_and_refit_model()

            print("Depth of decision tree after Refined-pruning: " +
                  str(dt.model.get_depth()))
            print("Number of nodes after Refined-pruning: " +
                  str(dt.model.tree_.node_count))

            print("generating learning curves")
            dt.generate_learning_curves(
                "CV Learning Curve after Refined Max Depth and Min Sample Leaves",
                "DecisionTree")

            # Post Pruning:
            # Reset Pre-pruning variables.
            dt.model_params['min_samples_leaf'] = 1
            dt.model_params['max_depth'] = None
            dt.model_params['min_samples_split'] = 2
            dt.update_and_refit_model()

            # Post Prune
            dt.post_prune()

            param_grid = {
                'max_depth': [4, 5, 6, 7, 8, 9, 10],
                'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8],
                'ccp_alpha': [0.0002, 0.0005, 0.001, 0.0015, 0.0020]
            }

            dt.find_hyper_params_fine(param_grid, "DecisionTree")

            earthquake_decision_tree_score = dt.run_cross_val_on_test_set(
                "Decision Tree")

            print("generating learning curves")
            earthquake_decision_tree_train_times, earthquake_decision_tree_test_times, earthquake_train_sizes = dt.generate_learning_curves(
                "CV Learning Curve Combining Pre and Post Pruning",
                "DecisionTree")

        if run_nn:

            ann = NeuralNetworkModel(X_train_transformed, X_test, y_train,
                                     y_test, pipe, pre_processed_feature_names,
                                     ["No Damage", "Destroyed"], "Earthquake")

            ann.model_params['solver'] = 'sgd'
            ann.update_and_refit_model()

            ann.generate_learning_curves("CV Learning Curve Before Any Tuning",
                                         "Artificial Neural Network")

            ann.model_params['max_iter'] = 1000
            ann.update_and_refit_model()

            param_range = [0.0006, 0.0008, 0.001, 0.0012, 0.0014]
            ann.tune_hyper_parameter('learning_rate_init', param_range,
                                     "Artificial Neural Network")

            ann.model_params['learning_rate_init'] = 0.0010
            ann.update_and_refit_model()

            ann.generate_learning_curves(
                "CV Learning Curve After Tuning Learning Rate",
                "Artificial Neural Network")

            param_range = [10, 25, 40, 50, 75, 100, 125, 150, 200]
            ann.tune_hyper_parameter('hidden_layer_sizes', param_range,
                                     "Artificial Neural Network", 1)

            param_range = [33, 36, 39, 40, 41, 42, 45]
            ann.tune_hyper_parameter('hidden_layer_sizes', param_range,
                                     "Artificial Neural Network", 2)

            ann.model_params['hidden_layer_sizes'] = (40)
            ann.update_and_refit_model()

            param_range = [0.0008, 0.001, 0.0012, 0.0014, .0016, .0018, .002]
            ann.tune_hyper_parameter('learning_rate_init', param_range,
                                     "Artificial Neural Network", 2)

            ann.model_params['learning_rate_init'] = 0.0018
            ann.update_and_refit_model()

            ann.generate_learning_curves(
                "CV Learning Curve After Tuning First Hidden Layer",
                "Artificial Neural Network")

            print("Finding optimal momentum rate")
            param_range = [0.88, 0.89, 0.91, 0.92, 0.93, 0.94, 0.95]
            ann.tune_hyper_parameter('momentum', param_range,
                                     "Artificial Neural Network", 1)

            ann.model_params['momentum'] = 0.89
            ann.update_and_refit_model()

            param_range = [
                40, (40, 2), (40, 3), (40, 4), (40, 5), (40, 10), (40, 15),
                (40, 20), (40, 25), (40, 30), (40, 40)
            ]
            ann.tune_hyper_parameter('hidden_layer_sizes', param_range,
                                     "Artificial Neural Network", 3)

            param_range = np.arange(50, 1000, 100)
            ann.tune_hyper_parameter('max_iter', param_range,
                                     "Artificial Neural Network", 1)

            ann.plot_epochs_vs_iterations()

            ann.model_params['max_iter'] = 250
            ann.update_and_refit_model()

            earthquake_nn_score = ann.run_cross_val_on_test_set(
                "Neural Network")

            earthquake_nn_train_times, earthquake_nn_test_times, _ = ann.generate_learning_curves(
                "Final CV Learning Curve", "Artificial Neural Network")

        if run_boost:

            booster = AdaBoostModel(X_train_transformed, X_test, y_train,
                                    y_test, pipe, pre_processed_feature_names,
                                    ["No Damage ", "Destroyed"], "Earthquake")

            booster.generate_learning_curves("Learning Curve Before Tuning",
                                             "AdaBoost")

            param_range = [
                DecisionTreeClassifier(max_depth=1),
                DecisionTreeClassifier(max_depth=2),
                DecisionTreeClassifier(max_depth=3),
                DecisionTreeClassifier(max_depth=4)
            ]
            booster.tune_hyper_parameter('base_estimator', param_range,
                                         "AdaBoost")

            booster.model_params['base_estimator'] = DecisionTreeClassifier(
                max_depth=1)
            booster.update_and_refit_model()

            param_range = np.arange(1, 100, 1)
            booster.tune_hyper_parameter('n_estimators', param_range,
                                         "AdaBoost", 1)

            booster.model_params['n_estimators'] = 23
            booster.update_and_refit_model()

            booster.generate_learning_curves(
                "Learning Curve After Tuning Number Estimators", "AdaBoost")

            param_range = np.linspace(0, 1, 25)
            booster.tune_hyper_parameter('learning_rate', param_range,
                                         "AdaBoost", 1)

            booster.model_params['learning_rate'] = 0.5
            booster.update_and_refit_model()

            earthquake_booster_score = booster.run_cross_val_on_test_set(
                "AdaBoost")

            earthquake_booster_train_times, earthquake_booster_test_times, _ = booster.generate_learning_curves(
                "Learning Curve After Tuning Learning Rate", "AdaBoost")

        if run_knn:
            knn = KNNModel(X_train_transformed, X_test, y_train, y_test, pipe,
                           pre_processed_feature_names,
                           ["No Damage", "Destroyed"], "Earthquake")
            knn.generate_learning_curves("Learning Curve Before Tuning", "KNN")

            param_range = np.arange(1, 25, 1)
            knn.tune_hyper_parameter('n_neighbors', param_range, "KNN", 1)
            knn.model_params['n_neighbors'] = 24
            knn.update_and_refit_model()

            knn.generate_learning_curves("Learning Curve After Tuning K",
                                         "KNN")

            param_range = [
                'euclidean', 'manhattan', 'chebyshev', 'hamming', 'canberra',
                'braycurtis'
            ]
            knn.tune_hyper_parameter('metric', param_range, "KNN", 1)
            knn.model_params['metric'] = 'manhattan'
            knn.update_and_refit_model()

            earthquake_knn_score = knn.run_cross_val_on_test_set("K-NN")

            earthquake_knn_train_times, earthquake_knn_test_times, _ = knn.generate_learning_curves(
                "Learning Curve After Tuning Distance Metric", "KNN")

        Utils.plot_training_times(
            earthquake_svm_poly_train_times, earthquake_svm_poly_test_times,
            earthquake_svm_rbf_train_times, earthquake_svm_rbf_test_times,
            earthquake_decision_tree_train_times,
            earthquake_decision_tree_test_times, earthquake_nn_train_times,
            earthquake_nn_test_times, earthquake_booster_train_times,
            earthquake_booster_test_times, earthquake_knn_train_times,
            earthquake_knn_test_times, earthquake_train_sizes, len(X_train),
            "Earthquake")

        print("Score of svm with poly kernel:" +
              str(earthquake_svm_poly_score))
        print("Score of svm with rbf kernel:" + str(earthquake_svm_rbf_score))
        print("Score of decision tree:" + str(earthquake_decision_tree_score))
        print("Score of neural networkl:" + str(earthquake_nn_score))
        print("Score of booster:" + str(earthquake_booster_score))
        print("Score of knn:" + str(earthquake_knn_score))

    if dataset2 != "":

        print("Loading dataset " + dataset2)

        # Load the data.
        dataset_csv_path = os.path.join(DATA_PATH, dataset2)
        dataset = pd.read_csv(dataset_csv_path)

        X = dataset.drop("class", axis=1)
        y = dataset["class"].copy()

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=1)

        pipe = DataPreprocessor.preprocess_data(X_train)
        X_train_transformed = pipe.fit_transform(X_train)

        pre_processed_feature_names = X_train_transformed

        diabetes_svm_poly_score = None
        diabetes_svm_poly_train_times = None
        diabetes_svm_poly_test_times = None

        diabetes_svm_rbf_score = None
        diabetes_svm_rbf_train_times = None
        diabetes_svm_rbf_test_times = None

        diabetes_decision_tree_score = None
        diabetes_decision_tree_train_times = None
        diabetes_decision_tree_test_times = None

        diabetes_nn_score = None
        diabetes_nn_train_times = None
        diabetes_nn_test_times = None

        diabetes_booster_score = None
        diabetes_booster_train_times = None
        diabetes_booster_test_times = None

        diabetes_knn_score = None
        diabetes_knn_train_times = None
        diabetes_knn_test_times = None

        diabetes_train_sizes = None

        if run_svm:

            svm_poly_kernel_param_grid_coarse = {
                'kernel': ['poly'],
                'degree': [3, 7],
                'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                'C': [0.1, 1, 10, 100, 1000]
            }

            svm_poly = SVMModel(X_train_transformed, X_test, y_train, y_test,
                                pipe, pre_processed_feature_names,
                                ["No Signs ", "Signs of DR"],
                                "Diabetic Retinopathy")

            svm_rbf_kernel_param_grid_coarse = {
                'kernel': ['rbf'],
                'gamma': [1, 0.1],
                'C': [0.1, 1, 10, 100, 1000]
            }

            svm_rbf = SVMModel(X_train_transformed, X_test, y_train, y_test,
                               pipe, pre_processed_feature_names,
                               ["No Signs ", "Signs of DR"],
                               "Diabetic Retinopathy")

            svm_rbf.find_hyper_params_coarse(svm_rbf_kernel_param_grid_coarse,
                                             "SVMRbf")

            svm_poly.find_hyper_params_coarse(
                svm_poly_kernel_param_grid_coarse, "SVMPoly")

            Utils.plot_svm_learners_on_same_curve(
                svm_poly, "poly", svm_rbf, "rbf",
                "Learning Curves After Coarse Grid Search")

            svm_poly.model_params['kernel'] = 'poly'
            svm_poly.update_and_refit_model()

            param_range = [
                2,
                3,
                4,
                5,
                6,
                7,
            ]
            svm_poly.tune_hyper_parameter('degree', param_range, "SVMPoly", 1)

            svm_poly.model_params['degree'] = 3
            svm_poly.update_and_refit_model()

            param_range = np.linspace(0.0, 0.2, 50)
            svm_poly.tune_hyper_parameter('C', param_range, "SVMPoly", 1)

            svm_poly.model_params['C'] = 0.02
            svm_poly.update_and_refit_model()

            param_range = np.linspace(0.0, 50, 50)
            svm_poly.tune_hyper_parameter('coef0', param_range, "SVMPoly", 1)

            svm_poly.model_params['coef0'] = 40
            svm_poly.update_and_refit_model()

            param_range = np.linspace(0.0, 0.25, 50)
            svm_poly.tune_hyper_parameter('gamma', param_range, "SVMPoly", 1)

            svm_poly.model_params['gamma'] = .050
            svm_poly.update_and_refit_model()

            ############## RBF Kernel ##########################################

            svm_rbf.model_params['kernel'] = 'rbf'
            svm_rbf.update_and_refit_model()

            param_range = np.linspace(0.0, 0.02, 50)
            svm_rbf.tune_hyper_parameter('gamma', param_range, "SVMRbf", 1)

            svm_rbf.model_params['gamma'] = 0.010
            svm_rbf.update_and_refit_model()

            param_range = np.linspace(-100, 300, 50)
            svm_rbf.tune_hyper_parameter('C', param_range, "SVMRbf", 1)

            svm_rbf.model_params['C'] = 125
            svm_rbf.update_and_refit_model()

            diabetes_svm_rbf_score = svm_rbf.run_cross_val_on_test_set(
                "SVM RBF Kernel")
            diabetes_svm_poly_score = svm_poly.run_cross_val_on_test_set(
                "SVM Poly Kernel")

            diabetes_svm_poly_train_times, diabetes_svm_poly_test_times, diabetes_svm_rbf_train_times, diabetes_svm_rbf_test_times = Utils.plot_svm_learners_on_same_curve(
                svm_poly, "poly", svm_rbf, "rbf", "Final Learning Curves")

        if run_dt:

            print("running dt experiment")
            dt = DecisionTreeModel(X_train_transformed, X_test, y_train,
                                   y_test, pipe, pre_processed_feature_names,
                                   ["No Signs ", "Signs of DR"],
                                   "Diabetic Retinopathy")

            dt.generate_learning_curves("CV Learning Curve Before Any Pruning",
                                        "DecisionTree")

            print("Depth of decision tree before pruning" +
                  str(dt.model.get_depth()))
            print("Number of nodes before pruning" +
                  str(dt.model.tree_.node_count))

            print("finding coarse hyper params for pre-pruning.")

            dt.find_hyper_params_coarse(dt_param_grid_coarse, "DecisionTree")

            print("Depth of decision tree after pre-pruning: " +
                  str(dt.model.get_depth()))
            print("Number of nodes after pre-pruning: " +
                  str(dt.model.tree_.node_count))

            print("generating learning curves")
            dt.generate_learning_curves(
                "CV Learning Curve after Coarse Grid Search for Pre-Pruning",
                "DecisionTree")

            param_range = ['gini', 'entropy']
            dt.tune_hyper_parameter('criterion', param_range, "DecisionTree")

            # Uncomment to plot range of max_depth parameter.
            print("Finding optimal depth of tree")
            param_range = np.arange(1, 25, 1)
            dt.tune_hyper_parameter('max_depth', param_range, "DecisionTree")

            dt.model_params['max_depth'] = 20
            dt.update_and_refit_model()

            print("generating learning curves")
            dt.generate_learning_curves(
                "CV Learning Curve after Refined Max Depth Search",
                "DecisionTree")

            print("Finding optimal min sample leaves of tree")
            param_range = np.arange(1, 25, 1)
            dt.tune_hyper_parameter('min_samples_leaf', param_range,
                                    "DecisionTree")

            dt.model_params['min_samples_leaf'] = 15
            dt.update_and_refit_model()

            print("Depth of decision tree after Refined-pruning: " +
                  str(dt.model.get_depth()))
            print("Number of nodes after Refined-pruning: " +
                  str(dt.model.tree_.node_count))

            print("generating learning curves")
            dt.generate_learning_curves(
                "CV Learning Curve after Refined Max Depth and Min Sample Leaves",
                "DecisionTree")

            # Post Pruning:
            # Reset Pre-pruning variables.
            dt.model_params['min_samples_leaf'] = 1
            dt.model_params['max_depth'] = None
            dt.model_params['min_samples_split'] = 2
            dt.update_and_refit_model()

            # Post Prune
            dt.post_prune()

            param_grid = {
                'max_depth': [10, 15, 20],
                'min_samples_leaf': [14, 15, 16, 17, 18],
                'ccp_alpha': [0.000, 0.002, 0.003, 0.004, 0.005]
            }

            dt.find_hyper_params_fine(param_grid, "DecisionTree")

            diabetes_decision_tree_score = dt.run_cross_val_on_test_set(
                "DecisionTree")

            print("generating learning curves")
            diabetes_decision_tree_train_times, diabetes_decision_tree_test_times, diabetes_train_sizes = dt.generate_learning_curves(
                "CV Learning Curve Combining Pre and Post Pruning",
                "DecisionTree")

        if run_nn:

            print("running nn experiment")
            ann = NeuralNetworkModel(X_train_transformed, X_test, y_train,
                                     y_test, pipe, pre_processed_feature_names,
                                     ["No Signs ", "Signs of DR"],
                                     "Diabetic Retinopathy")

            ann.model_params['solver'] = 'sgd'
            ann.update_and_refit_model()

            ann.generate_learning_curves("CV Learning Curve Before Any Tuning",
                                         "Artificial Neural Network")

            ann.model_params['max_iter'] = 1000
            ann.update_and_refit_model()

            print("Finding optimal learning rate")
            param_range = [
                .0009, 0.0010, 0.0013, 0.0015, 0.0017, 0.002, 0.003, 0.004,
                .005, .006, .007
            ]
            ann.tune_hyper_parameter('learning_rate_init', param_range,
                                     "Artificial Neural Network")

            ann.model_params['learning_rate_init'] = 0.004
            ann.update_and_refit_model()

            ann.generate_learning_curves(
                "CV Learning Curve After Tuning Initial Learning Rate",
                "Artificial Neural Network")

            print("Finding optimal hidden layers")
            param_range = [(4), (6), (8), (10), (12), (15), (20), (25), (100)]
            ann.tune_hyper_parameter('hidden_layer_sizes', param_range,
                                     "Artificial Neural Network", 1)

            ann.model_params['hidden_layer_sizes'] = (12)
            ann.update_and_refit_model()

            print("Finding optimal learning rate")
            param_range = [
                0.0010, 0.0013, 0.0015, 0.0017, 0.002, 0.003, 0.004, .005,
                .006, .007, .008
            ]
            ann.tune_hyper_parameter('learning_rate_init', param_range,
                                     "Artificial Neural Network", 2)

            ann.model_params['learning_rate_init'] = 0.006
            ann.update_and_refit_model()

            ann.generate_learning_curves(
                "CV Learning Curve After Tuning 1st Hidden Layer",
                "Artificial Neural Network")

            print("Finding optimal momentum rate")
            param_range = [0.88, 0.9, 0.92, 0.94, 0.96, 0.98, 0.99]
            ann.tune_hyper_parameter('momentum', param_range,
                                     "Artificial Neural Network", 1)

            print("Finding optimal hidden layers")
            param_range = [
                12, (12, 2), (12, 4), (12, 6), (12, 8), (12, 10), (12, 12),
                (12, 14), (12, 16), (15, 15), (20, 20), (50, 50)
            ]
            ann.tune_hyper_parameter('hidden_layer_sizes', param_range,
                                     "Artificial Neural Network", 2)

            ann.model_params['hidden_layer_sizes'] = (12, 2)
            ann.update_and_refit_model()

            param_range = np.arange(50, 1200, 100)
            ann.tune_hyper_parameter('max_iter', param_range,
                                     "Artificial Neural Network", 1)

            ann.model_params['tol'] = 0.000001
            ann.update_and_refit_model()

            ann.plot_epochs_vs_iterations()

            print("Finding optimal learning rate")
            param_range = [
                0.0010, 0.0013, 0.0015, 0.0017, 0.002, 0.003, 0.004, .005,
                .006, .007, .008
            ]
            ann.tune_hyper_parameter('learning_rate_init', param_range,
                                     "Artificial Neural Network", 3)

            diabetes_nn_score = ann.run_cross_val_on_test_set("Neural Network")

            diabetes_nn_train_times, diabetes_nn_test_times, _ = ann.generate_learning_curves(
                "Final CV Learning Curve", "Artificial Neural Network")
        if run_boost:
            booster = AdaBoostModel(X_train_transformed, X_test, y_train,
                                    y_test, pipe, pre_processed_feature_names,
                                    ["No Signs ", "Signs of DR"],
                                    "Diabetic Retinopathy")

            booster.generate_learning_curves("Learning Curve Before Tuning",
                                             "AdaBoost")

            param_range = [
                DecisionTreeClassifier(max_depth=1),
                DecisionTreeClassifier(max_depth=2),
                DecisionTreeClassifier(max_depth=3),
                DecisionTreeClassifier(max_depth=4)
            ]
            booster.tune_hyper_parameter('base_estimator', param_range,
                                         "AdaBoost")

            booster.model_params['base_estimator'] = DecisionTreeClassifier(
                max_depth=2)
            booster.update_and_refit_model()

            param_range = np.arange(1, 100, 1)
            booster.tune_hyper_parameter('n_estimators', param_range,
                                         "AdaBoost", 1)

            booster.model_params['n_estimators'] = 19
            booster.update_and_refit_model()

            booster.generate_learning_curves(
                "Learning Curve After Tuning Number Estimators", "AdaBoost")

            param_range = np.linspace(0, 1, 25)
            booster.tune_hyper_parameter('learning_rate', param_range,
                                         "AdaBoost", 1)

            booster.model_params['learning_rate'] = 0.3
            booster.update_and_refit_model()

            diabetes_booster_score = booster.run_cross_val_on_test_set(
                "AdaBoost")

            diabetes_booster_train_times, diabetes_booster_test_times, _ = booster.generate_learning_curves(
                "Learning Curve After Tuning Learning Rate", "AdaBoost")

        if run_knn:
            knn = KNNModel(X_train_transformed, X_test, y_train, y_test, pipe,
                           pre_processed_feature_names,
                           ["No Signs ", "Signs of DR"],
                           "Diabetic Retinopathy")
            knn.generate_learning_curves("Learning Curve Before Tuning", "KNN")

            param_range = np.arange(1, 25, 1)
            knn.tune_hyper_parameter('n_neighbors', param_range, "KNN", 1)

            knn.model_params['n_neighbors'] = 20
            knn.update_and_refit_model()

            knn.generate_learning_curves("Learning Curve After Tuning K",
                                         "KNN")

            param_range = [
                'euclidean', 'manhattan', 'chebyshev', 'hamming', 'canberra',
                'braycurtis'
            ]
            knn.tune_hyper_parameter('metric', param_range, "KNN", 1)

            knn.model_params['metric'] = 'euclidean'
            knn.update_and_refit_model()

            diabetes_knn_score = knn.run_cross_val_on_test_set("K-NN")

            diabetes_knn_train_times, diabetes_knn_test_times, _ = knn.generate_learning_curves(
                "Learning Curve After Tuning Distance Metric", "KNN")

        Utils.plot_training_times(
            diabetes_svm_poly_train_times, diabetes_svm_poly_test_times,
            diabetes_svm_rbf_train_times, diabetes_svm_rbf_test_times,
            diabetes_decision_tree_train_times,
            diabetes_decision_tree_test_times, diabetes_nn_train_times,
            diabetes_nn_test_times, diabetes_booster_train_times,
            diabetes_booster_test_times, diabetes_knn_train_times,
            diabetes_knn_test_times, diabetes_train_sizes, len(X_train),
            "Diabetic Retinopathy")

        print("Score of svm with poly kernel:" + str(diabetes_svm_poly_score))
        print("Score of svm with rbf kernel:" + str(diabetes_svm_rbf_score))
        print("Score of decision tree:" + str(diabetes_decision_tree_score))
        print("Score of neural networkl:" + str(diabetes_nn_score))
        print("Score of booster:" + str(diabetes_booster_score))
        print("Score of knn:" + str(diabetes_knn_score))

    sys.stdout.flush()
    sys.stdout.close()
Exemple #16
0
def traintest_model():
    """
    Experimental Function
    :return: None
    """
    _LOGFILENAME, timestamp = start_deepscore_core()
    EventIssuer.issueMessage("Training a new model", _LOGFILENAME)

    # Load Data
    X, Y = loadppData('ppData/X_1492930578.7.ds', 'ppData/Y_1492930578.7.ds')
    # print X.shape, Y.shape
    # split into input (X) and output (Y) variables

    # Partition into train and test
    train_X, train_Y, dev_X, dev_Y, test_X, test_Y = DataPreprocessor.partitionDataset(
        X, Y)
    # print "dev_Y[0] :", dev_Y[0]
    EventIssuer.issueMessage(
        "Training Set Size : " + str(train_X.shape[0]) +
        " | Validation Set Size : " + str(dev_X.shape[0]) +
        " | Test Set Size : " + str(test_X.shape[0]), _LOGFILENAME)

    # Create Model
    model = Sequential()

    model.add(Dense(12, input_dim=300, use_bias=True))
    model.add(Dense(32, activation='tanh', use_bias=True))
    model.add(Dense(32, activation='relu', use_bias=True))
    model.add(Dense(13, activation='softmax', use_bias=True))
    # #
    # model = Sequential()
    # model.add(Dense(12, input_dim=300, activation='tanh', kernel_regularizer=regularizers.l2(0.0001)))
    # model.add(Activation('tanh'))
    # model.add(Dense(13, activation='softmax'))
    #
    # model = Sequential()
    # model.add(Dense(12, input_dim=300, activation='relu'))
    # model.add(Dense(8, activation='tanh'))
    # model.add(Dense(8, activation='relu'))
    # model.add(Dense(13, activation='softmax'))

    adam = optimizers.Adam(lr=0.0001, epsilon=1e-08)

    # Compile Model
    model.compile(loss='mean_squared_error',
                  optimizer=adam,
                  metrics=['mean_squared_error'],
                  kernel_regularizer=regularizers.l2(0.0001))

    # Train
    total_train_time = 0
    total_valid_time = 0

    best_qwk = -1
    argmax_best_qwk = 0
    for epoch_num in range(1000):
        # Training
        start_time = time.time()
        running_model = model.fit(train_X,
                                  train_Y,
                                  batch_size=50,
                                  epochs=1,
                                  verbose=0)
        train_time = time.time() - start_time
        total_train_time += train_time

        # Evaluate
        start_time = time.time()

        analyzer_object = Analyzer.AnalyzerObject(model, _LOGFILENAME, dev_X,
                                                  dev_Y, epoch_num,
                                                  dev_X.shape[0])
        analyzer_object.analyze()

        if (analyzer_object.qwk > best_qwk):
            best_qwk = analyzer_object.qwk
            best_lwk = analyzer_object.lwk
            argmax_best_qwk = epoch_num
            EventIssuer.issueSuccess("Best QWK seen so far : " + str(best_qwk),
                                     _LOGFILENAME,
                                     highlight=True)

        valid_time = time.time() - start_time
        total_valid_time += valid_time

        # Issue events
        train_loss = running_model.history['loss'][0]
        train_metric = running_model.history['mean_squared_error'][0]
        epoch_info_1 = "Epoch " + str(epoch_num) + ", train: " + str(
            train_time) + "s, validation: " + str(valid_time) + "s"
        epoch_info_2 = "[Train] loss: " + str(train_loss) + ", metric: " + str(
            train_metric)
        EventIssuer.issueMessage(epoch_info_1, _LOGFILENAME)
        EventIssuer.issueMessage(epoch_info_2, _LOGFILENAME)

    # model.fit(train_X, train_Y, epochs=200, batch_size=10)

    saveModel(model, _LOGFILENAME, timestamp)
    # res = model.predict(test_X)

    scores = model.evaluate(test_X, test_Y)
    print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
    EventIssuer.issueSuccess("Best : QWK = " + str(best_qwk) + " | LWK = " +
                             str(best_lwk) + " at Epoch " +
                             str(argmax_best_qwk),
                             _LOGFILENAME,
                             highlight=True)

    EventIssuer.issueExit(_LOGFILENAME, timestamp)
Exemple #17
0
def run_train():
    # Inputs
    dictionary = DP.read_dict(dict_file)
    train_label = DP.read_train(train_label_file)
    train = DataSet(train_path, train_label, len(dictionary),
                    dictionary[EOS_tag])

    # Parameters
    N_input = train.datalen
    N_iter = N_input * N_epoch // batch_size
    print('Total training steps: %d' % N_iter)

    # Model
    model = s2vtModel(image_dim=train.feat_dim,
                      vocab_size=train.vocab_size,
                      N_hidden=N_hidden,
                      N_video_step=train.feat_timestep,
                      N_caption_step=train.maxseqlen,
                      batch_size=batch_size)

    # Loss function and optimizer
    tf_loss, tf_video, tf_caption, tf_caption_mask, _ = model.build_train_model(
        dictionary)
    tf_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)

    init = tf.global_variables_initializer()

    #    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    with tf.Session() as sess:
        sess.run(init)
        model.restore_model(sess, model_file)
        step = 0

        t = time.time()
        while step < N_iter:
            batch_x, batch_y = train.next_batch(batch_size=batch_size)
            y = np.full((batch_size, train.maxseqlen), dictionary[EOS_tag])
            y_mask = np.zeros(y.shape)

            for i, caption in enumerate(batch_y):
                y[i, :len(caption)] = caption
                y_mask[i, :len(caption)] = 1

            sess.run(tf_optimizer,
                     feed_dict={
                         tf_video: batch_x,
                         tf_caption: y,
                         tf_caption_mask: y_mask
                     })

            #            if True:
            if step % display_step == 0:
                used_time = time.time() - t
                t = time.time()
                loss = sess.run(tf_loss,
                                feed_dict={
                                    tf_video: batch_x,
                                    tf_caption: y,
                                    tf_caption_mask: y_mask
                                })
                print(
                    str(step) + '/' + str(N_iter) + ' step: loss = ' +
                    str(loss) + ' time = ' + str(used_time) + ' secs')
                model.save_model(sess, model_file)

            step += 1

        model.save_model(sess, model_file)

    return
Exemple #18
0
def main():
    hyper_params = HyperParams.HyperParams()
    hyper_params.parse_args()
    dataPreprocessor = DataPreprocessor.Wine()
    train_X, test_X, train_y, test_y = dataPreprocessor.get_data()
    input_layer_size = dataPreprocessor.get_input_layer_size()
    output_layer_size = dataPreprocessor.get_output_layer_size()

    model_evaluator = ModelEvaluator.ModelEvaluator()

    toolbox_factory = EvolutionaryToolboxFactory.EvolutionaryToolboxFactory()
    toolbox = toolbox_factory.get_toolbox(hyper_params.mutation_probability,
                                          hyper_params.tournament_size,
                                          hyper_params.max_number_of_layers)
    toolbox.register("evaluate", model_evaluator.evalOneMax)

    pop = toolbox.population(n=hyper_params.population_size)

    CXPB, MUTPB = hyper_params.crosover_probability, hyper_params.mutation_probability

    if DEBUG:
        print("Start of evolution")

    # Evaluate the entire population
    fitnesses = []
    for el in pop:
        if DEBUG:
            print(el)
        result = toolbox.evaluate(el, hyper_params.number_of_epochs,
                                  hyper_params.learning_rate, input_layer_size,
                                  output_layer_size, hyper_params.hidden_units,
                                  train_X, test_X, train_y, test_y)
        fitnesses.append(result)
    if DEBUG:
        print("fitness", fitnesses)
    for ind, fit in zip(pop, fitnesses):
        if DEBUG:
            print("ind", "fit", ind, fit)
        ind.fitness.values = fit
    if DEBUG:
        print("  Evaluated %i individuals" % len(pop))

    # Extracting all the fitnesses of
    fits = [ind.fitness.values[0] for ind in pop]

    # Variable keeping track of the number of generations
    g = 0

    # Begin the evolution
    while g < hyper_params.number_of_generations:
        # A new generation
        g = g + 1
        if DEBUG:
            print("-- Generation %i --" % g)

        # Select the next generation individuals
        offspring = toolbox.select(pop, len(pop))

        offspring = algorithms.varAnd(pop, toolbox, CXPB, MUTPB)
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]

        fitnesses = []
        for el in invalid_ind:
            result = toolbox.evaluate(el, hyper_params.number_of_epochs,
                                      hyper_params.learning_rate,
                                      input_layer_size, output_layer_size,
                                      hyper_params.hidden_units, train_X,
                                      test_X, train_y, test_y)
            fitnesses.append(result)

        for ind, fit in zip(invalid_ind, fitnesses):
            #print ("ind", "fit", ind, fit)
            ind.fitness.values = fit
        if DEBUG:
            print("  Evaluated %i individuals" % len(invalid_ind))

        # The population is entirely replaced by the offspring
        pop[:] = offspring

        # Gather all the fitnesses in one list and print the stats
        fits = [ind.fitness.values[0] for ind in pop]

        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x * x for x in fits)
        std = abs(sum2 / length - mean**2)**0.5
        if DEBUG:
            print("  Min %s" % min(fits))
            print("  Max %s" % max(fits))
            print("  Avg %s" % mean)
            print("  Std %s" % std)
    if DEBUG:
        print("-- End of (successful) evolution --")

    best_ind = tools.selBest(pop, 1)[0]
    if DEBUG:
        print("Best individual is %s, %s" %
              (best_ind, best_ind.fitness.values))
    else:
        with open(hyper_params.file_path, 'a') as file:
            file.write(hyper_params.params_str())
            file.write(
                str(best_ind) + "|" + str(best_ind.fitness.values)[1:][:-2] +
                "\n")
Exemple #19
0
import pandas as pd
import numpy as np
from Layers import dense_layer
from Model import nn_sequential_model
import DataPreprocessor as dp

columns = ["F0", "F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9", "T"]
raw_df = pd.read_csv("data.txt", names=columns, sep=",").sample(frac=1)
X_train, Y_train, x_test, y_test = dp.train_test_split(raw_df,
                                                       split_ratio=0.7,
                                                       normalize=True)

ann = nn_sequential_model()
ann.add_layer(dense_layer(10))
ann.add_layer(dense_layer(15, activation="softplus"))
ann.add_layer(dense_layer(15, activation="softplus"))
ann.add_layer(dense_layer(1, activation="sigmoid"))
ann.compile(epochs=50000, loss="binary_crossentropy", lr=1e-3)
ann.fit(X_train, Y_train, plot_freq=1, batch_size=16)

print("\ntraining metrics:", end=' ')
ann.evaluate(ann.predict(X_train), Y_train)
print("testing metrics:", end=' ')
ann.evaluate(ann.predict(x_test), y_test)