Beispiel #1
0
def run():
    print("Preprocessing the training data.")
    X_train, y_train = read_data('data/train_set.fasta')
    print("Preprocessing the test data.")
    X_test, y_test = read_data('data/benchmark_set.fasta')

    print("Running model 1.")
    model_1 = RandomForestClassifier(n_estimators=10, criterion='gini')
    model_1.fit(X_train, y_train)
    accuracy_1 = model_1.score(X_test, y_test)
    print(f"Model 1 accuracy: {accuracy_1}\n")

    print("Running model 2.")
    model_2 = RandomForestClassifier(n_estimators=100, criterion='gini')
    model_2.fit(X_train, y_train)
    accuracy_2 = model_2.score(X_test, y_test)
    print(f"Model 2 accuracy: {accuracy_2}\n")

    print("Running model 3.")
    model_3 = RandomForestClassifier(n_estimators=10, criterion='entropy')
    model_3.fit(X_train, y_train)
    accuracy_3 = model_3.score(X_test, y_test)
    print(f"Model 3 accuracy: {accuracy_3}\n")

    print("Running model 4.")
    model_4 = RandomForestClassifier(n_estimators=100, criterion='entropy')
    model_4.fit(X_train, y_train)
    accuracy_4 = model_4.score(X_test, y_test)
    print(f"Model 4 accuracy: {accuracy_4}\n")
Beispiel #2
0
def test():
    import preprocessing as pp
    X, Y, Yd, B = pp.read_data('krk_data_20000_balanced_8.cpkl')
    ind_nd = np.where(Yd == 0)[0]
    Ydtm = Y[ind_nd]
    Xdtm = np.zeros((Ydtm.shape[0], X.shape[1]))
    Bdtm = []
    for i in xrange(len(ind_nd)):
        Xdtm[i, :] = X[ind_nd[i]]
        Bdtm.append(B[ind_nd[i]])
    N = Xdtm.shape[0]
    print "Ydtm:{}\tXdtm{}".format(Ydtm.shape, Xdtm.shape)
    """
    TODO: delete this for loop
    """
    print "data read without splitting"
    for i in xrange(5):
        print B[i], Y[i]

    N = len(Y)
    Y = np.array(Y).reshape((N, 1))
    Yd = np.array(Yd).reshape((N, 1))

    D = X.shape[1]
    print D
    M = [128]
    c0 = 8
    F = [(1, 1), (3, 3), (3, 3)]
    C = [(8, ), (16, ), (32, )]

    import learn
    # first time use:
    graph = build_graph(F, C, c0, M, D)
    with tf.Session(graph=graph) as sess:
        learn.fit(sess, Xdtm, Ydtm, Bdtm, init=True)
def main():
    if len(sys.argv) != 2:
        print("python3 kmeans_ssd.py filename")
        sys.exit(1)

    data = read_data(sys.argv[1])
    kmeans(data)
Beispiel #4
0
 def train(self):
     input_setup()
     data_dir = os.path.join(os.getcwd(), "checkpoint\\train.h5")
     train_data,train_label = read_data(data_dir)
     glob_step = tf.Variable(0)
     learning_rate_exp = tf.train.exponential_decay(config.learning_rate,glob_step,1480,0.98,
                                                    staircase=True)# 每1个Epoch 学习率*0.98
     self.train_op = tf.train.GradientDescentOptimizer(learning_rate_exp).minimize(self.loss,
                                                                                   global_step = glob_step)
     tf.global_variables_initializer().run()
     counter = 0
     start_time = time.time()
     if self.load(self.checkpoint_dir):
         print(" [*] Load SUCCESS")
     else:
         print(" [!] Load failed...")
     print("Training...")
     for ep in range(config.epoch):
         batch_indx = len(train_data)//config.batch_size
         for idx in range(0,batch_indx):
             batch_images = train_data[idx * config.batch_size: (idx + 1) * config.batch_size]
             batch_labels = train_label[idx * config.batch_size: (idx + 1) * config.batch_size]
             counter += 1
             _,err = self.sess.run([self.train_op,self.loss],
                                   feed_dict = {self.images:batch_images,self.labels:batch_labels})
             if counter % 10 == 0:  # 10的倍数step显示
                 print("Epoch: [%2d], step: [%2d], time: [%4.4f], loss: [%.8f]" % ((ep + 1), counter, time.time() - start_time, err))
             if counter % 500 == 0:  # 500的倍数step存储
                 self.save(config.checkpoint_dir, counter)
Beispiel #5
0
def run_test():
    # loading and shuffling data and splitting into train/test sets
    instances, labels = read_data('../data/Tweets.csv')

    paired = list(zip(instances, labels))
    shuffle(paired)
    instances, labels = zip(*paired)

    bows = list(map(bag_of_words, map(sanitize, instances)))

    bows, _, _ = bows_to_numpy(bows)
    labels, _, _ = labels_to_numpy(labels)

    train_size = 10000
    test_size = 100
    bows_tr, labels_tr = bows[:train_size], labels[:train_size]
    bows_test, labels_test = bows[train_size:train_size + test_size],\
        labels[train_size:train_size + test_size]

    # learning weights on train set
    predictions = list(
        map(lambda x: predict(x, bows_tr, labels_tr), bows_test))

    # evaluating classification accuracy using learned weights on the test set
    labels_test = np.argmax(labels_test, axis=1)
    print('Accuracy:', accuracy_score(labels_test, predictions))
    print(classification_report(labels_test, predictions))
def main():
    if len(sys.argv) != 2:
        print("python3 filepath")
        sys.exit(1)
    data = preprocessing.read_data(sys.argv[1])
    agglomerative_clustering(data, sys.argv[1])
    return
Beispiel #7
0
def get_data():

    data = pre.read_data()

    variables = data['Variable'].unique().tolist()[:5]  # Probando solo con 5
    maps_data = {}

    for var in variables:
        print(var)
        df_variable = pre.df_variable(data, var)
        # add color column
        df_variable['color'] = [
            RdYlBu[11][val] for val in pd.cut(
                x=df_variable['Concentración'], bins=11, labels=False)
        ]

        geo_features = create_geojson_features(df_variable.reset_index())
        maps_data[var] = TimestampedGeoJson(
            {
                'type': 'FeatureCollection',
                'features': geo_features
            },
            period='P1D',
            add_last_point=True,
            auto_play=False,
            loop=False,
            max_speed=10,
            loop_button=True,
            date_options='YYYY/MM',
            duration='P1D',
            time_slider_drag_update=True)
    return maps_data, variables
def run_test():
    instances, labels = read_data('../data/Tweets.csv')
    bows = list(map(bag_of_words, map(sanitize, instances)))
    weights = estimate_weights(bows, labels, 0.001)
    predictions = predict_all(bows, weights, list(set(labels)))
    prediction_labels = [p[0] for p in predictions]
    print(accuracy_score(labels, prediction_labels))
Beispiel #9
0
def main():
    if len(sys.argv) != 3:
        print("python clustering_quality.py filename labelsfilename")
        sys.exit(1)

    labels = get_labels(sys.argv[2])
    data = preprocessing.read_data(sys.argv[1])
    print(
        metrics.cluster.silhouette_score(X=data.values,
                                         labels=labels,
                                         metric='euclidean'))
def preprocess_test(meta_data: pd.DataFrame) -> Dict:
    logging.info('Reading test data')
    test_building_data, test_weather_data = pp.read_data(TEST_DATA_PATH,
                                                         TEST_WEATHER_PATH,
                                                         meta_data,
                                                         nrows=None)
    test_data = features.prepare_features(test_building_data,
                                          test_weather_data)
    test_sets = splits.split_data_by_meter(test_data)
    logging.info('Test set ready.')
    return test_sets
Beispiel #11
0
def main():

    train_feature = read_data(os.path.join(data_path, train_feature_file))
    print(train_feature.head())

    train_salaries = read_data(os.path.join(data_path, train_salary_file))
    print(train_salaries.head())

    train_data = pd.merge(train_feature,
                          train_salaries,
                          how="left",
                          on="jobId")
    print(train_data.head())

    salary_info = company_salary(train_data)
    print(salary_info)

    salary_by_types(train_data, "degree")

    salary_by_types(train_data, "major")
    company_jobs(train_data)
def preprocess_train(meta_data: pd.DataFrame, remove_zeros: bool) -> Dict:
    #meta_data = pp.read_building_metadata(META_DATA_PATH)
    logging.info('Reading training data')
    train_building_data, train_weather_data = pp.read_data(
        TRAIN_DATA_PATH,
        TRAIN_WEATHER_PATH,
        meta_data,
        remove_zeros=remove_zeros,
        nrows=None)
    logging.info('Preparing training features and target')
    training_data = features.prepare_features(train_building_data,
                                              train_weather_data)
    target.get_log_target(training_data)
    logging.info('Splitting data by meter type...')
    train_sets = splits.split_data_by_meter(training_data)
    logging.info('Training set ready.')
    return train_sets
Beispiel #13
0
def test(self,sess):
    nx,ny = input_up(sess)
    print(nx,ny)
    data_dir = os.path.join(os.getcwd(), "checkpoint\\test.h5")
    test_data, test_label = preprocessing.read_data(data_dir)
    if SRCNN.load(self,config.checkpoint_dir):
        print(" [*] Load SUCCESS")
    else:
        print(" [!] Load failed...")
    print("Testing...")
    #312*21
    result = SRCNN.model(self).eval({self.images:test_data,self.labels:test_label})
    result = merge(result,[nx,ny])
    result = result.squeeze() # 除去size为1的维度
    # result= exposure.adjust_gamma(result, 1.07)#调暗一些
    image_path = os.path.join(os.getcwd(), "sample")
    image_path = os.path.join(image_path, "MySRCNN.bmp")
    preprocessing.imsave( image_path,result)
def run_test():
    # loading and shuffling data and splitting into train/test sets
    instances, labels = read_data('../data/Tweets.csv')

    paired = list(zip(instances, labels))
    shuffle(paired)
    instances, labels = zip(*paired)

    bows = list(map(bag_of_words, map(sanitize, instances)))
    bows_tr, labels_tr = bows[:10000], labels[:10000]
    bows_test, labels_test = bows[10000:], labels[10000:]

    # learning weights on train set
    weights = estimate_weights(bows_tr, labels_tr, 10)

    # evaluating classification accuracy using learned weights on the test set
    predictions = predict_all(bows_test, weights, list(set(labels)))
    labels_prediction = [p[0] for p in predictions]
    print('Accuracy:', accuracy_score(labels_test, labels_prediction))
    print(classification_report(labels_test, labels_prediction))
def run_test():
    # loading and shuffling data and splitting into train/test sets
    instances, labels = read_data('../data/Tweets.csv')

    paired = list(zip(instances, labels))
    shuffle(paired)
    instances, labels = zip(*paired)

    bows = list(map(bag_of_words, map(sanitize, instances)))

    bows, _, _ = bows_to_numpy(bows)
    labels, _, _ = labels_to_numpy(labels)

    train_size = 1000
    bows_tr, labels_tr = bows[:train_size], labels[:train_size]
    bows_test, labels_test = bows[train_size:], labels[train_size:]

    sizes = [len(bows[0]), 15, 3]
    biases = [np.random.randn(s, 1) for s in sizes[1:]]
    weights = [
        np.random.randn(s_out, s_in)
        for s_in, s_out in zip(sizes[:-1], sizes[1:])
    ]

    # learning weights on train set
    stochastic_gradient_descent(bows_tr,
                                labels_tr,
                                weights,
                                biases,
                                epochs=50,
                                activation_fn=sigmoid,
                                activation_fn_deriv=sigmoid_deriv)

    # evaluating classification accuracy using learned weights on the test set
    predictions = np.argmax(predict_all(bows_test, weights, biases), axis=1)
    labels_test = np.argmax(labels_test, axis=1)
    print('Accuracy:', accuracy_score(labels_test, predictions))
    print(classification_report(labels_test, predictions))
Beispiel #16
0
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense
from keras.optimizers import Adam
from keras.losses import categorical_crossentropy
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from preprocessing import read_data, labels_to_numpy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

if __name__ == "__main__":
    # Get the untransformed data
    X, y = read_data('../data/Tweets.csv')

    # Label each of the words in the data
    num_words = 8000
    t = Tokenizer(num_words=num_words)
    t.fit_on_texts(X)

    # Convert the data into labeled sequences of fixed length
    X = t.texts_to_sequences(X)
    X = pad_sequences(X)

    y, _, _ = labels_to_numpy(y)

    # Split into training and testing data
    train_percent = 0.5
    train_size = int(len(X) * train_percent)
    X_test = X[train_size:]
Beispiel #17
0
        else:
            return accuracy, score


if __name__ == '__main__':
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    lr = 1e-2
    wd = 1e-5
    val_num = 10000
    aug = True
    result_name = 'nn_noonehotencoder3feat_3fc_100'
    mode = 'all_feat'

    train_file = 'dota2Train.csv'
    test_file = 'dota2Test.csv'
    train_data, train_label = read_data(train_file, shuffle=True)
    test_data, test_label = read_data(test_file, shuffle=True)

    with tf.Graph().as_default():
        # build graph

        # control input
        batch_size = tf.placeholder(tf.int32, shape=[])

        # data input
        train_data = tf.constant(train_data,
                                 dtype=tf.float32,
                                 shape=train_data.shape)
        train_label = tf.constant(train_label,
                                  dtype=tf.int32,
                                  shape=train_label.shape)
from preprocessing import read_data
from sklearn.neural_network import MLPClassifier
from evaluation import evaluate_bow_classifier

if __name__ == "__main__":
    clf = MLPClassifier(verbose=1)
    instances, labels = read_data('../data/Tweets.csv')
    evaluate_bow_classifier(instances, labels, clf, use_argmax_labels=False)
Beispiel #19
0
    report["hbias"] = rbm.hbias
    report["vbias"] = rbm.vbias
    np.save("report", report)

#%%============================================================================
# Make a prediction
# ==============================================================================

test_data = np.load("test_data.npy")
test_data = np.concatenate((np.zeros((len(test_data), 20)), test_data), axis=1)
y_pred = np.zeros(len(test_data))
for i in xrange(len(y_pred)):
    sys.stdout.write("\rPrediction advancement: %d%%" % (100 * float(i) / len(y_pred)))
    sys.stdout.flush()
    y_pred[i] = rbm.predict_one(test_data[i, :])
train_ids, train_cuisines, train_ingredients = read_data("train.json")
test_ids, test_cuisines, test_ingredients = read_data("test.json")
del train_ids, train_ingredients, test_cuisines, test_ingredients
le = LabelEncoder()
le.fit(train_cuisines)
pred = le.inverse_transform(y_pred.astyp("int"))
create_submission(test_ids, pred)


#%%============================================================================
# Sampling from the RBM
# ==============================================================================
from preprocessing import (
    read_data,
    make_lowercase,
    remove_numbers,
def main():
    if len(sys.argv) != 2:
        print("python3 filename")
        sys.exit(1)
    data = preprocessing.read_data(sys.argv[1])
    kmeans_fun(data, sys.argv[1])
Beispiel #21
0
from preprocessing import read_data, onehot_encode, data_aug_np
from sklearn import svm
from evaluate import evaluate
import pickle
import os
import time

if __name__ == '__main__':
    result_name = 'svm_onlyheroes' + '.pickle'
    mode = 'one_hot_all_feat'
    train_file = 'dota2Train.csv'
    test_file = 'dota2Test.csv'

    assert os.path.exists('result')

    train_data, train_label = read_data(train_file)
    test_data, test_label = read_data(test_file)
    train_data = train_data[:, :]
    train_label = train_label[:]
    test_data = test_data[:, :]
    test_label = test_label[:]
    test_data, test_label = data_aug_np(test_data, test_label)

    if mode == 'only_heroes':
        train_data = train_data[:, 3:]
        test_data = test_data[:, 3:]
    elif mode == 'all_feat':
        pass
    elif mode == 'one_hot_all_feat':
        train_data, test_data = onehot_encode(train_data, test_data)
    rbm.hbias = report["hbias"]
    rbm.vbias = report["vbias"]

Y = np.argmax(train_data[:,:20], axis=1)
train_data = train_data[:,20:]
X = sigmoid(np.dot(train_data, rbm.W) + rbm.hbias)
#X = train_data


classifier = lr(0.01, solver = 'lbfgs', multi_class='multinomial')
classifier.fit(X, Y)

test_data = np.load('test_data.npy')
test_X = sigmoid(np.dot(test_data, rbm.W) + rbm.hbias)
#test_X = test_data

pred = classifier.predict(test_X)
train_ids, train_cuisines, train_ingredients = read_data('train.json')
test_ids, test_cuisines, test_ingredients = read_data('test.json')
del train_ids, train_ingredients, test_cuisines, test_ingredients
le = LabelEncoder()
le.fit(train_cuisines)
pred = le.inverse_transform(pred)
create_submission(test_ids, pred)






Beispiel #23
0
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from preprocessing import clean_sentence, read_data
from sklearn.metrics.pairwise import cosine_similarity
data_train = read_data('train_pairs.csv')
data_test = read_data('test_pairs.csv')
# a[start:stop:step]
corpus = data_train[:, 0:2:1].flatten()
corpus = np.append(corpus, data_test[:, 0:2].flatten())
clean_corpus = [clean_sentence(doc) for doc in corpus]
vectorizer = TfidfVectorizer()
vectorizer.fit(clean_corpus)

def test_step(threshold):
    origins_vec = vectorizer.transform(data_test[:, 0])
    suspects_vec = vectorizer.transform(data_test[:, 1])
    labels = data_test[:, 2]

    score = 0
    accuracy = 0
    for origin_vec, suspect_vec, label in zip(origins_vec, suspects_vec, labels):
        sim = cosine_similarity(origin_vec, suspect_vec)
        if sim > threshold:
            if float(label) == 1:
                score += 1
    accuracy = score / len(data_test)

    print('Accuracy test:', accuracy)


def train_step():
Beispiel #24
0
def train():
	graph = tf.Graph()
	with graph.as_default():

		global_step = tf.Variable(0, name='global_step', trainable=False)

		# im, la = pre.get_train()
		im, la = pre.get_val()
		images, labels = pre.read_data(im, la, BATCH_SIZE, NUM_SAMPLES, True)

		# First convolutional layer 
		W_conv1 = weight_variable('conv_weights_1', [5, 5, 3, 24], 0.01)
		b_conv1 = bias_variable('conv_biases_1', [24])
		h_conv1 = tf.nn.relu(conv2d(images, W_conv1) + b_conv1)

		# Pooling layer - downsamples by 2X.
		max_pool_1 = max_pool_2x2(h_conv1)

		# Second convolutional layer 
		W_conv2 = weight_variable('conv_weights_2', [5, 5, 24, 36], 24.0)
		b_conv2 = bias_variable('conv_biases_2', [36])
		h_conv2 = tf.nn.relu(conv2d(max_pool_1, W_conv2) + b_conv2)

		# Second Pooling layer
		max_pool_2 = max_pool_2x2(h_conv2)

		# Third convolutional layer 
		W_conv3 = weight_variable('conv_weights_3', [5, 5, 36, 48], 36.0)
		b_conv3 = bias_variable('conv_biases_3', [48])
		h_conv3 = tf.nn.relu(conv2d(max_pool_2, W_conv3) + b_conv3)

		# Third Pooling layer
		max_pool_3 = max_pool_2x2(h_conv3)

		# Fourth convolutional layer 
		W_conv4 = weight_variable('conv_weights_4', [3, 3, 48, 64], 48.0)
		b_conv4 = bias_variable('conv_biases_4', [64])
		h_conv4 = tf.nn.relu(conv2d(max_pool_3, W_conv4) + b_conv4)

		# Fifth convolutional layer 
		W_conv5 = weight_variable('conv_weights_5', [3, 3, 64, 64], 64.0)
		b_conv5 = bias_variable('conv_biases_5', [64])
		h_conv5 = tf.nn.relu(conv2d(h_conv4, W_conv5) + b_conv5)

		#stack result into one dimensional vector by using -1 option
		conv_flat = tf.reshape(h_conv5, [BATCH_SIZE, -1]) 

		# Fully connected layer 1
		W_fc1 = weight_variable('fc_weights_1', [1 * 18 * 64, 1164], 1164.0)
		b_fc1 = bias_variable('fc_biases_1', [1164])
		h_fc1 = tf.nn.relu(tf.matmul(conv_flat, W_fc1) + b_fc1)

		# Fully connected layer 2
		W_fc2 = weight_variable('fc_weights_2', [1164, 100], 100.0)
		b_fc2 = bias_variable('fc_biases_2', [100])
		h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)

		# Fully connected layer 3
		W_fc3 = weight_variable('fc_weights_3', [100, 10], 10.0)
		b_fc3 = bias_variable('fc_biases_3', [10])
		h_fc3 = tf.nn.relu(tf.matmul(h_fc2, W_fc3) + b_fc3)

		# Fully connected layer 4
		W_fc4 = weight_variable('fc_weights_4', [10, 1], 1.0)
		b_fc4 = bias_variable('fc_biases_4', [1])
		h_fc4 = tf.matmul(h_fc3, W_fc4) + b_fc4

		# radiants in the range of [-pi/2, pi/2] * 2 to get 360° range
		y = tf.multiply(tf.atan(h_fc4), 2)

		loss = loss_func(y, labels)

		# training operator for session call
		# train_op, lr = optimize(loss, global_step)

		# max_to_keep option to store all weights
		saver = tf.train.Saver(tf.global_variables(), max_to_keep=None)

		#tensorflow session 
		session = tf.Session()

		#tensorboard
		merged = tf.summary.merge_all()
		train_writer = tf.summary.FileWriter('train', session.graph)

		#initialization of all variables
		session.run(tf.global_variables_initializer())
		session.run(tf.local_variables_initializer())

		#threads
		coord = tf.train.Coordinator()
		threads = tf.train.start_queue_runners(coord=coord, sess=session)

		#save weights in directory
		#TODO file is empty
		# ckpt = tf.train.get_checkpoint_state('./weights/')
		# saver.restore(session, '/work/raymond/dlcv/dlcv_visnav/src/check_files/model99.ckpt-99')

		logging.basicConfig(filename='../log/training_eval.log',level=logging.INFO)


		for x in range(NUM_ITER):
			average_loss = 0.0
			ckpt = tf.train.get_checkpoint_state('/work/raymond/dlcv/dlcv_visnav/src/check_files/')

			checkpoint_dir = '/work/raymond/dlcv/dlcv_visnav/src/check_files/'
			checkpoint_filename = 'model'+str(x)+'.ckpt-'+str(x)
			saver.restore(session, checkpoint_dir+checkpoint_filename)
			print(checkpoint_filename + " loaded successfully...")

			for y in range(NUM_BATCHES):
				#print("testing...")
				lossVal = session.run(loss)
				print('iteration: ', x)
				print('loss: ', lossVal)
				average_loss = average_loss+lossVal
				
			# 	#print("done")
				print('batch: ', y)
				# print(lossVal)
			# 	# print(image_out.shape)
				
			# 	#break
			
			average_loss = average_loss/NUM_BATCHES
			print("average_loss: ", average_loss)

			content = x, checkpoint_filename, average_loss
			logging.info(content)
			
			# str1 = str(x)
			# str2 = "check_files/model"
			# str3 = ".ckpt"
			# str4 = str2 + str1 + str3

			# save_path = saver.save(session, str4, global_step=x)

			# content = datetime.now(), x, curr_learnRate, average_loss
			# logging.info(content)


		
		train_writer.close()
		
		#tensorflow threads 
		coord.request_stop()
		coord.join(threads)
Beispiel #25
0
 def _load_data(self, filename):
     """
     Loads hr and acc data into a pandas.DataFrame from a dataset file.
     """
     return extract_hr_acc(read_data(filename, self.base_datetime))
os.chdir("C:/Personal/Kaggle/ASHRAE/python_scripts")
from preprocessing import read_data, parse_timestamp, parallelize_dataframe
from model_training import create_dummies, score, rmsle

## Final Submission file
submission = pd.DataFrame()


def final_model_predict(x, model):
    preds = model.predict(x)
    x['meter_reading_pred'] = preds
    return x


meter_0 = read_data(
    "C:/Personal/Kaggle/ASHRAE/ashrae-energy-prediction/output/test_final_3.csv"
)
meter_0 = parse_timestamp(meter_0, 'timestamp')
meter_0.columns
meter_0['site_id'] = meter_0['site_id'].astype(int).astype(str)

meter_0 = create_dummies(meter_0,
                         ['site_id', 'primary_use', 'square_feet_profile'])

drop_col_list = [
    'wind_direction', 'square_feet', 'year_built', 'floor_count', 'month',
    'day', 'hour', 'year', 'primary_use', 'meter', 'site_id',
    'square_feet_profile', 'building_id', 'timestamp'
]
meter_0 = meter_0.drop(drop_col_list, axis=1)
Beispiel #27
0
def run():
    num_classes = 2
    image_shape=(64,64)
    curdir = os.getcwd()
    pardir = os.path.abspath(os.path.join(curdir, os.pardir))
    datadir = os.path.join(pardir,'pictures')
    runsdir = os.path.join(curdir, 'runs')
    
    epochs = 20
    batch_size = 128
    
    # Get data
    data = preprocessing.read_data(datadir)
    train_data, test_data = preprocessing.test_train_split(data, 0.2)
    tf.get_default_graph()
    input_image = tf.placeholder(tf.float32,(None, image_shape[0], image_shape[1], 3), name='input_image')
    y_label = tf.placeholder(tf.int64, (None), name='label')
    prob = tf.placeholder(tf.float32, name='prob')
    learning_rate_ph = tf.placeholder("float")
    logits = LeNet6(input_image, num_classes, prob)
    train_op, cross_entropy_loss = optimize(logits, y_label, learning_rate_ph, num_classes)
    pred_class = tf.argmax(tf.nn.softmax(logits), axis=1, name='pred')
    correct_prediction = tf.equal(pred_class, y_label)
    float_cast_pred = tf.cast(correct_prediction,tf.float32)
    accuracy = tf.reduce_mean(float_cast_pred, name='accuracy')
    #saver = tf.train.Saver()
    
    with tf.Session() as sess:
        # Get train data generator
        get_batches_fn = preprocessing.gen_batch_function(data, image_shape)
        sess.run(tf.global_variables_initializer())
        print("Training...")
        print()
        for i in range(epochs):
            train_loss = 0
            train_acc = 0
            samples = 0
            time_start = time.time()
            for images, labels in get_batches_fn(batch_size):
                _, loss, acc, pred_y, act_y, float_pred = sess.run([train_op, cross_entropy_loss, accuracy, pred_class, y_label, float_cast_pred], 
                                        feed_dict={input_image: images, y_label: labels, prob: 0.5, learning_rate_ph:1e-3})
                #print('Images shape:',images.shape)
                #print('pred_y', pred_y)
                #print('act_y', act_y)
                #print('float_cast_pred:', float_pred)
                train_loss += loss
                train_acc += acc
                samples += 1
                #print('loss:', loss, 'train_loss:',train_loss)
                #print('acc:', acc, 'train_acc:', train_acc)
                
            total_time = time.time() - time_start
            print("EPOCH {} ...".format(i+1))
            print("Loss = {}".format(train_loss/samples))
            print("Training accuracy = {}".format(train_acc/samples))
            print("Time = {} mins".format(total_time/60))
            print()
        # Test accuracy
        test_images, test_labels = preprocessing.gen_test_data(test_data, image_shape)
        loss, acc = sess.run([cross_entropy_loss, accuracy],
                             feed_dict={input_image:test_images, y_label:test_labels, prob:1})
        print("Test loss = {}".format(loss))
        print("Test accuracy = {}".format(acc))
        saver = tf.train.Saver()
        saver.save(sess, './model/model.ckpt')
        print('model saved!')
Beispiel #28
0
            one_hot_label_encoder.inverse_transform(result)))
    Y_test = np.array(
        label_encoder.transform(
            one_hot_label_encoder.inverse_transform(Y_test)))
    return result, Y_test


def evaluation(Y_pred, Y_true):
    labels = list(one_hot_label_encoder.categories_[0])
    conf_mat = confusion_matrix(Y_true, Y_pred)
    plot_confusion_matrix(conf_mat, labels, accuracy_score(Y_true, Y_pred))


# get the dataset
dataset_path = "Data_Set.csv"
dataset = prepro.read_data(file_name=dataset_path)  # read data

# split data in training and testing data
trainingdataset, testing_dataset = train_test_split(dataset,
                                                    test_size=0.2,
                                                    random_state=42)

# Plot training and testing data-set
prepro.plot_train_test_per_class(trainingdataset, testing_dataset)

# resampling the trainin data-set for balancing
oversampled = prepro.divise_data_in_balanced_data(trainingdataset)

# Plot the final balanced training data-set
prepro.plot_data_per_class(oversampled)
Beispiel #29
0
def lines_to_words(lines):
    words = []

    for line in lines:
        for word in line:
            words.append(word)

    return words


if __name__ == '__main__':
    # Gets list of words from Hamlet and sonnets
    play_lines = process_text_ham(ham)
    toke_play_lines = tokenize_ham(play_lines)
    no_punct_play_lines = elim_punct(toke_play_lines)
    sonnet_lines = pp.read_data(toke_lines)
    no_punct_sonnet_lines = elim_punct(sonnet_lines)
    lines = no_punct_play_lines + no_punct_sonnet_lines
    text = lines_to_words(lines)

    # Prepare data for model
    words = sorted(list(set(text)))
    word_to_int = dict((c, i) for i, c in enumerate(words))
    int_to_word = dict((i, c) for i, c in enumerate(words))

    n_words = len(text)
    n_vocab = len(words)

    seq_length = 5

    dataX = []
Beispiel #30
0
def test():
    graph = tf.Graph()
    with graph.as_default():

        global_step = tf.Variable(0, name='global_step', trainable=False)

        im, la = pre.get_test()
        images, labels = pre.read_data(im, la, BATCH_SIZE, NUM_SAMPLES, False)

        # First convolutional layer
        W_conv1 = weight_variable('conv_weights_1', [5, 5, 3, 24], 0.01)
        b_conv1 = bias_variable('conv_biases_1', [24])
        h_conv1 = tf.nn.relu(conv2d(images, W_conv1) + b_conv1)

        # Pooling layer - downsamples by 2X.
        max_pool_1 = max_pool_2x2(h_conv1)

        # Second convolutional layer
        W_conv2 = weight_variable('conv_weights_2', [5, 5, 24, 36], 24.0)
        b_conv2 = bias_variable('conv_biases_2', [36])
        h_conv2 = tf.nn.relu(conv2d(max_pool_1, W_conv2) + b_conv2)

        # Second Pooling layer
        max_pool_2 = max_pool_2x2(h_conv2)

        # Third convolutional layer
        W_conv3 = weight_variable('conv_weights_3', [5, 5, 36, 48], 36.0)
        b_conv3 = bias_variable('conv_biases_3', [48])
        h_conv3 = tf.nn.relu(conv2d(max_pool_2, W_conv3) + b_conv3)

        # Third Pooling layer
        max_pool_3 = max_pool_2x2(h_conv3)

        # Fourth convolutional layer
        W_conv4 = weight_variable('conv_weights_4', [3, 3, 48, 64], 48.0)
        b_conv4 = bias_variable('conv_biases_4', [64])
        h_conv4 = tf.nn.relu(conv2d(max_pool_3, W_conv4) + b_conv4)

        # Fifth convolutional layer
        W_conv5 = weight_variable('conv_weights_5', [3, 3, 64, 64], 64.0)
        b_conv5 = bias_variable('conv_biases_5', [64])
        h_conv5 = tf.nn.relu(conv2d(h_conv4, W_conv5) + b_conv5)

        #stack result into one dimensional vector by using -1 option
        conv_flat = tf.reshape(h_conv5, [BATCH_SIZE, -1])

        # Fully connected layer 1
        W_fc1 = weight_variable('fc_weights_1', [1 * 18 * 64, 1164], 1164.0)
        b_fc1 = bias_variable('fc_biases_1', [1164])
        h_fc1 = tf.nn.relu(tf.matmul(conv_flat, W_fc1) + b_fc1)

        # Fully connected layer 2
        W_fc2 = weight_variable('fc_weights_2', [1164, 100], 100.0)
        b_fc2 = bias_variable('fc_biases_2', [100])
        h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)

        # Fully connected layer 3
        W_fc3 = weight_variable('fc_weights_3', [100, 10], 10.0)
        b_fc3 = bias_variable('fc_biases_3', [10])
        h_fc3 = tf.nn.relu(tf.matmul(h_fc2, W_fc3) + b_fc3)

        # Fully connected layer 4
        W_fc4 = weight_variable('fc_weights_4', [10, 1], 1.0)
        b_fc4 = bias_variable('fc_biases_4', [1])
        h_fc4 = tf.matmul(h_fc3, W_fc4) + b_fc4

        # radiants in the range of [-pi/2, pi/2] * 2 to get 360 range
        y = tf.multiply(tf.atan(h_fc4), 2)

        saver = tf.train.Saver(tf.global_variables(), max_to_keep=None)

        #tensorflow session
        session = tf.Session()

        #initialization of all variables
        session.run(tf.global_variables_initializer())
        session.run(tf.local_variables_initializer())

        #threads
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, sess=session)

        #save weights in directory
        #TODO file is empty
        # ckpt = tf.train.get_checkpoint_state('./weights/')

        logging.basicConfig(filename='../log/test.log', level=logging.INFO)

        i = 100
        accuracy = 0.0
        saver.restore(session, '../weights/model' + str(i) + '.ckpt-' + str(i))

        for b in range(NUM_BATCHES):

            y_out, image_out, label_out = session.run([y, images, labels])
            #print('epoche ' + str(i) + ': ' + str(y_out) + '-' + str(label_out))
            batch_acc = comp_accuracy(y_out, label_out)
            accuracy += batch_acc
            content = y_out
            logging.info(content)

        accuracy = accuracy / NUM_BATCHES
        print(' accuracy: ', accuracy)

        content = accuracy
        logging.info(content)

        #tensorflow threads
        coord.request_stop()
        coord.join(threads)
Beispiel #31
0
    print(params)

# restoring model
savepath = params['filepath'].get('ckpt')
ckpt = torch.load(savepath)

vocab = ckpt['vocab']

model = SeNet(num_classes=params['num_classes'], vocab=vocab)
model.load_state_dict(ckpt['model_state_dict'])
model.eval()

# create dataset, dataloader
tagger = Okt()
padder = PadSequence(length=30)
tst_data = read_data(params['filepath'].get('tst'))
tst_data = remove_na(tst_data)
tst_dataset = Corpus(tst_data, vocab, tagger, padder)
tst_dataloader = DataLoader(tst_dataset, batch_size=128)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')
model.to(device)

# evaluation
correct_count = 0
for x_mb, y_mb in tqdm(tst_dataloader):
    x_mb = x_mb.to(device)
    y_mb = y_mb.to(device)
    with torch.no_grad():
        y_mb_hat = model(x_mb)
def main():
    first = preprocessing.read_data("018/first_file_preprocessed.csv")
    second = preprocessing.read_data("019/second_file_preprocessed.csv")

    idx = np.intersect1d(first.index, second.index)
    print(len(idx))
Beispiel #33
0
import preprocessing as pp
import visualize_data

data_dir = "data"
full_df = pp.read_data(data_dir=data_dir)
new_df = pp.create_features(full_df)

#Class distribution
#visualize_data.draw_count_plot(new_df)

#word length distribution
#visualize_data.draw_dist(new_df)

#correlation wrt newly created features
#visualize_data.draw_corr(new_df)

#Preprocessing
processed_df = pp.preprocess(new_df,
                             col_name="CONTENT",
                             r_stopwords=False,
                             lemma=True,
                             spell_corr=False,
                             emotion_corr=True)

#Classification and results
import machine_learning
svm_p, m_p, b_p, label = machine_learning.buildClassifier(processed_df,
                                                          bigram=False)
machine_learning.write_to_file(label, svm_p, m_p, b_p)