def run(): print("Preprocessing the training data.") X_train, y_train = read_data('data/train_set.fasta') print("Preprocessing the test data.") X_test, y_test = read_data('data/benchmark_set.fasta') print("Running model 1.") model_1 = RandomForestClassifier(n_estimators=10, criterion='gini') model_1.fit(X_train, y_train) accuracy_1 = model_1.score(X_test, y_test) print(f"Model 1 accuracy: {accuracy_1}\n") print("Running model 2.") model_2 = RandomForestClassifier(n_estimators=100, criterion='gini') model_2.fit(X_train, y_train) accuracy_2 = model_2.score(X_test, y_test) print(f"Model 2 accuracy: {accuracy_2}\n") print("Running model 3.") model_3 = RandomForestClassifier(n_estimators=10, criterion='entropy') model_3.fit(X_train, y_train) accuracy_3 = model_3.score(X_test, y_test) print(f"Model 3 accuracy: {accuracy_3}\n") print("Running model 4.") model_4 = RandomForestClassifier(n_estimators=100, criterion='entropy') model_4.fit(X_train, y_train) accuracy_4 = model_4.score(X_test, y_test) print(f"Model 4 accuracy: {accuracy_4}\n")
def test(): import preprocessing as pp X, Y, Yd, B = pp.read_data('krk_data_20000_balanced_8.cpkl') ind_nd = np.where(Yd == 0)[0] Ydtm = Y[ind_nd] Xdtm = np.zeros((Ydtm.shape[0], X.shape[1])) Bdtm = [] for i in xrange(len(ind_nd)): Xdtm[i, :] = X[ind_nd[i]] Bdtm.append(B[ind_nd[i]]) N = Xdtm.shape[0] print "Ydtm:{}\tXdtm{}".format(Ydtm.shape, Xdtm.shape) """ TODO: delete this for loop """ print "data read without splitting" for i in xrange(5): print B[i], Y[i] N = len(Y) Y = np.array(Y).reshape((N, 1)) Yd = np.array(Yd).reshape((N, 1)) D = X.shape[1] print D M = [128] c0 = 8 F = [(1, 1), (3, 3), (3, 3)] C = [(8, ), (16, ), (32, )] import learn # first time use: graph = build_graph(F, C, c0, M, D) with tf.Session(graph=graph) as sess: learn.fit(sess, Xdtm, Ydtm, Bdtm, init=True)
def main(): if len(sys.argv) != 2: print("python3 kmeans_ssd.py filename") sys.exit(1) data = read_data(sys.argv[1]) kmeans(data)
def train(self): input_setup() data_dir = os.path.join(os.getcwd(), "checkpoint\\train.h5") train_data,train_label = read_data(data_dir) glob_step = tf.Variable(0) learning_rate_exp = tf.train.exponential_decay(config.learning_rate,glob_step,1480,0.98, staircase=True)# 每1个Epoch 学习率*0.98 self.train_op = tf.train.GradientDescentOptimizer(learning_rate_exp).minimize(self.loss, global_step = glob_step) tf.global_variables_initializer().run() counter = 0 start_time = time.time() if self.load(self.checkpoint_dir): print(" [*] Load SUCCESS") else: print(" [!] Load failed...") print("Training...") for ep in range(config.epoch): batch_indx = len(train_data)//config.batch_size for idx in range(0,batch_indx): batch_images = train_data[idx * config.batch_size: (idx + 1) * config.batch_size] batch_labels = train_label[idx * config.batch_size: (idx + 1) * config.batch_size] counter += 1 _,err = self.sess.run([self.train_op,self.loss], feed_dict = {self.images:batch_images,self.labels:batch_labels}) if counter % 10 == 0: # 10的倍数step显示 print("Epoch: [%2d], step: [%2d], time: [%4.4f], loss: [%.8f]" % ((ep + 1), counter, time.time() - start_time, err)) if counter % 500 == 0: # 500的倍数step存储 self.save(config.checkpoint_dir, counter)
def run_test(): # loading and shuffling data and splitting into train/test sets instances, labels = read_data('../data/Tweets.csv') paired = list(zip(instances, labels)) shuffle(paired) instances, labels = zip(*paired) bows = list(map(bag_of_words, map(sanitize, instances))) bows, _, _ = bows_to_numpy(bows) labels, _, _ = labels_to_numpy(labels) train_size = 10000 test_size = 100 bows_tr, labels_tr = bows[:train_size], labels[:train_size] bows_test, labels_test = bows[train_size:train_size + test_size],\ labels[train_size:train_size + test_size] # learning weights on train set predictions = list( map(lambda x: predict(x, bows_tr, labels_tr), bows_test)) # evaluating classification accuracy using learned weights on the test set labels_test = np.argmax(labels_test, axis=1) print('Accuracy:', accuracy_score(labels_test, predictions)) print(classification_report(labels_test, predictions))
def main(): if len(sys.argv) != 2: print("python3 filepath") sys.exit(1) data = preprocessing.read_data(sys.argv[1]) agglomerative_clustering(data, sys.argv[1]) return
def get_data(): data = pre.read_data() variables = data['Variable'].unique().tolist()[:5] # Probando solo con 5 maps_data = {} for var in variables: print(var) df_variable = pre.df_variable(data, var) # add color column df_variable['color'] = [ RdYlBu[11][val] for val in pd.cut( x=df_variable['Concentración'], bins=11, labels=False) ] geo_features = create_geojson_features(df_variable.reset_index()) maps_data[var] = TimestampedGeoJson( { 'type': 'FeatureCollection', 'features': geo_features }, period='P1D', add_last_point=True, auto_play=False, loop=False, max_speed=10, loop_button=True, date_options='YYYY/MM', duration='P1D', time_slider_drag_update=True) return maps_data, variables
def run_test(): instances, labels = read_data('../data/Tweets.csv') bows = list(map(bag_of_words, map(sanitize, instances))) weights = estimate_weights(bows, labels, 0.001) predictions = predict_all(bows, weights, list(set(labels))) prediction_labels = [p[0] for p in predictions] print(accuracy_score(labels, prediction_labels))
def main(): if len(sys.argv) != 3: print("python clustering_quality.py filename labelsfilename") sys.exit(1) labels = get_labels(sys.argv[2]) data = preprocessing.read_data(sys.argv[1]) print( metrics.cluster.silhouette_score(X=data.values, labels=labels, metric='euclidean'))
def preprocess_test(meta_data: pd.DataFrame) -> Dict: logging.info('Reading test data') test_building_data, test_weather_data = pp.read_data(TEST_DATA_PATH, TEST_WEATHER_PATH, meta_data, nrows=None) test_data = features.prepare_features(test_building_data, test_weather_data) test_sets = splits.split_data_by_meter(test_data) logging.info('Test set ready.') return test_sets
def main(): train_feature = read_data(os.path.join(data_path, train_feature_file)) print(train_feature.head()) train_salaries = read_data(os.path.join(data_path, train_salary_file)) print(train_salaries.head()) train_data = pd.merge(train_feature, train_salaries, how="left", on="jobId") print(train_data.head()) salary_info = company_salary(train_data) print(salary_info) salary_by_types(train_data, "degree") salary_by_types(train_data, "major") company_jobs(train_data)
def preprocess_train(meta_data: pd.DataFrame, remove_zeros: bool) -> Dict: #meta_data = pp.read_building_metadata(META_DATA_PATH) logging.info('Reading training data') train_building_data, train_weather_data = pp.read_data( TRAIN_DATA_PATH, TRAIN_WEATHER_PATH, meta_data, remove_zeros=remove_zeros, nrows=None) logging.info('Preparing training features and target') training_data = features.prepare_features(train_building_data, train_weather_data) target.get_log_target(training_data) logging.info('Splitting data by meter type...') train_sets = splits.split_data_by_meter(training_data) logging.info('Training set ready.') return train_sets
def test(self,sess): nx,ny = input_up(sess) print(nx,ny) data_dir = os.path.join(os.getcwd(), "checkpoint\\test.h5") test_data, test_label = preprocessing.read_data(data_dir) if SRCNN.load(self,config.checkpoint_dir): print(" [*] Load SUCCESS") else: print(" [!] Load failed...") print("Testing...") #312*21 result = SRCNN.model(self).eval({self.images:test_data,self.labels:test_label}) result = merge(result,[nx,ny]) result = result.squeeze() # 除去size为1的维度 # result= exposure.adjust_gamma(result, 1.07)#调暗一些 image_path = os.path.join(os.getcwd(), "sample") image_path = os.path.join(image_path, "MySRCNN.bmp") preprocessing.imsave( image_path,result)
def run_test(): # loading and shuffling data and splitting into train/test sets instances, labels = read_data('../data/Tweets.csv') paired = list(zip(instances, labels)) shuffle(paired) instances, labels = zip(*paired) bows = list(map(bag_of_words, map(sanitize, instances))) bows_tr, labels_tr = bows[:10000], labels[:10000] bows_test, labels_test = bows[10000:], labels[10000:] # learning weights on train set weights = estimate_weights(bows_tr, labels_tr, 10) # evaluating classification accuracy using learned weights on the test set predictions = predict_all(bows_test, weights, list(set(labels))) labels_prediction = [p[0] for p in predictions] print('Accuracy:', accuracy_score(labels_test, labels_prediction)) print(classification_report(labels_test, labels_prediction))
def run_test(): # loading and shuffling data and splitting into train/test sets instances, labels = read_data('../data/Tweets.csv') paired = list(zip(instances, labels)) shuffle(paired) instances, labels = zip(*paired) bows = list(map(bag_of_words, map(sanitize, instances))) bows, _, _ = bows_to_numpy(bows) labels, _, _ = labels_to_numpy(labels) train_size = 1000 bows_tr, labels_tr = bows[:train_size], labels[:train_size] bows_test, labels_test = bows[train_size:], labels[train_size:] sizes = [len(bows[0]), 15, 3] biases = [np.random.randn(s, 1) for s in sizes[1:]] weights = [ np.random.randn(s_out, s_in) for s_in, s_out in zip(sizes[:-1], sizes[1:]) ] # learning weights on train set stochastic_gradient_descent(bows_tr, labels_tr, weights, biases, epochs=50, activation_fn=sigmoid, activation_fn_deriv=sigmoid_deriv) # evaluating classification accuracy using learned weights on the test set predictions = np.argmax(predict_all(bows_test, weights, biases), axis=1) labels_test = np.argmax(labels_test, axis=1) print('Accuracy:', accuracy_score(labels_test, predictions)) print(classification_report(labels_test, predictions))
from keras.models import Sequential from keras.layers import LSTM, Embedding, Dense from keras.optimizers import Adam from keras.losses import categorical_crossentropy from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from preprocessing import read_data, labels_to_numpy from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, accuracy_score import numpy as np if __name__ == "__main__": # Get the untransformed data X, y = read_data('../data/Tweets.csv') # Label each of the words in the data num_words = 8000 t = Tokenizer(num_words=num_words) t.fit_on_texts(X) # Convert the data into labeled sequences of fixed length X = t.texts_to_sequences(X) X = pad_sequences(X) y, _, _ = labels_to_numpy(y) # Split into training and testing data train_percent = 0.5 train_size = int(len(X) * train_percent) X_test = X[train_size:]
else: return accuracy, score if __name__ == '__main__': os.environ['CUDA_VISIBLE_DEVICES'] = '0' lr = 1e-2 wd = 1e-5 val_num = 10000 aug = True result_name = 'nn_noonehotencoder3feat_3fc_100' mode = 'all_feat' train_file = 'dota2Train.csv' test_file = 'dota2Test.csv' train_data, train_label = read_data(train_file, shuffle=True) test_data, test_label = read_data(test_file, shuffle=True) with tf.Graph().as_default(): # build graph # control input batch_size = tf.placeholder(tf.int32, shape=[]) # data input train_data = tf.constant(train_data, dtype=tf.float32, shape=train_data.shape) train_label = tf.constant(train_label, dtype=tf.int32, shape=train_label.shape)
from preprocessing import read_data from sklearn.neural_network import MLPClassifier from evaluation import evaluate_bow_classifier if __name__ == "__main__": clf = MLPClassifier(verbose=1) instances, labels = read_data('../data/Tweets.csv') evaluate_bow_classifier(instances, labels, clf, use_argmax_labels=False)
report["hbias"] = rbm.hbias report["vbias"] = rbm.vbias np.save("report", report) #%%============================================================================ # Make a prediction # ============================================================================== test_data = np.load("test_data.npy") test_data = np.concatenate((np.zeros((len(test_data), 20)), test_data), axis=1) y_pred = np.zeros(len(test_data)) for i in xrange(len(y_pred)): sys.stdout.write("\rPrediction advancement: %d%%" % (100 * float(i) / len(y_pred))) sys.stdout.flush() y_pred[i] = rbm.predict_one(test_data[i, :]) train_ids, train_cuisines, train_ingredients = read_data("train.json") test_ids, test_cuisines, test_ingredients = read_data("test.json") del train_ids, train_ingredients, test_cuisines, test_ingredients le = LabelEncoder() le.fit(train_cuisines) pred = le.inverse_transform(y_pred.astyp("int")) create_submission(test_ids, pred) #%%============================================================================ # Sampling from the RBM # ============================================================================== from preprocessing import ( read_data, make_lowercase, remove_numbers,
def main(): if len(sys.argv) != 2: print("python3 filename") sys.exit(1) data = preprocessing.read_data(sys.argv[1]) kmeans_fun(data, sys.argv[1])
from preprocessing import read_data, onehot_encode, data_aug_np from sklearn import svm from evaluate import evaluate import pickle import os import time if __name__ == '__main__': result_name = 'svm_onlyheroes' + '.pickle' mode = 'one_hot_all_feat' train_file = 'dota2Train.csv' test_file = 'dota2Test.csv' assert os.path.exists('result') train_data, train_label = read_data(train_file) test_data, test_label = read_data(test_file) train_data = train_data[:, :] train_label = train_label[:] test_data = test_data[:, :] test_label = test_label[:] test_data, test_label = data_aug_np(test_data, test_label) if mode == 'only_heroes': train_data = train_data[:, 3:] test_data = test_data[:, 3:] elif mode == 'all_feat': pass elif mode == 'one_hot_all_feat': train_data, test_data = onehot_encode(train_data, test_data)
rbm.hbias = report["hbias"] rbm.vbias = report["vbias"] Y = np.argmax(train_data[:,:20], axis=1) train_data = train_data[:,20:] X = sigmoid(np.dot(train_data, rbm.W) + rbm.hbias) #X = train_data classifier = lr(0.01, solver = 'lbfgs', multi_class='multinomial') classifier.fit(X, Y) test_data = np.load('test_data.npy') test_X = sigmoid(np.dot(test_data, rbm.W) + rbm.hbias) #test_X = test_data pred = classifier.predict(test_X) train_ids, train_cuisines, train_ingredients = read_data('train.json') test_ids, test_cuisines, test_ingredients = read_data('test.json') del train_ids, train_ingredients, test_cuisines, test_ingredients le = LabelEncoder() le.fit(train_cuisines) pred = le.inverse_transform(pred) create_submission(test_ids, pred)
import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from preprocessing import clean_sentence, read_data from sklearn.metrics.pairwise import cosine_similarity data_train = read_data('train_pairs.csv') data_test = read_data('test_pairs.csv') # a[start:stop:step] corpus = data_train[:, 0:2:1].flatten() corpus = np.append(corpus, data_test[:, 0:2].flatten()) clean_corpus = [clean_sentence(doc) for doc in corpus] vectorizer = TfidfVectorizer() vectorizer.fit(clean_corpus) def test_step(threshold): origins_vec = vectorizer.transform(data_test[:, 0]) suspects_vec = vectorizer.transform(data_test[:, 1]) labels = data_test[:, 2] score = 0 accuracy = 0 for origin_vec, suspect_vec, label in zip(origins_vec, suspects_vec, labels): sim = cosine_similarity(origin_vec, suspect_vec) if sim > threshold: if float(label) == 1: score += 1 accuracy = score / len(data_test) print('Accuracy test:', accuracy) def train_step():
def train(): graph = tf.Graph() with graph.as_default(): global_step = tf.Variable(0, name='global_step', trainable=False) # im, la = pre.get_train() im, la = pre.get_val() images, labels = pre.read_data(im, la, BATCH_SIZE, NUM_SAMPLES, True) # First convolutional layer W_conv1 = weight_variable('conv_weights_1', [5, 5, 3, 24], 0.01) b_conv1 = bias_variable('conv_biases_1', [24]) h_conv1 = tf.nn.relu(conv2d(images, W_conv1) + b_conv1) # Pooling layer - downsamples by 2X. max_pool_1 = max_pool_2x2(h_conv1) # Second convolutional layer W_conv2 = weight_variable('conv_weights_2', [5, 5, 24, 36], 24.0) b_conv2 = bias_variable('conv_biases_2', [36]) h_conv2 = tf.nn.relu(conv2d(max_pool_1, W_conv2) + b_conv2) # Second Pooling layer max_pool_2 = max_pool_2x2(h_conv2) # Third convolutional layer W_conv3 = weight_variable('conv_weights_3', [5, 5, 36, 48], 36.0) b_conv3 = bias_variable('conv_biases_3', [48]) h_conv3 = tf.nn.relu(conv2d(max_pool_2, W_conv3) + b_conv3) # Third Pooling layer max_pool_3 = max_pool_2x2(h_conv3) # Fourth convolutional layer W_conv4 = weight_variable('conv_weights_4', [3, 3, 48, 64], 48.0) b_conv4 = bias_variable('conv_biases_4', [64]) h_conv4 = tf.nn.relu(conv2d(max_pool_3, W_conv4) + b_conv4) # Fifth convolutional layer W_conv5 = weight_variable('conv_weights_5', [3, 3, 64, 64], 64.0) b_conv5 = bias_variable('conv_biases_5', [64]) h_conv5 = tf.nn.relu(conv2d(h_conv4, W_conv5) + b_conv5) #stack result into one dimensional vector by using -1 option conv_flat = tf.reshape(h_conv5, [BATCH_SIZE, -1]) # Fully connected layer 1 W_fc1 = weight_variable('fc_weights_1', [1 * 18 * 64, 1164], 1164.0) b_fc1 = bias_variable('fc_biases_1', [1164]) h_fc1 = tf.nn.relu(tf.matmul(conv_flat, W_fc1) + b_fc1) # Fully connected layer 2 W_fc2 = weight_variable('fc_weights_2', [1164, 100], 100.0) b_fc2 = bias_variable('fc_biases_2', [100]) h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2) # Fully connected layer 3 W_fc3 = weight_variable('fc_weights_3', [100, 10], 10.0) b_fc3 = bias_variable('fc_biases_3', [10]) h_fc3 = tf.nn.relu(tf.matmul(h_fc2, W_fc3) + b_fc3) # Fully connected layer 4 W_fc4 = weight_variable('fc_weights_4', [10, 1], 1.0) b_fc4 = bias_variable('fc_biases_4', [1]) h_fc4 = tf.matmul(h_fc3, W_fc4) + b_fc4 # radiants in the range of [-pi/2, pi/2] * 2 to get 360° range y = tf.multiply(tf.atan(h_fc4), 2) loss = loss_func(y, labels) # training operator for session call # train_op, lr = optimize(loss, global_step) # max_to_keep option to store all weights saver = tf.train.Saver(tf.global_variables(), max_to_keep=None) #tensorflow session session = tf.Session() #tensorboard merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('train', session.graph) #initialization of all variables session.run(tf.global_variables_initializer()) session.run(tf.local_variables_initializer()) #threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=session) #save weights in directory #TODO file is empty # ckpt = tf.train.get_checkpoint_state('./weights/') # saver.restore(session, '/work/raymond/dlcv/dlcv_visnav/src/check_files/model99.ckpt-99') logging.basicConfig(filename='../log/training_eval.log',level=logging.INFO) for x in range(NUM_ITER): average_loss = 0.0 ckpt = tf.train.get_checkpoint_state('/work/raymond/dlcv/dlcv_visnav/src/check_files/') checkpoint_dir = '/work/raymond/dlcv/dlcv_visnav/src/check_files/' checkpoint_filename = 'model'+str(x)+'.ckpt-'+str(x) saver.restore(session, checkpoint_dir+checkpoint_filename) print(checkpoint_filename + " loaded successfully...") for y in range(NUM_BATCHES): #print("testing...") lossVal = session.run(loss) print('iteration: ', x) print('loss: ', lossVal) average_loss = average_loss+lossVal # #print("done") print('batch: ', y) # print(lossVal) # # print(image_out.shape) # #break average_loss = average_loss/NUM_BATCHES print("average_loss: ", average_loss) content = x, checkpoint_filename, average_loss logging.info(content) # str1 = str(x) # str2 = "check_files/model" # str3 = ".ckpt" # str4 = str2 + str1 + str3 # save_path = saver.save(session, str4, global_step=x) # content = datetime.now(), x, curr_learnRate, average_loss # logging.info(content) train_writer.close() #tensorflow threads coord.request_stop() coord.join(threads)
def _load_data(self, filename): """ Loads hr and acc data into a pandas.DataFrame from a dataset file. """ return extract_hr_acc(read_data(filename, self.base_datetime))
os.chdir("C:/Personal/Kaggle/ASHRAE/python_scripts") from preprocessing import read_data, parse_timestamp, parallelize_dataframe from model_training import create_dummies, score, rmsle ## Final Submission file submission = pd.DataFrame() def final_model_predict(x, model): preds = model.predict(x) x['meter_reading_pred'] = preds return x meter_0 = read_data( "C:/Personal/Kaggle/ASHRAE/ashrae-energy-prediction/output/test_final_3.csv" ) meter_0 = parse_timestamp(meter_0, 'timestamp') meter_0.columns meter_0['site_id'] = meter_0['site_id'].astype(int).astype(str) meter_0 = create_dummies(meter_0, ['site_id', 'primary_use', 'square_feet_profile']) drop_col_list = [ 'wind_direction', 'square_feet', 'year_built', 'floor_count', 'month', 'day', 'hour', 'year', 'primary_use', 'meter', 'site_id', 'square_feet_profile', 'building_id', 'timestamp' ] meter_0 = meter_0.drop(drop_col_list, axis=1)
def run(): num_classes = 2 image_shape=(64,64) curdir = os.getcwd() pardir = os.path.abspath(os.path.join(curdir, os.pardir)) datadir = os.path.join(pardir,'pictures') runsdir = os.path.join(curdir, 'runs') epochs = 20 batch_size = 128 # Get data data = preprocessing.read_data(datadir) train_data, test_data = preprocessing.test_train_split(data, 0.2) tf.get_default_graph() input_image = tf.placeholder(tf.float32,(None, image_shape[0], image_shape[1], 3), name='input_image') y_label = tf.placeholder(tf.int64, (None), name='label') prob = tf.placeholder(tf.float32, name='prob') learning_rate_ph = tf.placeholder("float") logits = LeNet6(input_image, num_classes, prob) train_op, cross_entropy_loss = optimize(logits, y_label, learning_rate_ph, num_classes) pred_class = tf.argmax(tf.nn.softmax(logits), axis=1, name='pred') correct_prediction = tf.equal(pred_class, y_label) float_cast_pred = tf.cast(correct_prediction,tf.float32) accuracy = tf.reduce_mean(float_cast_pred, name='accuracy') #saver = tf.train.Saver() with tf.Session() as sess: # Get train data generator get_batches_fn = preprocessing.gen_batch_function(data, image_shape) sess.run(tf.global_variables_initializer()) print("Training...") print() for i in range(epochs): train_loss = 0 train_acc = 0 samples = 0 time_start = time.time() for images, labels in get_batches_fn(batch_size): _, loss, acc, pred_y, act_y, float_pred = sess.run([train_op, cross_entropy_loss, accuracy, pred_class, y_label, float_cast_pred], feed_dict={input_image: images, y_label: labels, prob: 0.5, learning_rate_ph:1e-3}) #print('Images shape:',images.shape) #print('pred_y', pred_y) #print('act_y', act_y) #print('float_cast_pred:', float_pred) train_loss += loss train_acc += acc samples += 1 #print('loss:', loss, 'train_loss:',train_loss) #print('acc:', acc, 'train_acc:', train_acc) total_time = time.time() - time_start print("EPOCH {} ...".format(i+1)) print("Loss = {}".format(train_loss/samples)) print("Training accuracy = {}".format(train_acc/samples)) print("Time = {} mins".format(total_time/60)) print() # Test accuracy test_images, test_labels = preprocessing.gen_test_data(test_data, image_shape) loss, acc = sess.run([cross_entropy_loss, accuracy], feed_dict={input_image:test_images, y_label:test_labels, prob:1}) print("Test loss = {}".format(loss)) print("Test accuracy = {}".format(acc)) saver = tf.train.Saver() saver.save(sess, './model/model.ckpt') print('model saved!')
one_hot_label_encoder.inverse_transform(result))) Y_test = np.array( label_encoder.transform( one_hot_label_encoder.inverse_transform(Y_test))) return result, Y_test def evaluation(Y_pred, Y_true): labels = list(one_hot_label_encoder.categories_[0]) conf_mat = confusion_matrix(Y_true, Y_pred) plot_confusion_matrix(conf_mat, labels, accuracy_score(Y_true, Y_pred)) # get the dataset dataset_path = "Data_Set.csv" dataset = prepro.read_data(file_name=dataset_path) # read data # split data in training and testing data trainingdataset, testing_dataset = train_test_split(dataset, test_size=0.2, random_state=42) # Plot training and testing data-set prepro.plot_train_test_per_class(trainingdataset, testing_dataset) # resampling the trainin data-set for balancing oversampled = prepro.divise_data_in_balanced_data(trainingdataset) # Plot the final balanced training data-set prepro.plot_data_per_class(oversampled)
def lines_to_words(lines): words = [] for line in lines: for word in line: words.append(word) return words if __name__ == '__main__': # Gets list of words from Hamlet and sonnets play_lines = process_text_ham(ham) toke_play_lines = tokenize_ham(play_lines) no_punct_play_lines = elim_punct(toke_play_lines) sonnet_lines = pp.read_data(toke_lines) no_punct_sonnet_lines = elim_punct(sonnet_lines) lines = no_punct_play_lines + no_punct_sonnet_lines text = lines_to_words(lines) # Prepare data for model words = sorted(list(set(text))) word_to_int = dict((c, i) for i, c in enumerate(words)) int_to_word = dict((i, c) for i, c in enumerate(words)) n_words = len(text) n_vocab = len(words) seq_length = 5 dataX = []
def test(): graph = tf.Graph() with graph.as_default(): global_step = tf.Variable(0, name='global_step', trainable=False) im, la = pre.get_test() images, labels = pre.read_data(im, la, BATCH_SIZE, NUM_SAMPLES, False) # First convolutional layer W_conv1 = weight_variable('conv_weights_1', [5, 5, 3, 24], 0.01) b_conv1 = bias_variable('conv_biases_1', [24]) h_conv1 = tf.nn.relu(conv2d(images, W_conv1) + b_conv1) # Pooling layer - downsamples by 2X. max_pool_1 = max_pool_2x2(h_conv1) # Second convolutional layer W_conv2 = weight_variable('conv_weights_2', [5, 5, 24, 36], 24.0) b_conv2 = bias_variable('conv_biases_2', [36]) h_conv2 = tf.nn.relu(conv2d(max_pool_1, W_conv2) + b_conv2) # Second Pooling layer max_pool_2 = max_pool_2x2(h_conv2) # Third convolutional layer W_conv3 = weight_variable('conv_weights_3', [5, 5, 36, 48], 36.0) b_conv3 = bias_variable('conv_biases_3', [48]) h_conv3 = tf.nn.relu(conv2d(max_pool_2, W_conv3) + b_conv3) # Third Pooling layer max_pool_3 = max_pool_2x2(h_conv3) # Fourth convolutional layer W_conv4 = weight_variable('conv_weights_4', [3, 3, 48, 64], 48.0) b_conv4 = bias_variable('conv_biases_4', [64]) h_conv4 = tf.nn.relu(conv2d(max_pool_3, W_conv4) + b_conv4) # Fifth convolutional layer W_conv5 = weight_variable('conv_weights_5', [3, 3, 64, 64], 64.0) b_conv5 = bias_variable('conv_biases_5', [64]) h_conv5 = tf.nn.relu(conv2d(h_conv4, W_conv5) + b_conv5) #stack result into one dimensional vector by using -1 option conv_flat = tf.reshape(h_conv5, [BATCH_SIZE, -1]) # Fully connected layer 1 W_fc1 = weight_variable('fc_weights_1', [1 * 18 * 64, 1164], 1164.0) b_fc1 = bias_variable('fc_biases_1', [1164]) h_fc1 = tf.nn.relu(tf.matmul(conv_flat, W_fc1) + b_fc1) # Fully connected layer 2 W_fc2 = weight_variable('fc_weights_2', [1164, 100], 100.0) b_fc2 = bias_variable('fc_biases_2', [100]) h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2) # Fully connected layer 3 W_fc3 = weight_variable('fc_weights_3', [100, 10], 10.0) b_fc3 = bias_variable('fc_biases_3', [10]) h_fc3 = tf.nn.relu(tf.matmul(h_fc2, W_fc3) + b_fc3) # Fully connected layer 4 W_fc4 = weight_variable('fc_weights_4', [10, 1], 1.0) b_fc4 = bias_variable('fc_biases_4', [1]) h_fc4 = tf.matmul(h_fc3, W_fc4) + b_fc4 # radiants in the range of [-pi/2, pi/2] * 2 to get 360 range y = tf.multiply(tf.atan(h_fc4), 2) saver = tf.train.Saver(tf.global_variables(), max_to_keep=None) #tensorflow session session = tf.Session() #initialization of all variables session.run(tf.global_variables_initializer()) session.run(tf.local_variables_initializer()) #threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=session) #save weights in directory #TODO file is empty # ckpt = tf.train.get_checkpoint_state('./weights/') logging.basicConfig(filename='../log/test.log', level=logging.INFO) i = 100 accuracy = 0.0 saver.restore(session, '../weights/model' + str(i) + '.ckpt-' + str(i)) for b in range(NUM_BATCHES): y_out, image_out, label_out = session.run([y, images, labels]) #print('epoche ' + str(i) + ': ' + str(y_out) + '-' + str(label_out)) batch_acc = comp_accuracy(y_out, label_out) accuracy += batch_acc content = y_out logging.info(content) accuracy = accuracy / NUM_BATCHES print(' accuracy: ', accuracy) content = accuracy logging.info(content) #tensorflow threads coord.request_stop() coord.join(threads)
print(params) # restoring model savepath = params['filepath'].get('ckpt') ckpt = torch.load(savepath) vocab = ckpt['vocab'] model = SeNet(num_classes=params['num_classes'], vocab=vocab) model.load_state_dict(ckpt['model_state_dict']) model.eval() # create dataset, dataloader tagger = Okt() padder = PadSequence(length=30) tst_data = read_data(params['filepath'].get('tst')) tst_data = remove_na(tst_data) tst_dataset = Corpus(tst_data, vocab, tagger, padder) tst_dataloader = DataLoader(tst_dataset, batch_size=128) device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') model.to(device) # evaluation correct_count = 0 for x_mb, y_mb in tqdm(tst_dataloader): x_mb = x_mb.to(device) y_mb = y_mb.to(device) with torch.no_grad(): y_mb_hat = model(x_mb)
def main(): first = preprocessing.read_data("018/first_file_preprocessed.csv") second = preprocessing.read_data("019/second_file_preprocessed.csv") idx = np.intersect1d(first.index, second.index) print(len(idx))
import preprocessing as pp import visualize_data data_dir = "data" full_df = pp.read_data(data_dir=data_dir) new_df = pp.create_features(full_df) #Class distribution #visualize_data.draw_count_plot(new_df) #word length distribution #visualize_data.draw_dist(new_df) #correlation wrt newly created features #visualize_data.draw_corr(new_df) #Preprocessing processed_df = pp.preprocess(new_df, col_name="CONTENT", r_stopwords=False, lemma=True, spell_corr=False, emotion_corr=True) #Classification and results import machine_learning svm_p, m_p, b_p, label = machine_learning.buildClassifier(processed_df, bigram=False) machine_learning.write_to_file(label, svm_p, m_p, b_p)