def main(args): # args_string = str(args) argp = _argparse().parse_args(args[1:]) sae = SparseAutoEncoder(28 * 28, argp.hu) data_loader = DataLoader() datasets = data_loader.load_data() trainer = sgd_trainer(argp.batch_size, argp.learning_rate, argp.sl, argp.t, argp.r, argp.sc) trainer.trainAutoEncoder(sae) W = sae.W1.get_value(borrow=True) out_dir = check_create_observations_dir("AutoEncoder") target_file = os.path.join(out_dir, "autoencoderfilter.png") display_sparse_encoder(W, target_file) test_set_x, test_set_y = datasets[2] test_inpt = test_set_x[:10, :] mnist_vis_file = os.path.join(out_dir, "autoencoderrec.png") display_reconstructions(test_inpt, sae.encode(test_inpt), mnist_vis_file) cmd_file_path = os.path.join(out_dir, "command.txt") f = open(cmd_file_path, "w") f.write("python ") for a in args: f.write(str(a)) f.write(" ") f.close() print "THE END" print "THE END"
def train(self, x, y): data_loader = DataLoader() for classifier in self.classifiers: sampled_data = data_loader.underSample(x, y) x_sampled = sampled_data[0] y_sampled = sampled_data[1] classifier.train(x_sampled, y_sampled)
def test_empty_directory(self): mock_glob = self.mocker.replace('glob.glob') mock_glob("some_directory/*") self.mocker.result(['.']) self.mocker.replay() with self.assertRaises(ValueError) as cm: DataLoader.load_check_ins_from_directory("some_directory") self.assertEqual(cm.exception.message, 'Error: directory some_directory is empty')
def start(p = 12,num_clusters =100,max_iter=50, batch_size = 500,init = 'random' ): data_loader = DataLoader() cifar_data = data_loader.load_cifar_data() images = cifar_data['data'].reshape((-1,3,32,32)).astype('float32') # img_test = images[2,:,:,:] # img_test = np.rollaxis(img_test,0,3) # img_test = img_test[:,:,::-1] # plt.imshow(img_test) # plt.show() images = np.rollaxis(images,1,4) images = images[:,:,:,::-1] num_patches = images.shape[0] patch_size = [p,p] # kmeans = MiniBatchKMeans(num_clusters,max_iter,batch_size,init) patches_img = kmeans.generate_patches(images, patch_size) # plt.imshow(patches_img[0,:,:,:]) # plt.show(); # Convert to matrix form rows X cols patches=patches_img.reshape(patches_img.shape[0],-1) # i=display(patches[0,:], patch_size) # plt.imshow(i) # plt.show() #pre-processing centers,counts = kmeans.fit(patches) fig = plt.figure() disp_row_size = np.ceil(np.sqrt(kmeans.num_clusters)) for i in xrange(kmeans.num_clusters): subplot = fig.add_subplot(disp_row_size, disp_row_size, i+1) subplot.get_xaxis().set_visible(False) subplot.get_yaxis().set_visible(False) img = display(centers[:,i], patch_size) subplot.imshow(img, interpolation='none') #plt.show() directory=check_create_observations_dir() plt.savefig(directory+'/repFields.png') # patch_test=patches[0,:,:,:] # plt.imshow(patch_test) # plt.show() display_bar(counts,directory+'/clusterCount.png') print "THE END"
def fit_mnist(): print "Computing for MNIST" data_loader = DataLoader() train_set,valid_set,test_set=data_loader.load_data() train_set_x,train_set_y=train_set plt=get_pairwise_plot(train_set_x, train_set_y) obs_dir=check_create_observations_dir("PCA") target_path = os.path.join(obs_dir,"scatterplotMNIST.png") plt.savefig(target_path) print "THE END"
def fit_cifar(): print "Computing for CIFAR 10" data_loader = DataLoader() cifar_data=data_loader.load_cifar_data() train_set_x=cifar_data['data'] train_set_y=cifar_data['labels'] plt=get_pairwise_plot(train_set_x, train_set_y) obs_dir=check_create_observations_dir("PCA") target_path = os.path.join(obs_dir,"scatterplotCIFAR.png") plt.savefig(target_path) print "THE END"
def loadData(self, stopCount=0): self.X = [] self.y = [] dl = DataLoader() matchGenerator = dl.getMatch() count = 0 if stopCount != 0: print "Loading up to {} matches...".format(stopCount) else: print "Loading ALL matches..." try: while True: if stopCount > 0 and count == stopCount: raise StopIteration current_list = [] raw_data = matchGenerator.next() count += 1 current_data = dl.filterMatchFields(raw_data) # data fields current_list.append(ModelGlobals.TIERS[current_data["matchTier"]]) current_list.append(ModelGlobals.PATCHES[current_data["patch"]]) offset = len(current_list) empty_fields = [0] * self.empty_array_width current_list.extend(empty_fields) for champion in current_data["teamA"]: current_list[self._findChampIndex("teamA", champion, offset)] = 1 for champion in current_data["teamB"]: current_list[self._findChampIndex("teamB", champion, offset)] = 1 self.X.append(current_list) # target value if current_data["winnerTeamA"]: self.y.append(0) else: self.y.append(1) except StopIteration as e: print "Done reading match data, {} matches".format(count)
def main(): model = Doc2Vec.load('400_pvdm_doc2vec.d2v') model_dbow = Doc2Vec.load('400_pvdbow_doc2vec.d2v') #mistake pvdm is actually pv-dbow path = 'datasets/' files = [f for f in listdir(path) if isfile(join(path,f))] files.pop(0) data_loader = DataLoader(path) domains = data_loader.csv_files names = {1: 'title', 4: 'abstract', 5: 'mesh', 'y': 6} domain_features = data_loader.get_feature_matrix(names) #get size n_total_documents = 0 for domain in domain_features: n_total_documents+=len(domain[0]) all_features = numpy.zeros(shape=(n_total_documents, 800)) all_labels = numpy.asarray([]) i = 0 for domain in domain_features: features, labels = domain all_labels = numpy.hstack((all_labels, labels)) for feature_vector in features: preprocessed_line = list(preprocess(feature_vector)) all_features[i, 0:400] = numpy.float_(model.infer_vector(preprocessed_line)) all_features[i, 400:] = numpy.float_(model_dbow.infer_vector(preprocessed_line)) i+=1 all_labels = numpy.asarray(all_labels) all_labels[all_labels == -1] = 0 all_labels = numpy.intc(all_labels) train, test = data_loader.create_random_samples(all_features, all_labels) train_x, train_y = train test_x, test_y = test classifier = NeuralNet(n_hidden_units=[200], output_size=2, batch_size=20, n_epochs=200, dropout=True, activation_function='relu', learning_rate=.3, momentum=True, momentum_term=.5) classifier.train(train_x, train_y) classifier.test(test_x, test_y)
def test_same_check_in_ids_in_same_file(self): self.file.write("418|12|2012-07-18 14:43:38|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message\n418|12|2012-07-18 12:34:45|45.54|45.6|41059b00f964a520850b1fe3|empty_message") self.file.seek(0) mock_glob = self.mocker.replace('glob.glob') mock_glob("some_directory/*") self.mocker.result(['.', 'file1']) mock_open = self.mocker.replace('__builtin__.open') mock_open("file1", 'rU') self.mocker.result(self.file) self.mocker.replay() with self.assertRaises(ValueError) as cm: DataLoader.load_check_ins_from_directory("some_directory") self.assertEqual(cm.exception.message, 'Error processing file file1: check-in with ID 12 has already been encountered for user 418')
def trainAutoEncoder(self,ae): data_loader = DataLoader() datasets=data_loader.load_shared_data() train_set_x, train_set_y = datasets[0] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / self.batch_size index = T.lscalar() x = ae.input main_cost = ae.get_cross_enropy_cost() if self.reconst_cost == "sqr": main_cost =ae.get_squared_error_cost() sparsity_cost= ae.get_KL_divergence_cost() if self.sparsity_cost =="l1": sparsity_cost = ae.get_L1_cost() cost = main_cost + self.sparsity_lambda * sparsity_cost gparams = T.grad(cost, ae.params) updates = [ (param, param - self.learning_rate * gparam) for param, gparam in zip(ae.params, gparams) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * self.batch_size: (index + 1) * self.batch_size] } ) for epoch_no in xrange(self.num_epochs): c=[] for batch_index in xrange(n_train_batches): c.append(train_model(batch_index)) print 'Training epoch no: %d, cost ' % epoch_no, np.mean(c)
def __init__(self, url = 'http://localhost:8080/workspace0', filename='Citation_Stream', on_gui = True): self.url = url; self.filename = filename; self.loader = DataLoader(); # create an instance of DataLoader self.loader.loadData(self.filename); # load data from file self.g = GephiJsonClient(url=self.url); # create an instance of GephiJsonClient self.g.cleanAll(); self.degree_dict = {}; self.cited_dict = {}; # elemts for GUI self.is_run = True; self.is_gui = on_gui; if self.is_gui == True: self.initializeUI();
def connect(self): if (self.csv_loaded == False): self.plotWidget.clear() line = 1 rfile = ReadCSV.read("test.csv") for row in rfile: if(line == 1): line = line+1 continue self.upload_data.append(row) Fixer.fixTimeAndAlt(self.upload_data) buff = Buffer() if os.path.exists('data.txt'): os.remove('data.txt') for i in self.upload_data: buff.sendToBuffer(i) buff.sendData() self.data = (DataLoader.read('data.txt')) self.csv_loaded = True self.infocsv.setText(_fromUtf8("Zaladowano")) self.info.setText("") self.wys = self.kat = self.dyst = self.pred = self.odchyl = 0
def test_single_directory_happy_path(self): self.file.write("418|12|2012-07-18 14:43:38|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message\n418|13|2012-07-18 12:34:45|45.54|45.6|41059b00f964a520850b1fe3|empty_message") self.file.seek(0) self.file2.write("418|14|2012-07-18 14:43:38|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message\n418|15|2012-07-18 12:34:45|45.54|45.6|41059b00f964a520850b1fe3|empty_message") self.file2.seek(0) mock_glob = self.mocker.replace('glob.glob') mock_glob("some_directory/*") self.mocker.result(['.', 'file1', 'file2']) mock_open = self.mocker.replace('__builtin__.open') mock_open("file1", 'rU') self.mocker.result(self.file) mock_open("file2", 'rU') self.mocker.result(self.file2) self.mocker.replay() expected_dict = { '418': [{'venue_id': '41059b00f964a520850b1fe3', 'latitude': 37.6164, 'check_in_message': 'empty_message', 'check_in_id': '12', 'longitude': -122.386, 'date': datetime.datetime(2012, 7, 18, 14, 43, 38)}, {'venue_id': '41059b00f964a520850b1fe3', 'latitude': 45.54, 'check_in_message': 'empty_message', 'check_in_id': '13', 'longitude': 45.6, 'date': datetime.datetime(2012, 7, 18, 12, 34, 45)}, {'venue_id': '41059b00f964a520850b1fe3', 'latitude': 37.6164, 'check_in_message': 'empty_message', 'check_in_id': '14', 'longitude': -122.386, 'date': datetime.datetime(2012, 7, 18, 14, 43, 38)}, {'venue_id': '41059b00f964a520850b1fe3', 'latitude': 45.54, 'check_in_message': 'empty_message', 'check_in_id': '15', 'longitude': 45.6, 'date': datetime.datetime(2012, 7, 18, 12, 34, 45)}]} actual_dict = DataLoader.load_check_ins_from_directory("some_directory") self.assertDictEqual(expected_dict, actual_dict)
def training_data_scibert(self): print("Creating SciBERT training files.\n") # Load training and validation data d_train = DataLoader() df_train = d_train.training_data_with_abstracts_citations().data d_val = DataLoader() df_validation = d_val.validation_data_with_abstracts_citations().data train_val_data = pd.concat((df_train, df_validation), axis=0).reset_index(drop=True) scibert_embeddings = self._scibert_embeddings(train_val_data) print("Saving SciBERT embeddings to disk...") scibert_embeddings_file = os.path.join(self.path_persistent, "scibert_embeddings.pkl") with open(scibert_embeddings_file, "wb") as f: pickle.dump(scibert_embeddings, f) print("Saved.\n") print("SciBERT training files created.")
def upload(data_file): table = "Straws" url = "http://dbweb6.fnal.gov:8080/mu2edev/hdb/loader" #url = "http://rexdb02.fnal.gov:8500/swhite/HdbHandler.py/loader" queryUrl = "http://dbweb6.fnal.gov:8088/QE/mu2e_hw_dev/app/SQ/query" group = "Straw Tables" password = "******" with open(data_file) as file_input: reader = csv.reader(file_input) for row in reader: dataLoader = DataLoader(password,url,group,table) dataLoader.addRow(createRow(row)) retVal,code,text = dataLoader.send() if retVal: print(str(row[0])+" successful upload") print(text) else: print (str(row[0])+ "FAILED upload") print(code) print(text) dataLoader.clearRows()
def example_many_to_one_hot_nn_optimization(): #data = load_iris() data = load_mnist() train, val, test = create_train_val_test_data(data, 0.25, 0.10) print(data[0][0].shape[0]) print(data[0][1].shape[0]) # VERIFICATION PLOT OF MNIST pixels = data[0][0].reshape((28, 28)) # Plot plt.imshow(pixels, cmap='gray') plt.show() # data_gen = DataLoader(train) neural_predictor = lambda nn: predictor_fitness(nn.predict, data_gen.generator()) simple_nn = NeuralNetwork(data[0][0].shape[0], [3, 3, 3], data[0][1].shape[0]) x0w, x0b = simple_nn.get_weights_and_biases() print("Number of weights:", len(x0w)) print("Number of biases:", len(x0b)) x0 = x0w x0.extend(x0b) fitness = lambda ind: network_opt(ind, simple_nn, neural_predictor) # OPTIMIZE HERE best_ind = x0 print("Best fitness is:", fitness(best_ind)) print(simple_nn.get_weights_and_biases()) # simple_nn.set_all(best_ind) data_gen = DataLoader(test) print( "Fitness on test data:", predictor_fitness(simple_nn.predict, data_gen.generator(), batch_size=test.shape[0]))
def resistanceupload(): def createRow(): return{'straw_barcode': str(row[0]), 'create_time' : str(row[1]), #Website gets real time somehow. 'worker_barcode' : str(row[2]), 'workstation_barcode' : str(row[3]), 'resistance' : str(row[4]), 'temperature' : str(row[5]), 'humidity' : str(row[6]), 'resistance_measurement_type' : str(row[7]), 'comments' : str(row[8]),} for row in upload_file: table = "straw_resistance_measurements" dataLoader = DataLoader(password,url,group,table) dataLoader.addRow(createRow()) retVal,code,text = dataLoader.send() if retVal: print "upload resistance success!\n" print text else: print "upload resistance failed!\n" print code print text dataLoader.clearRows()
def train_network(sess, clear=True, continue_training=False): if not os.path.exists(cfg.DIR.tensorboard_log_dir): os.mkdir(cfg.DIR.tensorboard_log_dir) else: if clear: shutil.rmtree(cfg.DIR.tensorboard_log_dir) if not os.path.exists(cfg.DIR.model_save_dir): os.mkdir(cfg.DIR.model_save_dir) writer = tf.summary.FileWriter(cfg.DIR.tensorboard_log_dir) writer.add_graph(sess.graph) # initialize the global parameters sess.run(tf.global_variables_initializer()) if continue_training: value_list = [] value_list.extend(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='global/network_scope')) restore = tf.train.Saver(value_list) restore.restore(sess, tf.train.latest_checkpoint(cfg.DIR.model_save_dir)) X = tf.placeholder(shape=(-1, 28, 28, 1), dtype=tf.float32) Y = tf.placeholder(shape=(-1, 10), dtype=tf.float32) logits, layer = build_network(X) loss = tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=logits) optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) train_data, train_labels, validate_data, validate_labels = split_train_validate_set(cfg.DIR.training_data) train_data_loader = DataLoader(train_data, train_labels) val_data_loader = DataLoader(validate_data, validate_labels) correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) for key, value in layer.items(): tf.summary.histogram(key, value) tf.summary.scalar("train_loss", loss) tf.summary.scalar("train_accuracy", accuracy) summary_op = tf.summary.merge_all() val_loss_holder = tf.placeholder(tf.float32) val_loss_tensor = tf.summary.scalar("val_loss", val_loss_holder) val_accuracy_holder = tf.placeholder(tf.float32) val_accuracy_tensor = tf.summary.scalar("val_accuracy", val_accuracy_holder) saver = tf.train.Saver(max_to_keep=10) if not os.path.exists(cfg.DIR.model_save_dir): os.makedirs(cfg.DIR.model_save_dir) start_time = time.time() tf.get_default_graph().finalize() train_step = 0 val_step = 0 for epoch in range(1, cfg.TRAIN.EPOCHS + 1): while(train_data_loader.hasNextBatch()): train_step += 1 batch_data, batch_label = train_data_loader.getNextBatch(cfg.TRAIN.BATCH_SIZE) _,_,_,summary = sess.run([loss, accuracy, optimizer, summary_op], feed_dict={X:batch_data, Y:batch_label}) writer.add_summary(summary, train_step) print("Epoch %d finished." % epoch) train_data_loader.reset() if epoch % cfg.TRAIN.SAVE_STEPS == 0: filename = 'digit_recognizer_{:d}'.format(epoch) filename = os.path.join(cfg.DIR.model_save_dir , filename) saver.save(sess, filename, global_step=epoch) if (epoch % cfg.TRAIN.VALIDATE_EPOCHES == 0): while val_data_loader.hasNextBatch(): val_step += 1 batch_val_data, batch_val_label = val_data_loader.getNextBatch(cfg.TRAIN.BATCH_SIZE) val_loss, val_accuracy, _ = sess.run([loss, accuracy], feed_dict={X: batch_val_data, Y: batch_val_label}) feed = {val_loss_holder: val_loss} val_loss_str = sess.run(val_loss_tensor, feed_dict=feed) writer.add_summary(val_loss_str, val_step) feed1 = {val_accuracy_holder: val_accuracy} val_accuracy_str = sess.run(val_accuracy_tensor, feed_dict=feed1) writer.add_summary(val_accuracy_str, val_step) val_data_loader.reset() filename = os.path.join(cfg.DIR.model_save_dir, 'digit_recognizer_final') saver.save(sess, filename) end_time = time.time() print("The total time used in training: {}".format(end_time - start_time))
def __init__(self): self.dataLoader = DataLoader() self.model = None
parser.add_argument('--delta', default="", dest="delta") args = vars(parser.parse_args()) if args['delta'] == "": delta = 200 else: delta = int(args['delta']) if args['end'] == "": end_date = datetime.datetime.today() else: end_date = datetime.datetime.strptime(args['end'], DATE_FORMAT_STRING) if args['start'] == "": start_date = end_date - datetime.timedelta(days=delta) else: start_date = datetime.datetime.strptime(args['start'], DATE_FORMAT_STRING) stock_id = args['stock_id'] end_date = end_date.strftime(DATE_FORMAT_STRING) start_date = start_date.strftime(DATE_FORMAT_STRING) data_loader = DataLoader(from_date=start_date, to_date=end_date, stock_list=stock_id) data = data_loader.load() bb = BollingerBand(stock_id, data) bb.graph()
from DataLoader import DataLoader import Hyperparameter as param from Batch import Batch from Util import numpy_to_var, toData, decoder_initial from Vocab import Vocab import numpy as np import math import sys from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from torch.autograd import Variable targetfile1 = "target.txt" inputfile1 = "ie_out.txt" device = "cpu" dataloader1 = DataLoader(inputfile1, targetfile1, device) x, y = dataloader1.caseload() vocab_size = 5000 vocab = Vocab(vocab_size) vocab.w2i = dataloader1.word2idx vocab.i2w = dataloader1.idx2word print("vocab.i2w") print(vocab.i2w) vocab.count = len(vocab.w2i) for w in ['<PAD>', '<UNK>', '<SOS>', '<EOS>']: vocab.w2i[w] = vocab.count vocab.i2w[vocab.count] = w vocab.count += 1 print("<unk>")
def main(): '''Main Function''' parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', required=True, help='Path to model .pt file') ''' parser.add_argument('-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-vocab', required=True, help='Source sequence to decode (one line per sequence)') ''' parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=100, help='Beam size') parser.add_argument('-batch_size', type=int, default=1, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader test_data = DataLoader(use_valid=True, batch_size=opt.batch_size, cuda=opt.cuda) translator = Translator(opt) translator.model.eval() numuser = test_data.user_size num_right = 0 num_total = 0 avgF1 = 0 avgPre = 0 avgRec = 0 avgF1_long = 0 avgPre_long = 0 avgRec_long = 0 avgF1_short = 0 avgPre_short = 0 avgRec_short = 0 numseq = 0 # number of test seqs # for micro pre rec f1 right = 0. pred = 0. total = 0. right_long = 0. pred_long = 0. total_long = 0. right_short = 0. pred_short = 0. total_short = 0. with open(opt.output, 'w') as f: for batch in tqdm(test_data, mininterval=2, desc=' - (Test)', leave=False): all_samples = translator.translate_batch(batch).data for bid in range(batch.size(0)): numseq += 1.0 ground_truth = np.zeros([numuser]) num_ground_truth = 0 for user in batch.data[bid][1:-1]: if user == Constants.EOS or user == Constants.PAD: break ground_truth[user] = 1.0 num_ground_truth += 1 pred_cnt = np.zeros([numuser]) for beid in range(opt.beam_size): for pred_uid in all_samples[bid, beid, 1:num_ground_truth + 1]: if pred_uid == Constants.EOS: break else: pred_cnt[pred_uid] += 1.0 / opt.beam_size F1, pre, rec = getF1(ground_truth, pred_cnt) avgF1 += F1 avgPre += pre avgRec += rec right += np.dot(ground_truth, pred_cnt) pred += np.sum(pred_cnt) total += np.sum(ground_truth) # for short user ground_truth = np.zeros([numuser]) num_ground_truth = 0 for user in batch.data[bid][1:-1]: if user == Constants.EOS or user == Constants.PAD: break ground_truth[user] = 1.0 num_ground_truth += 1 if num_ground_truth >= 5: break pred_cnt = np.zeros([numuser]) for beid in range(opt.beam_size): #total += len(ground_truth) for pred_uid in all_samples[bid, beid, 1:num_ground_truth + 1]: if pred_uid == Constants.EOS: break #continue else: pred_cnt[pred_uid] += 1.0 / opt.beam_size F1, pre, rec = getF1(ground_truth, pred_cnt) avgF1_short += F1 avgPre_short += pre avgRec_short += rec right_short += np.dot(ground_truth, pred_cnt) pred_short += np.sum(pred_cnt) total_short += np.sum(ground_truth) print('[Info] Finished.') print('Macro') print(avgF1 / numseq) print(avgPre / numseq) print(avgRec / numseq) print('Results for the first no more than 5 predictions') print(avgF1_short / numseq) print(avgPre_short / numseq) print(avgRec_short / numseq) print('Micro') pmi = right / pred rmi = right / total print(2 * pmi * rmi / (pmi + rmi)) print(pmi) print(rmi) print('Results for the first no more than 5 predictions') pmi_long = right_short / pred_short rmi_long = right_short / total_short print(2 * pmi_long * rmi_long / (pmi_long + rmi_long)) print(pmi_long) print(rmi_long)
import pandas as pd from DataLoader import DataLoader from StateForecaster import StateForecaster from preprocessing import preprocess_dataframe_for_forecasting_training if __name__ == "__main__": dataframe = DataLoader().load_dataframe_from_datapath() forecaster_dataframe = preprocess_dataframe_for_forecasting_training( dataframe) print(forecaster_dataframe) forecaster_model = StateForecaster(forecaster_dataframe) result = forecaster_model.fit(5) print(result.summary()) lag_order = result.k_ar print(lag_order) forecast = result.forecast(forecaster_dataframe.values[-lag_order:], steps=300) df_forecast = pd.DataFrame(forecast, index=forecaster_dataframe.index[-300:], columns=forecaster_dataframe.columns) print(df_forecast)
def __init__(self): self.antigens = DataLoader().load_test_genes()
def transfer_learning(print_output=True): path = 'datasets/' data_loader = DataLoader(path) names = {1: 'title', 4: 'abstract', 5: 'mesh', 'y': 6} transformed_data_sets = [] path = 'datasets/' files = [f for f in listdir(path) if isfile(join(path,f))] files.pop(0) data_loader = DataLoader(path) domains = data_loader.csv_files all_domains = copy.deepcopy(domains) training_domains = data_loader.csv_files all_domains_svm_wda_metrics_list = [] all_domains_svm_metrics_list = [] all_domains_svm_bow_mlp_list = [] all_domains_mlp_fold_scores = [] for i, held_out_domain in enumerate(domains): training_domains.pop(i) names = {1: 'title', 4: 'abstract', 5: 'mesh', 'y': 6} svm_wda_metrics_list = [] svm_metrics_list = [] svm_bow_mlp_list = [] folder_name = '/' + files[i] domain_name = files[i].__str__() domain_name = domain_name.split('.')[0] folder_name = 'output' + '/' + domain_name output = "Dataset: {}".format(files[i]) if print_output: print(output) #shuffle(data_loader.csv_files) data_loader.csv_files = training_domains data_sets = data_loader.csv_files domains = data_loader.get_feature_matrix(names) #Get one file out of the csv files in the dataloader use this as the held out domain #Get the feature representation of the held out data held_out_x, held_out_y = data_loader.get_feature_matrix(names, held_out_domain) #Create the folds for the held out data in this case the default 5 folds = data_loader.cross_fold_valdation(held_out_x, held_out_y) #Get the total number of domains i.e., the number of files with documents n_source_domains = len(data_sets) os.makedirs(folder_name) #Must convert the data type of the matrix for theano feature_engineer = Feature_Engineer() #Start the 5 fold cross validation for n_fold, fold in enumerate(folds): output = "Fold {}: \n".format(n_fold) if print_output: print(output) output = '{}/{}/fold_{}.csv'.format(os.getcwd(), folder_name, (n_fold + 1)) file = open(output, 'w') csv_writer = csv.writer(file) #Each sample is a list that contains the x and y for the classifier #Typically fold[0] would be the train sample but because it is switched for #testing the effectiveness of the domain adaptation train_sample = fold[1] test_sample = fold[0] #These are the original copies to be copied over the augmented feature matrix #Each sample contains the text and y labels from the data before it is put into the sklearn count vectorizer train_x, train_y = train_sample test_x, test_y = test_sample train_y[train_y == 0] = 2 train_y[train_y == 1] = 3 test_y[test_y == 0] = 2 test_y[test_y == 1] = 3 #Get the bag of words representation of the small 20% target source data and transform the other 80% #of the data. train_x = data_loader.get_transformed_features(train_x, True, False, True) test_x = data_loader.transform(test_x, True, True) transformed_domains = [] #Transform the domains with respect to the training data for domain in domains: domain_x, domain_y = domain transformed_domain_x = data_loader.transform(domain_x, True, True) transformed_domain_x, domain_y = data_loader.underSample(transformed_domain_x, domain_y) transformed_domains.append([transformed_domain_x, domain_y]) augmented_feature_matrix_train, augmented_y_train = feature_engineer.augmented_feature_matrix(transformed_domains, [train_x, train_y]) augmented_feature_matrix_test, augmented_y_test = feature_engineer.augmented_feature_matrix(held_out_domain=[test_x, test_y], train_or_test=False, n_source_domains=len(transformed_domains)) augmented_y_test[augmented_y_test == 2] = 0 augmented_y_test[augmented_y_test == 3] = 1 #SVM with the augmented feature matrix for domain adaptation svm_wda = SVM() svm_wda.train(augmented_feature_matrix_train, augmented_y_train) svm_wda.test(augmented_feature_matrix_test, augmented_y_test) output = "\nSVM with domain adaptation metrics:" csv_writer.writerow([output]) if print_output: print(output) print(svm_wda) print("\n") svm_wda_metrics_list.append(svm_wda.metrics) classifier = NeuralNet(n_hidden_units=[250], output_size=4, batch_size=20, n_epochs=200, dropout=True, activation_function='relu', learning_rate=.3, momentum=True, momentum_term=.5) write_to_csv(svm_wda.metrics, csv_writer) y_for_mlp = [] #Set up the x and y data for the MLP for p, domain in enumerate(transformed_domains): domain_x, domain_y = domain domain_x = domain_x.todense() y_for_mlp.append(domain_y) if p == 0: neural_net_x_train = domain_x neural_net_y_train = domain_y else: neural_net_x_train = numpy.vstack((neural_net_x_train, domain_x)) neural_net_y_train = numpy.hstack((neural_net_y_train, domain_y)) neural_net_x_train = numpy.float_(neural_net_x_train) classifier.train(neural_net_x_train, neural_net_y_train) test_y[test_y == 2] = 0 test_y[test_y == 3] = 1 svm_y_train = neural_net_y_train svm_y_train[svm_y_train == 2] = 0 svm_y_train[svm_y_train == 3] = 1 #SVM without the domain adaptation svm = SVM() svm.train(sparse.coo_matrix(neural_net_x_train), svm_y_train) svm.test(test_x, test_y) output = "\nSVM without domain adaptation" if print_output: print(output) print(svm) print("\n") csv_writer.writerow([output]) svm_metrics_list.append(svm.metrics) write_to_csv(svm.metrics, csv_writer) #Transform the feature vectors of the held out data to the learned hidden layer features of the previous #MLP trained with all n-1 datasets perceptron_train_x = theano.shared(neural_net_x_train) perceptron_test_x = theano.shared(test_x.todense()) transformed_perceptron_train_x = classifier.transfer_learned_weights(perceptron_train_x) transformed_perceptron_test_x = classifier.transfer_learned_weights(perceptron_test_x) modified_transformed_perceptron_train_x = numpy.hstack((transformed_perceptron_train_x, neural_net_x_train)) modified_transformed_perceptron_test_x = numpy.hstack((transformed_perceptron_test_x, test_x.todense())) output = "\nSVM with BoW and transformed features" csv_writer.writerow([output]) if print_output: print(output) svm_mlp_bow = SVM() svm_mlp_bow.train(sparse.coo_matrix(modified_transformed_perceptron_train_x), svm_y_train) svm_mlp_bow.test(sparse.coo_matrix(modified_transformed_perceptron_test_x), test_y) write_to_csv(svm_mlp_bow.metrics, csv_writer) if print_output: print(svm_mlp_bow) svm_bow_mlp_list.append(svm_mlp_bow.metrics) output = "*********** End of fold {} ***********".format(n_fold) if print_output: print(output) training_domains = copy.deepcopy(all_domains) file_name = '{}/{}/fold_averages.csv'.format(os.getcwd(), folder_name) file = open(file_name, 'w+') csv_writer = csv.writer(file) if print_output: output = "----------------------------------------------------------------------------------------" \ "\nFold Scores\n " \ "SVM with domain adaptation" print_write_output(output, svm_wda_metrics_list, all_domains_svm_wda_metrics_list, csv_writer) output = "\nSVM without domain adaptation" print_write_output(output, svm_metrics_list, all_domains_svm_metrics_list, csv_writer) output = "SVM with BoW and transformed features" print_write_output(output, svm_bow_mlp_list, all_domains_svm_bow_mlp_list, csv_writer) file_name = '{}/output/all_fold_averages.csv'.format(os.getcwd()) file = open(file_name, 'w+') csv_writer = csv.writer(file) if print_output: output = "*******************************************************************************************" \ "\nAll domain macro metric scores\n " \ "SVM with domain adaptation" print_macro_scores("SVM with domain adaptation", all_domains_svm_wda_metrics_list, csv_writer) output = "\nSVM without domain adaptation" print_macro_scores(output, all_domains_svm_metrics_list, csv_writer) output = "SVM with BoW and transformed features" print_macro_scores(output, all_domains_svm_bow_mlp_list, csv_writer)
def train_NN(self, nn): trainer.train_NN(self, nn) data_loader = DataLoader() datasets = data_loader.load_shared_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing batch_size = self.batch_size; n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size index = T.lscalar() # index to a [mini]batch x = nn.input # the data is presented as rasterized images y = T.ivector('y') cost = (nn.negative_log_likelihood_dropout(y) + self.L1_lambda * nn.L1 + self.L2_lambda * nn.L2 ) train_err_model = theano.function( inputs=[index], outputs=nn.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] } ) test_err_model = theano.function( inputs=[index], outputs=nn.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] } ) validate_err_model = theano.function( inputs=[index], outputs=nn.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) gparams = [T.grad(cost, param) for param in nn.params] updates = [ (param, param - self.learning_rate * gparam) for param, gparam in zip(nn.params, gparams) ] train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) def validate(): validation_losses = [validate_err_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) return this_validation_loss; def test(): test_losses = [test_err_model(i) for i in xrange(n_test_batches)] this_test_loss = numpy.mean(test_losses) return this_test_loss; def train(): train_losses = [train_err_model(i) for i in xrange(n_train_batches)] this_train_loss = numpy.mean(train_losses) return this_train_loss; print '... training' #Train in mini batches minEpochs = 4 validationFrequency = n_train_batches; iteration = 0; bestValidationLoss = numpy.Inf; directory=check_create_observations_dir() self.output_directory = directory max_epoch_reached = False while not max_epoch_reached : iteration = iteration + 1; epochNo = (iteration / n_train_batches) + 1 batchId= iteration % n_train_batches; currentCost=train_model(batchId) #print "Cost = %f" %(currentCost) if iteration % validationFrequency == 0: validation_err = validate() train_err = train() test_err = test() self.add_train_data(epochNo, train_err, validation_err, test_err) print "Epoch no: %d Validation Loss = %f" %(epochNo,validation_err*100) if validation_err < bestValidationLoss: bestValidationLoss = validation_err if epochNo > minEpochs and validation_err *self.early_stopping_threshold > bestValidationLoss: #print "------------------------Validation Loss = %f" %(validationLoss*100) break; if epochNo >= self.n_epochs: max_epoch_reached = True testLoss=test() trainer.save_errors(self, directory) repfields_final_path=os.path.join(directory,"repFields.png") W_vals=nn.W1.get_value(borrow=True) display(W_vals,repfields_final_path) print "iteration %d complete. Cost = %f Best Validation Loss = %f Test Loss = %f" %(iteration,currentCost,bestValidationLoss *100,testLoss *100)
def test_latitude_not_a_number(self): self.file.write("418|12|2012-07-18 14:43:38|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message\n418|12|2012-07-18 12:34:45|abcd|-122.386|41059b00f964a520850b1fe3|empty_message") self.file.seek(0) with self.assertRaises(ValueError) as cm: DataLoader.load_check_ins_from_file(self.file) self.assertEqual(cm.exception.message, 'Error in line 2: latitude should be a float number')
import math import numpy as np import random from collections import Counter from DataLoader import DataLoader from Models import StanfordModel, NCGModel from Utils import Utils global_results_stanford = [] global_results_radiation = [] datasets = Utils.separate_dataset_by_days(DataLoader.load_check_ins_from_file(open("104665558.csv", "U"))) users = datasets.keys() users = ["104665558"] for user in users: for i in range(0, 1): days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] # days = ['Wednesday'] results_stanford = {} results_radiation = {} for day in days: results_stanford[day] = [] results_radiation[day] = []
def test_latitude_out_of_bounds(self): self.file.write("418|12|2012-07-18 14:43:38|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message\n418|12|2012-07-18 12:34:45|100|-122.386|41059b00f964a520850b1fe3|empty_message") self.file.seek(0) with self.assertRaises(ValueError) as cm: DataLoader.load_check_ins_from_file(self.file) self.assertEqual(cm.exception.message, 'Error in line 2: latitude should be between -90 and 90')
from SchemaProcessor import SchemaProcessor from Conversion import Conversion from MigrationStateManager import MigrationStateManager from StructureLoader import StructureLoader from ReportGenerator import ReportGenerator from DataLoader import DataLoader from ConstraintsProcessor import ConstraintsProcessor from DBAccess import DBAccess from BinaryDataDecoder import BinaryDataDecoder if __name__ == '__main__': print(BootProcessor.get_introduction_message()) base_dir = os.getcwd() config = FsOps.read_config(base_dir) config = FsOps.read_extra_config(config, base_dir) conversion = Conversion(config) FsOps.create_logs_directory(conversion) BootProcessor.boot(conversion) FsOps.read_data_types_map(conversion) SchemaProcessor.create_schema(conversion) MigrationStateManager.create_state_logs_table(conversion) MigrationStateManager.create_data_pool_table(conversion) StructureLoader.load_structure(conversion) MigrationStateManager.read_data_pool(conversion) DataLoader.send_data(conversion) BinaryDataDecoder.decode(conversion) ConstraintsProcessor.process_constraints(conversion) DBAccess.close_connection_pools(conversion) ReportGenerator.generate_report(conversion, 'Migration is accomplished.')
batchSizeForOneThread = Config.getint("RBM", "batchSizeForOneThread") M = Config.getint("RBM", "artistsNumber") K = Config.getint("RBM", "ranksNumber") F = Config.getint("RBM", "hiddenLayerSize") learningRate = Config.getfloat("RBM", "learningRate") wDecay = Config.getfloat("RBM", "wDecay") updateFrequencyMAX = Config.getint("RBM", "updateFrequencyMin") numberOfEpoch = Config.getint("RBM", "numberOfEpoch") Verbose = Config.getboolean("RBM", "Verbose") TrainingSetFile = Config.get("RBM", "trainingSetFile") ValidationSetFile = Config.get("RBM", "validationSetFile") ValidationFromTrainingSetFile = Config.get("RBM", "validationFromTrainingSetFile") TestSetFile = Config.get("RBM", "testSetFile") dataLoader = DataLoader(trainingSetFile = TrainingSetFile, validationSetFile = ValidationSetFile, validationFromTrainingSetFile = ValidationFromTrainingSetFile, testSetFile = TestSetFile, K = K, M = M, batchSizeForOneThread = batchSizeForOneThread, threadsNumber = threadsNumber, verbose = Verbose) whereUpdateMax = np.where(dataLoader.updateFrequency > updateFrequencyMAX) dataLoader.updateFrequency[whereUpdateMax] = updateFrequencyMAX dataLoader.vBiasesInitialization[np.where(dataLoader.vBiasesInitialization < np.float64(0.1e-100))] = np.float64(0.1e-100) momentum = 0.5 rbm = RBM(M, K, F, learningRate, momentum, wDecay, dataLoader.vBiasesInitialization, dataLoader.updateFrequency) numberOfMiniSets = np.int(np.ma.floor(dataLoader.trainingSetSize / (threadsNumber * batchSizeForOneThread))) with open("Outs/"+sys.argv[1]+"_validation_RMSE.txt", "a") as rmsesFile: dataLoader.StartNewValidationSet() GetVisiableLayer = dataLoader.GiveVisibleLayerForValidation
class CitationStream(Tkinter.Tk): ''' This class is responsible for sending citation edge streams (loaded by using DataLoader) to Gephi's Master server by using GephiJsonClient. ''' def __init__(self, url = 'http://localhost:8080/workspace0', filename='Citation_Stream', on_gui = True): self.url = url; self.filename = filename; self.loader = DataLoader(); # create an instance of DataLoader self.loader.loadData(self.filename); # load data from file self.g = GephiJsonClient(url=self.url); # create an instance of GephiJsonClient self.g.cleanAll(); self.degree_dict = {}; self.cited_dict = {}; # elemts for GUI self.is_run = True; self.is_gui = on_gui; if self.is_gui == True: self.initializeUI(); def run(self): self.loader.flush(); self.g.cleanAll(); self.degree_dict.clear(); self.cited_dict.clear(); self.streamIn(IN_THRESHOLD , OUT_THRESHOLD, BATCH_SPEED); def runForever(self): i = 1; while self.is_run: print "ROUND #", i; self.run(); print "Waiting 10 seconds for the next round..."; if self.is_gui == True: self.date_txt.set("Waiting 10 seconds for the next round..."); time.sleep(10); i += 1; def streamIn(self, in_threshold = 10, out_threshold = 30, timeout=1): ''' Feed fetch batch into Gephi's pool. The maximum number of nodes displayed in the pool is set to be 1000. The nodes with in-degree >= in_threshold or out-degree >= out_threshold will be highlighted using different colors and sizes. timeout will be the time for sleeping between two batches. ''' displayed_nodes = deque(maxlen=1000); # create a queue and set its size 1000. There are 1000 nodes will be displayed in Gephi's pool displayed_dict = {}; # store nodes which are currently existing in Gephi's pool while True: tm,edgeset = self.loader.sendData(); if tm == -1 or edgeset == None: break; print "Batch: ", tm; if self.is_gui == True: self.date_txt.set(tm); for fromnode,tonode in edgeset: # update the cited list of the tonode self.cited_dict[tonode] = self.cited_dict.get(tonode,[]); self.cited_dict[tonode].append(fromnode); # update degrees if self.degree_dict.get(fromnode) == None: self.degree_dict[fromnode] = [0,1, tm]; # [in-degree, out-degree, date] else: self.degree_dict[fromnode][1] += 1; if self.degree_dict.get(tonode) == None: self.degree_dict[tonode] = [1,0, tm]; # [in-degree, out-degree, date] else: self.degree_dict[tonode][0] += 1; # add the fromnode to Gephi's pool node_attributes = NODE_ATTRIBUTE.copy(); if displayed_dict.get(fromnode) == None: displayed_dict[fromnode] = fromnode; # check the size if len(displayed_nodes) >= displayed_nodes.maxlen: deletenode = displayed_nodes.popleft(); del displayed_dict[deletenode]; self.g.deleteNode(deletenode); # delete the node from Gephi's pool self.g.addNode(fromnode, **node_attributes); displayed_nodes.append(fromnode); # check fromnode's in-degree, and update it in Gephis' pool if self.degree_dict[fromnode][1] >= out_threshold: sz = setSize(self.degree_dict[fromnode][1]); node_attributes['size'] = sz; node_attributes['r'] = 0.0/255; node_attributes['g'] = 200.0/255; node_attributes['b'] = 0.0/255; self.g.changeNode(fromnode, **node_attributes); # check tonode's out-degree, and update it in Gephi's pool node_attributes = NODE_ATTRIBUTE.copy(); if displayed_dict.get(tonode) == None: if str(int(self.degree_dict[tonode][2][0:4])+5)+self.degree_dict[tonode][2][4:7] >= tm[0:7]: displayed_dict[tonode] = tonode; # check the size if len(displayed_nodes) >= displayed_nodes.maxlen: deletenode = displayed_nodes.popleft(); del displayed_dict[deletenode]; self.g.deleteNode(deletenode); if self.degree_dict[tonode][0] >= in_threshold and self.degree_dict[tonode][1] >= out_threshold: sz = setSize(self.degree_dict[tonode][0]); node_attributes['size'] = sz; node_attributes['r'] = 0.0/255; node_attributes['g'] = 0.0/255; node_attributes['b'] = 100.0/255; elif self.degree_dict[tonode][0] >= in_threshold: sz = setSize(self.degree_dict[tonode][0]); node_attributes['size'] = sz; node_attributes['r'] = 200.0/255; node_attributes['g'] = 0.0/255; node_attributes['b'] = 0.0/255; self.g.addNode(tonode, **node_attributes); displayed_nodes.append(tonode); # connect the tonode to those nodes that cite it and already in Gephi's pool for eachcit in self.cited_dict[tonode]: if displayed_dict.get(eachcit) != None: self.g.addEdge(str(eachcit+"->"+tonode), eachcit, tonode, directed=True); else: if self.degree_dict[tonode][0] >= in_threshold and self.degree_dict[tonode][1] >= out_threshold: sz = setSize(self.degree_dict[tonode][0]); node_attributes['size'] = sz; node_attributes['r'] = 0.0/255; node_attributes['g'] = 0.0/255; node_attributes['b'] = 100.0/255; elif self.degree_dict[tonode][0] >= in_threshold: sz = setSize(self.degree_dict[tonode][0]); node_attributes['size'] = sz; node_attributes['r'] = 200.0/255; node_attributes['g'] = 0.0/255; node_attributes['b'] = 0.0/255; self.g.changeNode(tonode, **node_attributes); self.g.addEdge(str(fromnode+"->"+tonode), fromnode, tonode, directed=True); # sleep for seconds if one timestamp is done time.sleep(timeout); # clear displayed_nodes.clear(); displayed_dict.clear(); def clearData(self): ''' Clear up. ''' self.loader.clearData(); self.degree_dict.clear(); self.cited_dict.clear(); #---------------------- # Functions for the UI #---------------------- def initializeUI(self): ''' Initialize the components needed in the UI. ''' Tkinter.Tk.__init__(self); self.title("Dynamic Citation Network"); self.date_txt = Tkinter.StringVar(); # part 1 self.intro_lf = Tkinter.LabelFrame(self, text="INTRODUCTION", height=200, width=150); self.intro_lf.pack(fill=Tkinter.BOTH, expand=1); intro_lbl = Tkinter.Label(self.intro_lf, text=INTRO_INFO, wraplength=400, justify=Tkinter.LEFT, padx=10, pady=10); intro_lbl.pack(side=Tkinter.LEFT, expand=1); space_lbl_1 = Tkinter.Label(self, text=""); space_lbl_1.pack(fill=Tkinter.BOTH, expand=1); #part 2 self.legnd_lf = Tkinter.LabelFrame(self, text="LEGEND", height=200, width=150); self.legnd_lf.pack(fill=Tkinter.BOTH, expand=1); legnd_cvs = Tkinter.Canvas(self.legnd_lf, height=200, width=150); legnd_cvs.pack(fill=Tkinter.BOTH, expand=1); legnd_cvs.create_text(10,2,anchor=Tkinter.NW, text="Node Color:"); legnd_cvs.create_oval(15,22,30,37,fill="gray"); legnd_cvs.create_text(40,22,text="an ordinary paper",anchor=Tkinter.NW); legnd_cvs.create_oval(15,42,30,57,fill="green"); legnd_cvs.create_text(40,42,text="a paper cites >= "+str(OUT_THRESHOLD)+" papers",anchor=Tkinter.NW); legnd_cvs.create_oval(15,62,30,77,fill="blue"); legnd_cvs.create_text(40,62,text="a paper cites >= "+str(OUT_THRESHOLD)+" papers and cited by >= "+str(IN_THRESHOLD)+" papers",anchor=Tkinter.NW); legnd_cvs.create_oval(15,82,30,97,fill="red"); legnd_cvs.create_text(40,82,text="a paper cited by >= "+str(IN_THRESHOLD)+" papers",anchor=Tkinter.NW); legnd_cvs.create_text(10,112,anchor=Tkinter.NW, text="Node Size:"); legnd_cvs.create_text(15, 132, text="large:", anchor=Tkinter.NW); legnd_cvs.create_text(75,132,text="in-degree(out-degree) >= 50", anchor=Tkinter.NW); legnd_cvs.create_text(15, 152, text="medium:", anchor=Tkinter.NW); legnd_cvs.create_text(75,152,text="in-degree(out-degree) >= 30", anchor=Tkinter.NW); legnd_cvs.create_text(15, 172, text="small:", anchor=Tkinter.NW); legnd_cvs.create_text(75,172,text="in-degree(out-degree) >= 10", anchor=Tkinter.NW); space_lbl_1 = Tkinter.Label(self, text=""); space_lbl_1.pack(fill=Tkinter.BOTH, expand=1); #part 3 self.date_lf = Tkinter.LabelFrame(self, text="CURRENT DATE", height=200, width=150); self.date_lf.pack(fill=Tkinter.BOTH, expand=1); self.date_txt = Tkinter.StringVar(); date_lbl = Tkinter.Label(self.date_lf, textvariable=self.date_txt, padx=10); date_lbl.pack(side=Tkinter.LEFT); space_lbl_1 = Tkinter.Label(self, text=""); space_lbl_1.pack(fill=Tkinter.BOTH, expand=1); #part 4 self.btn_lf = Tkinter.LabelFrame(self, text="", height=100, width=150); self.btn_lf.pack(fill=Tkinter.BOTH); self.is_run = True; self.start_btn = Tkinter.Button(self.btn_lf, text="START", command = self.pressStart); self.start_btn.pack(side=Tkinter.LEFT); self.quit_btn = Tkinter.Button(self.btn_lf, text="QUIT", command = self.pressQuit); self.quit_btn.pack(side=Tkinter.RIGHT); def pressStart(self): ''' Function triggered by clicking 'START' button. It will start a new thread to simulate the streaming fashion. ''' thread.start_new(self.runForever, ()); self.start_btn['state'] = Tkinter.DISABLED; def pressQuit(self): ''' Function triggered by clicking 'QUIT' button. It will stop running application and quit it. ''' self.is_run = False; self.quit();
def train(): """ train model :return: """ model, base_model, seq_step_len = build_model() print('input lengths ', seq_step_len, 'label length', config.max_seq_len) train_dataset = DataLoader(DataMode.Train).load_batch_from_tfrecords() val_dataset = DataLoader(DataMode.Val).load_batch_from_tfrecords() train_summary_writer = tf.summary.create_file_writer( os.path.join(TENSORBOARD_DIR, 'trainLogs')) val_summary_writer = tf.summary.create_file_writer( os.path.join(TENSORBOARD_DIR, 'valLogs')) latest_ckpt = tf.train.latest_checkpoint(CHECKPOINT_DIR) start_epoch = 0 if latest_ckpt: start_epoch = int(latest_ckpt.split('-')[1].split('.')[0]) model.load_weights(latest_ckpt) event_logger.info('model resumed from: {}, start at epoch: {}'.format( latest_ckpt, start_epoch)) else: event_logger.info( 'passing resume since weights not there. training from scratch') def _validation(): """ validate the model's acc :return: loss and acc """ _val_losses = [] _val_accuracy = [] for _batch, _data in enumerate(val_dataset): _images, _labels = _data _input_length = np.array(np.ones(len(_images)) * int(seq_step_len)) _label_length = np.array( np.ones(len(_images)) * config.max_seq_len) _loss = model.evaluate( [_images, _labels, _input_length, _label_length], _labels, verbose=0) _acc = _compute_acc(_images, _labels, _input_length) _val_losses.append(_loss) _val_accuracy.append(_acc) return np.mean(_val_losses), np.mean(_val_accuracy) def _compute_acc(_images, _labels, _input_length): """ :param _images: a batch of images, [samples, w, h, c] :param _labels: :param _input_length: :return: acc """ _y_pred = base_model.predict_on_batch(x=_images) # print(_y_pred) # (64, 9, 37) _decoded_dense, _ = ctc_decode( _y_pred, _input_length, greedy=config.ctc_greedy, beam_width=config.beam_width, top_paths=config.top_paths, merge_repeated=config.decode_merge_repeated) _error_count = 0 for pred, real in zip(_decoded_dense[0], _labels): str_real = ''.join([config.characters[x] for x in real if x != -1]) str_pred = ''.join([config.characters[x] for x in pred if x != -1]) # print(str_real, str_pred) if str_pred != str_real: _error_count += 1 _acc = (len(_labels) - _error_count) / len(_labels) return _acc # start training progress for epoch in range(start_epoch, config.epochs): train_acc_avg = [] train_loss_avg = [] start = time.time() for batch, data in enumerate(train_dataset): images, labels = data input_length = np.array(np.ones(len(images)) * int(seq_step_len)) label_length = np.array(np.ones(len(images)) * config.max_seq_len) train_loss = model.train_on_batch( x=[images, labels, input_length, label_length], y=labels) train_acc = _compute_acc(images, labels, input_length) train_acc_avg.append(train_acc) train_loss_avg.append(train_loss) train_loss = np.mean(train_loss_avg) train_acc = np.mean(train_acc_avg) val_loss, val_acc = _validation() # write train and val logs with train_summary_writer.as_default(): tf.summary.scalar('loss', train_loss, step=epoch) tf.summary.scalar('acc', train_acc, step=epoch) with val_summary_writer.as_default(): tf.summary.scalar('loss', val_loss, step=epoch) tf.summary.scalar('acc', val_acc, step=epoch) print( 'Epoch: [{epoch}/{epochs}], train_loss: {train_loss}, train_acc: {train_acc}, ' 'val_loss: {val_loss}, val_acc: {val_acc}, ' 'one epoch costs time: {time} s, learning rate: {lr}'.format( epoch=epoch + 1, epochs=config.epochs, train_loss=train_loss, train_acc=train_acc, val_loss=val_loss, val_acc=val_acc, time=time.time() - start, lr=config.lr)) ckpt_path = os.path.join( CHECKPOINT_DIR, '{cnn}&{rnn}-{epoch}'.format(cnn=config.cnn_type, rnn=config.rnn_type, epoch=epoch + 1)) model.save_weights(ckpt_path) if val_acc >= config.end_acc or val_loss <= config.end_cost: # tf.saved_model.save(base_model, os.path.join(SVAED_MODEL_DIR, '{name}_model.h5'.format(name=config.dataset))) base_model.save( os.path.join(SVAED_MODEL_DIR, '{name}_model.h5'.format(name=config.dataset))) break
def test_invalid_date(self): self.file.write("418|12|2012-07-18 14:43:38|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message\n418|12|123asd|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message") self.file.seek(0) with self.assertRaises(ValueError) as cm: DataLoader.load_check_ins_from_file(self.file) self.assertEqual(cm.exception.message, 'Error in line 2: invalid format of date, should be YYYY-MM-DD HH:MM:SS')
for s in size: num_features *= s return num_features if __name__ == "__main__": if len(sys.argv) != 2: print("enter new or continue or testonly") exit(0) MODE = sys.argv[1] if MODE != "new" and MODE != "continue" and MODE != "testonly": print("enter new or continue or testonly") exit(0) data_loader = DataLoader() trainloader = data_loader.get_trainloader() testloader = data_loader.get_testloader() net = Net() if MODE == "continue" or MODE == "testonly": net.load_state_dict(torch.load(PATH)) criterion = nn.CrossEntropyLoss() #criterion = nn.MSELoss() optimizer = optim.SGD(net.parameters(), lr=0.0005, momentum=0.6) if MODE != "testonly": for epoch in range(TOTAL_EPOCH): running_loss = 0.0 for i, data in enumerate(trainloader, 0):
def test_empty_strings_in_end(self): self.file.write("418|23|2012-07-18 14:43:38|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message\n ") self.file.seek(0) with self.assertRaises(ValueError) as cm: DataLoader.load_check_ins_from_file(self.file) self.assertEqual(cm.exception.message, "Error in line 2: the line should contain user_id, check-in_id, date, latitude, longitude, venue_id and check-in_message, separated by |")
__author__ = 'ezequiel' from MainController import MainController from DataLoader import DataLoader if __name__ == "__main__": # load sample data sample_data = DataLoader.load_sampledata() controller = MainController(sample_data) controller.menu_redirect()
from __future__ import print_function import tensorflow.python.platform import math import random import numpy as np from six.moves import urllib from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf from DataLoader import DataLoader # Step 1: Download the data. dataset = DataLoader() filename = dataset.maybe_download('text8.zip', 31344016) words = dataset.read_data(filename) print('==> Data size', len(words)) # Step 2: Build the dictionary and # replace rare words with UNK token. vocabulary_size = 50000 data, count, dictionary, reverse_dictionary = dataset.build_dataset( words, vocabulary_size ) # Hint to reduce memory. del words print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10])
def main(): '''Main Function''' parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument( '-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-vocab', required=True, help='preprocess file to provide vocabulary') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-lambda_1', type=float, default=2 / 3, help='diversity factor for hamming diversity') parser.add_argument('-lambda_2', type=float, default=2 / 3, help='diversity factor for bi-gram diversity') parser.add_argument('-lambda_3', type=float, default=2 / 3, help='diversity factor for tri-gram diversity') parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) test_data = DataLoader(preprocess_data['dict']['src'], preprocess_data['dict']['tgt'], src_insts=test_src_insts, cuda=opt.cuda, shuffle=False, batch_size=opt.batch_size) translator = Translator_idbs(opt) translator.model.eval() print('[Info] Start translating...') f = open(opt.output, 'w') for batch in tqdm(test_data, mininterval=2, desc=' - (Test)', leave=False): all_hyp = translator.translate_batch(batch) for idx_seq in all_hyp: pred_line = ' '.join( [test_data.tgt_idx2word[idx] for idx in idx_seq]) #转化成单词拼接起来 f.write(pred_line + '\n') f.flush() f.close() print('[Info] Finished.')
return patches def display(data_row, patch_size): data_row = data_row - data_row.min() data_row = data_row / data_row.max() img = data_row.reshape(3, patch_size[0], patch_size[1]).astype("float32") img = np.rollaxis(img, 0, 3) return img # plt.imshow(img) # plt.show() if __name__ == "__main__": data_loader = DataLoader() cifar_data = data_loader.load_cifar_data() images = cifar_data["data"].reshape((-1, 3, 32, 32)).astype("float32") # img_test = images[2,:,:,:] # img_test = np.rollaxis(img_test,0,3) # img_test = img_test[:,:,::-1] # plt.imshow(img_test) # plt.show() images = np.rollaxis(images, 1, 4) images = images[:, :, :, ::-1] num_patches = images.shape[0] patch_size = [12, 12] # kmeans = KMeans()
import detectron2 from detectron2.evaluation import COCOEvaluator, inference_on_dataset from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_test_loader from detectron2.config import get_cfg from detectron2 import model_zoo from detectron2.engine import DefaultPredictor from detectron2.engine import DefaultTrainer from CustomVisualizer import CustomVisualizer from DataLoader import DataLoader from MetricsVisualizer import MetricsVisualizer from ImageDisplayer import ImageDisplayer # %% root_dir = "./../Data" # change this to download to a specific location on your pc DataLoader().download_datasets(root_dir) DataLoader().download_trained_models(root_dir) DataLoader().generateAllJsonDataAnnotations(root_dir) # %% annotation_type = "system_measures" json_path = os.path.join(root_dir, "CVC_muscima_" + annotation_type + ".json") muscima_data = DataLoader().load_from_json(json_path) json_path = os.path.join(root_dir, "AudioLabs_" + annotation_type + ".json") audioLabs_data = DataLoader().load_from_json(json_path) # %% def registerDataset(data_name, d, data, classes): DatasetCatalog.register(data_name, lambda d=d: data)
read_hdf_lock.acquire() model.save("%s_final.hdf5" % train_name) read_hdf_lock.release() return fit_hist # In[13]: TauLosses.SetSFs(1, 2.5, 5, 1.5) print(TauLosses.Le_sf, TauLosses.Lmu_sf, TauLosses.Ltau_sf, TauLosses.Ljet_sf) compile_model(model, 1e-3) # In[14]: loader = DataLoader('N:/tau-ml/tuples-v2-training-v2-t1/training/part_*.h5', netConf_full, 100, 2000, validation_size=10000000, max_queue_size=40, n_passes=-1, return_grid=True) print(loader.file_entries) print(loader.total_size, loader.data_size, loader.validation_size) # In[ ]: fit_hist = run_training('step{}'.format(1), model_name, loader, 0, 10) # In[ ]:
import torch import time import torch.nn as nn trainlist = './cfg/trainlist_7.txt' label = 'E:\Person_detection\Dataset\Yolov3_labels\labels' epoch = 1000 batch_size = 12 models = MnasNet().to('cuda') models.train() models.load_state_dict(torch.load('./checkpoints/pretrain/yolo3_32.pt')) dataset = dataset(trainlist=trainlist, label=label, batch_size=batch_size) dataloader = DataLoader(shuffle=True, dataset=dataset, batch_size=batch_size, num_workers=2) optimizer = optim.Adam(models.parameters(), lr=0.0001) def main(): for i in range(epoch): t1 = time.time() for step, (image, tcoord) in enumerate(dataloader): outputs = models(image.to('cuda')) print('epoch:', i, 'step:', step, 'loss:') t2 = time.time() print('epoch time:', (t2 - t1)) print('*************save models**********************') torch.save(models.state_dict(), './checkpoints/yolo3_{}.pt'.format(i))
def test_single_file_happy_path(self): self.file.write("418|12|2012-07-18 14:43:38|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message\n418|12|2012-07-18 14:43:38|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message") self.file.seek(0) expected = {'418': [{'venue_id': '41059b00f964a520850b1fe3', 'latitude': 37.6164, 'check_in_message': 'empty_message', 'check_in_id': '12', 'longitude': -122.386, 'date': datetime.datetime(2012, 7, 18, 14, 43, 38)}, {'venue_id': '41059b00f964a520850b1fe3', 'latitude': 37.6164, 'check_in_message': 'empty_message', 'check_in_id': '12', 'longitude': -122.386, 'date': datetime.datetime(2012, 7, 18, 14, 43, 38)}]} actual = DataLoader.load_check_ins_from_file(self.file) self.assertDictEqual(expected, actual)
word_vocab = torch.load(args.vocab_file) logging.info('load word vocab, size: %s' % len(word_vocab)) rel_vocab = torch.load(args.rel_vocab_file) logging.info('load relation vocab, size: %s' % len(rel_vocab)) ent_vocab = torch.load(args.ent_vocab_file) logging.info('load entity vocab, size: %s' % len(ent_vocab)) if args.atten_mode == "arsmcnn": train_loader = ArsmcnnLoader("./data/arsmcnn_train.pt", device) logging.info('load train data, batch_num: %d\tbatch_size: %d' % (train_loader.batch_num, train_loader.batch_size)) valid_loader = ArsmcnnLoader("./data/arsmcnn_valid.pt", device) logging.info('load valid data, batch_num: %d\tbatch_size: %d' % (valid_loader.batch_num, valid_loader.batch_size)) else: train_loader = DataLoader(args.train_file, device) logging.info('load train data, batch_num: %d\tbatch_size: %d' % (train_loader.batch_num, train_loader.batch_size)) valid_loader = DataLoader(args.valid_file, device) logging.info('load valid data, batch_num: %d\tbatch_size: %d' % (valid_loader.batch_num, valid_loader.batch_size)) os.makedirs(args.save_path, exist_ok=True) ############################################# # bulid model # ############################################# def get_models(args): if args.atten_mode in ["seq", "both"]: encoder_output_size = args.d_hidden * 2 + args.d_rel_embed
from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import train_test_split from pylab import rcParams from DataLoader import DataLoader from sklearn.cluster import KMeans from sklearn.mixture import GaussianMixture from sklearn.decomposition import PCA, FastICA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.random_projection import SparseRandomProjection from sklearn.neural_network import MLPClassifier rcParams['figure.figsize'] = 10, 7 # load data output_path = 'outputs\\Marketing' dl_1 = DataLoader('data\\UCI-bank-marketing.csv', output_path, 'Marketing') dl_1.load_data() dl_1.scaled_data() X, y = dl_1.get_data() # k-means clustering # Expectation Maximization # PCA # ICA # Randomized Projections # LDA def clustering_algo(X, y, cluster, n_c=2, n_i=10): if cluster == 'KM': clf = KMeans(n_clusters=n_c, n_init=n_i).fit(X) elif cluster == 'EM':
def main(opts): # Set number of actions opts.A = opts.delta_M * opts.delta_N # Set random seeds set_random_seeds(opts.seed) if opts.dataset == 0: if opts.mean_subtract: opts.mean = [119.16, 107.68, 95.12] opts.std = [61.88, 61.72, 67.24] else: opts.mean = [0, 0, 0] opts.std = [0, 0, 0] opts.num_channels = 3 elif opts.dataset == 1: if opts.mean_subtract: opts.mean = [193.0162338615919] opts.std = [37.716024486312811] else: opts.mean = [0] opts.std = [0] opts.num_channels = 1 else: raise ValueError('Dataset %d does not exist!' % (opts.dataset)) # Create tensorboard writer writer = SummaryWriter(log_dir=opts.save_path_vis) loader = DataLoader(opts) agent = Agent(opts, mode='eval') loaded_state = torch.load(opts.load_model) agent.policy.load_state_dict(loaded_state['state_dict']) h5file = h5py.File(opts.save_path_h5, 'w') all_splits = ['train', 'val', 'test'] if opts.dataset == 1: all_splits.append('test_unseen') for split in all_splits: true_images, utility_images, utility_matrices = get_utility_maps( loader, agent, split, opts) reward_matrices = [] for i in range(len(true_images)): shape = true_images[i].shape reward_matrix = np.zeros((shape[0], opts.N, opts.M)) for j in range(shape[0]): optimal_views, utility_value = get_submodular_views( utility_matrices[i][j], 4) for k in optimal_views: for itera in [ a_val % opts.N for a_val in range(k[0] - opts.nms_nbd, k[0] + opts.nms_nbd + 1) ]: for iterb in [ b_val % opts.M for b_val in range(k[1] - opts.nms_nbd, k[1] + opts.nms_nbd + 1) ]: reward_matrix[j, itera, iterb] += 255.0 / 4.0**( max(abs(k[0] - itera), abs(k[1] - iterb))) reward_matrix = np.minimum(reward_matrix, 255.0) reward_matrices.append(reward_matrix) if opts.debug: num_batches = len(true_images) assert (len(utility_images) == num_batches) assert (len(utility_matrices) == num_batches) for i in range(num_batches): batch_size = true_images[i].shape[0] assert (utility_images[i].shape == (batch_size, opts.N, opts.M, opts.N, opts.M, opts.num_channels, 8, 8)) assert (utility_matrices[i].shape == (batch_size, opts.N, opts.M, opts.N, opts.M)) if split == 'val': images_count = 0 # Iterate through the different batches for i in range(len(true_images)): shape = true_images[i].shape true_images[i] = np.reshape( true_images[i].transpose(0, 3, 1, 4, 2, 5), (shape[0], 1, shape[3], shape[1] * shape[4], shape[2] * shape[5])) / 255.0 utility_images_normal = np.reshape( utility_images[i].transpose(0, 1, 2, 5, 3, 6, 4, 7), (shape[0], opts.N * opts.M, opts.num_channels, opts.N * 8, opts.M * 8)) for j in range(shape[0]): x = vutils.make_grid(torch.Tensor( utility_images_normal[j]), padding=3, normalize=False, scale_each=False, nrow=opts.M) images_count += 1 writer.add_image( 'Panorama #%5.3d utility' % (images_count), x, 0) # ---- Apply submodularity based greedy algorithm to get near-optimal views ---- optimal_views, utility_value = get_submodular_views( utility_matrices[i][j], 4) optimal_views_images = np.zeros( (opts.N, opts.M, opts.num_channels, 32, 32)) # Convert the scores into images for visualization for k in optimal_views: optimal_views_images[k[0], k[1]] = 1.0 optimal_views_images = np.reshape( optimal_views_images.transpose(2, 0, 3, 1, 4), (1, opts.num_channels, opts.N * 32, opts.M * 32)) # Get the reward image computed based on optimal_views reward_image = np.repeat(np.repeat(np.repeat( reward_matrices[i][j][:, :, np.newaxis, np.newaxis, np.newaxis], repeats=opts.num_channels, axis=2), repeats=32, axis=3), repeats=32, axis=4) reward_image = np.reshape( reward_image.transpose(2, 0, 3, 1, 4), (1, opts.num_channels, opts.N * 32, opts.M * 32)) / 255.0 # Concatenate the true image, optimal view image and reward image for display concatenated_images = np.concatenate([ true_images[i][j], optimal_views_images, reward_image ], axis=0) x = vutils.make_grid(torch.Tensor(concatenated_images), padding=3, normalize=False, scale_each=False, nrow=1) writer.add_image('Panorama #%5.3d image' % (images_count), x, 0) utility_matrices = np.concatenate(utility_matrices, axis=0) reward_matrices = np.concatenate(reward_matrices, axis=0) h5file.create_dataset('%s/utility_maps' % split, data=utility_matrices) h5file.create_dataset('%s/nms' % (split), data=reward_matrices) json.dump(vars(opts), open(opts.save_path_json, 'w')) writer.close() h5file.close()
running_loss += loss.item() if __name__ == "__main__": if len(sys.argv) != 2: print("enter new or continue or testonly") exit(0) MODE = sys.argv[1] if MODE != "new" and MODE != "continue" and MODE != "testonly": print("enter new or continue or testonly") exit(0) ## input format + loss function data_loader = DataLoader(INPUT_PATH, labels_type="float") trainloader = data_loader.get_trainloader() testloader = data_loader.get_testloader() net = Net() net = net.cuda() if MODE == "continue" or MODE == "testonly": net.load_state_dict(torch.load(PATH)) criterion = nn.SmoothL1Loss() #criterion = nn.MSELoss() optimizer = optim.SGD(net.parameters(), lr=0.0004, momentum=0.7) if MODE != "testonly": for epoch in range(TOTAL_EPOCH): running_loss = 0.0
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len #========= Preparing DataLoader =========# training_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'], tgt_insts=data['train']['tgt'], batch_size=opt.batch_size, cuda=opt.cuda) validation_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'], tgt_insts=data['valid']['tgt'], batch_size=opt.batch_size, shuffle=False, test=True, cuda=opt.cuda) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx: print( '[Warning]', 'The src/tgt word2idx table are different but asked to share word embedding.' ) print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) #print(transformer) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(training_data.tgt_vocab_size) if opt.cuda: transformer = transformer.cuda() crit = crit.cuda() print("===>TRAIN\n") train(transformer, training_data, validation_data, crit, optimizer, opt)
result = pd.merge(ratings, items[[itemID_column, itemName_column]], how='left', on=[itemID_column]) merged_data = result[[ userID_column, itemID_column, itemName_column, ratings_column ]] # - testUser = 78 k = 10 merged_data[merged_data['user_id'] == testUser].sort_values( by=['rating'], ascending=False)[:40] ml = DataLoader(items_path, ratings_path, userID_column, itemID_column, ratings_column, itemName_column, size_of_data) data = ml.loadData(rating_scale_min, rating_scale_max) trainSet = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': False} model = KNNBasic(sim_options=sim_options) model.fit(trainSet) simsMatrix = model.compute_similarities() simsMatrix.shape testUserInnerID = trainSet.to_inner_uid(testUser) # Get the top K items we rated
import pandas as pd import numpy as np from math import log from DataLoader import DataLoader [X, y, df] = DataLoader.getDataSet() buildTree(X) def buildTree(X): print(X)
def train_LR(self, lr): trainer.train_LR(self, lr) dataloader = DataLoader() datasets =dataloader.load_shared_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] params=np.empty((28*28)*10+10); climin.initialize.randomize_normal(params,0,1) params = params/(28*28) lr.setParams(params); x=lr.x y=lr.y cost = ( lr.negative_log_likelihood(y) + self.L1_lambda * lr.L1 + self.L2_lambda * lr.L2_sqr ) g_W = T.grad(cost=cost, wrt=lr.W) g_b = T.grad(cost=cost, wrt=lr.b) g_W_model = theano.function( inputs=[x,y], outputs=g_W ) g_b_model = theano.function( inputs=[x,y], outputs=g_b ) batch_size = self.batch_size index = T.lscalar() test_err_model = theano.function( inputs=[index], outputs=lr.zeroOneLoss(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] } ) train_err_model = theano.function( inputs=[index], outputs=lr.zeroOneLoss(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] } ) validate_err_model = theano.function( inputs=[index], outputs=lr.zeroOneLoss(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute number of minibatches for training, validation and testing batch_size = self.batch_size; n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size def train_error(): train_losses = [train_err_model(i) for i in xrange(n_train_batches)] this_train_losses = np.mean(train_losses) return this_train_losses; def validate_error(): validation_losses = [validate_err_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) return this_validation_loss; def test_error(): test_losses = [test_err_model(i) for i in xrange(n_test_batches)] this_test_loss = np.mean(test_losses) return this_test_loss; def d_loss_wrt_pars(parameters, inpt, targets): lr.setParams(parameters) gwValue = g_W_model(inpt,targets) gbValue = g_b_model(inpt,targets) return np.concatenate([gwValue.flatten(),gbValue]) args = ((i, {}) for i in climin.util.iter_minibatches([train_set_x.eval(), train_set_y.eval()], self.batch_size, [0, 0])) opt = climin.rmsprop.RmsProp(params, d_loss_wrt_pars, step_rate=self.learning_rate,decay=self.decay, momentum=self.momentum, args=args) validation_frequency = n_train_batches directory=check_create_observations_dir() self.output_directory = directory bestValidationLoss = np.Inf; for info in opt: if info['n_iter'] % validation_frequency ==0: epoch_no = info['n_iter']/n_train_batches train_err=train_error() validation_err = validate_error() test_err = test_error() self.add_train_data(epoch_no, train_err, validation_err, test_err) if epoch_no % 10 ==0: repfields_path=os.path.join(directory,"repFields"+str(epoch_no).zfill(3)+'.png') W_vals=lr.W.get_value(borrow=True) display(W_vals,repfields_path) if epoch_no >= self.n_epochs: break if validation_err < bestValidationLoss: bestValidationLoss = validation_err # # if validation_err *0.95 > bestValidationLoss: # # print "Best Validation Error : %f Validation err:%f " %(bestValidationLoss,validation_err) # break; if epoch_no > 15 and train_err < 0.9* validation_err: break print "Iteration no: %d Validation error = %f" %(epoch_no,validation_err*100) trainer.save_errors(self, directory) repfields_final_path=os.path.join(directory,"repFields.png") W_vals=lr.W.get_value(borrow=True) display(W_vals,repfields_final_path)
""" mask = np.zeros( (input_image.shape[0], input_image.shape[1], self.num_classes)) image = color.convert_colorspace(input_image, "RGB", color_space)[:, :, 1] image = (image - np.min(image)) / (np.max(image) - np.min(image)) image = exposure.adjust_gamma(image, gamma=0.5) for i in range(self.window_size // 2, mask.shape[0] - (self.window_size // 2)): for j in range(self.window_size // 2, mask.shape[1] - (self.window_size // 2)): window = image[i - (self.window_size // 2):i + (self.window_size // 2), j - (self.window_size // 2):j + (self.window_size // 2)] mask[i, j, :] = self.sigmoid(window.reshape(1, -1), self.W, self.b) return mask if __name__ == "__main__": DM = DataLoader("ECE276A_HW1/trainset/", 0.9, ["barrel_blue", "non_barrel_blue", "rest"]) window_size = 10 gen = DM.data_generator("labeled_data/Stored_Values.pickle", window_size=window_size, step_size=2) model = LogisticRegression(window_size, num_classes=3) model.train(gen, epochs=1000, learning_rate=0.01) pickle_file = "trained_models/model2.pickle"
from Experiments import * from DataLoader import DataLoader if __name__ == '__main__': # load data dl_1 = DataLoader('data\\UCI-bank-marketing.csv', 'outputs\\Marketing', 'Marketing') dl_1.load_data() # run classifier ANN(dl_1) BOOST(dl_1) SVM_RBF(dl_1) SVM_linear(dl_1) KNN(dl_1) DT(dl_1) # load data dl_2 = DataLoader('data\\Heart.csv', 'outputs\\Heart', 'Heart') dl_2.load_data() # run classifier ANN(dl_2) BOOST(dl_2) SVM_linear(dl_2) KNN(dl_2) SVM_RBF(dl_2) DT(dl_2) # # load data # dl_3 = DataLoader('data\\Cancer.csv', 'outputs\\Cancer', 'Cancer') # dl_3.load_data()
import argparse from Align2D import Align2D parser = argparse.ArgumentParser( description='visualizer for raw measurements from the intel dataset') parser.add_argument('--laser_file', type=str, default='../data/intel_LASER_.txt', help='name of the laser scanner log file') parser.add_argument('--odometry_file', type=str, default='../data/intel_ODO.txt', help='name of the odometry log file') args = parser.parse_args() loader = DataLoader(args.laser_file, args.odometry_file) measurements = loader.measurements disc = 0.25 matcher = SDFScanMatcher(discretization=disc) matcher.AddScan(measurements[0].points) matcher.AddScan(measurements[1].points) matcher.AddScan(measurements[2].points) matcher.AddScan(measurements[34].points) matcher.AddScan(measurements[35].points) res, J, grads = matcher.GetResidualAndJacobian(measurements[35].points, np.identity(3)) #sdf = SDFMap([10,10]) print("residual on next scan: {:f}".format(np.linalg.norm(res**2))) fig = plt.figure()
def _predict_regression(self, x): return self.w.dot(x) def evaluate_accuracy(Y_pred, Y_true): return np.mean(Y_pred == Y_true, axis=0) def evaluate_squared_error(Y_pred, Y_true): return np.sqrt(np.mean((Y_pred-Y_true)**2, axis=0)) if __name__ == '__main__': from DataLoader import DataLoader try: if sys.argv[1] == 'mnist': data = DataLoader('mnist') training_size = 1000 eta0 = 1 gamma = 0.1 if sys.argv[1] == 'cs': data = DataLoader('cs') training_size = 15 eta0 = 0.05 gamma = 5e-1 except: print 'usage: python %s [mnist|cs]' % sys.argv[0] sys.exit(1) if data.type == 'classification': evaluate = evaluate_accuracy if data.type == 'regression':
import math import numpy as np import random from collections import Counter from DataLoader import DataLoader from Exceptions import TooSmallSingularValueError from Models import StanfordModel, NCGModel, SocialModelStanford, CorrectSocialModelStanford, AdvancedSocialModel, SimpleSocialModel from Utils import Utils datasets = DataLoader.load_check_ins_from_directory("top_felix_users") users = datasets.keys() network = DataLoader.load_social_network(open("top_felix_users_connections.csv")) """friends = [] for user in datasets: if user == '104665558': continue friends.append(len(network[user])) print np.min(friends) print np.mean(friends) print np.max(friends) exit()""" #print users #exit() #users = ["10221"] #users = ['45474206', '276391406', '21913365', '27818171', '40557413', '19836108', '488667514', '94173972', '28668373', '33660680', '292750714', '104665558', '23209554', '549041707', '18488759', '82666753', '133067027', '30235429', '41234692', '29109326', '169585114', '14665537', '54670715', '258576072', '16332709', '83111133', '75911133', '573461782', '563315196', '111258523', '2365991', '24441491', '240102387']
class ASGCNModelEvaluation: def __init__(self, embedding_type, dataset, model_name, max_degree=696, learning_rate=0.001, weight_decay=5e-4, dropout=0.0, epochs=300, early_stopping=30, hidden1=16, rank=128, skip=0, var=0.5, sampler_device="cpu", gpu=None, recs=10): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "-1" self.d = DataLoader() self.model = ASGCNModel(embedding_type, dataset, model_name, max_degree, learning_rate, weight_decay, dropout, epochs, early_stopping, hidden1, rank, skip, var, sampler_device, gpu, recs) def evaluate(self): # Load test data query_test, truth = self.d.evaluation_data_with_abstracts_citations() # Retrieve predictions recommendation = self.model.query_batch(query_test) # Evaluate print("Evaluating...") evaluation = EvaluationContainer() evaluation.evaluate(recommendation, truth) def main(): parser = argparse.ArgumentParser( description='Arguments for ASGCN model evaluation.') parser.add_argument('embedding_type', choices=["AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL", "MAX_2L", "CONC_AVG_MAX_2L", "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L" ], help="Type of embedding.") parser.add_argument('dataset', help='Name of the object file that stores the ' + 'training data.') parser.add_argument("model_name", choices=["gcn_adapt", "gcn_adapt_mix"], help="Model names.") parser.add_argument('--max_degree', type=int, default=696, help='Maximum degree for constructing the ' + 'adjacent matrix.') parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate.') parser.add_argument('--weight_decay', type=float, default=5e-4, help='Weight decay.') parser.add_argument('--dropout', type=float, default=0.0, help='Dropout rate (1 - keep probability).') parser.add_argument('--epochs', type=int, default=300, help='Number of epochs to train.') parser.add_argument('--early_stopping', type=int, default=30, help='Tolerance for early stopping (# of epochs).') parser.add_argument("--hidden1", type=int, default=16, help="Number of units in hidden layer 1.") parser.add_argument("--rank", type=int, default=128, help="The number of nodes per layer.") parser.add_argument('--skip', type=float, default=0, help='If use skip connection.') parser.add_argument('--var', type=float, default=0.5, help='If use variance reduction.') parser.add_argument("--sampler_device", choices=["gpu", "cpu"], default="cpu", help="The device for sampling: cpu or gpu.") parser.add_argument('--gpu', type=int, help='Which gpu to use.') parser.add_argument('--recs', type=int, default=10, help='Number of recommendations.') args = parser.parse_args() from ASGCNModelEvaluation import ASGCNModelEvaluation print("Starting...") model = ASGCNModelEvaluation( args.embedding_type, args.dataset, args.model_name, args.max_degree, args.learning_rate, args.weight_decay, args.dropout, args.epochs, args.early_stopping, args.hidden1, args.rank, args.skip, args.var, args.sampler_device, args.gpu, args.recs) model.evaluate() print("Finished.") if __name__ == "__main__": main()
__email__ = "*****@*****.**" __maintainer__ = "Mathew Sam" from LogisticRegression import LogisticRegression from DataLoader import DataLoader import numpy as np from scipy import ndimage import matplotlib.pyplot as plt import pickle from skimage.morphology import disk, square, erosion, dilation from skimage import exposure, color import cv2 DM = DataLoader("ECE276A_HW1/trainset/", 0.8, ["barrel_blue", "non_barrel_blue", "rest"]) pickle_file = "trained_models/model1.pickle" with open(pickle_file, 'rb') as handle: model = pickle.load(handle) figure_num = 0 for file_name in DM.train_files: figure_num = figure_num + 1 plt.figure(figure_num) file_name = DM.root_location + file_name plt.subplot(2, 1, 1) image = plt.imread(file_name) plt.imshow(image), plt.xticks([]), plt.yticks( []), plt.title("Original image") mask = model.test_image(image)
def test_invalid_venue(self): self.file.write("418|12|2012-07-18 14:43:38|37.6164|-122.386|41059b00f964a520850b1fe3|empty_message\n418|12|2012-07-18 12:34:45|34|-122.386||empty_message") self.file.seek(0) with self.assertRaises(ValueError) as cm: DataLoader.load_check_ins_from_file(self.file) self.assertEqual(cm.exception.message, 'Error in line 2: venue_id can not be an empty string')