def load_goals(self, task, model): print 'Reading in raw data from the task.' read_task_data = DataReader_Task(task, model) raw_data, raw_num, raw_reference, self.raw_reference_options = read_task_data.reset_goals() read_data = DataReader(subject=subject, data_start=data_start, reference_options=self.raw_reference_options, data_finish=data_finish, model=model, task=task) # raw_data = read_data.get_raw_data() print 'Raw data is ready!' self.goal_data = read_data.generate_output_goals(test_goals=raw_data, test_number=raw_num, test_reference=raw_reference) # print 'Setting up openrave' # self.setup_openrave() # print 'I will now pick base locations to evaluate. They will share the same reachability score, but will have' \ # ' differing manipulability scores.' # print 'before sorting:' # for i in xrange(10): # print self.scores[i] self.scores = np.array(sorted(self.scores, key=lambda t: (t[1][1], t[1][2]), reverse=True)) # print 'after sorting:' # for i in xrange(10): # print self.scores[i] self.best_base = self.scores[0] if self.best_base[1][1] == 0: print 'There are no base locations with reachable goals. Something went wrong in the scoring or the setup' print 'The best base location is: \n', self.best_base visualize_best = True if visualize_best: self.visualize_base_config(self.best_base, self.goal_data, self.raw_reference_options)
def train_lm(testing=False): data = DataReader(token_to_id_path, segment_sepparator) # Create model nodes for the source and target inputs input_sequence, label_sequence = create_inputs(data.vocab_dim) # Create the model. It has three output nodes # z: the input to softmax that provides the latent representation of the next token # cross_entropy: this is used training criterion # error: this a binary indicator if the model predicts the correct token z, cross_entropy, error = create_model(input_sequence, label_sequence, data.vocab_dim, hidden_dim) # For measurement we use the (build in) full softmax. full_ce = C.cross_entropy_with_softmax(z, label_sequence) # print out some useful training information log_number_of_parameters(z) ; print() # Run the training loop num_trained_samples = 0 num_trained_samples_since_last_report = 0 # Instantiate the trainer object to drive the model training lr_schedule = C.learning_parameter_schedule_per_sample(learning_rate) momentum_schedule = C.momentum_schedule_per_sample(momentum_per_sample) gradient_clipping_with_truncation = True learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) trainer = Trainer(z, (cross_entropy, error), learner) last_avg_ce = 0 for epoch_count in range(num_epochs): for features, labels, token_count in data.minibatch_generator(train_file_path, sequence_length, sequences_per_batch): arguments = ({input_sequence : features, label_sequence : labels}) t_start = timeit.default_timer() trainer.train_minibatch(arguments) t_end = timeit.default_timer() samples_per_second = token_count / (t_end - t_start) # Print progress report every num_samples_between_progress_report samples if num_trained_samples_since_last_report >= num_samples_between_progress_report or num_trained_samples == 0: av_ce = average_cross_entropy(full_ce, input_sequence, label_sequence, data) print_progress(samples_per_second, av_ce, num_trained_samples, t_start) num_trained_samples_since_last_report = 0 last_avg_ce = av_ce num_trained_samples += token_count num_trained_samples_since_last_report += token_count if not testing: # after each epoch save the model model_filename = "models/lm_epoch%d.dnn" % epoch_count z.save(model_filename) print("Saved model to '%s'" % model_filename) return last_avg_ce
def __init__(self): data_obj = DataReader() self.df = data_obj.get_pandas_df() self.pk = "NAGcode_1" self.selected_columns = ['statesup', 'defacto'] self.write_columns = [ "{}_dep_score".format(column) for column in self.selected_columns ]
def eval_ae(): from Models.logistic_regression import MultiClassLogisticRegression from Models.random_forest import RandomForest from Models.naive_bayes import NaiveBayes from Models.svm import SVM # load data data_reader = DataReader() df = data_reader.get_all_data() train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split( df) train_x, train_y, val_x, val_y, test_x, test_y = bag_of_words_full_no_empty_val_no_num_no_short_no_repeat( train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw) # Train an auto encoder of size 4096 encoder = get_encoder(train_x, test_x, 4096) # use auto encoder to encode the train, validate and test sets encoded_train = encoder.predict(train_x) encoded_test = encoder.predict(test_x) encoded_val = encoder.predict(val_x) # train the neural network model and calculate the precision, recall, f1 score, and accuracy print('neural net ae') model = _get_nn_model_bag_of_words_simple_scratch( encoded_train, train_y, encoded_val, val_y, data_reader.get_region_labels()['Code'], epochs=100, batch_size=256) eval_nn(model, encoded_test, test_y) evaluate_model_nn(model, encoded_test, test_y) # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy print('logistic regression ae') model = MultiClassLogisticRegression() model.train(encoded_train, train_y) model_obj = lambda: None model_obj.model = model eval_model(model_obj, encoded_test, test_y) evaluate_model(model, encoded_test, test_y) # train the random forest model and calculate the precision, recall, f1 score, and accuracy print('random forest ae') model = RandomForest() model.train(encoded_train, train_y) model_obj = lambda: None model_obj.model = model eval_model(model_obj, encoded_test, test_y) evaluate_model(model, encoded_test, test_y) # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy print('naive bayes ae') model = NaiveBayes() model.train(encoded_train, train_y) model_obj = lambda: None model_obj.model = model eval_model(model_obj, encoded_test, test_y) evaluate_model(model, encoded_test, test_y)
def main(args): logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) coord = tf.train.Coordinator() if args.mode == "train": with tf.compat.v1.name_scope('create_inputs'): data_reader = DataReader( data_dir=args.train_dir, data_list=args.train_list, mask_window=0.4, queue_size=args.batch_size * 3, coord=coord) if args.valid_list is not None: data_reader_valid = DataReader( data_dir=args.valid_dir, data_list=args.valid_list, mask_window=0.4, queue_size=args.batch_size * 2, coord=coord) logging.info( "Dataset size: train {}, valid {}".format(data_reader.num_data, data_reader_valid.num_data)) else: data_reader_valid = None logging.info("Dataset size: train {}".format(data_reader.num_data)) train_fn(args, data_reader, data_reader_valid) elif args.mode == "valid" or args.mode == "test": with tf.compat.v1.name_scope('create_inputs'): data_reader = DataReader_test( data_dir=args.data_dir, data_list=args.data_list, mask_window=0.4, queue_size=args.batch_size * 10, coord=coord) valid_fn(args, data_reader) elif args.mode == "pred": with tf.compat.v1.name_scope('create_inputs'): if args.input_mseed: data_reader = DataReader_mseed( data_dir=args.data_dir, data_list=args.data_list, queue_size=args.batch_size * 10, coord=coord, input_length=args.input_length) else: data_reader = DataReader_pred( data_dir=args.data_dir, data_list=args.data_list, queue_size=args.batch_size * 10, coord=coord, input_length=args.input_length) pred_fn(args, data_reader, log_dir=args.output_dir) else: print("mode should be: train, valid, test, pred or debug") return
class LyricGenRunner: def __init__(self, model_load_path, artist_name, test, prime_text): self.sess = tf.Session() self.artist_name = artist_name print 'Process data...' self.data_reader = DataReader(self.artist_name) self.vocab = self.data_reader.get_vocab() print 'Init model...' self.model = LSTMModel(self.sess, self.vocab, c.BATCH_SIZE, c.SEQ_LEN, c.CELL_SIZE, c.NUM_LAYERS, test=test) print 'Init variables...' self.saver = tf.train.Saver(max_to_keep=None) self.sess.run(tf.initialize_all_variables()) if model_load_path is not None: self.saver.restore(self.sess, model_load_path) print 'Model restored from ' + model_load_path if test: self.test(prime_text) else: self.train() def train(self): while True: inputs, targets = self.data_reader.get_train_batch( c.BATCH_SIZE, c.SEQ_LEN) print 'Training model...' feed_dict = { self.model.inputs: inputs, self.model.targets: targets } global_step, loss, _ = self.sess.run( [self.model.global_step, self.model.loss, self.model.train_op], feed_dict=feed_dict) print 'Step: %d | loss: %f' % (global_step, loss) if global_step % c.MODEL_SAVE_FREQ == 0: print 'Saving model...' self.saver.save(self.sess, join(c.MODEL_SAVE_DIR, self.artist_name + '.ckpt'), global_step=global_step) def test(self, prime_text): sample = self.model.generate(prime=prime_text) print sample
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10): global currLog global counter dataReader = DataReader("./data/" + get_last_file()[0]) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data = structured_preprocesser(data) y = data[remove] del data[remove] le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.2, random_state=49) first_classifier = tree.DecisionTreeClassifier() first_classifier.fit(X_train, y_train) first_classifier_acc = accuracy_score(first_classifier.predict(X_test), y_test) accuracy_scores = [first_classifier_acc] columns = [] datas = [] datas.append(dataset) columns.append([]) for i, x in product(range(3, 10), range(4, len(dataset.columns))): feature_model = RandomForestRegressor(random_state=1, max_depth=i) feature_model.fit(X_train, y_train) importances = feature_model.feature_importances_ indices = np.argsort(importances)[-x:] columns.append(dataset.columns[indices]) X_temp_train = X_train[dataset.columns[indices]] X_temp_test = X_test[dataset.columns[indices]] val = pd.DataFrame(np.r_[X_temp_train, X_temp_test]) val[target] = np.r_[y_train, y_test] datas.append(val) vr = tree.DecisionTreeClassifier() vr.fit(X_temp_train, y_train) accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test)) the_index = accuracy_scores.index(max(accuracy_scores)) return datas[the_index], accuracy_scores[0], max(accuracy_scores), list( columns[the_index])
def main(_): config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) with tf.Session(config=config) as sess: print('\n{} Model initializing'.format(datetime.now())) model = VistaNet(FLAGS.hidden_dim, FLAGS.att_dim, FLAGS.emb_size, FLAGS.num_images, FLAGS.num_classes) loss = loss_fn(model.labels, model.logits) train_op = train_fn(loss, model.global_step) accuracy = eval_fn(model.labels, model.logits) summary_op = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) train_summary_writer.add_graph(sess.graph) saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoints) data_reader = DataReader(num_images=FLAGS.num_images, train_shuffle=True) print('\n{} Start training'.format(datetime.now())) epoch = 0 best_loss = float('inf') while epoch < FLAGS.num_epochs: epoch += 1 print('\n=> Epoch: {}'.format(epoch)) train(sess, data_reader, model, train_op, loss, accuracy, summary_op) print('=> Evaluation') print('best_loss={:.4f}'.format(best_loss)) valid_loss, valid_acc = evaluate( sess, data_reader.read_valid_set(batch_size=FLAGS.batch_size), model, loss, accuracy, summary_op) print('valid_loss={:.4f}, valid_acc={:.4f}'.format( valid_loss, valid_acc)) if valid_loss < best_loss: best_loss = valid_loss save_path = os.path.join( FLAGS.checkpoint_dir, 'epoch={}-loss={:.4f}-acc={:.4f}'.format( epoch, valid_loss, valid_acc)) saver.save(sess, save_path) print('Best model saved @ {}'.format(save_path)) print('=> Testing') result_file = open( os.path.join( FLAGS.log_dir, 'loss={:.4f},acc={:.4f},epoch={}'.format( valid_loss, valid_acc, epoch)), 'w') test(sess, data_reader, model, loss, accuracy, epoch, result_file) print("{} Optimization Finished!".format(datetime.now()))
def build_cnv_training_data(self, data_dir, outcome_file): excel_obj = ExcelReader() data_reader_obj = DataReader() outcome_dict = excel_obj.get_cyto_cnv_result(outcome_file) cnv_df = data_reader_obj.cnv_data_reader_pipeline(data_dir) data_df = data_reader_obj.combine_outcome_data(cnv_df, outcome_dict) return data_df
def __init__(self, folder, type, num, batchSize, steps, numFeatures): self.master_filepath = folder # the master filepath in which all of the data is located self.len = int(np.ceil(num / float(batchSize))) self.num = num self.numFeatures = numFeatures self.batchSize = batchSize self.steps = steps reader = DataReader(self.master_filepath) self.data = reader.get_data(True, True) return
def __init__(self): # 获取字典 self.dealer = DataDealer(ANSWERS_DICT_PATH) # 获取样本集信息 self.reader = DataReader(TRAIN_DATA_TYPE) self.reader.set_pos() self.weight_vgg = None self.biase_vgg = None self.model_word2vec = gensim.models.KeyedVectors.load_word2vec_format( GLOVE_WIKI_GENSIM_DATA_PATH)
def createQuestionsDict(): """ 创建问题字典(包含回答字典) """ reader = DataReader() reader.set_pos() dealer = DataDealer(ANSWERS_DICT_PATH) start_id = reader.get_next_pic_id() qa = reader.get_pic_qa(start_id) for q in qa: question = q['question'] dealer.deal(question) now_id = reader.get_next_pic_id() i = 0 while now_id != start_id: qa = reader.get_pic_qa(now_id) for q in qa: question = q['question'] dealer.deal(question) now_id = reader.get_next_pic_id() i = i + 1 if i % 1000 == 0: print('*', end='') dealer.saveData(QUESTIONS_DICT_PATH) print('over!')
def data_training(): """ 仅用样本集进行训练 """ sentences = [] reader = DataReader(TRAIN_DATA_TYPE) reader.set_pos() start_id = reader.get_next_pic_id() qa = reader.get_pic_qa(start_id) for q in qa: question = q['question'] question = question.replace('?', ' ?') question = question.replace(',', ' ,') question = question.replace('.', ' .') sentence = question.split(' ') sentences.append(sentence) now_id = reader.get_next_pic_id() i = 0 while now_id != start_id: qa = reader.get_pic_qa(now_id) for q in qa: question = q['question'] question = question.replace('?', ' ?') question = question.replace(',', ' ,') question = question.replace('.', ' .') sentence = question.split(' ') sentences.append(sentence) now_id = reader.get_next_pic_id() i = i + 1 if i % 1000 == 0: print('*', end='') print('load data over!') model = gensim.models.Word2Vec(sentences, size=300, min_count=1) model.save(GENSIM_DATA_PATH)
def __init__(self): data_obj = DataReader() self.pk = "nagcode_1" self.df = data_obj.get_pandas_df() self.mode_map = { "active": "statesup", "defacto": "defacto" } self.memory = {} centralities = ["in-degree", "betweenness", "closeness"] self.selected_columns = ["{}_{}_centrality".format(mode, c) for c in centralities for mode in self.mode_map ]
def nearest_neighbors(instruction=None, dataset=None, mca_threshold=None, preprocess=True, drop=None, min_neighbors=3, max_neighbors=10): logger("Reading in dataset....") # Reads in dataset # data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, mca_threshold) logger("->", "Target Column Found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # encodes the label dataset into 0's and 1's y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i y_train = y_train.apply(lambda x: label_mappings[x]).values y_test = y_test.apply(lambda x: label_mappings[x]).values models = [] scores = [] logger("Fitting Nearest Neighbor...") logger("Identifying optimal number of neighbors...") # Tries all neighbor possibilities, based on either defaults or user # specified values for x in range(min_neighbors, max_neighbors): knn = KNeighborsClassifier(n_neighbors=x) knn.fit(X_train, y_train) models.append(knn) scores.append(accuracy_score(knn.predict(X_test), y_test)) logger("Stored model under 'nearest_neighbors' key") knn = models[scores.index(min(scores))] return { 'id': generate_id(), "model": knn, "accuracy_score": scores.index(min(scores)), "preprocesser": full_pipeline, "interpreter": label_mappings, "target": remove, "cross_val_score": cross_val_score(knn, X_train, y_train, cv=3) } clearLog()
def get_aggregate_columns(self, columns): data_obj = DataReader() df = data_obj.get_pandas_df() df = df.groupby(self.pk).agg(set).reset_index() df = df[columns] df['number_of_supporters'] = df['supporter'].apply( lambda x: len([i for i in x if not pd.isna(i)])) df['number_of_targets'] = df['target'].apply( lambda x: len([i for i in x if not pd.isna(i)])) df['support_target_ratio'] = df['number_of_supporters'] / df[ 'number_of_targets'] return df
def prepareData(): """prepare you dataset here""" # with open('all.data', 'rb') as f: # df = pickle.load(f) # df = df[~pd.isnull(df.is_trade)] reader = DataReader() # features = ["average_day_active_time","average_login_interval", "average_spin_interval", "average_bonus_win", "spin_per_active_day", "bonus_per_active_day","average_bet", "bonus_ratio", "free_spin_ratio", "coin"] df = reader.read("slot_purchase_profile_2017") return df
def load_exp_data(self, path, filename): """Charge les valeurs expérimentales depuis le fichier `filename` situé dans le dossier `path`. Si `filename` est une liste ou un tuple, seule la première case est considérée. Paramètres ---------- path : str Chemin du fichier à charger. filename : str or list-like of str Nom du fichier à charger. Retour ------ Retourne None si la lecture s'est bien passée, retourne l'erreur sinon. """ try: reader = DataReader(os.path.join(path, filename)) except FileNotFoundError as err: print(err) return err except ValueError as err: print("ValueError: ", err) return err except Exception as err: print(err) return err self.exptRaw = reader.get_t() self.expIRaw = reader.get_I() self.expt = self.exptRaw self.expI = self.expIRaw #Pour la modification d'intervalle self.valIntervalMin = (min(self.expt)) self.valIntervalMax = (max(self.expt)) self.mainGraph.set_experimental_data(self.expt, self.expI) #Recalcule les valeurs théoriques pour coller avec l'étendue des valeurs #expérimentales self.t = cm.create_t(0, max(self.expt), 1000) self.I = cm.cottrell_curve_gen(self.valN, self.valS, self.valC, self.valDth, self.t) self.mainGraph.set_theoric_data(self.t, self.I) self.mainGraph.set_limit_interval() self.mainGraph.update() self.expDataLoaded = True return None
def test(self): batch_size = 4 num_unroll_steps = 3 char_vocab_size = 51 max_word_length = 11 char_embed_size = 3 _, _, word_data, char_data, _ = load_data('data/', max_word_length) dataset = char_data['train'] self.assertEqual(dataset.shape, (929589, max_word_length)) reader = DataReader(word_data['train'], char_data['train'], batch_size=batch_size, num_unroll_steps=num_unroll_steps) for x, y in reader.iter(): assert x.shape == (batch_size, num_unroll_steps, max_word_length) break self.assertAllClose(X, x) self.assertAllClose(Y, y) with self.test_session() as session: input_ = tf.placeholder( tf.int32, shape=[batch_size, num_unroll_steps, max_word_length], name="input") ''' First, embed characters ''' with tf.variable_scope('Embedding'): char_embedding = tf.get_variable( 'char_embedding', [char_vocab_size, char_embed_size]) # [batch_size x max_word_length, num_unroll_steps, char_embed_size] input_embedded = tf.nn.embedding_lookup(char_embedding, input_) input_embedded = tf.reshape( input_embedded, [-1, max_word_length, char_embed_size]) session.run(tf.assign(char_embedding, EMBEDDING)) ie = session.run(input_embedded, {input_: x}) #print(x.shape) #print(np.transpose(x, (1, 0, 2))) #print(ie.shape) ie = ie.reshape([ batch_size, num_unroll_steps, max_word_length, char_embed_size ]) ie = np.transpose(ie, (1, 0, 2, 3)) #print(ie[0,:,:,:]) self.assertAllClose(IE3, ie[0, :, :, :])
def __init__(self, model_name='model', test=False): self.session = tf.Session() print('Process data...') self.data_reader = DataReader() self.vocab = self.data_reader.get_vocab() print('Init model...') self.model = Model(self.session, self.vocab, c.BATCH_SIZE, c.SEQ_LEN, c.CELL_SIZE, c.NUM_LAYERS, test) print('Init variables...') self.test = test self.saver = tf.train.Saver(max_to_keep=None) self.session.run(tf.global_variables_initializer()) self.model_name = model_name
def __init__(self, root_dir, up_level, save_dir='captions', bad_words_dict=set()): encoder = VQAMaskRCNNBenchmark() captioner = PythiaCaptioner(use_constrained=True) self.model = PythiaBUTD(encoder=encoder, captioner=captioner) self.model.to(device) self.data_iterator = DataReader(root_dir) self.bad_words_dict = bad_words_dict self.up_level = up_level self.captions = {} self.save_dir = save_dir
def __init__(self, fmt, filepath, sampling_interval): self.interval = datetime.timedelta(minutes=sampling_interval) reader = DataReader(fmt, filepath, self.interval) self.raw_data = reader.read() self.data = list(self.raw_data) print(f"Reading {len(self.data)} segments") self.sampling_horizon, self.prediction_horizon = 0, 0 self.scale, self.train_test_ratio = 0, 0 self.n, self.set_cutpoint = len(self.data), False self.train_x, self.train_y, self.train_weights = None, None, None self.test_x, self.test_y = None, None self.train_n, self.test_n = 0, 0 self.train_idx = None
def data_gen(train_or_test='train') -> Tuple: data_reader = DataReader(dataset=scene_name, context_size=CONTEXT_SIZE, root=root_path, mode=train_or_test) while True: data = data_reader.read(batch_size=12) query: Query = data[0] target_img_batch: np.ndarray = data[1] context: Context = query[0] query_camera_batch: np.ndarray = query[1] context_images: np.ndarray = context[0] context_cameras: np.ndarray = context[1] yield target_img_batch, target_img_batch
def main(): #Prepare dataset from csv to npz files #DatasetPreparation.prepare('train_preprocessed.csv','test_preprocessed.csv') #Read the dataset, create batches, and one hot encode the targets batch_size = 100 train_data = DataReader('train.npz',batch_size) validation_data = DataReader('validation.npz') test_data = np.load('test.npz') m = Model(train_data,validation_data) m.train() m.test(test_data)
def test(self): # Find the TESTDATA_FILE in the same directory as this script file. dir_path = os.path.dirname(os.path.realpath(__file__)) testdata_path = os.path.join(dir_path, self.TESTDATA_FILE) # Read each test case (data chunk) and verify the expected schema. with open(testdata_path) as testdatafile: data_reader = DataReader(testdatafile) chunk_count = 0 while True: chunk = data_reader.read_chunk() if chunk is None: break chunk_count += 1 self.verify_data_chunk(chunk_count, chunk)
def indexView(): ''' Renders the template for the index. ''' # if 'pond_pic_visible' not in session: # session['pond_pic_visible']='visible' #http://runnable.com/UiPcaBXaxGNYAAAL/how-to-upload-a-uploaded_file-to-the-server-in-flask-for-python if request.method == 'POST': #true if the button "upload" is clicked # Get the name of the uploaded uploaded_file uploaded_file = request.files['uploaded_file'] # Check if the uploaded_file is one of the allowed types/extensions if uploaded_file and allowed_file(uploaded_file.filename): pond_file = request.files['uploaded_file'] try: reader = DataReader( "") #I don't plan on using this filename, thanks pond_list = reader.readFile( pond_file.read() ) #read method is http://werkzeug.pocoo.org/docs/0.10/datastructures/#werkzeug.datastructures.FileStorage, except Exception as e: print "error in getPondList" print str(e) return render_template(INTERNAL_SERVER_ERROR_TEMPLATE_ROUTE, error=str(e)) ################################################################## #let's try something. AARDVARK <--easy to search for this #(this might be more work than making Pond objects serializable) ################################################################## ##trying http://jsonpickle.github.io/ pickle_pond_list(pond_list) return redirect(url_for("primary_production")) else: error_message = "Apologies, that file extension is not allowed. Please try one of the allowed extensions." return render_template('home_with_error.html', template_file_route=TEMPLATE_FILE_ROUTE, example_file_route=EXAMPLE_FILE_ROUTE, error_message=error_message) return render_template('home.html', template_file_route=TEMPLATE_FILE_ROUTE, example_file_route=EXAMPLE_FILE_ROUTE)
def __init__(self, input_file, vocabulary_file, img_data_file, char2ix_file, output_dir, maxwordlength, emb_dimension, line_batch_size, sample_batch_size, neg_num, window_size, discard, epochs, initial_lr, seed): torch.manual_seed(seed) self.img_data = np.load(img_data_file) self.data = DataReader(input_file, vocabulary_file, char2ix_file, maxwordlength, discard, seed) dataset = Word2vecDataset(self.data, window_size, sample_batch_size, neg_num) self.dataloader = DataLoader(dataset, batch_size=line_batch_size, shuffle=True, num_workers=0, collate_fn=dataset.collate) self.output_dir = output_dir self.emb_size = len(self.data.word2id) self.char_size = len(self.data.char2id) + 1 #5031 self.emb_dimension = emb_dimension self.line_batch_size = line_batch_size self.epochs = epochs self.initial_lr = initial_lr self.VCWE_model = VCWEModel(self.emb_size, self.emb_dimension, self.data.wordid2charid, self.char_size) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") self.num_train_steps = int(len(self.dataloader) * self.epochs) if self.use_cuda: self.VCWE_model.cuda()
def __init__(self, input_file, antonym_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12): print("Reading input file...") self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) print("Creating data batches") self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.antonym_file = open(antonym_file, 'r') self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda()
def readData(self, path_to_data, path_to_energy): """ Reads in weather data from a file and stores it """ if path_to_data == None: weather_reader = RandomReader(365 * 24) else: weather_reader = DataReader(path_to_data, path_to_energy) while weather_reader.canGetForecast(): forecast = weather_reader.getForecast( ) #forecast = list of 24 tuples of (windSpeed, sunlight, energy_needed) # store raw numbers self.raw_data.append(copy.deepcopy(forecast[0])) self.energy_needed.append(forecast[0].ERCOT) self.energy_gained.append( (self.calculate_wind_power(forecast[0].windSpeed), self.calculate_solar_power(forecast[0].sunlight), self.calculate_hydro_power())) # calculate features wind_power = 0.0 solar_power = 0.0 hydro_power = 0.0 for weather_tuple in forecast: #convert weather to power wind_power += self.calculate_wind_power( weather_tuple.windSpeed) solar_power += self.calculate_solar_power( weather_tuple.sunlight) hydro_power += self.calculate_hydro_power() self.features.append((wind_power, solar_power, hydro_power)) weather_reader.advanceTime()
def __init__(self, input_file, output_file, emb_dimension=300, batch_size=64, window_size=5, iterations=5, initial_lr=1.0, min_count=5): self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: print("USING CUDA") self.skip_gram_model.cuda() else: print("CUDA FAIL")
def __init__(self, input_file, output_file, emb_dimension=500, batch_size=32, window_size=5, iterations=5, initial_lr=0.001, min_count=12): self.data = DataReader(input_file, min_count) dataset = PennDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.penn_skip_gram_model = PennSkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.penn_skip_gram_model.cuda()
def read_data(self): self._logger.info("Reading meta data...") self._reader = DataReader(self._logger) ( self._vocab, self._vocab_size, self._dictionary, self._reverse_dictionary, self._unigrams, self._arts_srcs, self._srcs_ents, self._ents_srcs, ) = self._reader.read_meta_files(self._args.data) with open(self._args.output + "-labels-dict.pkl", "wb") as f: cPickle.dump(self._reverse_dictionary, f, protocol=cPickle.HIGHEST_PROTOCOL) with open(self._args.output + "-vocab-dict.pkl", "wb") as f: cPickle.dump(self._dictionary, f, protocol=cPickle.HIGHEST_PROTOCOL) self._number_of_srcs = len(set(self._srcs_ents.keys())) self._sample_dist()
def run(self, generation): runned_generation = list() data_reader = DataReader.getInstance() X, Y, X_test, X_output = data_reader.read_data() #for each gene of this generation for i in range(0, len(generation)): this_gene = generation[i] # runner is which algorithm will I use: # 0 is XGBoost Classifier # 1 is XGBoost regressor # 2 is SVC # 3 is DecisionTreeClassifier # 4 is AdaBoost applied to DecisionTreeClassifier # 5 is GradientBoosting # 6 is KNeighbors # 7 is RandomForest # 8 is RandomForest but simplified (more defaults and less configuration) runner = None if (this_gene.way == 0): runner = TitanicBoostClassifier() else: if (this_gene.way == 1): runner = TitanicBoostRegressor() else: runner = VariousForests() runner.set_datasets(X, Y, X_test, X_output) runner.set_gene_to_model(this_gene) #here we configure the model this_gene.set_fitness_level(runner.run()) runned_generation.append(this_gene) return runned_generation
def __init__(self): self.data_reader = DataReader('data/training_data/training.data', 'data/stopwords/stopwords.txt', True, 1000) self.perceptron = Perceptron() self.softmax = Softmax() # Let's create 5 classifiers universe_size = len(self.data_reader.universe) self.perceptron_classifiers = [np.zeros((universe_size + 1)) for i in range(5)] self.softmax_classifier = np.ones((5, universe_size + 1))
def run_test2(session, m, reader): state = session.run(m.initial_rnn_state) tokenNum = 0 word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) train_reader = DataReader(word_tensors['train'], char_tensors['train'],1, 1) i = 1 for x, y in train_reader.iter(): state = session.run([m.final_rnn_state], { m.input: x, m.targets: y, m.initial_rnn_state: state }) # constructs the word_embedding (which is the input node to the LSTM) # NOTE: each element is an index to a character_embedding. thus, it's # actually a matrix word_embedding = x[0][0] output = "" for w in word_embedding: output = output + str(w) + " " output = output.rstrip() + "," #print ("char_embedding[1]:" + str(session.run(m.char_embedding[1]))) i = i + 1 layer1 = state[0][0] layer2 = state[0][1] layer1_hiddens = layer1[1][0] layer2_hiddens = layer2[1][0] for x in layer1_hiddens: output = output + str(x) + " " output = output.rstrip() + "," for x in layer2_hiddens: output = output + str(x) + " " output = output.rstrip() + "\n" print (output)
def run(): lines = DataReader.read('car.data.txt') training_inputs = DataReader.parse_data(lines) print "Initializing Network..." my_network = Network(number_of_centers=NUMBER_OF_CENTERS, training=TRAINING_ITERATIONS) print "Done." print "Starting training. {} centers / {} iterations".\ format(NUMBER_OF_CENTERS, TRAINING_ITERATIONS) my_network.train(training_inputs) print "Done." # TODO(Accuracy): Test accuracy with non training data. right = 0 total_tests = 100 for i in range(total_tests): chosen = random.choice(training_inputs) response = my_network.classify(chosen['inputs']) if response == chosen['expected']: right += 1 print "Accuracy => {}/{}".format(right, total_tests)
def generate_score(self): # for item in data: # print Bmat_to_pos_quat(item) num = np.ones([len(self.goals), 1]) reference_options = ['head'] reference = np.zeros([len(self.goals), 1]) print 'Starting to convert data!' run_data = DataReader(subject=self.subject, model=self.model, task=self.task) run_data.receive_input_data(self.goals, num, reference_options, reference) run_data.generate_output_goals() run_data.generate_score(viz_rviz=True, visualize=self.visualize, plot=False)
def __init__(self, visualize=False, subject='any_subject', task='yogurt', model='chair', tf_listener=None): self.model = model self.task = task self.subject = subject baselink_B_liftlink = createBMatrix([-0.05, 0.0, 0.8897], [0, 0, 0, 1]) goals = [[0.301033944729, 0.461276517595, 0.196885866571, 0.553557277528, 0.336724229346, -0.075691681684, 0.757932650828], [0.377839595079, 0.11569018662, 0.0419789999723, 0.66106069088, 0.337429642677, -0.519856214523, 0.422953367233], [0.2741387011303321, 0.005522571699560719, -0.011919598309888757, -0.023580897114171894, 0.7483633417869068, 0.662774596931439, 0.011228696415565394], [0.13608632401364894, 0.003540318703608347, 0.00607600258150498, -0.015224467044577382, 0.7345761465214938, 0.6783020152473445, -0.008513323454022942]] liftlink_B_goal = createBMatrix([0.5309877259429142, 0.4976163448816489, 0.16719537682372823], [0.7765742993649133, -0.37100605554316285, -0.27784851903166524, 0.42671660945891]) data = np.array([baselink_B_liftlink*createBMatrix(goals[0][0:3], goals[0][3:]), # In reference to base link baselink_B_liftlink*createBMatrix(goals[1][0:3], goals[1][3:]), # In reference to base link createBMatrix(goals[2][0:3], goals[2][3:]), createBMatrix(goals[3][0:3], goals[3][3:])]) # This one is in reference to the head for item in data: print Bmat_to_pos_quat(item) # For my information, these are the [xyz] and quaternion [x,y,z,w] for the PoseStamped messages for the goal # positions. The first two have parent tf /base_link. The third has parent link /head # (array([ 0.48098773, 0.49761634, 0.91837238]), array([ 0.7765743 , -0.37100606, -0.27784852, 0.42671661])) # (array([ 0.4598544 , 0.8806009 , 0.65371782]), array([ 0.45253993, 0.53399713, -0.17283745, 0.69295158])) # (array([ 0.2741387 , 0.05522572, -0.0119196 ]), array([-0.0235809 , 0.74836334, 0.6627746 , 0.0112287 ])) num = np.ones([len(data), 1]) reference_options = ['head', 'base_link'] reference = np.array([[1], [1], [0], [0]]) print 'Starting to convert data!' runData = DataReader(subject=self.subject, model=self.model, task=self.task) runData.receive_input_data(data, num, reference_options, reference) runData.generate_output_goals() runData.generate_score(viz_rviz=True, visualize=False, plot=False)
def test(self): batch_size = 4 num_unroll_steps = 3 char_vocab_size = 51 max_word_length = 11 char_embed_size = 3 _, _, word_data, char_data, _ = load_data('data/', max_word_length) dataset = char_data['train'] self.assertEqual(dataset.shape, (929589, max_word_length)) reader = DataReader(word_data['train'], char_data['train'], batch_size=batch_size, num_unroll_steps=num_unroll_steps) for x, y in reader.iter(): assert x.shape == (batch_size, num_unroll_steps, max_word_length) break self.assertAllClose(X, x) with self.test_session() as session: input_ = tf.placeholder(tf.int32, shape=[batch_size, num_unroll_steps, max_word_length], name="input") ''' First, embed characters ''' with tf.variable_scope('Embedding'): char_embedding = tf.get_variable('char_embedding', [char_vocab_size, char_embed_size]) # [batch_size x max_word_length, num_unroll_steps, char_embed_size] input_embedded = tf.nn.embedding_lookup(char_embedding, input_) input_embedded = tf.reshape(input_embedded, [-1, max_word_length, char_embed_size]) session.run(tf.assign(char_embedding, EMBEDDING)) ie = session.run(input_embedded, { input_: x }) output = tdnn(input_embedded, [2], [2], scope='TDNN') out = session.run(output, { input_embedded: ie, 'TDNN/kernel_2/w:0': np.reshape(np.transpose(KERNEL_2_W), [1, 2, num_unroll_steps, 2]), 'TDNN/kernel_2/b:0': KERNEL_2_B }) out = out.reshape([batch_size, num_unroll_steps, 2]) out = out.transpose([1, 0, 2]) # torch uses time-major order self.assertAllClose(out, np.array([ [[-0.04201929, 0.02275813], [-0.04060676, 0.02283999], [-0.04333816, 0.02333505], [-0.04131923, 0.02480407]], [[-0.04124087, 0.02429205], [-0.04117644, 0.02419558], [-0.04282973, 0.02318067], [-0.04131923, 0.02480407]], [[-0.03877186, 0.0243939 ], [-0.04173752, 0.02552123], [-0.04168687, 0.02385954], [-0.04201929, 0.02454825]]])) print(out.shape) print(out) assert False
class MyWord2Vec: def __init__(self, args): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s : %(levelname)s : %(message)s") self._logger = logging.getLogger(__name__) self._logger.info("Initializing Model...") self._logger.info("Reading Args...") self._args = args self._lr = self._args.lr self._data_index = 0 self._context_tensor_size = 0 self._sampled_tensor_size = 0 def read_data(self): self._logger.info("Reading meta data...") self._reader = DataReader(self._logger) ( self._vocab, self._vocab_size, self._dictionary, self._reverse_dictionary, self._unigrams, self._arts_srcs, self._srcs_ents, self._ents_srcs, ) = self._reader.read_meta_files(self._args.data) with open(self._args.output + "-labels-dict.pkl", "wb") as f: cPickle.dump(self._reverse_dictionary, f, protocol=cPickle.HIGHEST_PROTOCOL) with open(self._args.output + "-vocab-dict.pkl", "wb") as f: cPickle.dump(self._dictionary, f, protocol=cPickle.HIGHEST_PROTOCOL) self._number_of_srcs = len(set(self._srcs_ents.keys())) self._sample_dist() def _load_model(self, file_path): with open(file_path, "rb") as f: embeddings = cPickle.load(f) return embeddings def _save_model(self, file_path, embeddings): with open(file_path, "wb") as f: cPickle.dump(embeddings, f, protocol=cPickle.HIGHEST_PROTOCOL) def _sample_dist(self): freq = np.power(self._unigrams / np.sum(self._unigrams), 0.75) # unigrams ^ 3/4 self._dist = freq * (1 / np.sum(freq)) # normalize probabs def _get_samples(self, size): samples = np.random.choice(range(self._vocab_size), size, p=self._dist) return samples def _plot(self, title, embeddings): self._logger.debug("Plotting...") pca = PCA(n_components=2) pca.fit(embeddings) low_dim_embs = pca.transform(embeddings) labels = [self._reverse_dictionary[key] for key in xrange(self._vocab_size)] for label, x, y in zip(labels, low_dim_embs[:, 0], low_dim_embs[:, 1]): plt.plot(x, y, "x") if title != "final": plt.annotate(label, xy=(x, y), fontsize="xx-small") else: plt.annotate(label, xy=(x, y)) if title is "final": plt.show() else: file = "fig-%s.eps" % title plt.savefig(file, format="eps", dpi=1200) plt.clf() def _build_graph(self): self._logger.info("Building tf graph...") self.graph = tf.Graph() with self.graph.as_default(): self.make_vars() self.build_expr() self.optimize() def make_vars(self): init_width = 0.5 / self._args.emb_size # Shared variables holding input and output embeddings self.inp_embeddings = tf.Variable( tf.random_uniform([self._vocab_size, self._args.emb_size], -init_width, init_width) ) self.out_embeddings = tf.Variable( tf.random_uniform([self._vocab_size, self._args.emb_size], -init_width, init_width) ) def build_expr(self): self.inp_ctx = tf.placeholder(tf.int32, shape=(None)) self.out_ctx = tf.placeholder(tf.int32, shape=(None)) self.inp_neg = tf.placeholder(tf.int32, shape=(None)) self.out_neg = tf.placeholder(tf.int32, shape=(None)) self.out_ents = tf.placeholder(tf.int32, shape=(None)) self.other_ents = tf.placeholder(tf.int32, shape=(None)) ctx_batch_size = tf.shape(self.inp_ctx)[0] neg_batch_size = tf.shape(self.out_ctx)[0] ents_constant = tf.shape(self.out_ents)[0] src_constnt = tf.constant(self._number_of_srcs, dtype=tf.float32) # embedding lookups to get vectors of specified indices (by placeholders) embed_inp_ctx = tf.nn.embedding_lookup(self.inp_embeddings, self.inp_ctx) embed_out_ctx = tf.nn.embedding_lookup(self.out_embeddings, self.out_ctx) embed_inp_neg = tf.nn.embedding_lookup(self.inp_embeddings, self.inp_neg) embed_out_neg = tf.nn.embedding_lookup(self.out_embeddings, self.out_neg) embed_entities = tf.nn.embedding_lookup(self.out_embeddings, self.out_ents) embed_other_entities = tf.nn.embedding_lookup(self.out_embeddings, self.other_ents) dot_ctx = tf.mul(embed_inp_ctx, embed_out_ctx) sum_ctx = tf.reduce_sum(dot_ctx, 1) ctx_expr = tf.log(tf.sigmoid(sum_ctx)) / tf.cast(ctx_batch_size, tf.float32) dot_neg = tf.mul(embed_inp_neg, embed_out_neg) sum_neg = tf.reduce_sum(dot_neg, 1) neg_expr = tf.log(tf.sigmoid(-sum_neg)) / tf.cast(neg_batch_size, tf.float32) avg_ents = tf.div(tf.reduce_sum(embed_other_entities, 1), src_constnt) ents_diff = tf.square(tf.sub(embed_entities, avg_ents)) reg_expr = self._args.regularizer * tf.reduce_sum(ents_diff) / tf.cast(ents_constant, tf.float32) self.loss = tf.reduce_sum(ctx_expr) + tf.reduce_sum(neg_expr) - reg_expr def optimize(self): optimizer = tf.train.GradientDescentOptimizer(self._lr) self.train = optimizer.minimize(-self.loss, gate_gradients=optimizer.GATE_NONE) def lr_decay(self): decay_factor = 10.0 * (5.0 / float(self._args.epochs)) lr = np.maximum(0.0001, self._lr / decay_factor) self._lr = round(lr, 4) def _ents_matrices(self): self._logger.info("Preparing named entites for this source") # get political entities source_entities = np.array(self._srcs_ents[self._current_source]) corresponding_ents = list() padding_index = self._dictionary["UNK"] # get corresponding entities and replace tokens by ids for ent in source_entities: base_ent = ent.split("_", -1)[0] temp = np.array(self._ents_srcs[base_ent]) """ TODO: Remve entity from its correspondings list """ for curr_ent in temp: temp[temp == curr_ent] = self._dictionary[curr_ent] temp = temp.astype(int).tolist() temp += [padding_index] * (self._number_of_srcs - len(temp)) corresponding_ents.append(temp) # replace entities' tokens by ids source_ents_ids = source_entities for ent in source_ents_ids: source_ents_ids[source_ents_ids == ent] = self._dictionary[ent] source_ents_ids = source_ents_ids.astype(int) self._current_entities = source_ents_ids self._corresponding_ents = corresponding_ents def generate_batch(self): context_words = [] sampled_words = [] # get current batch, curr_index: curr_index + batch_size current_data_batch = self._data[self._data_index : self._data_index + self._args.batch_size] self._data_index += self._args.batch_size % self._data_size # add extra UNKs for padding context windows padding_index = self._dictionary["UNK"] lpadded = ( self._args.window // 2 * [padding_index] + current_data_batch + self._args.window // 2 * [padding_index] ) for idx, word in enumerate(current_data_batch): context = lpadded[idx : (idx + self._args.window)] samples = self._get_samples(self._args.samples) context_words += zip([word] * len(context), context) sampled_words += zip([word] * len(samples), samples) inp_ctx, out_ctx = zip(*context_words) inp_neg, out_neg = zip(*sampled_words) feed_dict = { self.out_ents: self._current_entities, self.other_ents: self._corresponding_ents, self.inp_ctx: inp_ctx, self.out_ctx: out_ctx, self.inp_neg: inp_neg, self.out_neg: out_neg, } return feed_dict def _prepare_file(self, file_path): data = np.array(self._reader.read_file(file_path, self._dictionary)) self._data = data.astype(int).tolist() self._data_size = len(data) def train(self): if os.path.exists(self._args.output): embeddings = self._load_model(self._args.output) self._plot("final", embeddings) return self._build_graph() self._logger.info("Starting training ...") with tf.Session(graph=self.graph) as sess: tf.initialize_all_variables().run() first_start = time.time() start = time.time() for epoch in xrange(1, self._args.epochs + 1): self._logger.info( "[*] training, epoch num: %d, out of %d with learning rate: %f" % (epoch, self._args.epochs, self._lr) ) total_batches = 0 batches_so_far = 0 avg = 0 for file_path in self._arts_srcs: self._current_source = self._arts_srcs[file_path] self._ents_matrices() self._logger.info("Reading file %s" % file_path) self._prepare_file(file_path) file_batches = self._data_size / self._args.batch_size check_point = file_batches / 4 total_batches += file_batches for batch in xrange(file_batches): batches_so_far += 1 feed_dict = self.generate_batch() cost, _ = sess.run([self.loss, self.train], feed_dict=feed_dict) # if math.isnan(cost) or math.isinf(cost): # self._logger.info('[*] Encountered NaN or Inf, stopping training') # final_embeddings = prev_emb.eval() # break avg += cost # if batch % check_point == 0 and batch != 0: self._logger.info( "\t[*][*] batch %s out of %s, avg cost=%s, time so far: %ds" % (batch, file_batches, avg / batches_so_far, int(time.time() - start)) ) self._data_index = 0 self._logger.info( "[*] Done file %s, avg cost=%s, time taken: %ds " % (file_path, avg / file_batches, int(time.time() - start)) ) avg /= total_batches self._logger.info( "[*] Done epoch %s out of %s, avg cost=%s, time taken: %ds " % (epoch, self._args.epochs, avg, int(time.time() - start)) ) avg = 0 self.lr_decay() print "________________________________________________\n" self._logger.info("[*] Total training time: %ds" % int(time.time() - first_start)) final_embeddings = self.out_embeddings.eval() self._save_model(self._args.output, final_embeddings) self._plot("final", final_embeddings)
def __init__(self, visualize_best=False, train_subj=6, test_subj=6): output_raw_scores = False # compare = True self.visualize_best = visualize_best self.tf_listener = tf.TransformListener() self.train_subj = train_subj self.test_subj = test_subj print 'I will use data that was trained on subject ', self.train_subj print 'I will test on data from subject ', self.test_subj self.task = 'shaving' # options are: bathing, brushing, feeding, shaving, scratching_upper_arm/forearm/thigh/chest/knee self.model = 'chair' # options are: 'chair', 'bed', 'autobed' pos_clust = 2 ori_clust = 2 self.mc_simulation_number = None self.visualize = False data_start = 0 data_finish = 'end ' # 2000 # 4000 #'end' rospack = rospkg.RosPack() self.pkg_path = rospack.get_path('hrl_base_selection') print 'Loading scores.' self.loaded_scores = self.load_task(self.task, self.model, self.train_subj) if self.loaded_scores is None: print 'The scores do not exist. Must generate scores! This may take a long time...' self.generate_scores(data_start, data_finish, pos_clust, ori_clust) print 'Scores generated. I will now continue.' print 'Now loading the scores I just generated' self.loaded_scores = self.load_task(self.task, self.model, self.train_subj) if self.loaded_scores is None: print 'The scores still do not exist. This is bad. Fixes needed in code.' return headx = 0 heady = 0 self.scores = self.loaded_scores[headx, heady] if output_raw_scores: self.output_scores() subject = ''.join(['sub', str(self.test_subj), '_shaver']) print 'Reading in raw data from the task.' read_task_data = DataReader_Task(self.task, self.model) raw_data, raw_num, raw_reference, self.raw_reference_options = read_task_data.reset_goals() read_data = DataReader(subject=subject, data_start=data_start, reference_options=self.raw_reference_options, data_finish=data_finish, model=self.model, task=self.task, tf_listener=self.tf_listener) # raw_data = read_data.get_raw_data() print 'Raw data is ready!' self.goal_data = read_data.generate_output_goals(test_goals=raw_data, test_number=raw_num, test_reference=raw_reference) # print 'Setting up openrave' # self.setup_openrave() # print 'I will now pick base locations to evaluate. They will share the same reachability score, but will have' \ # ' differing manipulability scores.' # print 'before sorting:' # for i in xrange(10): # print self.scores[i] self.scores = np.array(sorted(self.scores, key=lambda t: (t[1][1], t[1][2]), reverse=True)) # print 'after sorting:' # for i in xrange(10): # print self.scores[i] self.best_base = self.scores[0] if self.best_base[1][1] == 0: print 'There are no base locations with reachable goals. Something went wrong in the scoring or the setup' print 'The best base location is: \n', self.best_base if visualize_best: read_data.pub_rviz() self.visualize_base_config(self.best_base, self.goal_data, self.raw_reference_options)
class APAProject(object): def __init__(self): self.data_reader = DataReader('data/training_data/training.data', 'data/stopwords/stopwords.txt', True, 1000) self.perceptron = Perceptron() self.softmax = Softmax() # Let's create 5 classifiers universe_size = len(self.data_reader.universe) self.perceptron_classifiers = [np.zeros((universe_size + 1)) for i in range(5)] self.softmax_classifier = np.ones((5, universe_size + 1)) def file_to_data_set(self, file): data_set = [] with open(file) as data: for line in data: _, score, sentence = line.split('|') score = float(score) # Calculating train target: # 0 if 0 < score <= 0.2, 1 if 0.2 < score <= 0.4, etc... class_number = math.floor(score * 5) sentence_vector = self.data_reader.get_sentence_coordinates(sentence) data_set.append((sentence_vector, class_number)) return data_set def train_perceptron(self): start_time = time.time() print "Starting training session ..." # We need to read data from datasmall and train the perceptron training_data_set = self.file_to_data_set('data/training_data/training.data') PERIODS = 5 for i in range(PERIODS): # For each period, reshuffle random.shuffle(training_data_set) # We train every classfier for (classifier_index, classifier) in enumerate(self.perceptron_classifiers): self.perceptron_classifiers[classifier_index], updates = self.perceptron.train_epoch(training_data_set, classifier_index, classifier) self.test_perceptron_multiclass() training_end_time = time.time() training_duration = training_end_time - start_time print "Training session finished: duration %s seconds" % training_duration def test_perceptron(self): print "Starting testing session..." test_data_set = self.file_to_data_set('data/test_data/test.data') for (classifier_index, classifier) in enumerate(self.perceptron_classifiers): error_count, success_count = self.perceptron.test_classifier(test_data_set, classifier, classifier_index) print "Classifier %s just finished. %s%% results are good" % ((classifier_index + 1), success_count * 100 / (success_count + error_count)) def test_perceptron_multiclass(self): print "Starting testing session..." test_data_set = self.file_to_data_set('data/test_data/test.data') success_count = 0 error_count = 0 for (sentence_vector, class_number) in test_data_set: results_classifiers = [] test_class = -1 for (classifier_index, classifier) in enumerate(self.perceptron_classifiers): results_classifiers.append(np.dot(classifier, sentence_vector)) if results_classifiers.index(max(results_classifiers)) == class_number: success_count += 1 else: error_count += 1 print "Classifier just finished. %s/%s ~= %s%% results are good" % (success_count, (error_count + success_count), success_count * 100 / (success_count + error_count)) def train_softmax(self): start_time = time.time() print "Starting softmax training session..." # We need to read data from datasmall and train the perceptron training_data_set = self.file_to_data_set('data/training_data/training.data') PERIODS = 10 for i in range(PERIODS): random.shuffle(training_data_set) # On apprend PERIODS fois et a chaque passage on test le classifier pour etudier l'evolution # Rappel : self.softmax_classifier = np.ones((5, universe_size)) self.softmax_classifier = self.softmax.train_epoch(self.softmax_classifier, training_data_set) self.test_softmax() training_end_time = time.time() training_duration = training_end_time - start_time print "Training session finished: duration %s seconds" % training_duration def test_softmax(self): print "Starting softmax testing session..." test_data_set = self.file_to_data_set('data/test_data/test.data') #test_data_set = self.file_to_data_set('data/training_data/training.data') error_count, success_count = self.softmax.test_classifier(self.softmax_classifier, test_data_set) print "Classifier just finished. %s/%s ~= %s%% results are good" % (success_count, (error_count + success_count), success_count * 100 / (success_count + error_count))
def main(_): ''' Trains model from data ''' print("we in main") print(sys.argv[2]) print(FLAGS) if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory', FLAGS.train_dir) word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout) train_model.update(model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update(model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=50) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0) valid_model.update(model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) with tf.variable_scope("Model", reuse=True): test_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=1, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=1, dropout=0.0) test_model.update(model.loss_graph(test_model.logits, 1, 1)) if FLAGS.load_model: saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', train_model.global_step.eval()) else: tf.initialize_all_variables().run() print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run( tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) def clear_char_embedding_padding(): char_embedding = session.run(train_model.char_embedding) char_embedding[0,:] = 0.0 session.run(tf.assign(train_model.char_embedding, char_embedding)) char_embedding = session.run(train_model.char_embedding) clear_char_embedding_padding() run_test2(session, test_model, train_reader) #exit(1) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() print (x) exit(1) loss, _, rnn_state, gradient_norm, step = session.run([ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, ], { train_model.input : x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) clear_char_embedding_padding() avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: print('%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run([ valid_model.loss, valid_model.final_rnn_state ], { valid_model.input : x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % FLAGS.print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp(best_valid_loss) - FLAGS.decay_when: print('** validation perplexity did not improve enough, decay learning rate') current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run(train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss run_test2(session, test_model, train_reader) print ("AGAIN") run_test2(session, test_model, train_reader)
def setUp(self): DataReader.createHostsFromFile() DataReader.createInstancesFromFile()
def main(_): ''' Loads trained model and evaluates it on test split ''' if FLAGS.load_model is None: print('Please specify checkpoint file to load model from') return -1 if not os.path.exists(FLAGS.load_model): print('Checkpoint file not found', FLAGS.load_model) return -1 word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized test dataset reader') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build inference graph ''' with tf.variable_scope("Model"): m = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0) m.update(model.loss_graph(m.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) global_step = tf.Variable(0, dtype=tf.int32, name='global_step') saver = tf.train.Saver() saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', global_step.eval()) ''' training starts here ''' rnn_state = session.run(m.initial_rnn_state) count = 0 avg_loss = 0 start_time = time.time() for x, y in test_reader.iter(): count += 1 loss, rnn_state = session.run([ m.loss, m.final_rnn_state ], { m.input : x, m.targets: y, m.initial_rnn_state: rnn_state }) avg_loss += loss avg_loss /= count time_elapsed = time.time() - start_time print("test loss = %6.8f, perplexity = %6.8f" % (avg_loss, np.exp(avg_loss))) print("test samples:", count*FLAGS.batch_size, "time elapsed:", time_elapsed, "time per one batch:", time_elapsed/count)
def testCreateHostsFromFile(self): DataReader.createHostsFromFile(TestDataReader.TEST_HOSTS_FILE) self.assertEquals(len(DataReader.hosts), 8)
def testCreateHostsFileError(self): DataReader.createHostsFromFile(TestDataReader.TEST_HOSTS_FILE) self.assertRaises(Exception)
def testFindHostByID(self): host = DataReader.findHostByID(2) self.assertIsNotNone(host) host = DataReader.findHostByID(-1) self.assertIsNone(host)
def initData(hfile, ifile): setup() logging.info('INITIALIZING DATA: Reading host and instance files') DataReader.createHostsFromFile(hfile) DataReader.createInstancesFromFile(ifile)
def testCreateInstancesFromFile(self): DataReader.createInstancesFromFile(TestDataReader.TEST_INSTANCES_FILE) self.assertEquals(len(DataReader.instances), 15) for i in DataReader.instances: self.assertIsNotNone(i.host)
# params for SVC kernel = 'rbf' C = 1 gamma = 10 # Choose a classifier alg = RandomForestClassifier(n_estimators=number_of_trees, criterion='entropy', max_features='log2') # alg = SVC(kernel=kernel, C=C, gamma=gamma) # alg = SVR() # alg = LinearSVR() visualize_xyz_example = False visualize_interpolated = False training_enabled = True data_reader = DataReader(xyz) print "Parsing data..." data, labels = data_reader.parse(fname) labels = np.array(labels) labels[labels < 1] = -1 if (visualize_xyz_example): timestamps = np.arange(train_frame_start, train_frame_end + train_sparseness, train_sparseness) visualize_count = 3 visualized = 0 for i in xrange(0, len(labels)): if (labels[i] and visualized < visualize_count): plt.figure(figsize=(20, 10)) plt.plot(timestamps, data[i][1], 'r') plt.plot(timestamps, data[i][2], 'g')