def main(): args = parse_args() nn = network.NeuralNetwork( input_count=dataset.IMAGE_SIZE, hidden_count=args.hidden_count, output_count=dataset.LABEL_COUNT, learning_rate=args.learning_rate, ) try: print('Training...') for epoch in range(1, args.epochs + 1): print(f'Epoch {epoch}') train_set = dataset.read_dataset(args.train_file) nn.train(itertools.islice(train_set.pairs, 0, args.train_limit)) except KeyboardInterrupt: pass print('Testing...') test_set = dataset.read_dataset(args.test_file) results = [] for img, label in itertools.islice(test_set.pairs, 0, args.test_limit): output = nn.query(img) output_label = np.argmax(output) results.append(output_label == np.argmax(label)) print(f'Test set precision: {np.mean(results)}') print('Dumping neural net...') with open('model.bin', 'wb') as model_file: pickle.dump(nn, model_file)
def main(args): """ Entry point: train or test. """ json.dump(vars(args), open(os.path.join(args.output_dir, 'config.json'), 'w')) if args.gpu_id == -1: ctx = mx.cpu() else: ctx = mx.gpu(args.gpu_id) mx.random.seed(args.seed, ctx=ctx) if args.mode == 'train': train_dataset = read_dataset(args, 'train_file') val_dataset = read_dataset(args, 'test_file') vocab_path = os.path.join(args.output_dir, 'vocab.jsons') if os.path.exists(vocab_path): vocab = nlp.Vocab.from_json(open(vocab_path).read()) else: vocab = build_vocab(train_dataset) with open(vocab_path, 'w') as fout: fout.write(vocab.to_json()) glove = nlp.embedding.create(args.embedding, source=args.embedding_source) vocab.set_embedding(glove) train_data_loader = prepare_data_loader(args, train_dataset, vocab) val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True) model = NLIModel(len(vocab), args.embedding_size, args.hidden_size, args.dropout, args.intra_attention) train_model(model, train_data_loader, val_data_loader, vocab.embedding, ctx, args) elif args.mode == 'test': model_args = argparse.Namespace( **json.load(open(os.path.join(args.model_dir, 'config.json')))) vocab = nlp.Vocab.from_json( open(os.path.join(args.model_dir, 'vocab.jsons')).read()) val_dataset = read_dataset(args, 'test_file') val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True) model = NLIModel(len(vocab), model_args.embedding_size, model_args.hidden_size, 0., model_args.intra_attention) model.load_parameters(os.path.join(args.model_dir, 'checkpoints', 'valid_best.params'), ctx=ctx) loss_func = gluon.loss.SoftmaxCrossEntropyLoss() logger.info('Test on {}'.format(args.test_file)) loss, acc = test_model(model, val_data_loader, loss_func, ctx) logger.info('loss={:.4f} acc={:.4f}'.format(loss, acc))
def main(): train_data = read_dataset('./penn.train.pos.gz') print(len(train_data)) word_map = dict() tagger_map = dict() for d in train_data: for w in d[0]: if w not in word_map: word_map[w] = len(word_map) for t in d[1]: if t not in tagger_map: tagger_map[t] = len(tagger_map) word_map['BBBBB'] = len(word_map) word_map['UNKNOWNWORD'] = len(word_map) config = dict() config['window_size'] = 2 config['embedding_size'] = 100 config['layer_size'] = [1 + 2 * config['window_size'], len(tagger_map)] config['batch_size'] = 500 config['dev_every_i'] = 10000 train_data_x, train_data_y = construct_training_data( train_data, config['window_size'], word_map, tagger_map) dev_data_x, dev_data_y = construct_training_data( read_dataset('./penn.devel.pos.gz'), config['window_size'], word_map, tagger_map) test_data_x, _ = construct_training_data( read_dataset('./penn.devel.pos.gz'), config['window_size'], word_map, tagger_map) model = mlp_postagger_model(config) print('Train Size:', len(train_data_x)) print('Dev Size:', len(dev_data_x)) print("Initial dev accuracy %g" % model.develop(dev_data_x, dev_data_y)) avg_loss = 0.0 begin = 0 done = False total_trained_instances = 0.0 last_dev = 0.0 while (not done): end = begin + config['batch_size'] if end > len(train_data_x): end = len(train_data_x) avg_loss += model.train(train_data_x[begin:end], train_data_y[begin:end]) total_trained_instances += end - begin if total_trained_instances - last_dev > config['dev_every_i']: dev_accuracy = model.develop(dev_data_x, dev_data_y) print('Ration %0.2f\tAccuracy %f' % (total_trained_instances / len(train_data_x), dev_accuracy)) labels = model.preidict(test_data_x) last_dev = total_trained_instances begin = end if begin >= len(train_data_x): begin = 0
def main(args): """ Entry point: train or test. """ json.dump(vars(args), open(os.path.join(args.output_dir, 'config.json'), 'w')) if args.gpu_id == -1: ctx = mx.cpu() else: ctx = mx.gpu(args.gpu_id) mx.random.seed(args.seed, ctx=ctx) if args.mode == 'train': train_dataset = read_dataset(args, 'train_file') val_dataset = read_dataset(args, 'test_file') vocab_path = os.path.join(args.output_dir, 'vocab.jsons') if os.path.exists(vocab_path): vocab = nlp.Vocab.from_json(open(vocab_path).read()) else: vocab = build_vocab(train_dataset) with open(vocab_path, 'w') as fout: fout.write(vocab.to_json()) glove = nlp.embedding.create(args.embedding, source=args.embedding_source) vocab.set_embedding(glove) train_data_loader = prepare_data_loader(args, train_dataset, vocab) val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True) model = NLIModel(len(vocab), args.embedding_size, args.hidden_size, args.dropout, args.intra_attention) train_model(model, train_data_loader, val_data_loader, vocab.embedding, ctx, args) elif args.mode == 'test': model_args = argparse.Namespace(**json.load( open(os.path.join(args.model_dir, 'config.json')))) vocab = nlp.Vocab.from_json( open(os.path.join(args.model_dir, 'vocab.jsons')).read()) val_dataset = read_dataset(args, 'test_file') val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True) model = NLIModel(len(vocab), model_args.embedding_size, model_args.hidden_size, 0., model_args.intra_attention) model.load_parameters(os.path.join( args.model_dir, 'checkpoints', 'valid_best.params'), ctx=ctx) loss_func = gluon.loss.SoftmaxCrossEntropyLoss() logger.info('Test on {}'.format(args.test_file)) loss, acc = test_model(model, val_data_loader, loss_func, ctx) logger.info('loss={:.4f} acc={:.4f}'.format(loss, acc))
def train_and_save(): x = tf.placeholder(tf.float32, shape=[None, INPUT_SAMPLE_LENGHT], name=INPUT_TENSOR_NAME) y_ = tf.placeholder(tf.float32, shape=[None, NUMBER_OF_CLASSES], name=LABEL_TENSOR_NAME) logits = create_computation_graph(x) output = add_predictor(logits) datset = suffle_dataset(read_dataset(FEATURE_FILENAME)) datset.train_labels = to_one_hot(datset.train_labels) train_op, loss_op = make_traingin_step(logits, y_) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) train_model(sess, x, y_, train_op, datset, loss_op) saver.save(sess, TRAIN_MODEL_SAVE_NAME) tf.train.write_graph(sess.graph_def, "res", "tgraph.pb", as_text=False) save_labels(MODEL_METAFILE_NAME, datset) datset.train_labels = from_one_hot(datset.train_labels) print_acc(sess, output, x, datset)
def get_dataset(self, feat_path_i): adapt_dataset = dataset.read_dataset(feat_path_i) full_dataset = dataset.unify_datasets( [adapt_dataset, self.basic_dataset]) preproc(full_dataset, self.norm, self.n_feats) print(full_dataset.X.shape) return full_dataset
def __call__(self, dataset_path): single_dataset = dataset.read_dataset(dataset_path) results = [alg_i(single_dataset) for alg_i in self.algs] y_true = results[0][0] all_preds = [result_i[1] for result_i in results] y_pred = vote(all_preds) ensemble.single_cls.show_result(y_true, y_pred)
def get_data(self, mode="train_"): ''' 获得两个列表: int_words_list:包含每个句子的词对应int的列表 int_labels_list:包含每个句子的label对应的int的列表 例: int_words_list[0]: [57, 37, 306, 1098, 1771, 7, 16, 18, 1, 19, 27, 544,\ 12, 9135, 10629, 82, 16, 1, 1, 7651, 3, 1, 11, 1, 784,\ 2, 19, 1, 90, 8160, 80, 2, 3, 875, 7, 1, 2, 1932, 24,\ 8595, 1, 2, 31, 10310, 1530, 8, 1, 1, 6] int_labels_list[1]: [1, 3, 2, 8, 0, 1, 23, 3, 0, 24, 1, 2, 20, 2, 2, 35, 23,\ 15, 5, 18, 3, 0, 1, 2, 2, 6, 24, 0, 12, 5, 34, 6, 3, 0,\ 1, 2, 6, 15, 1, 2, 2, 6, 10, 9, 15, 13, 2, 2, 7] ''' f_name = "penn." + mode.rstrip('_') + ".pos.gz" data_set = read_dataset(f_name) word_to_int = pkl.load(open(mode + "word_to_int_dict", 'r')) lab_to_int = pkl.load(open(mode + "lab_to_int_dic", 'r')) int_words_list = [] int_labels_list = [] for t in data_set: words = t[0] labels = t[1] int_words = [] for w in words: if word_to_int.has_key(w): int_words.append(word_to_int[w]) else: int_words.append(word_to_int['UNK']) int_labels = [lab_to_int[l] for l in labels] int_words_list.append(int_words) int_labels_list.append(int_labels) return int_words_list, int_labels_list
def load_dataset(): data = list(dataset.read_dataset('weighted')) train = list(extract_all_sentences(data[:16000])) dev = list(extract_all_sentences(data[16000:18000])) test = list(extract_all_sentences(data[18000:])) return train, dev, test
def load_dataset(): data = list(dataset.read_dataset('weighted')) train = data[:16000] dev = data[16000:18000] test = data[18000:] return train, dev, test
def main(): # Constants lable_column_name = 'Survived' lable_class_vocab = ['SANK', 'SURVIVED'] numerical_column_names = ['Age', 'Fare', 'Pclass'] categorical_column_names = ['Sex'] data_set_type = 'kaggle' # should be either 'kaggle' or 'google' max_epoch = 70000 batch_size = 256 shuffle = True test_size = 80 network_size = 'medium' fc_scenario_num = 2 relevant_columns = numerical_column_names + categorical_column_names relevant_columns.append(lable_column_name) # Models for prediction on Titanic survival df_train, y_train, df_test, y_test, df_predict, y_predict = ds.read_dataset( test_size=test_size, data_set_type=data_set_type, relevant_columns=relevant_columns, lable_column_name=lable_column_name) create_feature_column = fc.CreateFeatureColumn( df=df_train.append(df_test.append(df_predict)), numerical_column_names=numerical_column_names, categorical_column_names=categorical_column_names) feature_column = None if fc_scenario_num == 2: feature_column = create_feature_column.fc_second_scenario() elif fc_scenario_num == 3: feature_column = create_feature_column.fc_third_scenario() else: feature_column = create_feature_column.fc_first_scenario() for f in feature_column: print(f) print('{} Featutes have been created based on Scenario {}'.format( len(feature_column), fc_scenario_num)) tf_model = TfModels.TfModels(df_train, y_train, df_test, y_test, df_predict, batch_size, shuffle, lable_class_vocab, feature_column, network_size, y_predict) eval_accuracy, pred_accuracy = tf_model.tf_DNN_Classifier( optimizer=None, show_best_epoch_trend=False, max_epoch=max_epoch, show_pred_result=True, save_pred_result=True) print('\nEvaluation Accuracy is {}\nPrediction Accuracy is: {}'.format( eval_accuracy, pred_accuracy))
def simple_exp(in_path): if (type(in_path) == str): full_dataset = dataset.read_dataset(in_path) else: full_dataset = in_path print("dataset dim %i" % full_dataset.dim()) full_dataset.norm() y_true, y_pred = train_model(full_dataset) show_result(y_true, y_pred)
def word_frequency_dic(self, mode="train_"): "get a dict of word frequencies" c = Counter() f_name = "penn." + mode.rstrip('_') + ".pos.gz" train_dataset = read_dataset(f_name) #统计每个词出现的次数 for t in train_dataset: for word in t[0]: c[word] += 1 return c
def train(): srcnn = model() low, label = dataset.read_dataset("./train.h5") val_low, val_label = dataset.read_dataset("./test.h5") checkpoint = ModelCheckpoint("SRCNN_check.h5", monitor='val_loss', verbose=1, save_best_only=True) callbacks_list = [checkpoint] srcnn.fit(low, label, validation_data=(val_low, val_label), callbacks=callbacks_list, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1)
def load_data(): # read the data X, y = read_dataset() # impute the missing data in the dataset X = impute(X) # normalizing the dataset X = normalizing(X) return X, y
def load_and_test_graph(): dataset = read_dataset(FEATURE_FILENAME) #persisted_sess.graph.as_default() load_graph() print_graph(tf.get_default_graph()) with tf.Session() as sess: #sess.run(tf.global_variables_initializer()) output_tensor = sess.graph.get_tensor_by_name("import/" + OUTPUT_TENSOR_NAME + ":0") input_tensor = sess.graph.get_tensor_by_name("import/" + INPUT_TENSOR_NAME + ":0") print_acc(sess, output_tensor, input_tensor, dataset)
def main(): data = read_dataset('train.json') data = prep_dataset(data) write_dataset('train_processed.json', data) data = None # Use less memory data = prep_dataset(read_dataset('test.json')) chunk_size = len(data) // 10 for i in range(10): if i < 9: print('Saving datapoints [', i * chunk_size, ':', (i + 1) * chunk_size, ']') write_dataset('test_processed_' + str(i) + '.json', data[i * chunk_size:(i + 1) * chunk_size]) else: print('Saving datapoints [', i * chunk_size, ':', len(data), ']') write_dataset('test_processed_' + str(i) + '.json', data[i * chunk_size:]) print('Saving entire dataset', len(data)) write_dataset('test_processed_full.json', data)
def read_sentences(self, out_filename='sentences.txt', mode="train_"): "words: a dict of words" "labels: a dict of labels" "read data and convert it to sentences in ./sentences.txt" f_name = "penn." + mode.rstrip('_') + ".pos.gz" train_dataset = read_dataset(f_name) out = open(mode + out_filename, 'w') words = ' '.join([' '.join(tup[0]) for tup in train_dataset]).split() labels = ' '.join([' '.join(tup[1]) for tup in train_dataset]).split() out.write('\n'.join([' '.join(tup[0]) for tup in train_dataset])) out.close() return words, labels
def missing_icebergs(): original_ids = [it['id'] for it in read_dataset('test.json')] processed_ids = [] for file in [ 'test_processed_0', 'test_processed_1', 'test_processed_2', 'test_processed_3', 'test_processed_4', 'test_processed_5', 'test_processed_6', 'test_processed_7', 'test_processed_8', 'test_processed_9', ]: processed_ids += [it['id'] for it in read_dataset(file + '.json')] print(len(processed_ids), len(original_ids)) for i in range(len(original_ids)): print(i) if original_ids[i] != processed_ids[i]: print('Not equal!', original_ids[i], processed_ids[i], i)
def main(): """ Load mini-imagenet and train a model. """ # Parse arguments parser = argument_parser() args = parser.parse_args() t_kwargs = train_kwargs(args) e_kwargs = evaluate_kwargs(args) rng = np.random.RandomState(seed) torch.manual_seed(seed) model_name = args.model_name net = Model(args.classes) if torch.cuda.is_available(): net.cuda() #net = torch.nn.DataParallel(net).cuda() is_eval = args.test if is_eval: print "Evaluate mode" net.load_state_dict( torch.load("./models/model_r_{}.pth".format(model_name))) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, betas=(0, 0.999)) print net train_set, val_set, test_set = dset.read_dataset(DATA_DIR) print "Dataset loaded." if not is_eval: reptile.train(train_set, val_set, net, model_name, criterion, optimizer, **t_kwargs) torch.save(net.state_dict(), './models/model-{}.pth'.format(args.model_name)) # Final eval print "accuracy on train_set: {}".format( reptile.test(train_set, net, criterion, optimizer, **e_kwargs)) print "accuracy on val_set: {}".format( reptile.test(val_set, net, criterion, optimizer, **e_kwargs)) print "accuracy on test_set: {}".format( reptile.test(test_set, net, criterion, optimizer, **e_kwargs))
#%% import numpy as np from dataset import read_dataset from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.multioutput import MultiOutputRegressor from sklearn.svm import SVR, LinearSVR #%% DIMS = 4 X, y = read_dataset(DIMS) print(X.shape, y.shape) X = X / DIMS y = (y * 2) - 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) #%% def train_and_score(model): print("Fitting {}".format(model)) model.fit(X_train, y_train) print("Score: {}".format(model.score(X_test, y_test))) return model rfr = train_and_score(RandomForestRegressor(n_estimators=20, max_depth=12, random_state=42)) lr = train_and_score(LinearRegression()) m_svr = train_and_score(MultiOutputRegressor(LinearSVR())) m_lr = train_and_score(MultiOutputRegressor(LinearRegression()))
def main(args): global verbose, encoding verbose = args.verbose encoding = args.encoding assert args.poly_degree >= 1, '--degree must be positive integer' poly_degree = args.poly_degree gpu = args.gpu if gpu >= 0: cuda.check_cuda_available() if verbose: logger.info('Use GPU {}'.format(gpu)) cuda.get_device_from_id(gpu).use() df = read_dataset(args.path_input, args.flag_has_header) # agg = df.groupby('fact_en')['twa'].mean() # invalid_facts = set(agg[(agg == 1.0)|(agg == 0.0)].index) # if verbose: # logger.info('Invalid facts: {}'.format(len(invalid_facts))) # df = df[~df['fact_en'].isin(invalid_facts)] # if verbose: # logger.info('Remained {} lines'.format(len(df))) # Load vocabulary if verbose: logger.info('Load vocabulary') rel2id = Vocabulary() rel2id.read_from_file(args.path_rels) fact2id = Vocabulary() fact2id.read_from_list(np.unique(get_values(df, 'fact'))) ja2id = Vocabulary() ja2id.read_from_list(np.unique(get_values(df, 'fact_ja'))) en2id = Vocabulary() en2id.read_from_list(np.unique(get_values(df, 'fact_en'))) df.index = df['fact'] df.loc[:, 'fact'] = replace_by_dic(df['fact'], fact2id).astype(np.int32) df.loc[:, 'fact_ja'] = replace_by_dic(df['fact_ja'], ja2id).astype(np.int32) df.loc[:, 'fact_en'] = replace_by_dic(df['fact_en'], en2id).astype(np.int32) df.loc[:, 'rel'] = replace_by_dic(df['rel'], rel2id).astype(np.int32) en2ja = {en: set(df[df['fact_en'] == en]['fact'].unique()) for en in sorted(df['fact_en'].unique())} idx2vec = get_idx2vec(df, poly_degree=poly_degree) if gpu >= 0: idx2vec = cuda.to_gpu(idx2vec) ss = df.drop_duplicates('fact_en') itr = FactIterator(ss, len(ss), ja2id, en2id, train=False, evaluate=True, repeat=False, poly_degree=poly_degree) # Define a model model_type = args.model.lower() dim_in = len(COL_BASIC_FEATURES) rel_size = len(rel2id) if model_type.startswith('linear'): ensembler = LinearEnsembler(dim_in, rel_size, use_gpu=(gpu >= 0), poly_degree=poly_degree, flag_unifw=args.flag_unifw, verbose=verbose) elif model_type.startswith('mlp'): options = args.model.split(':') params = {} if len(options) > 1: params['dim_hid'] = int(options[1]) if len(options) > 2: params['activation'] = options[2] ensembler = MLPEnsembler( dim_in, rel_size, use_gpu=(gpu >= 0), poly_degree=poly_degree, flag_unifw=args.flag_unifw, verbose=verbose, **params) else: raise ValueError('Invalid --model: {}'.format(model_type)) ensembler.add_persistent('_mu', None) ensembler.add_persistent('_sigma', None) # load a trained model chainer.serializers.load_npz(args.path_model, ensembler) if ensembler._mu is not None: logger.info('standardize vectors: True') itr.standardize_vectors(mu=ensembler._mu, sigma=ensembler._sigma) idx2vec = standardize_vectors(idx2vec, ensembler._mu, ensembler._sigma) else: logger.info('standardize vectors: False') model = Classifier(ensembler, en2ja, idx2vec) # calculate probabilities for testing set buff = [] for i, (rels, _, en_indices) in enumerate(itr, start=1): if i % 500 == 0: logger.info('Evaluating: {}'.format(i)) buff.append((model(rels, en_indices), en_indices)) scores = list(chain.from_iterable(t[0] for t in buff)) if verbose: logger.info('Output results to ' + args.path_output) with open(args.path_output, 'w') as f: header = '\t'.join(['rel', 'start', 'end', 'start_en', 'end_en', 'score', 'label']) f.write(header + '\n') for row in sorted(scores, key=lambda t: t[2], reverse=True): idx_fact, idx_en, score = row fact = fact2id.id2word[idx_fact] fact_ja, fact_en = fact.split('@@@') rel, start_en, end_en = fact_en.split('|||') rel, start_ja, end_ja = fact_ja.split('|||') try: label = df.loc[fact, 'label'] except KeyError: label = df.loc[fact, 'twa'] f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( rel, start_ja, end_ja, start_en, end_en, score, label))
def main(batch_size, ar_window_size, model_details, seed): train_dataset, val_dataset, test_dataset = read_dataset() torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) batches_per_epoch = len(train_dataset) // batch_size # Start Training bot = LSTNetBot(train_dataset, test_dataset, val_dataset=val_dataset, cnn_hidden_size=model_details["cnn_hidden_size"], rnn_hidden_size=model_details["rnn_hidden_size"], skip_hidden_size=model_details["skip_hidden_size"], skip=model_details["skip"], cnn_kernel=model_details["cnn_kernel"], hdrop=model_details["hdrop"], edrop=model_details["edrop"], odrop=model_details["odrop"], avg_window=500, ar_window_size=ar_window_size, clip_grad=10, unit_type=model_details["unit_type"]) param_groups = [{"params": bot.model.parameters(), "lr": 5e-4}] optimizer = optim.RMSprop(param_groups) scheduler = ReduceLROnPlateau( optimizer, factor=0.25, patience=4, cooldown=0, threshold=2e-4, min_lr=[x["lr"] * 0.25**2 for x in param_groups]) _ = bot.train(optimizer, batch_size=batch_size, n_epochs=20, seed=seed, log_interval=batches_per_epoch // 50, snapshot_interval=batches_per_epoch // 50 * 5, early_stopping_cnt=15, scheduler=scheduler) timestamp = datetime.now().strftime("%Y%m%d_%H%M") timestamp = datetime.now().strftime("%Y%m%d_%H%M") val_pred = bot.predict_avg(is_test=False, k=8).cpu().numpy() weights = val_dataset.series_i[:, 0, -1] * .25 + 1 score = mean_squared_error(val_dataset.y, val_pred, sample_weight=weights) export_validation( "cache/preds/val/{}_{:.6f}_{}.csv".format(bot.name, score, timestamp), val_pred) test_pred = bot.predict_avg(is_test=True, k=8).cpu().numpy() export_test( "cache/preds/test/{}_{:.6f}_{}.csv".format(bot.name, score, timestamp), test_pred) bot.logger.info("Score: {:.6f}".format(score)) return score
def interactive_isfm(): matched_frames, _ = read_dataset(shortest_track_length=3) # Choose optimization method, BatchBundleAdjustment or IncrementalBundleAdjustment. optimizer = BatchBundleAdjustment() # Choose the two first frames for initialization frame_0 = matched_frames[0] frame_1 = matched_frames[1] # Initialize map from two-view geometry. sfm_map = initialize_map(frame_0, frame_1) # You can here choose which images to add to the map in add_new_frame(). next_frames = matched_frames[2::] # Callback for optimizing the map (press 'O') def optimize(vis): # Apply BA. optimizer.full_bundle_adjustment_update(sfm_map) vis.clear_geometries() for geom in get_geometry(): vis.add_geometry(geom, reset_bounding_box=False) # Callback for adding new frame to the map (press 'A') def add_new_frame(vis): if not next_frames: return # Get next frame frame_new = next_frames.pop(0) print("Adding frame " + str(frame_new.id())) # Find 2d-3d correspondences with map and compute initial pose with respect to the map. frame_map_corr, pose_w_new = track_map(sfm_map, frame_new) # Insert frame as keyframe into the map kf_new = add_as_keyframe_to_map(sfm_map, frame_new, pose_w_new, frame_map_corr) # Find new correspondences, triangulate and add as map points. find_and_add_new_map_points(sfm_map, kf_new) vis.clear_geometries() for geom in get_geometry(): vis.add_geometry(geom, reset_bounding_box=False) # Helper function for extracting the visualization elements from the map. def get_geometry(): poses = sfm_map.get_keyframe_poses() p, c = sfm_map.get_pointcloud() axes = [] for pose in poses: axes.append( o3d.geometry.TriangleMesh.create_coordinate_frame( size=1.0).transform(pose.to_matrix())) pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(p.T) pcd.colors = o3d.utility.Vector3dVector(c.T / 255) return [pcd] + axes # Create visualizer. key_to_callback = {} key_to_callback[ord("O")] = optimize key_to_callback[ord("A")] = add_new_frame o3d.visualization.draw_geometries_with_key_callbacks( get_geometry(), key_to_callback)
BATCH_SIZE = 2 NUM_WORKERS = 4 DEVICE = "cuda" if __name__ == "__main__": SAVE_FILE = "~/instrument_pitch_tracker/efficient/log_dummy.log" print("debug - we have the libraries") DATA_FOLDER = "./" data_sub_dirs = [ os.path.abspath(os.path.join(DATA_FOLDER, f)) for f in os.listdir(DATA_FOLDER) ] dataset = read_dataset(data_sub_dirs[0], window_s=2.56, step_ms=10) print("debug - dataset import success") train_set, dev_set, test_set = partition_dataset(dataset, dev_ratio=0.1, test_ratio=0.1) train_loader = data.DataLoader(train_set, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True) dev_loader = data.DataLoader(dev_set, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False) model = CREPE(pretrained=False).to(DEVICE)
The word list classifier: PerceptronClassifier The classifier object. ''' prev = '<s>' ret = [] for i in range(len(words)): # Your code here, implement the greedy search, label = classifier.predict(words,i,prev) ret.append(classifier.labels[label]) prev = classifier.labels[label] return ret from dataset import read_dataset print (time.strftime('%Y-%m-%d %H:%M:%S')) train_dataset = read_dataset('./penn.train.pos.gz') devel_dataset = read_dataset('./penn.devel.pos.gz') print('%d is training sentences.' % len(train_dataset)) print('%d is development sentences.' % len(devel_dataset)) perceptron = PerceptronClassifier(max_iter=1, training_data=train_dataset, devel_data=devel_dataset) print('========================TEST CASE1==========================') n_corr, n_total = 0, 0 for devel_data in devel_dataset: devel_data_x, devel_data_y = devel_data pred_y = greedy_search(devel_data_x, perceptron) for pred_tag, corr_tag in zip(pred_y, devel_data_y): if pred_tag == corr_tag: n_corr += 1
def main(args): # Read the hyperparameter config json file with open(hyperparams_path) as _in_file: hyperparams_dict = json.load(_in_file) epochs = int(hyperparams_dict["epochs"]) learning_rate = float(hyperparams_dict["learning_rate"]) curr_dt = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") strategy = tf.distribute.MirroredStrategy() batch_size = float( hyperparams_dict["batch-size"]) * strategy.num_replicas_in_sync model_dir = bucket + args.model_name + curr_dt gpus = tf.config.experimental.list_physical_devices("GPU") if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.experimental.list_logical_devices("GPU") print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) nb_train_samples = sum([len(files) for r, d, files in os.walk(args.train)]) nb_validation_samples = sum( [len(files) for r, d, files in os.walk(args.validation)]) nb_test_samples = sum([len(files) for r, d, files in os.walk(args.eval)]) logging.info("Number of training samples={}".format(nb_train_samples)) train_data_generator = dataset.read_dataset(args.train, batch_size=batch_size, train_mode=True) validation_data_generator = dataset.read_dataset(args.validation, batch_size=batch_size, train_mode=False) test_data_generator = dataset.read_dataset(args.eval, batch_size=batch_size, train_mode=False) tensorboard_cb = CustomTensorBoardCallback(log_dir=model_dir + "/tensorboard/" + curr_dt) checkpoints_cb = ModelCheckpoint( filepath=os.path.join(model_dir + "/training_checkpoints", "ckpt_{epoch}"), save_weights_only=True, ) callbacks = [tensorboard_cb, checkpoints_cb] logging.info("Configuring model") with strategy.scope(): model = model_def.transfer_learning_model( dropout=args.dropout, model_name=args.model_name, learning_rate=learning_rate, ) logging.info("Starting training") history = model.fit( train_data_generator, steps_per_epoch=nb_train_samples // batch_size, epochs=epochs, validation_data=validation_data_generator, validation_steps=nb_validation_samples // batch_size, callbacks=callbacks, verbose=1, ) loss, acc, auc = tuple( model.evaluate(test_data_generator, steps=nb_test_samples // batch_size, verbose=1)) print("Model {} with dropout {} had loss {} and acc {}", model_name, j, loss, acc) export_path = tf.saved_model.save( model, bucket + model_name + "/keras_export_" + curr_dt) print("Model exported to: ", export_path)
def main(): train_data = read_dataset('./penn.train.pos.gz') print(len(train_data)) word_map = dict() tagger_map = dict() for d in train_data: for w in d[0]: if w not in word_map: word_map[w] = len(word_map) for t in d[1]: if t not in tagger_map: tagger_map[t] = len(tagger_map) word_map['BBBBB'] = len(word_map) word_map['UNKNOWNWORD'] = len(word_map) window_size = 2 train_data_x, train_data_y = construct_training_data( train_data, window_size, word_map, tagger_map) dev_data_x, dev_data_y = construct_training_data( read_dataset('./penn.devel.pos.gz'), window_size, word_map, tagger_map) #dev_data_x=train_data_x[0:3000] #dev_data_y=train_data_y[0:3000] embedding_size = 100 embeddings = tf.Variable(tf.random_uniform( [len(word_map), embedding_size], -1.0, 1.0)) W = tf.Variable(tf.random_normal([embedding_size * ( window_size + 1 + window_size), len(tagger_map)])) b = tf.Variable(tf.random_normal([len(tagger_map)])) words = tf.placeholder("int32", [None, window_size + 1 + window_size]) array_x = tf.nn.embedding_lookup(embeddings, words) x = tf.reshape(array_x, [-1, (window_size + 1 + window_size) * embedding_size]) h = tf.add(tf.matmul(x, W), b) y = tf.nn.softmax(h) y_ = tf.placeholder("float32", [None, len(tagger_map)]) #loss = tf.reduce_mean(tf.square(y - y_)) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(h, y_)) #loss=tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1])) optimizer = tf.train.AdamOptimizer(learning_rate=1.5).minimize(loss) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) predicted_label = tf.argmax(y, 1) init = tf.initialize_all_variables() print('Train Size:', len(train_data_x)) print('Dev Size:', len(dev_data_x)) with tf.Session() as sess: sess.run(init) print("test accuracy %g" % accuracy.eval(feed_dict={ words: dev_data_x, y_: dev_data_y })) for kkk in range(100): avg_loss = 0.0 for i in range(0, len(train_data_x), batch_size): sess.run(optimizer, feed_dict={words: train_data_x[i:i + batch_size], y_: train_data_y[i:i + batch_size]}) avg_loss += sess.run( loss, feed_dict={words: train_data_x[i:i + batch_size], y_: train_data_y[i:i + batch_size]}) #print(kkk, avg_loss) print("test accuracy %g" % accuracy.eval(feed_dict={ words: dev_data_x, y_: dev_data_y })) print(list(sess.run(predicted_label, feed_dict={words: dev_data_x[0:10]}))) print(numpy.argmax(dev_data_y[0:10], 1))
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization from keras import regularizers from features import LabelTrainer from dataset import read_dataset #Fourth approach with VGG-19 features dir = 'images' print('Loading VGG19 model...') base_model = VGG19() vgg19model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output) print('Loading datasets and pre-processing...') dataset, m = read_dataset() filenames = [f for f in listdir(dir) if isfile(join(dir, f))] positivitymatrix = [] positivity = [] print('Building matrix...') for index, row in dataset.iterrows(): filename = row.urlImage[row.urlImage.rfind('/') + 1:] if index % 1000 == 0: print(index) if not filename in filenames: continue #see tutorial on keras img = image.load_img(join(dir, filename), target_size=(224, 224)) x = image.img_to_array(img)
assert hmm.bigram['NN', 'VV'] == 2 assert hmm.cooc['dog', 'NN'] == 2 print("====================Test case2========================") testing_dataset = [['dog', 'chase', 'mouse'], ['I', 'chase', 'dog']] for testing_data in testing_dataset: tags = viterbi(testing_data, hmm) print(' '.join(testing_data)) print(tags) print("====================Test case3========================") from dataset import read_dataset print(time.strftime('%Y-%m-%d %H:%M:%S')) train_dataset = read_dataset('./penn.train.pos.gz') devel_dataset = read_dataset('./penn.devel.pos.gz') print('%d is training sentences.' % len(train_dataset)) print('%d is development sentences.' % len(devel_dataset)) hmm = HMM() hmm.fit(train_dataset) n_corr, n_total = 0, 0 for devel_data_x, devel_data_y in devel_dataset: pred_y = viterbi(devel_data_x, hmm) for pred_tag, corr_tag in zip(pred_y, devel_data_y): if pred_tag == corr_tag: n_corr += 1 n_total += 1 print("accuracy=%f" % (float(n_corr) / n_total)) print(time.strftime('%Y-%m-%d %H:%M:%S'))
import dataset import network import random import time start_time = time.time() training_dataset = dataset.read_dataset("./training/train-labels.idx1-ubyte", "./training/train-images.idx3-ubyte") test_dataset = dataset.read_dataset("./test_data/t10k-labels.idx1-ubyte", "./test_data/t10k-images.idx3-ubyte") print(training_dataset) print(test_dataset) # training_dataset.print_image(random.randrange(0, training_dataset.total_items)) # test_dataset.print_image(random.randrange(0, test_dataset.total_items)) # import mnist_loader # training_data, validation_data, test_data = mnist_loader.load_data_wrapper() # training_data = list(training_data) digit_nn = network.Network([784, 30, 10]) digit_nn.sgd(training_dataset.data, 30, 10, 3.0, test_dataset=list(test_dataset.test_zip)) # import networkoooo # net = networkoooo.Network([784, 30, 10]) # net.SGD(training_dataset.data, 30, 10, 3.0, test_data=test_dataset.test_zip)
import dataset import train import s3_utils prefix = "/opt/ml/" model_path = os.path.join(prefix, "model") bucket_path = "s3://${aws_s3_bucket.ml_bucket.id}/input_data/" hyperparams_path = prefix + "input/config/hyperparameters.json" with open(hyperparams_path) as _in_file: hyperparams_dict = json.load(_in_file) batch_size = float(hyperparams_dict["batch-size"]) test_data_generator = dataset.read_dataset( bucket_path + "test", batch_size=batch_size, train_mode=False, dataset=False, shuffle=False, binary_classification=True, ) class_indices = dict() for k, v in test_data_generator.class_indices.items(): class_indices[v] = k class ScoringService(object): model = None # Where we keep the model when it's loaded @classmethod def get_model(cls):
import open3d as o3d from dataset import read_dataset def visualize_map(map, axis_size=1): poses = map.get_keyframe_poses() p, c = map.get_pointcloud() axes = [] for pose in poses: axes.append( o3d.geometry.TriangleMesh.create_coordinate_frame( size=axis_size).transform(pose.to_matrix())) pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(p.T) pcd.colors = o3d.utility.Vector3dVector(c.T / 255) o3d.visualization.draw_geometries([pcd] + axes) if __name__ == '__main__': _, sfm_map = read_dataset() visualize_map(sfm_map, 25)
def cluster(self): labelTrainer = LabelTrainer() labelTrainer.read_features() labelTrainer.build_model(encoding_dim=100) labelTrainer.train(epch=15) print('Building matrix...') X = [] for filename in labelTrainer.filenames: _, encodedLabel = labelTrainer.get_data(filename) X.append(encodedLabel) X = np.array(X) print('Clustering...') kmeans = KMeans(n_clusters=50, random_state=0).fit(X) predictions = kmeans.predict(X) dictClassFiles = {} dictFileClass = {} for filename, pred in zip(labelTrainer.filenames, predictions): if pred in dictClassFiles: dictClassFiles[pred].append(filename) else: dictClassFiles[pred] = [filename] dictFileClass[filename] = {'class': pred} print('Loading datasets and pre-processing...') dataset, m = read_dataset() userGood = {} print('Building user performance metrics...') for index, row in dataset.iterrows(): filename = row.urlImage[row.urlImage.rfind('/') + 1:] if index % 1000 == 0: print(index) if not filename in labelTrainer.filenames: continue numberLikes = row.numberLikes meanNumberLikes = m.loc[m.index == row.alias].numberLikes[0] pos = numberLikes / meanNumberLikes dictFileClass[filename]['likes'] = numberLikes dictFileClass[filename]['mean'] = meanNumberLikes dictFileClass[filename]['pos'] = pos cl = dictFileClass[filename]['class'] if not row.alias in userGood: userGood[row.alias] = [(cl, pos)] else: userGood[row.alias].append((cl, pos)) #Read 5 images for each cluster in the list #for i in [0,1,10,16,38]: # images = dictClassFiles[i][:5] # for image in images: # img = misc.imread(join('images', image)) # misc.imshow(img) alpha = 0.5 userArrays = {} for user in userGood: userClasses = list(set((cl for (cl, pos) in userGood[user]))) arr = np.zeros(50) userGoodRestricted = userGood[user][::2] for userClass in userClasses: s = [ pos - 1 for (cl, pos) in userGoodRestricted if cl == userClass ] l = len(s) if l > 0: s = sum(s) / l else: s = 0 arr[userClass] = s for i in range(len(arr)): if arr[i] > alpha: arr[i] = 1 elif arr[i] < -alpha: arr[i] = -1 else: arr[i] = 0 userArrays[user] = arr self.userArrays = userArrays self.dictFileClass = dictFileClass