def load_MNIST(): data_path = '../data/MNIST_data' data = input_data.read_data_sets(data_path, one_hot=False) x_train_aux = data.train.images x_test = data.test.images data_dim = data.train.images.shape[1] n_train = data.train.images.shape[0] train_size = int(n_train * 0.8) valid_size = n_train - train_size x_valid, x_train = merge_datasets(x_train_aux, data_dim, train_size, valid_size) print('Data loaded. ', time.localtime().tm_hour, ':', time.localtime().tm_min, 'h') # logs.write('\tData loaded ' + str(time.localtime().tm_hour) +':' + str(time.localtime().tm_min) + 'h\n') x_train = np.reshape(x_train, [-1, 28, 28, 1]) x_valid = np.reshape(x_valid, [-1, 28, 28, 1]) x_test = np.reshape(x_test, [-1, 28, 28, 1]) train_dataset = Dataset(x_train, data.train.labels) valid_dataset = Dataset(x_valid, data.train.labels) test_dataset = Dataset(x_test, data.test.labels) print('Train Data: ', train_dataset.x.shape) print('Valid Data: ', valid_dataset.x.shape) print('Test Data: ', test_dataset.x.shape) return train_dataset, valid_dataset, test_dataset
def load_data(self): data = {} data['train'] = Dataset('data/preprocessed_data/train-shard-*.tar') data['dev'] = Dataset('data/preprocessed_data/dev.tar') data['test'] = Dataset('data/preprocessed_data/test.tar') return data
def execute_demo(language, size=0): data = Dataset(language) if size: data.trainset = data.trainset[0:size] print("{}: {} training - {} dev - {} test".format(language, len(data.trainset), len(data.devset), len(data.testset))) improved = Improved(language) improved.train(data.trainset) predictions_dev = improved.test(data.devset) predictions_test = improved.test(data.testset) gold_labels_dev = [sent['gold_label'] for sent in data.devset] gold_labels_test = [sent['gold_label'] for sent in data.testset] if size: print("dev score size = " + str(size)) report_score(gold_labels_dev, predictions_dev) print("test score size = " + str(size)) report_score(gold_labels_test, predictions_test) print('-' * 50) else: print("dev score") report_score(gold_labels_dev, predictions_dev) print("test score") report_score(gold_labels_test, predictions_test) print('-' * 50)
def train(self, dataset=None, initial_epoch=0): callbacks = get_callbacks(self.model_name, self.tb_path, self.model_path_new_train, self.config['lr_dec'], self.config['lr']) if dataset == None: dataset = Dataset(self.data_name, self.config_path) dataset_train, dataset_val = dataset.get_tf_data() self.model.compile(optimizer=tf.keras.optimizers.Adam( learning_rate=self.config['lr']), loss=[marginLoss, 'mse'], loss_weights=[1., self.config['lmd_gen']], metrics={'Original_CapsNet': 'accuracy'}) print('-' * 30 + f'{self.data_name} train' + '-' * 30) history = self.model.fit(dataset_train, epochs=self.config['epochs'], validation_data=(dataset_val), batch_size=self.config['batch_size'], initial_epoch=initial_epoch, callbacks=callbacks, workers=self.config['num_workers']) self.model.save_weights( os.path.join(self.config['saved_model_dir'], f"{self.model_name}", f"{self.model_name}_{self.data_name}.h5")) return history
def main( datadir, batchsize = 16, workers = 0, epochs = 100, lr = 1e-3, snapshot = None, checkpoint_dir = None ): traindataset = Dataset(datadir, tileids="tileids/train_fold0.tileids") testdataset = Dataset(datadir, tileids="tileids/test_fold0.tileids") nclasses = len(traindataset.classes) traindataloader = torch.utils.data.DataLoader(traindataset,batch_size=batchsize,shuffle=True,num_workers=workers) testdataloader = torch.utils.data.DataLoader(testdataset,batch_size=batchsize,shuffle=False,num_workers=workers) logger = Logger(columns=["loss"], modes=["train", "test"]) vizlogger = VisdomLogger() network = LSTMSequentialEncoder(48,48,nclasses=nclasses) optimizer = torch.optim.Adam(network.parameters(), lr=lr) loss = torch.nn.NLLLoss() if torch.cuda.is_available(): network = torch.nn.DataParallel(network).cuda() loss = loss.cuda() start_epoch = 0 if snapshot is not None: state = resume(snapshot,model=network, optimizer=optimizer) if "epoch" in state.keys(): start_epoch = state["epoch"] if "data" in state.keys(): logger.resume(state["data"]) for epoch in range(start_epoch, epochs): logger.update_epoch(epoch) print("\nEpoch {}".format(epoch)) print("train") train_epoch(traindataloader, network, optimizer, loss, loggers=(logger,vizlogger)) print("\ntest") test_epoch(testdataloader, network,loss, loggers=(logger, vizlogger)) data = logger.get_data() vizlogger.update(data) if checkpoint_dir is not None: checkpoint_name = os.path.join(checkpoint_dir,"model_{:02d}.pth".format(epoch)) save(checkpoint_name, network, optimizer, epoch=epoch, data=data)
def __init__(self): self.dataset = Dataset(args) self.dataset_test = Dataset(args, mode="test") self.embedding_matrix = build_embedding_matrix( self.dataset.vocab.word2id, args.embed_dim) self.model = Model(args) self.optimizer = optim.Adam(self.model.parameters()) self._epoch = 0 self._iter = 0 self.max_acc = None self.load_model()
def main(args): # I/O config_file = args.config_file config = imp.load_source('config', config_file) if args.name: config.name = args.name trainset = Dataset(config.train_dataset_path) network = Network() network.initialize(config, trainset.num_classes) # Initalization for running log_dir = utils.create_log_dir(config, config_file) summary_writer = tf.summary.FileWriter(log_dir, network.graph) if config.restore_model: network.restore_model(config.restore_model, config.restore_scopes) proc_func = lambda images: preprocess(images, config, True) trainset.start_batch_queue(config.batch_format, proc_func=proc_func) # Main Loop print('\nStart Training\nname: {}\n# epochs: {}\nepoch_size: {}\nbatch_size: {}\n'.format( config.name, config.num_epochs, config.epoch_size, config.batch_format['size'])) global_step = 0 start_time = time.time() for epoch in range(config.num_epochs): # Training for step in range(config.epoch_size): # Prepare input learning_rate = utils.get_updated_learning_rate(global_step, config) batch = trainset.pop_batch_queue() wl, sm, global_step = network.train(batch['mu'].reshape(config.batch_format['size'], -1) , batch['conv_final'].reshape(config.batch_format['size'], -1) , batch['label'] , learning_rate , config.keep_prob) wl['lr'] = learning_rate # Display if step % config.summary_interval == 0: duration = time.time() - start_time start_time = time.time() utils.display_info(epoch, step, duration, wl) summary_writer.add_summary(sm, global_step=global_step) # Save the model network.save_model(log_dir, global_step)
def process_data(X, y=None, test_size=0.20, dummies=False): if y is None: km = dask_ml.cluster.KMeans(n_clusters=10, init_max_iter=100) km.fit(X.flatten().reshape(-1, 1)) y = km.labels_ y_uniqs = np.unique(y) len_ = X.shape[0] X = prepare_dataset(X) if dummies: y = dd.get_dummies(y) shape_ = list(X.shape[1:]) samples = list() samples_labels = list() print('Preparing samples ...') for _ in range(2): for y_uniq in y_uniqs: sample = list() label = list() for xa, ya in zip(chunks(X, 100),chunks(y, 100)): try: sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]]) label.append(y_uniq) if len(sample) >= 10: break except: pass samples += sample samples_labels += label samples = da.vstack(samples) samples_labels = da.vstack(samples_labels) X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size, random_state=4891) X_train = X_train.reshape([X_train.shape[0]] + shape_) X_test = X_test.reshape([X_test.shape[0]] + shape_) print('Training dataset shape: ', X_train.shape) print('Validation dataset shape: ', X_test.shape) train_dataset = Dataset(X_train, y_train) test_dataset = Dataset(X_test, y_test) train_dataset.samples = samples train_dataset.samples_labels = samples_labels print('Sample dataset shape: ', train_dataset.samples.shape) return train_dataset, test_dataset
def load_MNIST(): data_path = '../data/MNIST_data' data = input_data.read_data_sets(data_path, one_hot=False) x_train_aux = data.train.images x_test = data.test.images data_dim = data.train.images.shape[1] n_train = data.train.images.shape[0] print("TTTTTTTTTTTTTTTTTTTTTTTTTT") print("data.train.images.shape: ", data.train.images.shape) print("data_dim: ", data_dim) print("n_train: ", n_train) train_size = int(n_train * 0.8) valid_size = n_train - train_size x_valid, x_train = merge_datasets(x_train_aux, data_dim, train_size, valid_size) print('Data loaded. ', time.localtime().tm_hour, ':', time.localtime().tm_min, 'h') # logs.write('\tData loaded ' + str(time.localtime().tm_hour) +':' + str(time.localtime().tm_min) + 'h\n') print('x_train.shape: ', x_train.shape) x_train = np.reshape(x_train, [-1, 28, 28, 1]) x_valid = np.reshape(x_valid, [-1, 28, 28, 1]) x_test = np.reshape(x_test, [-1, 28, 28, 1]) print("!!!!!!!!!!!!!!!!!!!") print("Dataset test after reshape: ") print('x_train.shape: ', x_train.shape) print("type(x_train): ", type(x_train)) print('type(data.train.labels): ', type(data.train.labels)) print("!!!!!!!!!!!!!!!!!!!") train_dataset = Dataset(x_train, data.train.labels) valid_dataset = Dataset(x_valid, data.train.labels) test_dataset = Dataset(x_test, data.test.labels) print("```````````") print("train_dataset.height: ", train_dataset.height) print("train_dataset.width: ", train_dataset.width) print("train_dataset.num_channels: ", train_dataset.num_channels) print("```````````") print('Train Data: ', train_dataset.x.shape) print('Valid Data: ', valid_dataset.x.shape) print('Test Data: ', test_dataset.x.shape) print("train_dataset.labels: ", train_dataset.labels) print("data.train.labels.shape: ", data.train.labels.shape) return train_dataset, valid_dataset, test_dataset
def execute_demo(language): data = Dataset(language) print("{}: {} training - {} dev".format(language, len(data.trainset), len(data.devset))) # for sent in data.trainset: # print(sent['sentence'], sent['target_word'], sent['gold_label']) baseline = Baseline(language) baseline.train(data.trainset, data.bigram_dic) predictions = baseline.test(data.devset, data.bigram_dic) gold_labels = [sent['gold_label'] for sent in data.devset] report_score(gold_labels, predictions) print("{} test".format(language)) predictions = baseline.test(data.testset, data.bigram_dic) gold_labels = [sent['gold_label'] for sent in data.testset] report_score(gold_labels, predictions)
def validate(network, config, log_dir, step): # Initialize testing if not hasattr(validate, 'images'): testset = Dataset(config.test_dataset_path, prefix=config.data_prefix, isDebug=config.isDebug) random_indices = np.random.permutation(np.where( testset.is_photo)[0])[:64] validate.images = testset.images[random_indices].astype(np.object) validate.images = preprocess(validate.images, config, is_training=False) output_dir = os.path.join(log_dir, 'samples') if not os.path.isdir(output_dir): os.makedirs(output_dir) # scales = np.indices((8,8), dtype=np.float32)[1] * 5 scales = np.ones((8, 8)) scales = scales.flatten() test_results = network.generate_BA(validate.images, scales, config.batch_size) utils.save_manifold(test_results, os.path.join(output_dir, '{}.jpg'.format(step)))
def find_max_nlabels(opt, cfg, new_anchors): model = PreprocessTargets(cfg.model, new_anchors) ipu_opts = ipu_options(cfg, model) dataset = Dataset(opt.data, cfg) loader = DataLoader(ipu_opts, dataset, batch_size=cfg.model.micro_batch_size, num_workers=cfg.system.num_workers, mode=DataLoaderMode.Async) inference_model = inferenceModel(model.eval(), ipu_opts) max_nlabels = [ torch.tensor([0]), ] * len(cfg.model.strides) pbar = tqdm( loader, desc='Finding the maximum number of labels after preprocessing') for _, (_, label, _, _) in enumerate(pbar): n_labels = inference_model(label) for j, old_max_nlabels in enumerate(max_nlabels): max_nlabels[j] = torch.maximum(n_labels[j], old_max_nlabels) return max_nlabels
def discretization_evaluation(tree, discretized_tree): # Load data # data_dir = '../data/discrete_' data_dir = '../data/cartpole_greedy_ppo_' data_path = data_dir + 'state.npy' label_path = data_dir + 'action.npy' # a data loader with all data in dataset test_loader = torch.utils.data.DataLoader(Dataset(data_path, label_path, partition='test', ToTensor=True), batch_size=int(1e4), shuffle=True) accuracy_list = [] accuracy_list_ = [] correct = 0. correct_ = 0. for batch_idx, (data, target) in enumerate(test_loader): # data, target = data.to(device), target.to(device) target_onehot = onehot_coding(target, tree.args['output_dim']) prediction, _, _ = tree.forward(data) prediction_, _, _ = discretized_tree.forward(data) with torch.no_grad(): pred = prediction.data.max(1)[1] correct += pred.eq(target.view(-1).data).sum() pred_ = prediction_.data.max(1)[1] correct_ += pred_.eq(target.view(-1).data).sum() accuracy = 100. * float(correct) / len(test_loader.dataset) accuracy_ = 100. * float(correct_) / len(test_loader.dataset) print('Original Tree Accuracy: {:.4f} | Discretized Tree Accuracy: {:.4f}'. format(accuracy, accuracy_))
def word_freq(self, language, unigram=True, bigram=True, trigram=True): data = Dataset(language) word_count = Counter() uni_count = Counter() bi_count = Counter() tri_count = Counter() text = [] for line in data.trainset: if line['sentence'] not in text: text.append(line['sentence']) # Building unigram wordcounts words = ' '.join(text) for word in words.split(" "): word_count[word] += 1 # Building letter Unigram counts if unigram == True: for i in range(len(word) - 1): uni_count[word[i]] += 1 # Building letter Bigram counts if bigram == True: for i in range(len(word) - 1): bi_count[word[i:i + 2]] += 1 # Building letter Trigram counts if trigram == True: for i in range(len(word) - 2): tri_count[word[i:i + 3]] += 1 return word_count, uni_count, bi_count, tri_count
def execute_demo(language, flag): data = Dataset(language) if flag == 0: print("{}: {} training - {} dev".format(language, len(data.trainset), len(data.devset)) ) #data.trainset 是dataset函数内返回的dataset的形式 data.devset用来测试用的 if flag == 1: print("{}: {} training - {} test".format(language, len(data.trainset), len(data.testset))) # for sent in data.trainset: # # print(sent['sentence'], sent['target_word'], sent['gold_label']) # print(sent) baseline = Baseline(language) baseline.train(data.trainset) predictions_devset = baseline.test(data.devset) predictions_testset = baseline.test(data.testset) gold_labels_devset = [sent['gold_label'] for sent in data.devset] ##输出的是二元值 0 1 0 1形式的 gold_labels_testset = [sent['gold_label'] for sent in data.testset] if flag == 0: print("Test by using dev set:") report_score(gold_labels_devset, predictions_devset) if flag == 1: print("Test by using test set:") report_score(gold_labels_testset, predictions_testset)
def main(data_root, result_root, split, seed, feat_window_size): result_root += "-s-%d-%d" % (split, seed) ### read label2index mapping and index2label mapping ########################### label2index = dict() index2label = dict() with open(os.path.join(data_root, 'mapping.txt'), 'r') as f: content = f.read().split('\n')[0:-1] for line in content: label2index[line.split()[1]] = int(line.split()[0]) index2label[int(line.split()[0])] = line.split()[1] ### read test data ############################################################# #with open('data/split1.test', 'r') as f: with open(os.path.join(data_root, 'split%d.test' % split), 'r') as f: video_list = f.read().split('\n')[0:-1] dataset = Dataset(data_root, video_list, label2index, shuffle=False) # load prior, length model, grammar, and network load_iteration = NUM_ITERS log_prior = np.log( np.loadtxt('%s/prior.iter-' % result_root + str(load_iteration) + '.txt')) grammar = PathGrammar('%s/grammar.txt' % result_root, label2index) length_model = PoissonModel('%s/lengths.iter-' % result_root + str(load_iteration) + '.txt', max_length=2000) forwarder = Forwarder(dataset.input_dimension, dataset.n_classes, feat_window_size=feat_window_size) forwarder.load_model('%s/network.iter-' % result_root + str(load_iteration) + '.net') # parallelization n_threads = 4 # Viterbi decoder viterbi_decoder = Viterbi(grammar, length_model, frame_sampling=30, max_hypotheses=np.inf) # forward each video log_probs = dict() queue = mp.Queue() for i, data in enumerate(dataset): sequence, _ = data video = list(dataset.features.keys())[i] queue.put(video) log_probs[video] = forwarder.forward(sequence) - log_prior log_probs[video] = log_probs[video] - np.max(log_probs[video]) # Viterbi decoding procs = [] for i in range(n_threads): p = mp.Process(target=decode, args=(queue, log_probs, viterbi_decoder, index2label, result_root)) procs.append(p) p.start() for p in procs: p.join()
def run(datapath, model, saved_model_path=None): # Construct dataset pipeline dataset = Dataset(datapath).data # Base name for files basename = model.name + '_adam_lr{}_relu_dropout_epochs30'.format( str(config.learning_rate)) if saved_model_path is not None: test_gen = dataset['test'] num_instances = dataset['num_test'] else: # Train model on dataset model.summary() model.train(dataset) # Save model modelname = basename + '.h5' saved_model_path = model.save(modelname) # Save plots plotname = basename + '_plt' # + additional tags plotutils.get_plots(model, smooth=True, name=plotname) test_gen = dataset['valtest'] num_instances = dataset['num_val'] # Save confusion matrix mtxname = basename + '_mtx' model.evaluate(saved_model_path, test_gen, num_instances, name=mtxname)
def main(args): paths = Dataset(args.dataset_path)['abspath'] print('%d images to load.' % len(paths)) assert (len(paths) > 0) # Load model files and config file network = Network() network.load_model(args.model_dir) images = preprocess(paths, network.config, False) # Run forward pass to calculate embeddings mu, sigma_sq = network.extract_feature(images, args.batch_size, verbose=True) feat_pfe = np.concatenate([mu, sigma_sq], axis=1) # test print('mu:', mu.shape) lfwtest = LFWTest(paths) lfwtest.init_standard_proto(args.protocol_path) accuracy, threshold = lfwtest.test_standard_proto(mu, utils.pair_euc_score) print('Euclidean (cosine) accuracy: %.5f threshold: %.5f' % (accuracy, threshold)) accuracy, threshold = lfwtest.test_standard_proto(feat_pfe, utils.pair_MLS_score) print('MLS accuracy: %.5f threshold: %.5f' % (accuracy, threshold))
def test(args): sys.setrecursionlimit(7000) is_ensemble = args['--ensemble'] model_path = args['MODEL_FILE'] test_set_path = args['TEST_DATA_FILE'] extra_config = None if args['--extra-config']: extra_config = args['--extra-config'] extra_config = json.loads(extra_config) print(f'loading model from [{model_path}]', file=sys.stderr) model_cls = EnsembleModel if is_ensemble else RenamingModel if is_ensemble: model_path = model_path.split(',') model = model_cls.load(model_path, use_cuda=args['--cuda'], new_config=extra_config) model.eval() test_set = Dataset(test_set_path) eval_results, decode_results = Evaluator.decode_and_evaluate( model, test_set, model.config, return_results=True) print(eval_results, file=sys.stderr) save_to = args['--save-to'] if args['--save-to'] else args[ 'MODEL_FILE'] + f'.{test_set_path.split("/")[-1]}.decode_results.bin' print(f'Save decode results to {save_to}', file=sys.stderr) pickle.dump(decode_results, open(save_to, 'wb'))
def dataset(): vocab = Vocabulary(args) dataset = Dataset(args, vocab) source_files = sorted(glob.glob(args.dataset_file_path + 'train_source*.dat')) target_files = sorted(glob.glob(args.dataset_file_path + 'train_target*.dat')) print('========== Begin someting about vocabulary:') print('Vocab Size:', dataset.vocab.vocab_size) print('First 10 Word2cnt:', list(dataset.vocab._word2cnt.items())[:10]) print() print('========== Begin someting about dataset:') X_lens = [len(sen.split()) for source_file in source_files for sen in open(source_file)] y_lens = [len(sen.split()) for target_file in target_files for sen in open(target_file)] print('Number of Source Sentences:', len(X_lens)) print('Number of Sarget Sentences:', len(y_lens)) print() print('Mean Length of Source Sentences:', np.mean(X_lens)) print('Max Length of Source Sentences:', np.max(X_lens)) print('Min Length of Source Sentences:', np.min(X_lens)) print() print('Mean Length of Target Sentences:', np.mean(y_lens)) print('Max Length of Target Sentences:', np.max(y_lens)) print('Min Length of Target Sentences:', np.min(y_lens)) print()
def main(args): paths = Dataset(args.dataset_path)['abspath'] print('%d images to load.' % len(paths)) assert(len(paths)>0) all_path = [] for j,image_dir in enumerate(paths): image_path = listdir(image_dir) #one_image = random.choice(image_path) one_image = image_path[1] one_image = os.path.join(image_dir,one_image) all_path.append(one_image) # sec_image = random.choice(image_path) # sec_image = os.path.join(image_dir,sec_image) # all_path.append(sec_image) # Load model files and config file network = Network() network.load_model(args.model_dir) images = preprocess(all_path, network.config, False) # Run forward pass to calculate embeddings mu, sigma_sq = network.extract_feature(images, args.batch_size, verbose=True) feat_pfe = np.concatenate([mu, sigma_sq], axis=1) ytftest = YTFTest(all_path) ytftest.init_standard_proto(args.protocol_path) accuracy, threshold = ytftest.test_standard_proto(mu, utils.pair_euc_score) print('Euclidean (cosine) accuracy: %.5f threshold: %.5f' % (accuracy, threshold)) accuracy, threshold = ytftest.test_standard_proto(feat_pfe, utils.pair_MLS_score) print('MLS accuracy: %.5f threshold: %.5f' % (accuracy, threshold))
def test_read(): dataset = Dataset(path=data_path, shape=[256, 256], augmentation=True, collapse_length=2, is_raw=True) assert len(dataset) > 0 events, timestamps, images, augmentation_parameters = dataset[0] assert isinstance(events, dict) assert set.intersection(set( events.keys()), {'x', 'y', 'timestamp', 'polarity', 'element_index'}) assert isinstance(events['x'], np.ndarray) assert isinstance(events['y'], np.ndarray) assert isinstance(events['timestamp'], np.ndarray) assert isinstance(events['polarity'], np.ndarray) assert isinstance(events['element_index'], np.ndarray) assert events['x'].dtype == np.int64 assert events['y'].dtype == np.int64 assert events['timestamp'].dtype == np.float32 assert events['polarity'].dtype == np.int64 assert events['element_index'].dtype == np.int64 n = events['x'].size for k, v in events.items(): assert v.size == n, k assert (events['element_index'] != 0).sum() == 0, 'Sample is a sequence ' \ 'of more than 1 element' assert images.ndim == 3 assert images.shape == (2, 256, 256) assert timestamps.shape == (2, ) assert timestamps[0] < timestamps[1]
def execute_demo(language): data = Dataset(language) if test == True: print("{}: {} training - {} dev".format(language, len(data.trainset), len(data.testset))) else: print("{}: {} training - {} dev".format(language, len(data.trainset), len(data.devset))) if Base == True: baseline = Baseline(language) else: baseline = MyLine(language) baseline.train(data.trainset) if test == True: predictions = baseline.test(data.testset) gold_labels = [sent['gold_label'] for sent in data.testset] else: predictions = baseline.test(data.devset) gold_labels = [sent['gold_label'] for sent in data.devset] report_score(gold_labels, predictions)
def infer(label2index, index2label, n_threads): # load models log_prior = np.log(np.loadtxt('results/prior')) grammar = PathGrammar('results/grammar', label2index) length_model = PoissonModel('results/mean_lengths', max_length=2000) forwarder = Forwarder('results/net.model') # Viterbi decoder (max_hypotheses = n: at each time step, prune all hypotheses worse than the top n) viterbi_decoder = Viterbi(grammar, length_model, frame_sampling=30, max_hypotheses=50000) # create list of test videos with open('data/split1.test', 'r') as f: video_list = f.read().split('\n')[0:-1] # forward each video log_probs = dict() queue = mp.Queue() for video in video_list: queue.put(video) dataset = Dataset('data', [video], label2index) log_probs[video] = forwarder.forward(dataset) - log_prior log_probs[video] = log_probs[video] - np.max(log_probs[video]) # Viterbi decoding procs = [] for i in range(n_threads): p = mp.Process(target=decode, args=(queue, log_probs, viterbi_decoder, index2label)) procs.append(p) p.start() for p in procs: p.join()
def execute_demo(language): if language == 'english': word_emb = load_word_embeddings('english') elif language == 'spanish': word_emb = load_word_embeddings('spanish') data = Dataset(language) print("{}: {} training - {} dev".format(language, len(data.trainset), len(data.devset))) #for sent in data.trainset: # Gold label -> 0 if the word is not complex, 1 if the word is complex. #print(sent['sentence'], sent['target_word'], sent['gold_label']) baseline = Baseline(language) model = Model(language) model.train(data.trainset, word_emb) predictions = model.test(data.devset, word_emb) gold_labels = [sent['gold_label'] for sent in data.devset] report_score(gold_labels, predictions)
def test_tree(tree, epochs=10): criterion = nn.CrossEntropyLoss() # Utility variables best_testing_acc = 0. testing_acc_list = [] # Load data data_dir = '../data/discrete_' data_path = data_dir + 'state.npy' label_path = data_dir + 'action.npy' test_loader = torch.utils.data.DataLoader( Dataset(data_path, label_path, partition='test'), batch_size=learner_args['batch_size'], shuffle=True) for epoch in range(epochs): # Testing stage tree.eval() correct = 0. for batch_idx, (data, target) in enumerate(test_loader): data, target = data.to(device), target.to(device) batch_size = data.size()[0] prediction, _, _, _ = tree.forward(data) pred = prediction.data.max(1)[1] correct += pred.eq(target.view(-1).data).sum() accuracy = 100. * float(correct) / len(test_loader.dataset) if accuracy > best_testing_acc: best_testing_acc = accuracy testing_acc_list.append(accuracy) print( '\nEpoch: {:02d} | Testing Accuracy: {}/{} ({:.3f}%) | Historical Best: {:.3f}%\n' .format(epoch, correct, len(test_loader.dataset), accuracy, best_testing_acc))
def execute_demo(language): data = Dataset(language) print("{}: {} training - {} test".format(language, len(data.trainset), len(data.testset))) baseline = Baseline(language) word_frequence = baseline.word_frequences(data.trainset) char_frequence = baseline.char_frequence(data.trainset) lengh_trainset = baseline.lengh_trainset(data.trainset) bigram_counts_word = baseline.bigram_counts_word(data.trainset) pos_dictionary = baseline.pos_dictionary(data.trainset) lengh_char = baseline.lengh_char(data.trainset) bigram_counts_char = baseline.bigram_counts_char(data.trainset) baseline.train(data.trainset, word_frequence, pos_dictionary, bigram_counts_word, lengh_trainset, char_frequence, lengh_char, bigram_counts_char) predictions = baseline.test(data.testset, word_frequence, pos_dictionary, bigram_counts_word, lengh_trainset, char_frequence, lengh_char, bigram_counts_char) gold_labels = [sent['gold_label'] for sent in data.testset] report_score(gold_labels, predictions)
def execute_demo(language): data = Dataset(language) print("{}: {} training - {} test".format(language, len(data.trainset), len(data.testset))) # for sent in data.trainset: # print(sent['target_word'])#sent['sentence'], sent['target_word'], sent['gold_label']) baseline = Baseline(language) baseline.train(data.trainset) predictions_dev = baseline.test(data.devset) predictions_test = baseline.test(data.testset) gold_labels_dev = [sent['gold_label'] for sent in data.devset] gold_labels_test = [sent['gold_label'] for sent in data.testset] print("DEV result:") report_score(gold_labels_dev, predictions_dev, detailed=True) print("TEST result:") report_score(gold_labels_test, predictions_test, detailed=True)
def train(label2index, index2label): print("begin of training") # list of train videos with open('data/split1.train', 'r') as f: video_list = f.read().split('\n')[0:-1] # read train set dataset = Dataset('data', video_list, label2index) # train the network trainer = Trainer(dataset) print(" Traing trainer\n") trainer.train(batch_size=512, n_epochs=2, learning_rate=0.1) #trainer.train(batch_size = 512, n_epochs = 6, learning_rate = 0.01) # save training model trainer.save_model('results/net.model') print("Traing Done") # estimate prior, loss-based lengths, and monte-carlo grammar print("Preparing Prior") prior = estimate_prior(dataset) mean_lengths = loss_based_lengths(dataset) grammar = monte_carlo_grammar(dataset, mean_lengths, index2label) print("Grammar Done") np.savetxt('results/prior', prior) np.savetxt('results/mean_lengths', mean_lengths, fmt='%.3f') with open('results/grammar', 'w') as f: f.write('\n'.join(grammar) + '\n') print 'All Done!'
def sloss_train(label2index, index2label): print 'Start Sloss Trian!' print '! ! ! Cut loss\' weight to 0.1' print '! ! ! Test rbf affinity' print 'Change Net' # list of train videos with open('data/split1.train', 'r') as f: video_list = f.read().split('\n')[0:-1] # read train set dataset = Dataset('data', video_list, label2index) # train the network trainer = Trainer(dataset) print(" Traing trainer\n") trainer.s_train(batch_size=1024, n_epochs=2, learning_rate=0.1) trainer.s_train(batch_size=1024, n_epochs=2, learning_rate=0.01) trainer.s_train(batch_size=1024, n_epochs=1, learning_rate=0.001) # save training model trainer.save_model('results/net_test_tensorboard.model') print("Traing Done") # estimate prior, loss-based lengths, and monte-carlo grammar print("Preparing Prior") prior = estimate_prior(dataset) mean_lengths = loss_based_lengths(dataset) grammar = monte_carlo_grammar(dataset, mean_lengths, index2label) print("Grammar Done") np.savetxt('results/prior', prior) np.savetxt('results/mean_lengths', mean_lengths, fmt='%.3f') with open('results/grammar', 'w') as f: f.write('\n'.join(grammar) + '\n') print 'All Done!'
print('building embedding_matrix:', embedding_matrix_file_name) for word, i in word2idx.items(): vec = word2vec.get(word) if vec is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = vec print('saving embedding_matrix') pickle.dump(embedding_matrix, open(embedding_matrix_file_name, 'wb')) return embedding_matrix if __name__ == '__main__': # 测试: load_all_word_vec 所有的 word vec all_word2vec = load_word_vec() print(len(all_word2vec.keys())) # wiki_6b: 400001; twitter_27b: 1193515 print(all_word2vec['hello'].shape) # (100,) # 测试: load_word_vec 语料中出现的 word vec from utils.dataset import Dataset dataset = Dataset() dataset.load_dataset() print(len(dataset.vocab._word2idx)) # 58 word2vec = load_word_vec(word2idx=dataset.vocab._word2idx) print(word2vec.keys()) # dict_keys(['the', ',', '.', 'and', 'in', 'is', 'it', 'his', 'but', 'new', 'united', 'during', 'states', 'our', 'my', 'march', 'never', 'least', 'june', 'july', 'your', 'april', 'september', 'january', 'december', 'november', 'california', 'paris', 'sometimes', 'usually', 'spring', 'hot', 'jersey', 'cold', 'favorite', 'orange', 'apple', 'warm', 'quiet', 'fruit', 'busy', 'liked', 'autumn', 'mild', 'freezing', 'lemon', 'grape', 'chilly', 'relaxing', 'snowy']) # 测试: build_embedding_matrix embedding_matrix = build_embedding_matrix(dataset.vocab._word2idx, opt.embed_dim) print(embedding_matrix.shape) # (60, 100) print(embedding_matrix[-6]) print(dataset.vocab._word2idx.keys())