def cross_validation(data_set, k): data = dataset.read_data(data_set) n, _ = data.shape if k == n - 1: fold = 1 else: fold = n / k variants = [("full", "kmeans"), ("diag", "kmeans")] train_results, test_results, models = defaultdict( float), defaultdict(float), defaultdict(list) for i in xrange(0, n, fold): print("Running for {}".format(i)) test_likelihoods = 0.0 test_example = data[i:i+fold, :] train_examples = np.concatenate((data[:i, :], data[i + k + 1:, :]), axis=0) for config, model_data in average_likelihood(data_set).iteritems(): t = test(model_data.model_obj, data) train_results[config] += t models[config].append((t, model_data.model_obj)) ranked = sorted(train_results.iteritems(), key=lambda x: x[1], reverse=True) print "Best model is {}".format(ranked[0]) best_conf = ranked[0][0] best_model = sorted(models[best_conf], key=lambda x: x[0], reverse=True)[0][1] large_data = dataset.read_data(data_set[0:-5]+"large") t = test(best_model, large_data) print "Test results", t plotMOG(large_data, params((best_model.pi, best_model.mu, best_model.sigma)), title="Selected model for cross-validation k = {}".format(k)) pl.show() return ranked[0][1] / (n / fold), t
def main(args): logger.info(f"Args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") logger.info("Reading train dataset") train_data = read_data(os.path.join(args.data_dir, f"train.query.txt"), min_len=args.min_len) logger.info(f" Number of train data: {len(train_data):8d}") seen_set = set(train_data) if not args.train and os.path.isfile(args.model_path): logger.info(f"Loading trie at {args.model_path}") trie = pickle.load(open(args.model_path, 'rb')) else: logger.info("Making trie") trie = Trie(train_data) os.makedirs(os.path.dirname(args.model_path), exist_ok=True) logger.info(f"Saving trie at {args.model_path}") sys.setrecursionlimit(100000) pickle.dump(trie, open(args.model_path, 'wb')) logger.info("Reading test dataset") test_data = read_data(os.path.join(args.data_dir, f"test.query.txt"), min_len=args.min_len) logger.info(f" Number of test data: {len(test_data):8d}") logger.info("Evaluating MPC") test_dataset = PrefixDataset(test_data, args.min_prefix_len, args.min_suffix_len) seens = [] ranks = [] pranks = [] rls = [] for query, prefix in tqdm(test_dataset): seen = int(query in seen_set) completions = trie.get_mpc(prefix, n_candidates=args.n_candidates, min_freq=args.min_freq) rank = calc_rank(query, completions) prank = calc_partial_rank(query, completions) rl = [0 for _ in range(args.n_candidates + 1)] if seen: for i in range(1, len(query) + 1): r = calc_rank(query, trie.get_mpc(query[:-i])) if r == 0: break else: for j in range(r, args.n_candidates + 1): rl[j] += 1 seens.append(seen) ranks.append(rank) pranks.append(prank) rls.append(rl) mrr_logs = mrr_summary(ranks, pranks, seens, args.n_candidates) mrl_logs = mrl_summary(rls, seens, args.n_candidates) for log in mrr_logs + mrl_logs: logger.info(log)
def predict(path): # path = config.VALIDATE_PATH # path = config.TESTING_PATH # path = args.testpath data = dataset.read_data(path) processed_data = dataset.clean_test_data(data) valid_dataset = dataset.TagTestDataset(processed_data) processed_data = dataset.clean_test_data(data, False) predict_dataset = dataset.PredictTestDataset(processed_data) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE) predict_dataloader = torch.utils.data.DataLoader( predict_dataset, batch_size=config.VALID_BATCH_SIZE) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tag_model = BERTBaseJapanese() tag_model = nn.DataParallel(tag_model) tag_model.load_state_dict(torch.load(config.TAG_MODEL_PATH)) tag_model = tag_model.to(device) predict_model = BERTBaseJapanese() predict_model = nn.DataParallel(predict_model) predict_model.load_state_dict(torch.load(config.PREDICT_MODEL_PATH)) predict_model = predict_model.to(device) function.predict_fn(valid_dataloader, predict_dataloader, tag_model, predict_model, device)
def run_adaboost(train_or_test, test_file, model_file): predicted_labels = [] if (train_or_test == "test"): file = open(model_file, "rb") file.seek(0) pk_data = pickle.load(file) test = read_data(test_file) models = pk_data["models"] predictions = np.zeros((np.size(test.pixel_values, 0), 2)) count = 0 for model in models: tmp_predictions = model.predict(test.pixel_values) for index in range(0, np.size(test.pixel_values, 0)): if (tmp_predictions[index, 0] == 1): if (tmp_predictions[index][1] > predictions[index][1]): predictions[index][0] = class_label[count] predictions[index][1] = tmp_predictions[index][1] count += 1 correct = 0 for index in range(test.size): predicted_labels.append(int(predictions[index][0])) if (str(int(predictions[index][0])) == test.orientations[index]): correct += 1 return list(zip(test.image_names, predicted_labels)) else: train = read_data("train-data.txt") n_train = np.size(train.pixel_values, 0) models = [] X = train.pixel_values for label in class_label: y = np.zeros(n_train, dtype=int) y[np.array(train.orientations) == label] = 1 model = AdaboostClassifier(n_hypotheses=200) model.fit(X, y) models.append(model) adaboost_file = open(model_file, "wb") pickle.dump({"models": models}, adaboost_file) adaboost_file.close()
def main(): start_time = time() for dataset in ['A', 'B', 'C']: print 'Dataset:', dataset trainX, trainY = read_data(dataset, 'train') testX, testY = read_data(dataset, 'test') # plt.scatter(trainX[:,0], trainX[:,1], c=trainY, cmap=plt.cm.Paired) # plt.show() # continue # optimal_C = find_optimal_C(trainX, trainY) C = 0.001 while C <= 1000: # lamda = 0.001 # learning_rate = 0.0001 # while lamda <= 1000: # clf = logistic_regression.LogisticRegression(learning_rate=learning_rate, lamda=lamda) clf = mysvm.SVC(C=C, is_dual=False, kernel='linear') # clf = svm.SVC(C=C, kernel='linear') clf.fit(trainX, trainY) train_Y = clf.predict(trainX) train_score = score(train_Y, trainY) test_Y = clf.predict(testX) test_score = score(test_Y, testY) print 'C:', C print 'Number of SVs:', clf.n_support_ # print 'Margin:', clf.margin print 'Dataset:', dataset # print 'Lambda:', lamda # print 'Learning rate:', learning_rate print '---------------------------------------' print 'Training/test accuracy:', str(round(train_score*100, 2)) + '%', '/', str(round(test_score*100, 2)) + '%' print '---------------------------------------' print # lamda *= 10 C *= 10 print '----------' + str(round(time() - start_time, 2)) + ' seconds.---------------'
def main(args): # load textfile source_dataset, target_dataset, vocab, vocab_inv = read_data(args.source_filename, args.target_filename, train_split_ratio=args.train_split, dev_split_ratio=args.dev_split, seed=args.seed) source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset print_bold("data #") print("train {}".format(len(source_dataset_train))) print("dev {}".format(len(source_dataset_dev))) print("test {}".format(len(source_dataset_test))) vocab_source, vocab_target = vocab vocab_inv_source, vocab_inv_target = vocab_inv print("vocab {} (source)".format(len(vocab_source))) print("vocab {} (target)".format(len(vocab_target))) # split into buckets source_buckets_train, target_buckets_train = make_buckets(source_dataset_train, target_dataset_train) if args.buckets_limit is not None: source_buckets_train = source_buckets_train[:args.buckets_limit+1] target_buckets_train = target_buckets_train[:args.buckets_limit+1] print_bold("buckets #data (train)") for size, data in zip(bucket_sizes, source_buckets_train): print("{} {}".format(size, len(data))) print_bold("buckets #data (dev)") source_buckets_dev, target_buckets_dev = make_buckets(source_dataset_dev, target_dataset_dev) if args.buckets_limit is not None: source_buckets_dev = source_buckets_dev[:args.buckets_limit+1] target_buckets_dev = target_buckets_dev[:args.buckets_limit+1] for size, data in zip(bucket_sizes, source_buckets_dev): print("{} {}".format(size, len(data))) print_bold("buckets #data (test)") source_buckets_test, target_buckets_test = make_buckets(source_dataset_test, target_dataset_test) if args.buckets_limit is not None: source_buckets_test = source_buckets_test[:args.buckets_limit+1] target_buckets_test = target_buckets_test[:args.buckets_limit+1] for size, data in zip(bucket_sizes, source_buckets_test): print("{} {}".format(size, len(data))) model = load_model(args.model_dir) assert model is not None if args.gpu_device >= 0: cuda.get_device(args.gpu_device).use() model.to_gpu() print_bold("WER (train)") wer_train = compute_mean_wer(model, source_buckets_train, target_buckets_train, len(vocab_inv_target), batchsize=args.batchsize, argmax=True) print(wer_train) print_bold("WER (dev)") wer_dev = compute_mean_wer(model, source_buckets_dev, target_buckets_dev, len(vocab_inv_target), batchsize=args.batchsize, argmax=True) print(wer_dev) print_bold("WER (test)") wer_test = compute_mean_wer(model, source_buckets_test, target_buckets_test, len(vocab_inv_target), batchsize=args.batchsize, argmax=True) print(wer_test)
def gmm_with_kmeans(k, data): data = dataset.read_data(data) kmeans = KMeans(k) centroids, labels = kmeans.fit(data) # plotKMeans(data, kmeans.mu, labels) # print centroids.shape gmm = GMM(k, mu=centroids) plotMOG(data, params(gmm.fit(data)), title="GMM with KMeans likelihood={}".format(gmm.likelihood)) gmm = GMM(k) plotMOG(data, params(gmm.fit(data)), title="GMM general likelihood={}".format(gmm.likelihood)) pl.show()
def average_likelihood(data_set, variants=None, mixtures=5): models = {} data = dataset.read_data(data_set) n = len(data) if not variants: variants = gmm_variants ModelData = namedtuple('ModelData', ['avg_likelihood', 'model_obj']) for variant, init in variants: for i in xrange(1, mixtures + 1): gmm = fit_retry(i, data, variant, init) models[(variant, init, i)] = ModelData( avg_likelihood=gmm.likelihood / n, model_obj=gmm) return models
def main(cfg): net = SPRINSeg(6, cfg.fps_n).cuda() if len(cfg.resume_path) > 0: net.load_state_dict( torch.load(hydra.utils.to_absolute_path(cfg.resume_path))) opt = radam.RAdam(net.parameters(), cfg.lr, weight_decay=cfg.weight_decay) pcs_train, segs_centered_train, segs_train = read_data( hydra.utils.to_absolute_path('shapenet_part_seg_hdf5_data'), r'ply_data_(train|val).*\.h5') pcs_test, segs_centered_test, segs_test = read_data( hydra.utils.to_absolute_path('shapenet_part_seg_hdf5_data'), r'ply_data_test.*\.h5') print(len(pcs_train)) print(len(pcs_test)) for e in range(1, cfg.max_epoch): run_epoch(net, pcs_train, segs_centered_train, segs_train, opt, e, ds=cfg.npoints, batchsize=cfg.batch_size) if e % 10 == 0: run_epoch(net, pcs_test, segs_centered_test, segs_test, opt, e, train=False, ds=cfg.npoints, batchsize=cfg.batch_size, rand_rot=True) torch.save(net.state_dict(), 'epoch{}.pt'.format(e))
def main(args): logger.info(f"Args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") spm_path = os.path.join('spm', args.spm, "spm.model") logger.info(f"Loading tokenizer from {spm_path}") tokenizer = Tokenizer(spm_path) args.ntoken = ntoken = len(tokenizer) args.branching_factor = min([args.branching_factor, args.ntoken]) logger.info(f" Vocab size: {ntoken}") n_queries_str = f"{f'only {args.n_queries} samples' if args.n_queries else 'all'} quries from" logger.info(f"Reading a dataset ({n_queries_str} test.query.txt)") seen_set = set( read_data(os.path.join(args.data_dir, "train.query.txt"), min_len=args.min_len)) test_data = read_data(os.path.join(args.data_dir, "test.query.txt"), min_len=args.min_len) if args.n_queries: random.seed(args.seed) test_data = random.sample(test_data, args.n_queries) n_seen_test_data = len([x for x in test_data if x in seen_set]) n_unseen_test_data = len(test_data) - n_seen_test_data logger.info( f" Number of test data: {len(test_data):8d} (seen {n_seen_test_data}, unseen {n_unseen_test_data})" ) logger.info(f"Loading model from {args.model_dir}") model = model_load(args.model_dir) model = model.to(device) logger.info('Generation starts!') with torch.no_grad(): generate(model, tokenizer, test_data, args, seen_set=seen_set, calc_mrl=args.calc_mrl)
def train(): if gfile.Exists('corpus/mapping') and gfile.Exists('corpus/SAD.csv.token'): print('Files have already been formed!') else: dataset.form_vocab_mapping(50000) vocab_map, _ = dataset.read_map('corpus/mapping') dataset.file_to_token('corpus/SAD.csv', vocab_map) d = dataset.read_data('corpus/SAD.csv.token') random.seed(SEED) random.shuffle(d) train_set = d[:int(0.9 * len(d))] valid_set = d[int(-0.1 * len(d)):] sess = tf.Session() Model = create_model(sess, 'train') #Model = create_model(sess, 'valid') step = 0 loss = 0 while (True): step += 1 encoder_input, encoder_length, target = Model.get_batch(train_set) ''' print(encoder_input) print(encoder_length) print(target) exit() ''' loss_train = Model.step(sess, encoder_input, encoder_length, target) loss += loss_train / CHECK_STEP if step % CHECK_STEP == 0: Model.mode = 'valid' temp_loss = 0 for _ in range(100): encoder_input, encoder_length, target = Model.get_batch( valid_set) loss_valid = Model.step(sess, encoder_input, encoder_length, target) temp_loss += loss_valid / 100. Model.mode = 'train' print("Train Loss: %s" % loss) print("Valid Loss: %s" % temp_loss) checkpoint_path = os.path.join('saved_model/', 'dis.ckpt') Model.saver.save(sess, checkpoint_path, global_step=step) print("Model Saved!") loss = 0
def main(): gtzan_dir = args.root_dir + '/genres/' song_samples = 660000 genres = { 'metal': 0, 'disco': 1, 'classical': 2, 'hiphop': 3, 'jazz': 4, 'country': 5, 'pop': 6, 'blues': 7, 'reggae': 8, 'rock': 9 } # Read the data print("Reading in the data..") if os.path.isfile(os.path.join( args.root_dir, "x_gtzan_npy.npy")) and os.path.isfile( os.path.join(args.root_dir, "y_gtzan_npy.npy")): X = np.load("x_gtzan_npy.npy") y = np.load("y_gtzan_npy.npy") print("Using saved training data..") else: X, y = read_data(gtzan_dir, genres, song_samples, to_melspectrogram, debug=False) np.save('x_gtzan_npy.npy', X) np.save('y_gtzan_npy.npy', y) print("Saved data not found, reading again..") print("Completed reading the data!") print("Splitting into train test and converting to desired shape..") X_train, X_test, y_train, y_test = get_train_test(X, y) print("Training the model..") model = train(X_train, y_train, args.batch_size, args.epochs) print("Training completed!") print("Doing the inference on the test set..") test_acc = test(X_test, y_test, model) print('Accuracy of the network on the 5240 test images: %d %%' % (test_acc))
def gmm_log_plot(d, type="full", title=""): ks, ll = [], [] for i in xrange(1, 5): data = dataset.read_data(d) if type == "kmeans": gmm = GMM(i, mu=KMeans(i).fit(data)[0]) elif type == "diag": gmm = GMM(i, variant="diag") else: gmm = GMM(i) gmm.fit(data) ks.append(i) ll.append(-gmm.likelihood) print("Likelihood for k = {} => {}".format(i, gmm.likelihood)) plot_loglikelihood(ks, ll, label=type, title=title) pl.ylabel("Log Likelihood") pl.xlabel("Number of mixtures") pl.draw()
def __init__(self, data_dir='train.txt', num_classes=10, mode='train', height=256, width=256): """ init :param data_dir: str :param mode: str, train or test """ self.curr = 0 self.mode = mode self.height = height self.width = width self.num = num_classes self.img_paths, self.label = read_data(data_dir, mode) self.n = len(self.img_paths)
def main(cfg): net = SPRINSeg(6, cfg.fps_n).to("cuda") net.load_state_dict( torch.load(hydra.utils.to_absolute_path('sprin/epoch250.pt'))) pcs_test, segs_centered_test, segs_test = read_data( hydra.utils.to_absolute_path( "shapenet_part_seg_hdf5_data/ply_data_test*")) print(len(pcs_test)) run_epoch(net, pcs_test, segs_centered_test, segs_test, None, 1, train=False, ds=cfg.npoints, batchsize=1)
def rank_models(): """ Rank models based on average log likelihood """ data_sets = ["data_1_small", "data_2_small", "data_3_small"] for data_set in data_sets: avg_performance = defaultdict(float) models, test_results = dict(), {} for config, model_data in average_likelihood(data_set).iteritems(): avg_performance[config] += model_data.avg_likelihood models[config] = model_data.model_obj ranked = sorted(avg_performance.iteritems(), key=lambda x: x[1], reverse=True) large_data = dataset.read_data(data_set[0:-5]+"large") print "best model is {}".format(ranked[0]) for config, _ in ranked: test_results[config] = test(models[config], large_data) test_ranked = sorted(test_results.iteritems(), key=lambda x: x[1], reverse=True) best_model = models[ranked[0][0]] # plotMOG(large_data, params((best_model.pi, best_model.mu, best_model.sigma))) # pl.show() plot_rankings(avg_performance, test_results, title=data_set[0:-5])
def main(_): builder = Cifar10DatasetBuilder(buffer_size=FLAGS.shuffle_buffer_size) labels, images = read_data(FLAGS.data_path, training=True) dataset = builder.build_dataset( labels, images, FLAGS.batch_size, training=True) model = ResNetCifar10(FLAGS.num_layers, shortcut_connection=FLAGS.shortcut_connection, weight_decay=FLAGS.weight_decay, batch_norm_momentum=FLAGS.batch_norm_momentum) optimizer = build_optimizer(init_lr=FLAGS.init_lr, momentum=FLAGS.momentum) ckpt = tf.train.Checkpoint(model=model, optimizer=optimizer) trainer = ResNetCifar10Trainer(model) trainer.train(dataset, optimizer, ckpt, FLAGS.batch_size, FLAGS.num_iterations, FLAGS.log_per_iterations, FLAGS.ckpt_path)
def main(args): logger.info(f"Args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") spm_path = os.path.join('spm', args.spm, "spm.model") args.sample = parse_sample_options(args.sample) logger.info(f"Loading tokenizer from {spm_path}") tokenizer = Tokenizer(spm_path) args.ntoken = ntoken = len(tokenizer) logger.info(f" Vocabulary size: {ntoken}") logger.info("Reading dataset") data = {} for x in ['train', 'valid', 'test']: data[x] = read_data(os.path.join(args.data_dir, f"{x}.query.txt"), min_len=args.min_len) logger.info(f" Number of {x:>5s} data: {len(data[x]):8d}") logger.info("Preparing model and optimizer") config = LMConfig(ntoken, args.ninp, args.nhid, args.nlayers, args.dropouti, args.dropoutr, args.dropouth, args.dropouto) model = LanguageModel(config).to(device) params = get_params(model) logger.info( f" Number of model parameters: {sum(p.numel() for p in params)}") optimizer = torch.optim.Adam(params) if args.resume: logger.info(f"Loading model from {args.resume}") model_load(args.resume, model, optimizer) model = model.to(device) if n_gpu > 1: logger.info(f"Making model as data parallel") model = torch.nn.DataParallel(model, dim=1) train(model, optimizer, tokenizer, data['train'], data['valid'], args) test(model, tokenizer, data['test'], args)
def main(_): builder = Cifar10DatasetBuilder() labels, images = read_data(FLAGS.data_path, training=False) dataset = builder.build_dataset(labels, images, batch_size=10000, training=False) model = ResNetCifar10(FLAGS.num_layers, shortcut_connection=FLAGS.shortcut_connection) ckpt = tf.train.Checkpoint(model=model) evaluator = ResNetCifar10Evaluator(model) latest_ckpt = tf.train.latest_checkpoint(FLAGS.ckpt_path) if latest_ckpt: print('loading checkpoint %s ' % latest_ckpt) ckpt.restore(latest_ckpt).expect_partial() loss, acc = evaluator.evaluate(dataset, 10000) print('Eval loss: %s, eval accuracy: %s' % (loss, acc))
from absl import app from absl import flags FLAGS = flags.FLAGS flags.DEFINE_string("logs_dir", "logs", "Where the log file is saved.") flags.DEFINE_string("logs_file", "logs.txt", "Where the logs are saved.") flags.DEFINE_string("loss_binary", "./loss.py", "Binary to loss file") flags.DEFINE_integer("plot_every_n_iterations", 10, "How often to plot learning progress.") dataset = dataset.MaskingDataset() for filename in os.listdir("data"): if filename.startswith("masker") and filename.endswith(".txt"): dataset.read_data("data", filename) masking_frequency = float(os.environ["MASK_FREQ"]) probe_level = int(os.environ["PROBE_LEVEL"]) masking_level = int(os.environ["MASKING_LEVEL"]) data = dataset.get_curve_data(masking_frequency=masking_frequency, probe_level=probe_level, masking_level=masking_level) actual_frequencies, actual_amplitudes = zip(*data) model_class = model.Model(masking_frequency, probe_level, masking_level) def calculate_model_output(inputs: List[float], pars: List[float]) -> float: model_vars = inputs + model_class.parameters_from_learned(pars) output = model_class.function(*model_vars)
def eval_bleu(args, model, tokenizer, file_type='test', num=99999999): dataset = CodeChangeDataset(tokenizer, args, logger, file_type=file_type, block_size=args.block_size, mode='test') test_sampler = SequentialSampler(dataset) test_dataloader = DataLoader(dataset, sampler=test_sampler, batch_size=1) model.to(args.device) model.zero_grad() model.eval() preds = [] for step, (batch, token_labels) in enumerate( tqdm(test_dataloader, total=min(num, len(dataset)))): if step >= num: break inputs = batch.to(args.device) with torch.no_grad(): beam_size = args.beam_size m = torch.nn.LogSoftmax(dim=-1) outputs = model(inputs)[1] p = [] zero = torch.cuda.LongTensor(1).fill_(0) for i in range(inputs.shape[0]): past_hidden = [] for x in outputs: _p = x[:, i:i + 1] _q = _p.expand(-1, beam_size, -1, -1, -1) past_hidden.append(_q) # context_mask=source_mask[i:i+1,:].expand(beam_size,-1) beam = Beam(beam_size, tokenizer.bos_token_id, tokenizer.eos_token_id) input_ids = None for _ in range(162): if beam.done(): break input_ids = beam.getCurrentState() transformer_outputs = model(input_ids, past=past_hidden) out = m(transformer_outputs[0][:, -1, :]).data beam.advance(out) past_hidden = [ x.data.index_select(1, beam.getCurrentOrigin()) for x in transformer_outputs[1] ] hyp = beam.getHyp(beam.getFinal()) pred = beam.buildTargetTokens(hyp)[:beam_size] pred = [ torch.cat([x.view(-1) for x in p] + [zero] * (162 - len(p))).view( 1, -1) for p in pred ] p.append(torch.cat(pred, 0).unsqueeze(0)) p = torch.cat(p, 0) for pred in p: t = pred[0].cpu().numpy() t = list(t) if 0 in t: t = t[:t.index(0)] text = tokenizer.decode(t, clean_up_tokenization_spaces=False) preds.append(text) golds = [] datas = read_data(data_dir=args.data_dir, file_type=file_type) for (src, tgt) in datas[:num]: golds.append(tgt) assert len(preds) == len(golds), 'Pred %d\tGold %d' % (len(preds), len(golds)) EM = [] with open(os.path.join(args.output_dir, f"{file_type}.output"), 'w', encoding='utf-8') as f, open(os.path.join( args.output_dir, f"{file_type}.gold"), 'w', encoding='utf-8') as f1: for pred, gold in zip(preds, golds): f.write(pred + '\n') f1.write(gold + '\n') EM.append(pred.split() == gold.split()) bleu_score = round( _bleu(os.path.join(args.output_dir, f"{file_type}.gold"), os.path.join(args.output_dir, f"{file_type}.output")), 2) EM = round(np.mean(EM) * 100, 2) return bleu_score, EM
import os import numpy as np import math import pyopencl as cl from dataset import read_data os.environ["PYOPENCL_CTX"] = "0" train_m, train_u, train_r, train_J, test_m, test_u, test_r, test_J, u2uid, m2mid = read_data( ) dimuser = 50 dimmovie = 10 round_number = 0 if os.path.isfile(f"model_uvec_{round_number}.npy"): uvec = np.load(f"model_uvec_{round_number}.npy") else: uvec = np.random.rand(len(u2uid) * dimuser).astype(np.float32) uvec *= np.float32(0.1) if os.path.isfile(f"model_mvec_{round_number}.npy"): mvec = np.load(f"model_mvec_{round_number}.npy") else: mvec = np.random.rand(len(m2mid) * dimmovie).astype(np.float32) mvec *= np.float32(0.1) uvecd = np.zeros((len(u2uid) * dimuser)).astype(np.float32) mvecd = np.zeros((len(m2mid) * dimmovie)).astype(np.float32)
def run_train(ps_hosts, worker_hosts, job_name, task_index, model_f, data_path, output_path, param_path): # ====================================== # Variables ps_hosts = ps_hosts.split(",") worker_hosts = worker_hosts.split(",") param_dict = load_json(param_path) # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=job_name, task_index=task_index) if job_name == "ps": server.join() elif job_name == "worker": # Load Data (X_train, Y_train, X_valid, Y_valid, _, _) = read_data(data_path, param_dict['train_ratio'], param_dict['valid_ratio']) print("=" * 30) print("X_train shape: {}".format(X_train.shape)) print("Y_train shape: {}".format(Y_train.shape)) print("X_valid shape: {}".format(X_valid.shape)) print("Y_valid shape: {}".format(Y_valid.shape)) print("=" * 30) # Inference output dimension output_dim = len(Y_train[0]) # Check is_chief is_chief = task_index == 0 # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Build model... # Datasets train_X_dataset = tf.data.Dataset.from_tensor_slices(X_train) train_Y_dataset = tf.data.Dataset.from_tensor_slices(Y_train) train_dataset = tf.data.Dataset.zip( (train_X_dataset, train_Y_dataset)) train_dataset = train_dataset.shuffle( param_dict['dataset_shuffle_buffer_size']).batch( param_dict['batch_size']).repeat(param_dict['n_epoch']) if is_chief: valid_X_dataset = tf.data.Dataset.from_tensor_slices(X_valid) valid_Y_dataset = tf.data.Dataset.from_tensor_slices(Y_valid) valid_dataset = tf.data.Dataset.zip( (valid_X_dataset, valid_Y_dataset)) valid_dataset = valid_dataset.shuffle( param_dict['dataset_shuffle_buffer_size']).batch( param_dict['batch_size']) # Feedable Iterator handle = tf.placeholder(tf.string, shape=[]) iterator = tf.data.Iterator.from_string_handle( handle, train_dataset.output_types, train_dataset.output_shapes) # Iterators train_iterator = train_dataset.make_one_shot_iterator() train_handle_tensor = train_iterator.string_handle() if is_chief: valid_iterator = valid_dataset.make_initializable_iterator() valid_handle_tensor = valid_iterator.string_handle() X, Y = iterator.get_next() is_training = tf.placeholder_with_default(False, shape=None, name="is_training") global_step = tf.contrib.framework.get_or_create_global_step() logits = mlp(X=X, output_dim=output_dim, is_training=is_training, **param_dict['model_param']) Y_pred = slim.softmax(logits) loss = slim.losses.softmax_cross_entropy(logits, Y) accuracy, correct = calc_metric(Y, Y_pred) train_op = tf.train.AdamOptimizer( param_dict['learning_rate']).minimize(loss, global_step=global_step) tf.add_to_collection('X', X) tf.add_to_collection('Y_pred', Y_pred) #saved_model_tensor_dict = build_saved_model_graph(X, # Y_pred, # saved_model_path) # The StopAtStepHook handles stopping after running given steps. # hooks = [tf.train.StopAtStepHook(last_step=1000000)] # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession( master=server.target, is_chief=is_chief, checkpoint_dir=output_path, # hooks=hooks, ) as mon_sess: # Get dataset handle train_handle = mon_sess.run(train_handle_tensor) valid_handle = mon_sess.run(valid_handle_tensor) # Metric window acc_window = [0.] * TRAIN_METRIC_WINDOW loss_window = [0.] * TRAIN_METRIC_WINDOW batch_i = 0 while not mon_sess.should_stop(): # Run a training step asynchronously. mon_sess.run(train_op, feed_dict={ is_training: True, handle: train_handle, }) if is_chief: train_accuracy, train_loss = mon_sess.run([accuracy, loss], feed_dict={ is_training: False, handle: train_handle, }) acc_window = acc_window[1:] + [train_accuracy] loss_window = loss_window[1:] + [train_loss] if batch_i % VERBOSE_INTERVAL == 0: recent_mean_train_accuracy = sum(acc_window) / len( acc_window) recent_mean_train_loss = sum(loss_window) / len( loss_window) valid_i = 0 valid_correct = 0 valid_loss = 0 valid_total_num = 0 mon_sess.run(valid_iterator.initializer) while True: try: (batch_Y_pred, batch_valid_correct, batch_valid_loss) = mon_sess.run( [Y_pred, correct, loss], feed_dict={ is_training: False, handle: valid_handle, }) curr_batch_num = batch_Y_pred.shape[0] valid_correct += batch_valid_correct.sum() valid_loss += batch_valid_loss * curr_batch_num valid_total_num += curr_batch_num valid_i += 1 except tf.errors.OutOfRangeError: break valid_accuracy = valid_correct / valid_total_num valid_loss = valid_loss / valid_total_num print("-" * 30) print("recent_mean_train_accuracy : {}".format( recent_mean_train_accuracy)) print("recent_mean_train_loss : {}".format( recent_mean_train_loss)) print("valid_accuracy : {}".format(valid_accuracy)) print("valid_loss : {}".format(valid_loss)) batch_i += 1
saver = tf.train.Saver(max_to_keep=None) epoch = int(FLAGS.epoch) with tf.Session() as sess: merged = tf.summary.merge_all() init = tf.initialize_all_variables() sess.run(init) file_writer = tf.summary.FileWriter(FLAGS.log_path, tf.get_default_graph()) validation_image, validation_msg = read_data(FLAGS.input_path, FLAGS.train_size, FLAGS.input_size) if tf.train.get_checkpoint_state(FLAGS.checkpoint_path): ckpt = tf.train.latest_checkpoint(FLAGS.checkpoint_path) saver.restore(sess, ckpt) start_point = int(ckpt.split('-')[-1]) print("\nLoad success, continuing from checkpoint: epoch-%d" % (start_point + 1)) else: print("\nNo previous checkpoint. Training from scratch..") start_point = 0 for j in range(start_point, epoch): if j + 1 > epoch / 3: # reduce learning rate lr_ = FLAGS.learning_rate * 0.1
import preprocess as pp import dataset as dataset #extract feature and store to txt pp.Make_feature_set() flower = dataset.read_data("data/feature.txt")
def leave_one_out_validation(data_set): data = dataset.read_data(data_set) return cross_validation(data_set, data.shape[0] - 1)
def main(args): vocab, vocab_inv = load_vocab(args.model_dir) vocab_source, vocab_target = vocab vocab_inv_source, vocab_inv_target = vocab_inv source_dataset, target_dataset = read_data(vocab_source, vocab_target, args.source_train, args.target_train, args.source_dev, args.target_dev, args.source_test, args.target_test, reverse_source=True) source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset printb("data #") if len(source_dataset_train) > 0: print("train {}".format(len(source_dataset_train))) if len(source_dataset_dev) > 0: print("dev {}".format(len(source_dataset_dev))) if len(source_dataset_test) > 0: print("test {}".format(len(source_dataset_test))) print("vocab {} (source)".format(len(vocab_source))) print("vocab {} (target)".format(len(vocab_target))) # split into buckets source_buckets_train = None if len(source_dataset_train) > 0: printb("buckets #data (train)") source_buckets_train, target_buckets_train = make_buckets( source_dataset_train, target_dataset_train) if args.buckets_slice is not None: source_buckets_train = source_buckets_train[:args.buckets_slice + 1] target_buckets_train = target_buckets_train[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_train): print("{} {}".format(size, len(data))) source_buckets_dev = None if len(source_dataset_dev) > 0: printb("buckets #data (dev)") source_buckets_dev, target_buckets_dev = make_buckets( source_dataset_dev, target_dataset_dev) if args.buckets_slice is not None: source_buckets_dev = source_buckets_dev[:args.buckets_slice + 1] target_buckets_dev = target_buckets_dev[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_dev): print("{} {}".format(size, len(data))) source_buckets_test = None if len(source_dataset_test) > 0: printb("buckets #data (test)") source_buckets_test, target_buckets_test = make_buckets( source_dataset_test, target_dataset_test) if args.buckets_slice is not None: source_buckets_test = source_buckets_test[:args.buckets_slice + 1] target_buckets_test = target_buckets_test[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_test): print("{} {}".format(size, len(data))) model = load_model(args.model_dir) assert model is not None if args.gpu_device >= 0: cuda.get_device(args.gpu_device).use() model.to_gpu() def mean(l): return sum(l) / len(l) with chainer.using_config("train", False): if source_buckets_train is not None: printb("WER (train)") wer_train = compute_error_rate_buckets(model, source_buckets_train, target_buckets_train, len(vocab_target), args.beam_width, args.alpha) print(mean(wer_train), wer_train) if source_buckets_dev is not None: printb("WER (dev)") wer_dev = compute_error_rate_buckets(model, source_buckets_dev, target_buckets_dev, len(vocab_target), args.beam_width, args.alpha) print(mean(wer_dev), wer_dev) if source_buckets_test is not None: printb("WER (test)") wer_test = compute_error_rate_buckets(model, source_buckets_test, target_buckets_test, len(vocab_target), args.beam_width, args.alpha) print(mean(wer_test), wer_test)
def run_submit(args): augment = ['null'] out_dir = args.out_dir + f'/{args.model_name}' initial_checkpoint = args.initial_checkpoint batch_size = args.batch_size ## setup out_dir os.makedirs(out_dir +'/submit', exist_ok=True) log = Logger() log.open(out_dir+'/log.submit.txt',mode='a') log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) log.write('\t%s\n' % COMMON_STRING) log.write('\n') log.write('\tSEED = %u\n' % SEED) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % out_dir) log.write('\n') log.write('submitting .... @ %s\n'%str(augment)) log.write('initial_checkpoint = %s\n'%initial_checkpoint) log.write('\n') if 1: #save log.write('** dataset setting **\n') files_train = [f'train_image_data_{fid}.feather' for fid in range(4)] data = read_data(args.data_dir, files_train) df = pd.read_csv(args.df_path) valid_split = np.load(args.data_dir + '/valid_b_fold1_15985.npy').tolist() valid_df = df[df['image_id'].isin(valid_split)] test_dataset = KaggleDataset( df = df, data = data, idx = valid_df.index.values, augment = valid_augment, ) log.write('\n') ## net log.write('** net setting **\n') if args.model_name == 'serex50': net = Serex50_Net().cuda() elif args.model_name == 'effnetb3': net = EfficientNet_3().cuda() else: raise NotImplemented net.load_state_dict(torch.load(initial_checkpoint, map_location=lambda storage, loc: storage), strict=True) image_id, truth, probability = do_evaluate(net, test_dataset, batch_size, augment) if 1: #save write_list_to_file (out_dir + '/submit/image_id.txt',image_id) write_pickle_to_file(out_dir + '/submit/probability.pickle', probability) write_pickle_to_file(out_dir + '/submit/truth.pickle', truth) if 1: image_id = read_list_from_file(out_dir + '/submit/image_id.txt') probability = read_pickle_from_file(out_dir + '/submit/probability.pickle') truth = read_pickle_from_file(out_dir + '/submit/truth.pickle') num_test= len(image_id) if 1: recall, avgerage_recall = compute_kaggle_metric(probability, truth) log.write('avgerage_recall : %f\n'%(avgerage_recall)) for i,name in enumerate(TASK_NAME): log.write('%28s %f\n'%(name,recall[i])) log.write('\n')
# 分类类别 classes = ['dogs', 'cats'] num_classes = len(classes) # 交叉验证集比例 # 输入图片大小 # 输入图片通道数 # 训练数据路径 validation_rate = 0.25 img_size = 64 num_channels = 3 train_data_path = 'training_data' # 读数据,预处理 data = dataset.read_data(train_data_path, img_size, classes, validation_rate) # print(format(len(data.train.labels))) # 设定参数,构建网络 # 设定占位符 x = tf.placeholder(tf.float32, shape=[None, img_size, img_size, num_channels], name='x') y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true') y_true_cls = tf.argmax(y_true, dimension=1) # 网络结构参数 # 第一层 # 卷积
def main(args): # load textfile train_dataset, dev_dataset, test_dataset, vocab, vocab_inv = read_data( args.text_filename, train_split_ratio=args.train_split, dev_split_ratio=args.dev_split, seed=args.seed) save_vocab(args.model_dir, vocab, vocab_inv) vocab_size = len(vocab) print_bold("data # hash") print("train {} {}".format(len(train_dataset), hash(str(train_dataset)))) print("dev {} {}".format(len(dev_dataset), hash(str(dev_dataset)))) print("test {} {}".format(len(test_dataset), hash(str(test_dataset)))) print("vocab {}".format(vocab_size)) # split into buckets train_buckets = make_buckets(train_dataset) print_bold("buckets #data (train)") if args.buckets_limit is not None: train_buckets = train_buckets[:args.buckets_limit + 1] for size, data in zip(bucket_sizes, train_buckets): print("{} {}".format(size, len(data))) print_bold("buckets #data (dev)") dev_buckets = make_buckets(dev_dataset) if args.buckets_limit is not None: dev_buckets = dev_buckets[:args.buckets_limit + 1] for size, data in zip(bucket_sizes, dev_buckets): print("{} {}".format(size, len(data))) print_bold("buckets #data (test)") test_buckets = make_buckets(test_dataset) for size, data in zip(bucket_sizes, test_buckets): print("{} {}".format(size, len(data))) # to maintain equilibrium min_num_data = 0 for data in train_buckets: if min_num_data == 0 or len(data) < min_num_data: min_num_data = len(data) repeats = [] for data in train_buckets: repeat = len(data) // min_num_data repeat = repeat + 1 if repeat == 0 else repeat repeats.append(repeat) num_updates_per_iteration = 0 for repeat, data in zip(repeats, train_buckets): num_updates_per_iteration += repeat * args.batchsize num_iteration = len(train_dataset) // num_updates_per_iteration + 1 # init model = load_model(args.model_dir) if model is None: model = RNNModel(vocab_size, args.ndim_embedding, args.num_layers, ndim_h=args.ndim_h, kernel_size=args.kernel_size, pooling=args.pooling, zoneout=args.zoneout, dropout=args.dropout, wgain=args.wgain, densely_connected=args.densely_connected, ignore_label=ID_PAD) if args.gpu_device >= 0: chainer.cuda.get_device(args.gpu_device).use() model.to_gpu() # setup an optimizer if args.eve: optimizer = Eve(alpha=args.learning_rate, beta1=0.9) else: optimizer = optimizers.Adam(alpha=args.learning_rate, beta1=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip)) optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) min_learning_rate = 1e-7 prev_ppl = None total_time = 0 def mean(l): return sum(l) / len(l) # training for epoch in xrange(1, args.epoch + 1): print("Epoch", epoch) start_time = time.time() for itr in xrange(1, num_iteration + 1): sys.stdout.write("\r{} / {}".format(itr, num_iteration)) sys.stdout.flush() for repeat, dataset in zip(repeats, train_buckets): for r in xrange(repeat): batch = sample_batch_from_bucket(dataset, args.batchsize) source, target = make_source_target_pair(batch) if model.xp is cuda.cupy: source = cuda.to_gpu(source) target = cuda.to_gpu(target) model.reset_state() Y = model(source) loss = softmax_cross_entropy(Y, target, ignore_label=ID_PAD) optimizer.update(lossfun=lambda: loss) if itr % args.interval == 0 or itr == num_iteration: save_model(args.model_dir, model) # show log sys.stdout.write("\r" + stdout.CLEAR) sys.stdout.flush() print_bold(" accuracy (sampled train)") acc_train = compute_random_accuracy(model, train_buckets, args.batchsize) print(" ", mean(acc_train), acc_train) print_bold(" accuracy (dev)") acc_dev = compute_accuracy(model, dev_buckets, args.batchsize) print(" ", mean(acc_dev), acc_dev) print_bold(" ppl (sampled train)") ppl_train = compute_random_perplexity(model, train_buckets, args.batchsize) print(" ", mean(ppl_train), ppl_train) print_bold(" ppl (dev)") ppl_dev = compute_perplexity(model, dev_buckets, args.batchsize) ppl_dev_mean = mean(ppl_dev) print(" ", ppl_dev_mean, ppl_dev) elapsed_time = (time.time() - start_time) / 60. total_time += elapsed_time print(" done in {} min, lr = {}, total {} min".format( int(elapsed_time), optimizer.alpha, int(total_time))) # decay learning rate if prev_ppl is not None and ppl_dev_mean >= prev_ppl and optimizer.alpha > min_learning_rate: optimizer.alpha *= 0.5 prev_ppl = ppl_dev_mean
import unittest from embeddings import batcher from dataset import Dataset import dataset class MyTestCase(unittest.TestCase): def test_something(self): text = [0, 1, 2, 3, 4, 5, 6, 7, 8] batchconfig = batcher.BatcherConfig(8, 2, 1) batch = batcher.Batcher(text, batchconfig) batches, labels = batch.CBOW_batch() print batches print labels # self.assertEqual(batches, ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']) if __name__ == '__main__': unittest.main() text = dataset.read_data("text")
import torch.nn.functional as F from torch.autograd import Variable from torch.utils.tensorboard import SummaryWriter ###INITIALIZE PATHS### try: os.mkdir(config.DAE_CHECKPOINT) os.mkdir(config.BVAE_CHECKPOINT) os.mkdir(config.SCAN_CHECKPOINT) os.mkdir(config.RECOMB_CHECKPOINT) os.mkdir(config.VIS_RECON_PATH) os.mkdir(config.VIS_LATENT_TRAVERSAL) #Read RGB data from data folder and save dataset.read_data(config.DATA_PATH,config.UNNORM_DATA_PATH) #Turn data into hsv and normalize to mean=0 std=1 channel_mean,channel_std=dataset.normalize_data(NORM_DATA_PATH,UNNORM_DATA_PATH) #load into array data_set=dataset.load_data(NORM_DATA_PATH) #create one hot array one_hots=dataloader.generate_one_hots(data_set.shape[0]) #create shuffle indexes perm=dataset.index_generate_random(data_set) #shuffle the data data_set=data_set[perm] one_hots=one_hots[perm] train_data,test_data=dataset.split_train_test(data_set,config.TRAIN_SIZE) oh_train_data,oh_test_data=dataset.split_train_test(one_hots,config.TRAIN_SIZE)