def train(args): device = args.device load_path = args.load_path # load data train_data = load_data('train') val_data = load_data('validation') # load model with tf.device('/gpu:%d' % device): model = get_model('policy') # trainer init optimizer = Config.optimizer train_step = optimizer.minimize(model.loss) # init session and server sess = tf.InteractiveSession() saver = tf.train.Saver() if load_path==None: sess.run(tf.initialize_all_variables()) else: saver.restore(sess, load_path) print("Model restored from %s" % load_path) # accuracy pred = tf.reshape(model.pred, [-1, 9*10*16]) label = tf.reshape(model.label, [-1, 9*10*16]) correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(label,1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) logging.basicConfig(filename='log.txt', level=logging.DEBUG) # train steps for i in range(Config.n_epoch): # training step batch_data, batch_label = train_data.next_batch(Config.minibatch_size) input_dict = {model.label:batch_label} for var, data in zip(model.inputs, batch_data): input_dict[var]=data #from IPython import embed;embed() sess.run(train_step, feed_dict=input_dict) # evalue step if (i+1)%Config.evalue_point == 0: batch_data, batch_label = val_data.next_batch(Config.minibatch_size) val_dict = {model.label:batch_label} for var, data in zip(model.inputs, batch_data): val_dict[var]=data score = accuracy.eval(feed_dict=val_dict) print("epoch %d, accuracy is %.2f" % (i,score)) logging.info("epoch %d, accuracy is %.2f" % (i,score)) # save step if (i+1)%Config.check_point == 0: save_path = saver.save(sess, "%s/epoch-%d" %(Config.save_path, i)) print("Model saved in file: %s" % save_path) logging.info("Model saved in file: %s" % save_path)
def main(args): model = utils.get_models(bert_config=args.bert_config, pred_n_labels=args.pred_n_labels, arg_n_labels=args.arg_n_labels, n_arg_heads=args.n_arg_heads, n_arg_layers=args.n_arg_layers, pos_emb_dim=args.pos_emb_dim, use_lstm=args.use_lstm, device=args.device) if torch.cuda.is_available(): map_location = lambda storage, loc: storage.cuda() else: map_location = 'cpu' model.load_state_dict( torch.load(args.model_path, map_location=map_location)) model.zero_grad() model.eval() loader = load_data(data_path=args.test_data_path, batch_size=args.batch_size, tokenizer_config=args.bert_config, train=False) start = time.time() extract(args, model, loader, args.save_path) print("TIME: ", time.time() - start) test_results = do_eval(args.save_path, args.test_gold_path) utils.print_results("TEST RESULT", test_results, ["F1 ", "PREC", "REC ", "AUC "])
def main(): ''' # Use these three to get the data loaded, targets loaded, and the accessions stripped (Otherwise use dataset.py load_data()) # get classifications type_dict = get_targets() # load data scores = get_data() # get arrays of scores and targets data, targets = get_arrays(type_dict, scores) ''' data, targets = load_data() # tune model parameters #tune_model_parameters(data,targets) # get ROC curves #get_roc(data, targets) # get confusion matrix get_conf_matrix(data, targets) '''I WANT TO RE-RUN the ROC curves and the Confusion matrix data using predictions from a cross-validation rather than train/test_split'''
def _load_data(self, opts, seed): if opts['dataset'].lower() in ('mnist', 'fashion', 'cifar10', 'svhn'): (self.data, self.labels), (self.test_data, self.test_labels) = load_data(opts['dataset'], seed, imbalance=True) if 'augment_x' in opts and opts['augment_x']: self.data, self.labels = self.oversampling( opts, self.data, self.labels, seed) self.num_points = len(self.data) else: raise ValueError('Unknown %s' % opts['dataset']) self.class_counts = [ np.count_nonzero(self.labels == c) for c in range(opts['n_classes']) ] print("[ statistic ]") print("Total train: ", self.num_points) print(self.class_counts) print("Total test: ", len(self.test_labels)) print([ np.count_nonzero(self.test_labels == c) for c in range(opts['n_classes']) ])
def main(args): data_train, data_eval, concept2idx, tag2idx, word2concept = dataset.load_data( ) m = args[0] model_map = { "baseline": models.Baseline, "word": models.Word, "tag": models.Tag, "concept": models.Concept, "tag_word": models.TagWord, "concept_tag": models.ConceptTag, "concept_word": models.ConceptWord, "concept_tag_word": models.ConceptTagWord } if m in model_map: lstm_dim, hidden_dim, epoch, batch, early_stop, patience = args[1:] model = model_map[m](concept2idx=concept2idx, tag2idx=tag2idx, word2concept=word2concept, lstm_dim=int(lstm_dim), hidden_dim=int(hidden_dim)) model.train(data_train, int(epoch), int(batch), early_stop == "1", int(patience)) accuracy, preds = model.evaluate(data_eval) logging.info("Accuracy: {}".format(accuracy)) log_result( data_eval, preds, "out/{}_{}_{}_{}.txt".format(m, lstm_dim, hidden_dim, accuracy))
def train(dataset_dir, emb_file, epoch, batch_size): (train_data, test_data, text_field, label_field) = dataset.load_data(dataset_dir, emb_file) class_size = len(label_field.vocab) nbow = nbow_model.NBoW(class_size, text_field.vocab.vectors) nbow.train() optimizer = torch.optim.Adam(nbow.parameters()) train_iter = Iterator(train_data, batch_size) for n in range(epoch): for batch in train_iter: optimizer.zero_grad() logit = nbow(batch.text.t()) loss = F.cross_entropy(logit, batch.label) loss.backward() optimizer.step() nbow.eval() (accuracy, num_correct) = compute_accuracy(nbow, test_data) print('Epoch: {} Accuracy: {:.2f}% ({}/{})'.format(n + 1, accuracy * 100, num_correct, len(test_data))) nbow.train()
def test(): for epoch in range(args.epochs): # 载入模型 output_path = args.output_path + str(epoch) args.model_path = os.path.join(output_path, "pytorch_model.bin") tokenizer, model = load_model(args) # 载入测试集数据 data_path = args.data_path + str(epoch) test_data_path = os.path.join(data_path, 'test.csv') test_loader = load_data(tokenizer, args, test_data_path, "test") # 3263 logger.info("Testing data has been loaded!") # 得到测试结果 running = Running(model, args) outputs = running.test(test_loader) # 写入数据 outputs_df = pd.read_csv( os.path.join(args.raw_data_path, "sample_submission.csv")) outputs_df['target_0'] = outputs[:, 0] outputs_df['target_1'] = outputs[:, 1] outputs_df['target_2'] = outputs[:, 2] outputs_df[['id', 'target_0', 'target_1', 'target_2']].to_csv(os.path.join(output_path, "sub.csv"), index=False) logger.info('sub ' + str(epoch) + ' has been written.')
def reload_state(checkpoint, training_state, config, args): """ Reload state when resuming training. """ model, id_to_token, id_to_char = BidafModel.from_checkpoint( config['bidaf'], checkpoint) if torch.cuda.is_available() and args.cuda: model.cuda() model.train() optimizer = get_optimizer(model, config, training_state) token_to_id = {tok: id_ for id_, tok in id_to_token.items()} char_to_id = {char: id_ for id_, char in id_to_char.items()} len_tok_voc = len(token_to_id) len_char_voc = len(char_to_id) with open(args.data) as f_o: data, _ = load_data(json.load(f_o), span_only=True, answered_only=True) limit_passage = config.get('training', {}).get('limit') data = tokenize_data(data, token_to_id, char_to_id, limit_passage) data = get_loader(data, config) assert len(token_to_id) == len_tok_voc assert len(char_to_id) == len_char_voc return model, id_to_token, id_to_char, optimizer, data
def train_main(args): global loader_train, loader_val loader_train, loader_val, loader_test = load_data(train_bath_size=BATCH_SIZE, args=args,RANDOM_SEED=RANDOM_SEED, val_batch_size=BATCH_SIZE) device = set_device() setup_seed(RANDOM_SEED) #随机种子 #model = googleNet() model = resnet18() #model = load_model(model, args.pretrained_model_path, device=device) model = nn.DataParallel(model) #多gpu criterion = nn.CrossEntropyLoss() params = net_lr(model, FC_LR, NET_LR) if OPTIMIZER == 'adam': optimizer = torch.optim.Adam(params, betas=(0.9, 0.999), weight_decay=0, eps=1e-08) else: optimizer = torch.optim.SGD(params, momentum=MOMENTUM, nesterov=True, weight_decay=WEIGHT_DECAY) print(model) start_epoch = 0 if Load_model: start_epoch = 25 filepath = 'load_model_path' model = load_model(model, filepath, device=device) model = model.to(device=device) optimizer = load_optimizer(optimizer, filepath, device=device) train(model, optimizer, criterion, device=device, epochs=EPOCH, start=start_epoch)
def main(_): # Import data data = load_data(FLAGS.dataset, one_hot=True, validation_size=10000) ####################################### DEFINE MODEL ####################################### # x is a value that we'll input when TensorFlow is asked to run a computation # None means the dimension can be of any length (any number of MNIST images) x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS]) # Variables are modifiable tensors that live in the graph of interacting operations # Typical to use this type for model parameters - initially set to 0 as they'll be learnt W = tf.Variable(tf.zeros([IMAGE_PIXELS, NUM_CLASSES])) b = tf.Variable(tf.zeros([NUM_CLASSES])) # First multiply x by W and then add bias before applying the softmax layer of a NN # y = tf.nn.softmax(tf.matmul(x, W) + b) # Remove softmax layer due to later instability of raw cross-entropy formulation y = tf.matmul(x, W) + b # Placeholder to input the correct answers y_ = tf.placeholder(tf.float32, [None, NUM_CLASSES]) ######################################### TRAINING ######################################### #### DEFINE LOSS AND OPTIMISER #### # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw # outputs of 'y', and then average across the batch. # Internally computes the softmax activation cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_, logits=y)) #### APPLY OPTIMISATION #### # In one line, compute gradients, compute parameter update steps and apply update steps to parameters train_step = tf.train.GradientDescentOptimizer(FLAGS.learn_rate).minimize(cross_entropy) # Launch model in an interactive session sess = tf.InteractiveSession() tf.global_variables_initializer().run() #### TRAINING #### # Stochastic GD as it's less expensive than using all available data for every training step for _ in range(1000): # Get a 'batch' of `batch_size` random data points from training set each loop iteration batch_xs, batch_ys = data.train.next_batch(FLAGS.batch_size) # Run train_step, feeding in the batch data to replace placeholders sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) ######################################### TESTING ######################################### correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print(sess.run(accuracy, feed_dict={x: data.test.images, y_: data.test.labels}))
def main(): print('Loading data...') train, valid, test = load_data(args.dataset_path, valid_portion=0) train_data = RecSysDataset(train) valid_data = RecSysDataset(valid) test_data = RecSysDataset(test) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn) valid_loader = DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn) test_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn) ## Load Transition Matrix M2 = pd.read_csv('datasets/transition/final2_transition_gowalla_narm.csv') M2 = M2.T[1:].T M2.index = M2.columns n_items = 38575 # 38575, 3271, 8487 model = NARM(n_items, M2, args.hidden_size, args.embed_dim, args.batch_size).to(device) optimizer = optim.Adam(model.parameters(), args.lr) criterion = nn.CrossEntropyLoss() scheduler = StepLR(optimizer, step_size=args.lr_dc_step, gamma=args.lr_dc) for epoch in tqdm(range(args.epoch)): # train for one epoch scheduler.step(epoch=epoch) trainForEpoch(train_loader, model, optimizer, epoch, args.epoch, criterion, log_aggr=512) recall10, mrr10, recall20, mrr20, recall50, mrr50 = validate( test_loader, model) print( 'Epoch {} validation: Recall@{}: {:.4f}, MRR@{}: {:.4f}, Recall@{}: {:.4f}, MRR@{}: {:.4f}, Recall@{}: {:.4f}, MRR@{}: {:.4f} \n' .format(epoch, 10, recall10, 10, mrr10, 20, recall20, 20, mrr20, 50, recall50, 50, mrr50)) # store best loss and save a model checkpoint ckpt_dict = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } if epoch % 10 == 0: torch.save(ckpt_dict, 'latest_checkpoint.pth.tar')
def _kmeans(self, n, k, init='random', iteration=2, a=-9, b=9): _iter = 0 #load data points _points = load_data(n) """ Select the method for generating the k initial centers """ if init == 'geo_init': #generate centers by geometric initialisation method _centers = geometric_init(_points, k) print _centers else: #generate centers randomly _centers = generate_centers(a, b, n, k) print _centers #compute euclidean distance between data points and centers _distances = cdist(_points, _centers, 'euclidean') print _distances while _iter < 2: _iter += 1 intra = [0] #variable for intra cluster criterion #create an ndarray to classify clusters _clusters = np.ones((k, n)) * 100 for i in range(n): index_min = np.argmin(_distances[i]) _clusters.put((index_min, i), index_min) _distances.put((i, index_min), 100) #compute the intra cluster criterion squared_distances = np.square(_distances) for i in range(k): s = j = 0 while j < n: if _clusters[i][j] < 100: s += squared_distances[j][i] j += 1 intra.append(s) inter = np.sum(intra) #recompute centers _centers = [[]] for i in range(k): temp_list = [[]] for j in range(n): if _clusters[i][j] < 100: temp_list.append(_points[j]) del temp_list[0] _centers.append(np.median(temp_list, axis=0)) del _centers[0] #_iter+=1 _distances = cdist(_points, _centers, 'euclidean') return inter, intra, _clusters
def make_model(): model = RandomForestClassifier(n_estimators=185)#label_propagation.LabelSpreading(gamma=0.25, max_iter=5) sensor_data = dataset.load_data() X, y = sensor_data.data[:200], sensor_data.target[:200] model.fit(X, y) np.savetxt("X.csv", X, delimiter=",", fmt='%10.5f') np.savetxt("y_train.csv", y, delimiter=",", fmt='%10.1f') return model
def main(): parser = argparse.ArgumentParser() parser.add_argument('dir', help='Caminho para o dataset') args = parser.parse_args() X_train, Y_train, X_test, Y_test = dataset.load_data(args.dir) dataset.save_data(args.dir, X_train, Y_train, X_test, Y_test)
def make_model3(): model = label_propagation.LabelSpreading(kernel='knn', n_neighbors=15) sensor_data = dataset.load_data() X, y = sensor_data.data[:200], sensor_data.target[:200] model.fit(X, y) np.savetxt("X.csv", X, delimiter=",", fmt='%10.5f') np.savetxt("y_train.csv", y, delimiter=",", fmt='%10.1f') return model
def prepro_bow(name, filename, ngram, encoding='utf-8'): print('Process {} dataset...'.format(name)) word_count = {} def link_words(samples): '''link sentence back return x, y ''' x, y = list(), list() # line: (sentence, label) for line in samples: sentence = line[0] for w in sentence: if w in word_count: word_count[w] += 1 else: word_count[w] = 0 for line in samples: sentence = [] for w in line[0]: if word_count[w] >= 20: sentence.append(w) x.append(' '.join(sentence)) y.append(line[1]) vectorizer = CountVectorizer(ngram_range=(1, ngram), min_df=20) print(sum([1 if word_count[i] >= 20 else 0 for i in word_count])) x = vectorizer.fit_transform(x).toarray() print(len(vectorizer.get_feature_names())) return x, y # load dataset train_set, _ = load_data(os.path.join('dataset', 'raw', name, filename + '_train.txt'), encoding=encoding) #dev_set, _ = load_data(os.path.join('dataset', 'raw', 'books', 'books.task.dev')) test_set, _ = load_data(os.path.join('dataset', 'raw', name, filename + '_test.txt'), encoding=encoding) #train_set.extend(dev_set) train_len = len(train_set) train_set.extend(test_set) x_train, y_train = link_words(train_set) x_test, y_test = x_train[train_len:], y_train[train_len:] x_train, y_train = x_train[:train_len], y_train[:train_len] print('Process {} dataset done'.format(name)) return x_train, y_train, x_test, y_test
def main(_): model_path = 'models/' + FLAGS.name data = load_data(FLAGS.dataset, one_hot=True, validation_size=10000) # Define and instantiate VAE model if FLAGS.vae_type == 'vae': vae = VAE(network_architecture=network_architecture( FLAGS.vae_type, FLAGS.latent_dim), batch_size=FLAGS.batch_size, learn_rate=FLAGS.learn_rate) elif FLAGS.vae_type == 'conv': vae = ConvVAE(network_architecture=network_architecture( FLAGS.vae_type, FLAGS.latent_dim), batch_size=FLAGS.batch_size, learn_rate=FLAGS.learn_rate) else: raise ValueError( "Autoencoder type should be either conv or vae. Received: {}.". format(FLAGS.vae_type)) with tf.Session() as sess: np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) saver = tf.train.Saver() saver.restore(sess, model_path) print("Model restored from: %s" % model_path) # Sample a test input and see how well the VAE can reconstruct these samples x_sample = data.test.next_batch(FLAGS.batch_size)[0] x_reconstruct = vae.reconstruct(sess, x_sample) plt.figure(figsize=(8, 12)) for i in range(5): plt.subplot(5, 2, 2 * i + 1) plt.imshow(x_sample[i].reshape(IMAGE_SIZE, IMAGE_SIZE), vmin=0, vmax=1, cmap='gray') plt.title("Test input") plt.colorbar() plt.subplot(5, 2, 2 * i + 2) plt.imshow(x_reconstruct[i].reshape(IMAGE_SIZE, IMAGE_SIZE), vmin=0, vmax=1, cmap='gray') plt.title("Reconstruction") plt.colorbar() plt.tight_layout() plt.show() visualise_latent_space(sess, vae, data.test) if FLAGS.latent_dim == 2: plot_reconstructions(sess, vae, FLAGS.batch_size)
def load_data(params: NakdimonParams): data = {} for stage_name, stage_dataset_filenames in params.corpus.items(): np.random.seed(2) data[stage_name] = dataset.load_data( dataset.read_corpora(stage_dataset_filenames), validation_rate=params.validation_rate, maxlen=params.maxlen) return data
def evaluate(split): print('Loading {} set...'.format(split)) x, y_true = load_data(split) x = preprocess_input(x) y_pred = model.predict(x) y_pred = y_pred.argmax(axis=1) print("{} set statistics:".format(split)) print("Top-1-accuracy: {:.4f}".format(np.mean(y_true == y_pred))) print(metrics.classification_report(y_true, y_pred))
def __init__(self, source_type="keras", name="mnist", **kwargs): # if source_type != "keras" and souce_type: # raise NotImplementedError("temporarily only keras datasets available") # load train data flatten = config.global_config["network_type"] == "dense" self.X, self.y = load_data(source_type, name, flatten=flatten, **kwargs) self.input_shape = self.X[0].shape self.noutputs = self.y.shape[1]
def train(): # 训练epochs轮 for epoch in range(args.epochs): # 载入模型 tokenizer, model = load_model(args) # 读取数据 data_path = args.data_path + str(epoch) train_data_path = os.path.join(data_path, 'train.csv') train_loader = load_data(tokenizer, args, train_data_path, "train") evaluate_data_path = os.path.join(data_path, 'dev.csv') evaluate_loader = load_data(tokenizer, args, evaluate_data_path, "evaluate") logger.info("Training data has been loaded!") # 训练 running = Running(model, args) running.train(train_loader, evaluate_loader, epoch) # 释放显存 torch.cuda.empty_cache() # 垃圾回收 gc.collect()
def ts_method(model): sensor_data = dataset.load_data() X, y = sensor_data.data[4000:], sensor_data.target[4000:] predicted_labels = model.predict(X) # predicted_labels = [1 if x > 0.5 else 0 for x in p[:, 1]] cm = confusion_matrix(y, predicted_labels, labels=model.classes_) print('&' * 70) print(cm) print(classification_report(y, predicted_labels))
def train_script(input_path, output_dir): output_path = output_dir + "train_intermediate.csv" gather_translations(input_path, output_path) df, encode_dict, nb_class, weight_list = load_data(input_path=output_path, weight=True) json.dump(encode_dict, open(output_dir + "mapping.json", "w")) train(df, nb_class, validation=True, output_model_file=output_dir + 'pytorch_beto_news.bin', output_vocab_file=output_dir + 'vocab_beto_news.bin', weight_list=weight_list)
def __init__(self, config, **opt): # Load config used for training and merge with testing options self.config = yaml.load(open(config, "r")) self.config = Namespace(**{**self.config, **opt}) # Load training data.pkl for src and tgt vocabs self.data = load_data(self.config) # Load trained model checkpoints device, devices_ids = misc_utils.set_cuda(self.config) self.model, _ = build_model(None, self.config, device) self.model.eval()
def Main(args): if not os.path.exists(args.data_dir): os.makedirs(args.data_dir) if args.do_train or args.do_eval or args.split_dataset or args.create_submission_file: articles, ref_articles_id, ref_span_starts, ref_span_ends, labels = load_data( args.train_data_folder, args.labels_path) train_file_path = os.path.join(args.data_dir, args.train_file) dev_file_path = os.path.join(args.data_dir, args.dev_file) if not os.path.exists(train_file_path) or not os.path.exists( dev_file_path) or args.overwrite_cache: logger.info("Creating train/dev files: %s, %s", train_file_path, dev_file_path) get_train_dev_files(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, train_file_path, dev_file_path, args.split_by_ids, args.dev_size, args.random_state, args.balance, args.shuffle) if args.do_predict or args.create_submission_file or args.eval_submission: test_file_path = os.path.join(args.data_dir, args.test_file) test_articles, test_articles_id, test_span_starts, test_span_ends, test_labels = load_data( args.test_data_folder, args.test_template_labels_path) if not os.path.exists(test_file_path) or args.overwrite_cache: logger.info("Creating roberta-type test file: %s", test_file_path) get_test_file(test_articles, test_articles_id, test_span_starts, test_span_ends, test_labels, test_file_path) if args.do_train or args.do_eval or args.do_predict: transformers_clf(args) if args.create_submission_file: if not os.path.exists('results'): os.makedirs('results') output_file = os.path.join('results', args.output_file) logger.info("Creating the submission file: %s", output_file) create_submission_file(args.predicted_logits_files, train_file_path, dev_file_path, test_file_path, test_articles_id, test_span_starts, test_span_ends, output_file, args.weights, args.data_dir) if args.eval_submission: output_file = os.path.join('results', args.output_file) logger.info("Evaluating the submission file: %s", output_file) if args.test_labels_path is None: acc, f1 = eval_submission(output_file, test_file_path) logger.info('accuracy: %f', acc) print('f1-macro:', f1) else: cmd = "python tools/task-TC_scorer.py -s {} -r {} -p {}".format( output_file, args.test_labels_path, args.propaganda_techniques_file) subprocess.run(cmd, shell=True)
def __init__(self, model_folder, checkpoint_file): sys.path.append(model_folder) from model import get_model from dataset import load_data self.dataset = load_data('validation') self.sess = tf.InteractiveSession() self.model = get_model('policy') saver = tf.train.Saver() saver.restore(self.sess, checkpoint_file)
def init_state(logger, config, args): logger.log('Loading data...') with open(args.data) as f_o: data, _ = load_data(args.data) limit_passage = config.get('training', {}).get('limit') vocab_size = config.get('training', {}).get('vocab_size', None) logger.log('Tokenizing data...') data, token_to_id, char_to_id = tokenize_data(logger, data, vocab_size, True, limit_passage) data = get_loader(data, config) id_to_token = {id_: tok for tok, id_ in token_to_id.items()} id_to_char = {id_: char for char, id_ in char_to_id.items()} assert(token_to_id[C.SOS_TOKEN] == C.SOS_INDEX) assert(token_to_id[C.UNK_TOKEN] == C.UNK_INDEX) assert(token_to_id[C.EOS_TOKEN] == C.EOS_INDEX) assert(token_to_id[C.PAD_TOKEN] == C.PAD_INDEX) logger.log('Creating model...') model = BidafModel.from_config(config['bidaf'], id_to_token, id_to_char) if args.word_rep: logger.log('Loading pre-trained embeddings...') with open(args.word_rep) as f_o: pre_trained = SymbolEmbSourceText( f_o, set(tok for id_, tok in id_to_token.items() if id_ != 0)) mean, cov = pre_trained.get_norm_stats(args.use_covariance) rng = np.random.RandomState(2) oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance) model.embedder.embeddings[0].embeddings.weight.data = torch.from_numpy( symbol_injection( id_to_token, 0, model.embedder.embeddings[0].embeddings.weight.data.numpy(), pre_trained, oovs)) else: pass # No pretraining, just keep the random values. # Char embeddings are already random, so we don't need to update them. if torch.cuda.is_available() and args.cuda: model.cuda() model.train() optimizer = get_optimizer(model, config, state=None) return model, id_to_token, id_to_char, optimizer, data
def __init__(self, data_src, seed, batch_size=32, dataset='MNIST'): self.batch_size = batch_size self.data_src = data_src # Load data ((x, y), (x_test, y_test)) = load_data(dataset, seed=seed, imbalance=True) # tf.keras.datasets.cifar10.load_data() if self.data_src == self.TRAIN: self.dataset_x = x self.dataset_y = y else: self.dataset_x = x_test self.dataset_y = y_test # Arrange x: channel first self.dataset_x = np.transpose(self.dataset_x, axes=(0, 3, 1, 2)) # Normalize between -1 and 1 # self.dataset_x = self.dataset_x / 255 - 0.5 # Y 1D format # self.dataset_y = self.dataset_y[:, 0] assert (self.dataset_x.shape[0] == self.dataset_y.shape[0]) # Compute per class instance count. classes = np.unique(self.dataset_y) self.classes = classes per_class_count = list() for c in classes: per_class_count.append(np.sum(np.array(self.dataset_y == c))) # Recount after pruning per_class_count = list() for c in classes: per_class_count.append(np.sum(np.array(self.dataset_y == c))) self.per_class_count = per_class_count # List of labels self.label_table = [str(c) for c in range(len(self.classes))] # Preload all the labels. self.labels = self.dataset_y[:] # per class ids self.per_class_ids = dict() ids = np.array(range(len(self.dataset_x))) for c in classes: self.per_class_ids[c] = ids[self.labels == c]
def main(): model = Net() # print(sum(p.numel() for p in model.parameters() if p.requires_grad)) image_fnames, data_fnames = dataset.find_images() images, landmarks_2d, landmarks_3d = dataset.load_data( image_fnames, data_fnames) dataset.augment_flip(images, landmarks_2d, landmarks_3d) images = np.array(images) landmarks_2d = np.array(landmarks_2d) landmarks_3d = np.array(landmarks_3d) X_train, X_val, Y_train, Y_val = train_test_split(images, landmarks_2d, train_size=0.8, test_size=0.2) from torch.utils.data import DataLoader, TensorDataset BATCH_SIZE = 20 train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(Y_train)) train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) valid_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(Y_val)) valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False) # defining the optimizer optimizer = optim.Adam(model.parameters(), lr=0.07) # defining the loss function criterion = nn.MSELoss() # checking if GPU is available # if torch.cuda.is_available(): # model = model.cuda() # criterion = criterion.cuda() n_epochs = 5 # empty list to store training losses train_losses = [] # empty list to store validation losses val_losses = [] # training the model for epoch in range(n_epochs): train(model, optimizer, criterion, epoch, train_dataloader, valid_dataloader, train_losses, val_losses)
def load_data(partition, xtokenizer, char_vocab, tag_vocab, feats_vocab): logging.info('Loading data') tensor_data = {} data = dataset.load_data(partition, xtokenizer, char_vocab, tag_vocab, feats_vocab) for part in partition: token_data, token_char_data, form_token_char_data, lemma_token_char_data, morph_token_data = data[part] token_data = torch.tensor(token_data, dtype=torch.long) token_char_data = torch.tensor(token_char_data, dtype=torch.long) token_form_char_data = torch.tensor(form_token_char_data, dtype=torch.long) token_lemma_char_data = torch.tensor(lemma_token_char_data, dtype=torch.long) token_morph_data = torch.tensor(morph_token_data, dtype=torch.long) ds = TensorDataset(token_data, token_char_data, token_form_char_data, token_lemma_char_data, token_morph_data) tensor_data[part] = ds return tensor_data
def main(): print('Loading data...') train, valid, test = load_data(args.dataset_path, valid_portion=args.valid_portion) train_data = RecSysDataset(train) valid_data = RecSysDataset(valid) test_data = RecSysDataset(test) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn) valid_loader = DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn) test_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn) if args.dataset_path.split('/')[-2] == 'diginetica': n_items = 43098 elif args.dataset_path.split('/')[-2] in ['yoochoose1_64', 'yoochoose1_4']: n_items = 37484 else: raise Exception('Unknown Dataset!') model = DPAN(n_items, args.hidden_size, args.embed_dim, args.batch_size, args.alpha_pool, args.beta_pool).to(device) if args.test: ckpt = torch.load('latest_checkpoint.pth.tar') model.load_state_dict(ckpt['state_dict']) recall, mrr = validate(test_loader, model) print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall, args.topk, mrr)) return optimizer = optim.Adam(model.parameters(), args.lr) criterion = nn.CrossEntropyLoss() scheduler = StepLR(optimizer, step_size=args.lr_dc_step, gamma=args.lr_dc) for epoch in tqdm(range(args.epoch)): # train for one epoch scheduler.step(epoch=epoch) begin_time = time.time() trainForEpoch(train_loader, model, optimizer, epoch, args.epoch, criterion, log_aggr=200) end_time = time.time() run_time = end_time - begin_time print('Epoch {} 运行时间:{:.4f}s\n'.format(epoch, run_time)) recall, mrr = validate(valid_loader, model) print('Epoch {} validation: Recall@{}: {:.4f}, MRR@{}: {:.4f} \n'.format(epoch, args.topk, recall, args.topk, mrr)) # store best loss and save a model checkpoint ckpt_dict = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(ckpt_dict, 'latest_checkpoint.pth.tar')
def model(): """TODO MLmodel """ from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score data = dataset.load_data() print("all samples= ",data.shape) print("dataY contains:", np.unique(data[:,1])) data = pd.DataFrame(data, columns=['domain', 'label']) data = data.drop_duplicates(subset='domain') data = np.array(data) trainX = data[:30000,0] trainY = data[:30000,1].astype(int) testX = data[30000:30500, 0] testY = data[30000:30500,1].astype(int) #print(trainX) print("trainY contains: ", np.unique(trainY)) #print(testX) print("testY contains: ", np.unique(testY)) feature_table = get_feature(trainX) LR = LogisticRegression() LR = LR.fit(feature_table,trainY) pred = LR.predict(get_feature(testX)) acc = accuracy_score(testY, pred) print("acc stage 1: ", acc) joblib.dump(LR, './models/LR.pkl') algorithm_domains = dataset.load_simple_data() algorithm_domains = list(set(algorithm_domains)) algorithm_y = [0]*len(algorithm_domains) pred_feature = get_feature(algorithm_domains) pred = LR.predict(pred_feature) acc = accuracy_score(algorithm_y, pred) print("acc stage 2: ", acc) #if __name__ == '__main__': # model()
def make_model2(): sensor_data = dataset.load_data() rng = np.random.RandomState(0) indices = np.arange(len(sensor_data.data)) rng.shuffle(indices) print(len(sensor_data.data)) sm = SMOTE(random_state=42) X, y = sm.fit_sample(sensor_data.data[indices[:2000]], sensor_data.target[indices[:2000]]) n_total_samples = len(y) print(len(y)) n_labeled_points = 200 max_iterations = 50 unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:] lp_model = label_propagation.LabelSpreading(kernel='knn', n_neighbors=15) for i in range(max_iterations): if len(unlabeled_indices) == 0: print("No unlabeled items left to label.") break y_train = np.copy(y) y_train[unlabeled_indices] = -1 lp_model.fit(X, y_train) p = lp_model.predict_proba(X[unlabeled_indices]) # predicted_labels = [1 if x > 0.57 else 0 for x in p[:, 1]] predicted_labels = lp_model.predict(X[unlabeled_indices]) true_labels = y[unlabeled_indices] # print("#"*20 + "Iteration :: " + str(i) + "#"*20) # print(classification_report(true_labels, predicted_labels)) pred_entropies = stats.distributions.entropy( lp_model.label_distributions_.T) uncertainty_index = np.argsort(pred_entropies)[::-1] uncertainty_index = uncertainty_index[ np.in1d(uncertainty_index, unlabeled_indices)][:40] delete_indices = np.array([]) for index in uncertainty_index: delete_index, = np.where(unlabeled_indices == index) delete_indices = np.concatenate((delete_indices, delete_index)) unlabeled_indices = np.delete(unlabeled_indices, delete_indices) n_labeled_points += len(uncertainty_index) np.savetxt("X.csv", X, delimiter=",", fmt='%10.5f') np.savetxt("y_train.csv", y_train, delimiter=",", fmt='%10.1f') return lp_model
def load_and_run_all_models(config, emails_list_file, emails_data_base_dir): emails = get_emails(emails_list_file) total_err = 0.0 for mail in emails: X_data, Y_data = dataset.load_data(mail, emails_data_base_dir) model_path = os.path.join(config.save_model_dir, mail, "full") model_object_path = os.path.join(model_path, "model.json") model_weights_path = os.path.join(model_path, "weights.h5") threshold = get_threshold(config.thresholds_dir, mail) err = load_and_run_model(config, model_object_path, model_weights_path, threshold, X_data, Y_data) print("Error for %s with threshold %f: %f" % (mail, threshold, err)) total_err = (total_err + err) / 2 print("Total error:", total_err)
def init_state(config, args): token_to_id = {'': 0} char_to_id = {'': 0} print('Loading data...') with open(args.data) as f_o: data, _ = load_data(json.load(f_o), span_only=True, answered_only=True) print('Tokenizing data...') data = tokenize_data(data, token_to_id, char_to_id) data = get_loader(data, config) id_to_token = {id_: tok for tok, id_ in token_to_id.items()} id_to_char = {id_: char for char, id_ in char_to_id.items()} print('Creating model...') model = BidafModel.from_config(config['bidaf'], id_to_token, id_to_char) if args.word_rep: print('Loading pre-trained embeddings...') with open(args.word_rep) as f_o: pre_trained = SymbolEmbSourceText( f_o, set(tok for id_, tok in id_to_token.items() if id_ != 0)) mean, cov = pre_trained.get_norm_stats(args.use_covariance) rng = np.random.RandomState(2) oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance) model.embedder.embeddings[0].embeddings.weight.data = torch.from_numpy( symbol_injection( id_to_token, 0, model.embedder.embeddings[0].embeddings.weight.data.numpy(), pre_trained, oovs)) else: pass # No pretraining, just keep the random values. # Char embeddings are already random, so we don't need to update them. if torch.cuda.is_available() and args.cuda: model.cuda() model.train() optimizer = get_optimizer(model, config, state=None) return model, id_to_token, id_to_char, optimizer, data
def fit(self): """ Computes the holdout and enqueues the job to the Cluster queue :param X: :param y: :return: """ X, y = load_data(dataset=self.dataset, data_home=self.data_home) holdout = check_holdout(self.holdout, X, y, classifier=True) params = {'dataset': self.dataset, 'data_home': self.data_home, 'holdout': holdout, 'estimator': self.estimator, 'parameters': ParameterGrid(self.param_grid), 'scorer': self.scorer, 'verbose': self.verbose, 'fit_params': self.fit_params, 'error_score': self.error_score, 'split': holdout} self.queue.enqueue(params, 1)
def train_and_evaluate(config, emails_list_file, emails_data_base_dir): emails = get_emails(emails_list_file) totalFP = 0 totalFN = 0 totalTP = 0 totalTN = 0 with open(config.results_file, "w+") as f: f.write("") for mail in emails: print('Loading data...') X_data, Y_data = dataset.load_data(mail, emails_data_base_dir) falsePositives = 0 falseNegatives = 0 truePositives = 0 trueNegatives = 0 predictions = [] shouldBe = [] for i in range(len(X_data)): tmp = X_data[i] X_data[i] = X_data[0] X_data[0] = tmp tmp = Y_data[i] Y_data[i] = Y_data[0] Y_data[0] = tmp X_train = X_data[1:] y_train = Y_data[1:] X_test = X_data[0:1] y_test = Y_data[0:1] print(len(X_train), 'train sequences') print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=config.max_seq_len) X_test = sequence.pad_sequences(X_test, maxlen=config.max_seq_len) X_train, X_test = config.additional_data_transform(X_train, X_test) max_value = max(X_train.max(), X_test.max()) + 1 print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = config.build_model(max_value) model.compile(loss=config.loss_function, optimizer=config.optimizer, class_mode=config.class_mode) print("Train...") model.fit(X_train, y_train, batch_size=config.batch_size, nb_epoch=config.epochs, validation_data=(X_test, y_test), show_accuracy=True) score, acc = model.evaluate(X_test, y_test, batch_size=config.batch_size, show_accuracy=True) prediction = model.predict(X_test) predicted_class = round(abs(float(prediction))) print("Model prediction:", prediction, "Should be:", y_test) print('Test score:', score) print('Test accuracy:', acc) predictions.append(float(prediction)) shouldBe.append(y_test[0]) if(predicted_class == 0 and y_test[0] == 1): falseNegatives += 1 elif(predicted_class == 1 and y_test[0] == 0): falsePositives += 1 elif(predicted_class == 1 and y_test[0] == 1): truePositives += 1 elif(predicted_class == 0 and y_test[0] == 0): trueNegatives += 1 tmp = X_data[i] X_data[i] = X_data[0] X_data[0] = tmp tmp = Y_data[i] Y_data[i] = Y_data[0] Y_data[0] = tmp totalFP += falsePositives totalFN += falseNegatives totalTP += truePositives totalTN += trueNegatives model_path = os.path.join(config.save_model_dir, mail, str(i)) model_object_path = os.path.join(model_path, "model.json") model_weights_path = os.path.join(model_path, "weights.h5") save_model(model, model_object_path, model_weights_path) train_on_full_data(config, mail, X_data, Y_data) result_string = "\n".join(["For: %s FP: %d FN: %d TP: %d TN: %d", "Predictions: %s", "Should be: %s"]) result_string = result_string % (mail, falsePositives, falseNegatives, truePositives, trueNegatives, str(predictions), str(shouldBe)) appendResults(result_string, config.results_file) print(result_string) result_string = "TotalFP: " + str(totalFP) + " TotalFN: " + str(totalFN) + \ " TotalTP: " + str(totalTP) + " TotalTN: " + str(totalTN) appendResults(result_string, config.results_file) print(result_string) gather_results.from_file(config.results_file, config.out_results_dir) print("Looking for FP-free threshold...") compute_thresholds(config.out_results_dir, config.thresholds_dir, emails_list_file)
for k in range(len(args)): mat[:,:,k] = mat_temp[:,:,args[k]] return mat def shuffle_args(): args = [] for i in range(5): a = [0,1] random.shuffle(a) args.append(a) seq = [0,1,2,3,4] random.shuffle(seq) args.append(seq) return args if __name__ == '__main__': from dataset import load_data dataset = load_data('validation') data, label= dataset.next_batch(1000) for selfpos, emypos, selfmove, emymove, selfprot, emyprot in zip(*data): new_selfmove, new_emymove, new_selfprot, new_emyprot = gentensor(selfpos, emypos) #from IPython import embed; embed() assert all( (selfmove.sum(axis=2) == new_selfmove.sum(axis=2)).reshape(-1)) assert all( (emymove.sum(axis=2) == new_emymove.sum(axis=2)).reshape(-1)) assert all( (selfprot.sum(axis=2) == new_selfprot.sum(axis=2)).reshape(-1)) assert all( (emyprot.sum(axis=2) == new_emyprot.sum(axis=2)).reshape(-1))
from sklearn.externals import joblib from HOG import HOG import dataset import argparse ''' Set up the argument parser which will get the CSV file and location where model is to be stored''' argparser = argparse.ArgumentParser() argparser.add_argument("-d", "--dataset", required = True, help = "path to the dataset file") argparser.add_argument("-m", "--model", required = True, help = "path to where the model will be stored") args = vars(argparser.parse_args()) (digits, labels) = dataset.load_data(args["dataset"]) hog = HOG(orientations = 18, pixelsPerCell = (10, 10), cellsPerBlock = (1, 1), normalise = True) data = [] # Add histogram for each digit in a list for digit in digits: digit = dataset.deskew(digit) hist = hog.describe(digit.reshape((28,28))) data.append(hist) # Set up and train the model SVC_model = LinearSVC() SVC_model.fit(data, labels) # Save the model to file
from sklearn import datasets from sklearn.cross_validation import train_test_split from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report import matplotlib.pyplot as plt import numpy as np from sklearn.utils import shuffle #This is where your clf will be (model) scores, targets, acc = load_data() ''' # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images)
stats.register("std", np.std) stats.register("min", np.min) stats.register("max", np.max) pop, log = alg.myEAMuCommaLambda(pop, start_gen, toolbox, Config.MU, Config.LAMBDA, cxpb=0.6, mutpb=0.2, ngen=Config.ngen, stats=stats, halloffame=hof, logbook=logbook, verbose=True, id=id) return pop, log, hof if __name__ == "__main__": # load the whole data X_train, y_train = load_data("data/"+trainset_name) X_test, y_test = load_data("data/"+testset_name) # set cfg Config.input_shape = X_train[0].shape Config.noutputs = y_train.shape[1] # print(Config.input_shape, Config.noutputs) if checkpoint_file is None: pop, log, hof = main(id) else: pop, log, hof = main(id, checkpoint_file) network = hof[0].createNetwork() network.summary() print( hof[0] )
from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.layers.embeddings import Embedding NFOLDS = 10 BATCH = 32 EPOCHS = 5 CLASSES = 2 EMBDIMS = 100 MAXLEN = 100 MAXFEATURES = 10000 if __name__ == "__main__": np.random.seed(1337) dataset = dataset.Dataset(MAXFEATURES) x, y = dataset.load_data() labels_one_hot = k.utils.np_utils.to_categorical(np.array(y), CLASSES) x = sequence.pad_sequences(x, maxlen=MAXLEN) scores = [] folds = sk.cross_validation.KFold(len(y), n_folds=NFOLDS) for train_indices, test_indices in folds: print 'starting new fold...' train_x = x[train_indices] train_y = labels_one_hot[train_indices] test_x = x[test_indices] test_y = labels_one_hot[test_indices] model = k.models.Sequential()