def train_embedding(preprocess, datatype): if preprocess: embedding_path = PRE_ALL_PATH else: embedding_path = NOPRE_ALL_PATH encode_time = True if datatype is not DataType.ALL: word_sequences = reader.load_data(preprocess, datatype, encode_time) sequences_for_training = [] for idx, words in word_sequences.items(): sentences = [s for s in ' '.join(words).replace(NULL_TOKEN, '').split(' ') if s != ''] for s in sentences: sequences_for_training.append(s) else: sequences_for_training = [] word_sequences = reader.load_data(preprocess, DataType.VISITOR, encode_time) for idx, words in word_sequences.items(): sentences = [s for s in ' '.join(words).replace(NULL_TOKEN, '').split(' ') if s != ''] for s in sentences: sequences_for_training.append(s) word_sequences = reader.load_data(preprocess, DataType.AGENT, encode_time) for idx, words in word_sequences.items(): sentences = [s for s in ' '.join(words).replace(NULL_TOKEN, '').split(' ') if s != ''] for s in sentences: sequences_for_training.append(s) print('data load completed. start training.') print(sequences_for_training[0:2]) cores = multiprocessing.cpu_count() model = gensim.models.Word2Vec(sentences=SequenceIterator(sequences_for_training), size=50, sg=1, min_count=5, window=5, workers=cores) model.save(embedding_path)
def main(_): raw_data = reader.load_data(FLAGS.parse_data_path) train_sents, train_trees, dev_sents, dev_trees, vocab_dict, pos_dict, label_dict = raw_data # items in ids config = get_config(FLAGS.parse_lang) with tf.Session() as session: with tf.variable_scope(FLAGS.parse_scope_name): m = NNParser(config=config) # CheckPoint State if not os.path.exists(FLAGS.parse_train_dir): os.makedirs(FLAGS.parse_train_dir) ckpt = tf.train.get_checkpoint_state(FLAGS.parse_train_dir) if ckpt: print("Loading model parameters from %s" % ckpt.model_checkpoint_path) m.saver.restore(session, tf.train.latest_checkpoint(FLAGS.parse_train_dir)) else: print("Created model with fresh parameters.") session.run(tf.global_variables_initializer()) # train dataset should be generated only once and called by run_epoch function for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.4f" % (i + 1, session.run(m.lr))) # new iterator train_dataset = transition_system.generate_examples( train_sents, train_trees, m.batch_size, label_dict) train_perplexity = run_epoch(session, m, m.train_op, train_dataset) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
def sub_proc(sub_filenames, q, idx): for _, a_file in enumerate(sub_filenames): df = reader.load_data(a_file) vid = np.asarray(df.vid.values, dtype=np.int64) cid = np.asarray(df.cid.values, dtype=np.int64) title_length = np.asarray(df.title_length.values, dtype=np.int64) class_id = np.asarray(df.class_id.values, dtype=np.int64) second_class = np.asarray(df.second_class.values, dtype=np.int64) is_intact = np.asarray(df.is_intact.values, dtype=np.int64) stars = df.stars.values sample_member = [ vid, cid, title_length, class_id, second_class, is_intact, stars ] sub_item_map = dict() collector = dict() for i, k in enumerate(vid): sample = [ vid[i], cid[i], title_length[i], class_id[i], second_class[i], is_intact[i] ] sample = np.asarray(sample, dtype=np.int64) sample = np.concatenate([sample, stars[i]]) #print(sample, type(sample), sample.dtype) sub_item_map[k] = sample collector.update(sub_item_map) q.put(collector, block=True, timeout=False)
def calibrate(reck_heaters, calib, heater_name, heater_index, input, outputs): datafilename = take_fringe(reck_heaters, heater_index) data_for_fitting = load_data(datafilename,outputs) calib.datafilename = data_for_fitting calib.fit(heatername)
def functor(idx, q, sub_filenames): collector = dict() for i, a_file in enumerate(filenames): df = reader.load_data(a_file) uids = df.did.values watches = df.watch.values uid_vid_map = get_user_watch_map(uids, watches) full_uid_vid_map.update(uid_vid_map) collector.update(full_uid_vid_map) status, output = getstatusoutput('free -g') print('done with sub_files: {}, mem:\n{}'.format( sub_filenames, output)) q.put(collector, block=True) status, output = getstatusoutput('free -g') print('put into queue, mem: {}'.format(output))
def get_full_user_map(path, num_parallel_reads=None): assert isinstance(num_parallel_reads, int), "invalid type of num_parallel_reads." if num_parallel_reads > 1: full_uid_vid_map = _get_full_user_map_parallel(path, num_parallel_reads) else: filenames = utils.path_to_list(path, key_word='user') full_uid_vid_map = dict() for i, a_file in enumerate(filenames): df = reader.load_data(a_file) uids = df.did.values watches = df.watch.values uid_vid_map = get_user_watch_map(uids, watches) full_uid_vid_map.update(uid_vid_map) return full_uid_vid_map
def main(): parser = argparse.ArgumentParser( description='Generates some cards for bunker') parser.add_argument('--xlsx', help='path to excel file') args = parser.parse_args() fname = args.xlsx data = load_data(fname) gen = SimpleGenerator(data) while True: print('Your card is:') card = gen.generate() print_card(card) print('\n\n') inp = input("Press Enter to continue...\n") if inp == 'Q': break
def __init__(self, config, data_path=None, vocabulary=None, name=None): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps raw_context, raw_questions, raw_choices, raw_labels, self.choices_map = \ read.load_data(data_path) all_choices = read.build_choices(raw_choices) self.epoch_size = ((len(raw_context) // batch_size) - 1) // num_steps # build vocab for train data if not vocabulary: self.vocabulary = read.get_vocab(raw_questions,\ raw_context,min_frequency=500) else: self.vocabulary = vocabulary raw_choices = [" ".join(x) for x in raw_choices] self.all_choices = read.vocab_transform(all_choices, self.vocabulary) self.questions = read.vocab_transform(raw_questions, self.vocabulary) self.context = read.vocab_transform(raw_context, self.vocabulary) self.labels = read.vocab_transform(raw_labels, self.vocabulary) self.choices = read.vocab_transform([" ".join(x) for x in raw_choices], self.vocabulary)
filename=datetime.now().strftime('mylogfile_%H_%M_%d_%m_%Y.log'), filemode='w') if __name__ == "__main__": # tf.reset_default_graph() options = get_params() root = "/Users/liuhongbing/Documents/tensorflow/data/snli_1.0/" train = [l.strip().split('\t') for l in open(root + 'snli_1.0_train.txt')] dev = [l.strip().split('\t') for l in open(root + 'snli_1.0_dev.txt')] test = [l.strip().split('\t') for l in open(root + 'snli_1.0_test.txt')] vocab = get_vocab(train) print("vocab (incr. maxfeatures accordingly):", len(vocab)) X_train, Y_train, Z_train = load_data(train, vocab) X_dev, Y_dev, Z_dev = load_data(dev, vocab) X_test, Y_test, Z_test = load_data(test, vocab) print('Build model...') model = build_model(options) config_str = getConfig(options) MODEL_ARCH = root + "/Attention_neural/arch_att" + config_str + ".yaml" MODEL_WGHT = root + "/Attention_neural/weights_att" + config_str + ".weights" MAXLEN = options.xmaxlen X_train = pad_sequences(X_train, maxlen=MAXLEN, value=vocab["unk"], padding='pre')
import json import os from keras.models import load_model from numpy import argmax import CONFIG from reader import KerasBatchGenerator, load_data _, _, _total_words, reversed_dictionary, dictionary = load_data() _model = load_model(os.path.join(os.getcwd(), 'model', 'model.h5')) while True: input_string = input('\n\nEnter 3 words: \n') input_string = input_string.split() input_string = input_string[3:] idx = [] for i in input_string: if i == '.': i = '<eos>' try: idx.append(dictionary[i]) except: print('Word ', i, ' donot exist') i = '<unk>' idx.append(dictionary[i]) pass string = ''
def preprocess(): # Load training and eval data samples,labels,categories = reader.load_data("../data_set/train2014/", "../data_set/annotations/instances_train2014.json")
def main(_): data_path = FLAGS.data_path if ((FLAGS.clear_save) & (os.path.exists(FLAGS.save_path))): shutil.rmtree(FLAGS.save_path) if FLAGS.testing: train_path = os.path.join(data_path, 'test') else: train_path = os.path.join(data_path, 'train') val_path = os.path.join(data_path, 'val') test_path = os.path.join(data_path, 'test') if not os.path.exists(FLAGS.save_path): os.makedirs(FLAGS.save_path) log_fi = os.path.join(FLAGS.save_path,'output.log') lg.basicConfig(filename=log_fi,level=lg.DEBUG,\ format='%(asctime)s %(message)s') # print("Loading train data from %s" % train_path) train = RawInput(rn.load_data(train_path, return_entities=True)) print("Loading val data from %s"%val_path) val = RawInput(rn.load_data(val_path, return_entities=True), vocabulary=train.vocab) print("Loading test data from %s" % test_path) test = RawInput(rn.load_data(test_path, return_entities=True), vocabulary=train.vocab) if FLAGS.use_glove: embedding = rn.glove_embedding(FLAGS.glove_path,train.vocab) else: embedding = None with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-FLAGS.init_scale, FLAGS.init_scale) print("Loading model..") with tf.name_scope("Train"): with tf.variable_scope("Model", reuse=None, initializer=initializer): if FLAGS.use_glove: m = Model(vocab_size=train.vocab_size, choices_idx=train.transformed_labels_idx, pre_embedding=embedding) else: m = Model(vocab_size=train.vocab_size, choices_idx=train.transformed_labels_idx) with tf.Session() as session: saver = tf.train.Saver(tf.all_variables()) ckpt = tf.train.get_checkpoint_state(FLAGS.save_path) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Loading parameters from %s" % ckpt.model_checkpoint_path) lg.info("Loading parameters from %s" % ckpt.model_checkpoint_path) saver.restore(session, ckpt.model_checkpoint_path) else: print("New session.") lg.info("New session.") session.run(tf.initialize_all_variables()) all_st = time.time() for i in range(FLAGS.max_epoch): train_iter = rn.batch_iter( train.contexts, train.questions, train.choices, train.labels, train.choices_map, train.context_lens, train.qs_lens, batch_size=FLAGS.batch_size, entity_inds=train.entities) train_cost, train_acc = run_epoch( session, m, train_iter, train_op=m.train_op, verbose=False, vocab=train.vocab) print("Train cost: after " + str(i) + " epoch is " + str(train_cost)) print("Train acc: after " + str(i) + " epoch is " + str(train_acc)) lg.info("Train cost: after " + str(i) + " epoch is " + str(train_cost)) lg.info("Train acc: after " + str(i) + "epoch is " + str(train_acc)) if i % FLAGS.ckpt_steps == 0: checkpoint_path = os.path.join(FLAGS.save_path, "wdw.ckpt") saver.save(session, checkpoint_path, global_step=i) val_iter = rn.batch_iter( val.contexts, val.questions, val.choices, val.labels, val.choices_map, val.context_lens, val.qs_lens, batch_size=FLAGS.batch_size, entity_inds=val.entities) val_cost, val_acc = run_epoch( session, m, val_iter, train_op=None, verbose=False, vocab=train.vocab, is_testing=True) lg.info("Val cost: after " + str(i) + " epoch is " + str(val_cost)) lg.info("Val acc: after " + str(i) + " epoch is " + str(val_acc)) print("Val cost: after " + str(i) + " epoch is " + str(val_cost)) print("Val acc: after " + str(i) + " epoch is " + str(val_acc)) test_iter = rn.batch_iter( test.contexts, test.questions, test.choices, test.labels, test.choices_map, test.context_lens, test.qs_lens, batch_size=FLAGS.batch_size, entity_inds=test.entities) print("Checking on test set.") test_cost, test_acc = run_epoch(session, m, test_iter, train_op=None, verbose=False, vocab=train.vocab,is_testing=True) test_str = ("Test Accuracy: %s\n" % test_acc) print(test_str) lg.info(test_str)
def main(_): if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) logger = logging.getLogger() # Do what you need to load datasets from FLAGS.data_dir dataset = None vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) # load data sets Q_train, P_train, A_start_train, A_end_train, A_len_train, P_raw_train, A_raw_train, Q_len_train, P_len_train = load_data( FLAGS.data_dir, "train") Q_dev, P_dev, A_start_dev, A_end_dev, A_len_dev, P_raw_dev, A_raw_dev, Q_len_dev, P_len_dev = load_data( FLAGS.data_dir, "val") #Q_test, P_test, A_start_test, A_end_test = load_data(FLAGS.data_dir, "test") # see some data logger.info("Training samples read... %s" % (len(Q_train))) logger.info("Dev samples read... %s" % (len(Q_dev))) # logger.info("Before Padding: \n Q_train[0]: %s \n P_train[0]: %s \n A_start_train[0]: %s \n A_end_train[0]: %s" % (Q_train[0], P_train[0], A_start_train[0], A_end_train[0])) # pad the data at load-time. So, we don't need to do any masking later!!! # ref: https://keras.io/preprocessing/sequence/ # if len < maxlen, pad with specified val # elif len > maxlen, truncate QMAXLEN = FLAGS.QMAXLEN PMAXLEN = FLAGS.PMAXLEN Q_train = pad_sequences(Q_train, maxlen=QMAXLEN, value=PAD_ID, padding='post') P_train = pad_sequences(P_train, maxlen=PMAXLEN, value=PAD_ID, padding='post') A_start_train = pad_sequences(A_start_train, maxlen=PMAXLEN, value=0, padding='post') A_end_train = pad_sequences(A_end_train, maxlen=PMAXLEN, value=0, padding='post') train_data = zip(P_train, Q_train, P_len_train, Q_len_train, A_start_train, A_end_train, A_len_train, P_raw_train, A_raw_train) # see the effect of padding # logger.info("After Padding: \n Q_train[0]: %s \n P_train[0]: %s \n A_start_train[0]: %s \n A_end_train[0]: %s" % (Q_train[0], P_train[0], A_start_train[0], A_end_train[0])) # repeat on dev and test set Q_dev = pad_sequences(Q_dev, maxlen=QMAXLEN, value=PAD_ID, padding='post') P_dev = pad_sequences(P_dev, maxlen=PMAXLEN, value=PAD_ID, padding='post') A_start_dev = pad_sequences(A_start_dev, maxlen=PMAXLEN, value=0, padding='post') A_end_dev = pad_sequences(A_end_dev, maxlen=PMAXLEN, value=0, padding='post') dev_data = zip(P_dev, Q_dev, P_len_dev, Q_len_dev, A_start_dev, A_end_dev, A_len_dev, P_raw_dev, A_raw_dev) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if os.path.exists(global_train_dir): os.unlink(global_train_dir) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Graph().as_default(): with tf.Session() as sess: logger.info("Loading embeddings") embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz') pretrained_embeddings = embeddings['glove'] logger.info("Embeddings loaded with shape: %s %s" % (pretrained_embeddings.shape)) qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys())) initialize_model(sess, qa, train_dir) # a reasonable model should perhaps give decent results (f1 in double digits) even with training on smaller set of train_data if FLAGS.tiny_sample: sample_pct = FLAGS.tiny_sample_pct # sample sample_pct % from train and test for local dev sam_train = np.random.choice( range(len(train_data)), int(sample_pct / 100 * len(train_data))) # no need to sample dev sam_dev = range( len(dev_data) ) #np.random.choice(range(len(dev_data)), int(FLAGS.dev_tiny_sample_pct/100*len(dev_data))) # small sample train_data = [train_data[i] for i in sam_train] dev_data = [dev_data[i] for i in sam_dev] qa.train(sess, train_data, dev_data)
from reader import load_data from reader import get_vocab from reader import vocab_transform from reader import batch_iter contexts, questions, choices, labels, choices_map, context_lens, qs_lens, entities =\ load_data(data_path="wdw/test", return_entities=True) # # # 2. Fit vocabulary with questions and context. # vocab = get_vocab(contexts, questions) # # # # 3. Transform context and questions # contexts = vocab_transform(contexts, vocab) # questions = vocab_transform(questions, vocab) # # # 4. Give to batch_iter # readers = batch_iter(contexts, questions, choices, labels, choices_map, # context_lens, qs_lens) # # # for q, c, ch, lab, ch_map, c_lens, q_lens in readers: # # print(c.shape) # # break
def main(_): train_path = os.path.join(FLAGS.data_wdw, 'train') val_path = os.path.join(FLAGS.data_wdw, 'val') test_path = os.path.join(FLAGS.data_wdw, 'test') print("Loading train data from %s" % train_path) train = RawInput(rn.load_data(train_path)) print("Loading val data from %s" % val_path) val = RawInput(rn.load_data(val_path), vocabulary=train.vocab) if len(train.labels_idx) < len(val.labels_idx): print("More validation choices than train") print("Loading test data from %s" % test_path) test = RawInput(rn.load_data(test_path), vocabulary=train.vocab) if len(train.labels_idx) < len(test.labels_idx): print("More test choices than train") with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-FLAGS.init_scale, FLAGS.init_scale) print("Loading model..") with tf.name_scope("Train"): with tf.variable_scope("Model", reuse=None, initializer=initializer): m = Model(is_training=True, vocab_size=train.vocab_size, labels_idx=train.labels_idx) sv = tf.train.Supervisor(logdir=FLAGS.save_path) with sv.managed_session() as session: for i in range(FLAGS.max_epoch): train_iter = rn.batch_iter( train.contexts, train.questions, train.choices, train.labels, train.choices_map, train.context_lens, train.qs_lens, batch_size=FLAGS.batch_size, context_num_steps=FLAGS.context_steps, question_num_steps=FLAGS.question_steps) # lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) # m.assign_lr(session, config.learning_rate * lr_decay) val_iter = rn.batch_iter( val.contexts, val.questions, val.choices, val.labels, val.choices_map, val.context_lens, val.qs_lens, batch_size=FLAGS.batch_size, context_num_steps=FLAGS.context_steps, question_num_steps=FLAGS.question_steps) print("Epoch: %d" % (i + 1)) run_epoch(session, m, train_iter, eval_op=m.train_op, verbose=True) print("Checking on validation set.") ave_cost, ave_acc = run_epoch(session, m, val_iter, eval_op=None, verbose=False) print("Avg. Val Accuracy: %s" % ave_acc) print("Avg. Vac Cost: %s" % ave_cost) test_iter = rn.batch_iter(test.contexts, test.questions, test.choices, test.labels, test.choices_map, test.context_lens, test.qs_lens, batch_size=FLAGS.batch_size, context_num_steps=c_steps, question_num_steps=q_steps) print("\nChecking on test set.") test_acc = run_epoch(session, m, test_iter, eval_op=None, verbose=False) print("\nAvg. Test Accuracy: %s\n" % test_acc) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
from reader import load_data from reader import get_vocab from reader import vocab_transform from reader import batch_iter contexts, questions, choices, labels, choices_map, context_lens, qs_lens =\ load_data(data_path="wdw/test") # # 2. Fit vocabulary with questions and context. vocab = get_vocab(contexts, questions) # # 3. Transform context and questions contexts = vocab_transform(contexts, vocab) questions = vocab_transform(questions, vocab) # 4. Give to batch_iter readers = batch_iter(contexts, questions, choices, labels, choices_map, context_lens, qs_lens) # for q, c, ch, lab, ch_map, c_lens, q_lens in readers: # print(c.shape) # break
Author: Gabriela Tavares, [email protected] Adapted from jmetzen.github.io/2015-11-27/vae.html """ import numpy as np import tensorflow as tf import matplotlib.pyplot as plt np.random.seed(0) tf.set_random_seed(0) # Load data. import reader dataset = reader.load_data('juri_train.csv', 'juri_test.csv') n_samples = dataset.train.num_examples def xavier_init(fan_in, fan_out, constant=1): """ Xavier initialization of network weights""" # https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow low = -constant * np.sqrt(6.0 / (fan_in + fan_out)) high = constant * np.sqrt(6.0 / (fan_in + fan_out)) return tf.random_uniform((fan_in, fan_out), minval=low, maxval=high, dtype=tf.float32) class VariationalAutoencoder(object):
def main(): raw_data = rdr.load_data("input/") train_data, valid_data, test_data, vocabulary, reversed_dictionary = raw_data test(MODEL_PATH, test_data, reversed_dictionary, vocabulary)
def test(): data_path = "./data/zh" print("Data Path: " + data_path) train_sents, train_trees, dev_sents, dev_trees, vocab_dict, pos_dict, label_dict = reader.load_data( data_path) print("Vocab Dict Size %d" % len(vocab_dict)) print("POS Dict Size %d" % len(pos_dict)) print("Label Dict Size %d" % len(label_dict)) # unique labels size, Nl, not arc label num train_dataset = generate_examples(train_sents, train_trees, 1, label_dict) # Unknown feature index for step, (x, y) in enumerate(train_dataset): if (step <= 10): print("Step id: %d" % step) print(x) print(y) else: break
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if os.path.exists(global_train_dir): os.unlink(global_train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Download Dataset json ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) _, _, _ = prepare_dev(dev_dirname, dev_filename, vocab) # ========= Process input json ========= prefix = os.path.join("data", "squad") # writes dev.answer, dev.context, dev.question, dev.span dev_path = FLAGS.dev_path dev_filename = FLAGS.dev_path.split("/")[-1] dev_data = data_from_json(os.path.join(prefix, dev_filename)) dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', prefix) print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers)) # writes dev.ids.context, dev.ids.question vocab_path = pjoin(os.path.join("data", "squad"), "vocab.dat") dev_deposit_path = pjoin(os.path.join("data", "squad"), "dev") x_dis_path = dev_deposit_path + ".ids.context" y_ids_path = dev_deposit_path + ".ids.question" data_to_token_ids(dev_deposit_path + ".context", x_dis_path, vocab_path) data_to_token_ids(dev_deposit_path + ".question", y_ids_path, vocab_path) # load data sets Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data(os.path.join("data", "squad"), "dev") # for our purposes this is as test set. question_uuid_data = [] with open(os.path.join("data", "squad") + "/dev.quid") as f: for line in f: question_uuid_data.append((line)) # pad the data at load-time. So, we don't need to do any masking later!!! # ref: https://keras.io/preprocessing/sequence/ # if len < maxlen, pad with specified val # elif len > maxlen, truncate QMAXLEN = FLAGS.QMAXLEN PMAXLEN = FLAGS.PMAXLEN Q_test = pad_sequences(Q_test, maxlen=QMAXLEN, value=PAD_ID, padding='post') P_test = pad_sequences(P_test, maxlen=PMAXLEN, value=PAD_ID, padding='post') A_start_test = pad_sequences(A_start_test, maxlen=PMAXLEN, value=0, padding='post') A_end_test = pad_sequences(A_end_test, maxlen=PMAXLEN, value=0, padding='post') test_data = zip(P_test, Q_test, P_len_test, Q_len_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, question_uuid_data) # ========= Model-specific ========= # You must change the following code to adjust to your model with tf.Graph().as_default(): with tf.Session() as sess: embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz') pretrained_embeddings = embeddings['glove'] qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys())) initialize_model(sess, qa, train_dir) # get predicted start-end indices a_s = [] # store all start index preds a_e = [] # store all end index preds a_s_l = [] a_e_l = [] f1 = exact_match = total = 0; answers = {} prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size)) for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)): batch_test = batch[:4] (ys, ye) = qa.predict_on_batch(sess, *batch_test) a_s = (np.argmax(ys, axis=1)) a_e = (np.argmax(ye, axis=1)) a_s_l = a_s_l + list(a_s) a_e_l = a_e_l + list(a_e) for j in range(len(a_s)): p_raw = batch[7][j] a_raw = batch[8][j] s = a_s[j] e = a_e[j] pred_raw = ' '.join(p_raw.split()[s:e + 1]) f1 += f1_score(pred_raw, a_raw) exact_match += exact_match_score(pred_raw, a_raw) total += 1 answers[batch[9][j].strip("\n")] = pred_raw.strip("\n") prog.update(i + 1, [("processed", i + 1)]) exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total print(("First Answer Entity level F1/EM: %.2f/%.2f", f1, exact_match)) #answers = generate_answers(question_uuid_data, a_s_l, a_e_l, context_data, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
import os from keras.callbacks import ModelCheckpoint, TensorBoard import CONFIG from keras_model import model from reader import KerasBatchGenerator, load_data, save_json train_data, valid_data, _total_words, reversed_dictionary, dictionary = load_data( ) train_data_generator = KerasBatchGenerator(train_data, CONFIG._num_steps, CONFIG._batch_size, _total_words, skip_step=CONFIG._num_steps) valid_data_generator = KerasBatchGenerator(valid_data, CONFIG._num_steps, CONFIG._batch_size, _total_words, skip_step=CONFIG._num_steps) _model = model(total_words=_total_words, hidden_size=CONFIG._hidden_size, num_steps=CONFIG._num_steps, optimizer='adam') print(_model.summary()) checkpointer = ModelCheckpoint(filepath=os.path.join(os.getcwd(), 'model', 'checkpoint',
def train(config, evaluator, restore=False): data, num_emb = reader.load_data(config) train_set, dev_set, test_set = data['train'], data['valid'], data['test'] if not os.path.exists(config.model_dir): os.mkdir(config.model_dir) if not os.path.exists(config.log_dir): os.mkdir(config.log_dir) if not os.path.exists(config.log_train_dir): os.mkdir(config.log_train_dir) if restore == False: train_files = glob.glob(config.log_train_dir + '/*') for train_file in train_files: os.remove(train_file) if len(config.gpu_chosen) > 0: gpu_options = tf.GPUOptions( visible_device_list=",".join(map(str, config.gpu_chosen)), per_process_gpu_memory_fraction=config.gup_per_fraction) else: gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=config.gup_per_fraction) with tf.Graph().as_default(), \ tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # with tf.variable_scope("model", reuse=None): model = config.model_func(config) init = tf.global_variables_initializer() saver = tf.train.Saver() sess.run(init) if not config.DEBUG: word_embedding = np.loadtxt(config.word_vec_path, dtype=np.float32) # with tf.variable_scope("model", reuse=True): with tf.variable_scope("Embed", reuse=True): embedding = tf.get_variable( "embedding", [config.vocab_size, config.wordvec_size]) ea = embedding.assign(word_embedding) sess.run(ea) best_valid_score = 0.0 best_valid_epoch = 0 if restore: saver.restore(sess, config.model_path) with open(config.log_train_acc_path, "w") as train_acc_fp, \ open(config.log_valid_acc_path, "w") as valid_acc_fp: for epoch in range(config.num_epoch): start_time = time.time() if epoch > config.decay_epoch: learning_rate = sess.run(model.learning_rate) lr_decay = config.lr_decay #learning_rate = config.learning_rate #lr_decay = config.lr_decay**max(epoch-config.decay_epoch, 0.0) sess.run( tf.assign(model.learning_rate, learning_rate * lr_decay)) print('=' * 40) print(("Epoch %d, Learning rate: %.4f") % (epoch + 1, sess.run(model.learning_rate))) avg_loss = evaluator.train(train_set, model, sess) print(('\ntrain loss: %.4f') % avg_loss) if (epoch + 1) % 5 == 0: train_score = evaluator.evaluate(train_set, model, sess)[0] print(('train top1 acc: %.4f') % train_score) train_acc_fp.write("%d: %.4f\n" % (epoch + 1, train_score)) valid_score = evaluator.evaluate(dev_set, model, sess)[0] print(('valid top1 acc: %.4f') % valid_score) valid_acc_fp.write("%d: %.4f\n" % (epoch + 1, valid_score)) if valid_score > best_valid_score: best_valid_score = valid_score best_valid_epoch = epoch if config.model_save_by_best_valid: saver.save(sess, config.model_path) if not config.model_save_by_best_valid and ( epoch + 1) % config.model_save_period == 0: saver.save(sess, config.model_path) if config.model_save_by_best_valid and epoch - best_valid_epoch > config.early_stop_epoch: break print("time per epoch is %.2f min" % ((time.time() - start_time) / 60.0)) if not config.model_save_by_best_valid: saver.save(sess, config.model_path) print(("\nbest valid top1 acc: %.4f") % best_valid_score) test_score = evaluator.evaluate(test_set, model, sess)[0] print(('*' * 10 + 'test top1 acc: %.4f') % test_score)
self.class_ = key.split('/')[-2] def __str__(self): self.normalize_beans() out_str = ','.join([str(i) for i in self.norm_beans]) out_str += "," + self.class_ return out_str def normalize_beans(self): s = sum(self.beans) self.norm_beans = [i*1e0/s for i in self.beans] if __name__ == "__main__": files = reader.load_data('../../../1/data') descriptors = [] k = 0 for filename in files: img = cv2.imread(filename) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) sift = cv2.SIFT() kp, des = sift.detectAndCompute(gray, None) for index in xrange(len(kp)): descriptors.append(SiftDescriptor(kp[index], des[index], filename)) k += 1 print k kmeans_input_des = np.vstack([x.des for x in descriptors])
test_batch_size = 80 period = [1, 4, 16, 64] epoch = 200 parameter_configs = { "learning_rate": 0.001, "lstm_hidden_size": 192, "temperature": 1, "period": [1, 4, 16, 64] } print('Loading data...') #----------------------------data------------------------------------------ train_filepath = './dataset/train_data.p' val_filepath = './dataset/val_data.p' test_filepath = './dataset/test_data.p' train_data, train_labels = reader.load_data(train_filepath, time_length, 1) val_data, val_labels = reader.load_data(val_filepath, time_length, 1) test_data, test_labels = reader.load_data(test_filepath, time_length, 1) print train_data.shape print val_data.shape print test_data.shape test_nums = test_data.shape[0] test_batch_nums = int(math.ceil(test_nums / (test_batch_size + 0.0))) #shuffle train_data_tmp = np.zeros(train_data.shape) train_labels_tmp = np.zeros(train_labels.shape) count = 0 samples_train_nums = train_data.shape[0]
config = Config() model = LSTM_RBM(config) # load data if config.new_data: pitches = reader.data2index('./pitches.pkl') config.n_visible = pitches[3] inputs_data = pitches[0] index_to_data = pitches[1] data_to_index = pitches[2] reader.save_data('pitches_i2d.pkl', pitches[1]) reader.save_data('pitches_d2i.pkl', pitches[2]) reader.save_data('pitches_len.pkl', pitches[3]) print ('information of new data has been saved.') else: data_to_index = reader.load_data('./pitches_d2i.pkl') index_to_data = reader.load_data('./pitches_i2d.pkl') raw_data = reader.load_data('./pitches.pkl') inputs_data = reader.convert_to_index(raw_data, data_to_index) len_pitches = reader.load_data('./pitches_len.pkl') config.n_visible = len_pitches print ('information of needed data has been loaded.') outputs = [] with tf.Session() as sess, tf.device('/cpu:0'): if config.new_data: sess.run(tf.initialize_all_variables()) print ('check point: initialize variables') else: model.load_params(sess) print ('check point: load_params')
def read_train_eval(testid, preprocess, maxseq, modelType, encodeTime, dropout, earlyStop, seedNum, batchSize, maxEpoch, topn): ''' :param testid: identifier :param preprocess: whether sequences are stemmed or not :param maxseq: the maximum sequence length :param modelType: one of SIMPLE_RNN | LSTM_RNN | GRU_RNN :param encodeTime: :param dropout :param earlyStop: whether training stops when errors are saturated :param seedNum: random seed :param batchSize :param maxEpoch :param topn: how frequently used word tokens are considered :return: ''' N = 1000 TRAIN_INSTANCE_DIR = os.path.join( 'log', '{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(testid, preprocess, maxseq, modelType, dropout, earlyStop, seedNum, batchSize, maxEpoch)) if not os.path.isdir(TRAIN_INSTANCE_DIR): os.mkdir(TRAIN_INSTANCE_DIR) log_csvfile = os.path.join(TRAIN_INSTANCE_DIR, 'log.csv') result_file = os.path.join(TRAIN_INSTANCE_DIR, 'results.txt') print('Load data') session_data = load_data(preprocess=preprocess, maxseq=maxseq, encodeTime=encodeTime) label_data = load_label() topN_words = load_topn_words(session_data, N) sequences, labels = filter_labeled_data(session_data, label_data) print('Load embedding') if preprocess: w2v_model = load_embedding(embeddingType=EmbeddingType.PRE_ALL) else: w2v_model = load_embedding(embeddingType=EmbeddingType.NOPRE_ALL) print('Pre-processing sequences') print(' - Get word vectors') vocab_size, embedding_dim, word_indices, embedding_matrix = \ get_wordvectors_from_keyedvectors(w2v_model, seed=seedNum) print(' - Transform sequences') if topn is False: transformed_seq = transform_sequence(sequences, word_indices=word_indices) else: transformed_seq = transform_sequence_using_topn( sequences, word_indices, w2v_model, topN_words) print(' - Transform labels') transformed_labels = transform_label(label_data) print(' - Transform seq data to list') X, y = transform_labeled_data_listform(transformed_seq, transformed_labels) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seedNum) for train_index, test_index in sss.split(X, y): pass X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index] y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index] X_train, y_train = random_oversampling(X_train, y_train, seed=seedNum) X_test, y_test = random_oversampling(X_test, y_test, seed=seedNum) X_train = sequence.pad_sequences(X_train, maxlen=maxseq) X_test = sequence.pad_sequences(X_test, maxlen=maxseq) list_callbacks = [CSVLogger(log_csvfile, separator=',', append=False)] if earlyStop: earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto') list_callbacks.append(earlyStopping) if modelType is ModelType.GRU_RNN: model = GRU_RNN(vocab_size=vocab_size, maxlen=maxseq, dropout=dropout, embedding=embedding_matrix, embedding_dim=embedding_dim)() model.fit({'text': X_train}, y_train, validation_data=({ 'text': X_test }, y_test), batch_size=batchSize, epochs=maxEpoch, verbose=1, callbacks=list_callbacks) y_pred = model.predict({'text': X_test}, batch_size=batchSize, verbose=1) elif modelType is ModelType.LSTM_RNN: model = LSTM_RNN(vocab_size=vocab_size, maxlen=maxseq, dropout=dropout, embedding=embedding_matrix, embedding_dim=embedding_dim)() model.fit({'text': X_train}, y_train, validation_data=({ 'text': X_test }, y_test), batch_size=batchSize, epochs=maxEpoch, verbose=1, callbacks=list_callbacks) y_pred = model.predict({'text': X_test}, batch_size=batchSize, verbose=1) elif modelType is ModelType.SIMPLE_RNN: model = SIMPLE_RNN(vocab_size=vocab_size, maxlen=maxseq, dropout=dropout, embedding=embedding_matrix, embedding_dim=embedding_dim)() model.fit({'text': X_train}, y_train, validation_data=({ 'text': X_test }, y_test), batch_size=batchSize, epochs=maxEpoch, verbose=1, callbacks=list_callbacks) y_pred = model.predict({'text': X_test}, batch_size=batchSize, verbose=1) else: print('This function should be set for XXX_single modeltype.') exit() print('Evaluation..') with open(result_file, 'wt') as f: writer.eval(y_pred, y_test, file=f)
import collections import reader import numpy as np from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.layers import GRU, RepeatVector, TimeDistributed, Dense from keras.models import Model, Sequential from keras.layers.embeddings import Embedding from keras.optimizers import Adam from keras.losses import sparse_categorical_crossentropy eng_sen = reader.load_data('data/small_vocab_en') fre_sen = reader.load_data('data/small_vocab_fr') def tokenize(x): x_tk = Tokenizer(char_level=False) x_tk.fit_on_texts(x) return x_tk.texts_to_sequences(x), x_tk def pad(x, length=None): if (length is None): length = max([len(sentence for sentence in x)]) return pad_sequences(x, maxlen=length, padding="post") def preprocess(x, y): preprocess_x, x_tk = tokenize(x) preprocess_y, y_tk = tokenize(y) preprocess_x = pad(preprocess_x)
# def main(_): # if (os.path.exists(FLAGS.save_path)): # shutil.rmtree(FLAGS.save_path) # os.makedirs(FLAGS.save_path) # t_log = open(os.path.join(FLAGS.save_path, 'train.txt'),'w') # v_log = open(os.path.join(FLAGS.save_path, 'val.txt'),'w') # te_log = open(os.path.join(FLAGS.save_path, 'test.txt'),'w') data_path = "/home/manoj/oogie-boogie/wdw" train_path = os.path.join(data_path, 'test') val_path = os.path.join(data_path, 'test') test_path = os.path.join(data_path, 'test') config = Config() print("Loading train data from %s" % train_path) train = RawInput(rn.load_data(train_path)) # print("Loading val data from %s"%val_path) # val = RawInput(rn.load_data(val_path),vocabulary=train.vocab,c_len=train.c_len,\ # q_len=train.q_len) # if len(train.labels_idx) < len(val.labels_idx): # print("More validation choices than train") # # print("Loading test data from %s"%test_path) # test = RawInput(rn.load_data(test_path),vocabulary=train.vocab,c_len=train.c_len,\ # q_len=train.q_len) # if len(train.labels_idx) < len(test.labels_idx): # print("More test choices than train") with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale,