import sys import ioutils import cPickle print "Reading corpus from", sys.argv[1] useful_data = ioutils.read_corpus(sys.argv[1], False) print "Writing..." with open(sys.argv[2], "wb") as f: cPickle.dump(useful_data, f)
if not args.load: timestamp = (datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')) args.save = args.save + "/" + timestamp os.mkdir(args.save) f = open(args.save + "/argument.txt", "w") for i in vars(args): f.write(str(i) + "\t" + str(vars(args)[i]) + "\n") f.close() else: args.save = args.load utils.config_logger(args.verbose) logger = utils.get_logger('train') logger.info('Reading training data') train_pairs, train_max = ioutils.read_corpus(args.train, args.lower, args.lang, args.ratio) logger.info('Reading validation data') valid_pairs, valid_max = ioutils.read_corpus(args.validation, args.lower, args.lang) logger.info('Reading test data') test_pairs, test_max = ioutils.read_corpus(args.test, args.lower, args.lang) logger.info('Reading word embeddings') word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab) max_len = None #print(train_pairs) #embeddings = utils.normalize_embeddings(embeddings) logger.debug('Embeddings have shape {} (including unknown, padding and null)' .format(embeddings.shape)) logger.info('Converting words to indices') # find out which labels are there in the data (more flexible to different datasets) label_dict = utils.create_label_dict(train_pairs)
session_config = tf.ConfigProto(allow_soft_placement=True) session_config.gpu_options.allow_growth = True # session_config.gpu_options.per_process_gpu_memory_fraction = 0.5 sess = tf.InteractiveSession(config=session_config) model_class = utils.get_model_class(params) model, _ = model_class.load(args.model, sess) word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocabulary, generate=False, load_extra_from=args.model, normalize=True) model.initialize_embeddings(sess, embeddings) label_dict = ioutils.load_label_dict(args.model) pairs = ioutils.read_corpus(args.dataset, params['lowercase'], params['language']) dataset = utils.create_dataset(pairs, word_dict, label_dict) genres = None if args.genres != None: genres = utils.read_genres(args.genres) loss, acc, answers, logits = model.evaluate(sess, dataset, True) print('# problems: %s' % dataset.num_items) print('Loss: %f' % loss) print('Accuracy: %f' % acc) if args.genres: print_acc_per_genre(pairs, answers, logits, label_dict, genres)
# word_dict, embeddings = ioutils.load_embeddings(args.embeddings, # args.vocabulary, # generate=False, # load_extra_from=args.model, # normalize=True) word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocabulary, generate=False, load_extra_from=None, normalize=False) model.initialize_embeddings(sess, embeddings) label_dict = ioutils.load_label_dict(args.model) pairs, wordpairs = ioutils.read_corpus(args.dataset, True, params['language']) dataset, _, _ = utils.create_dataset(pairs, wordpairs, word_dict, label_dict, max_len1=model.maxlen1, max_len2=model.maxlen2) print("Test Dataset Size :%d", dataset.num_items) loss, acc, answers, logits = model.evaluate(sess, dataset, True, batch_size=64) #print(answers) print(np.array(logits).shape) label_dict_inverse = {}
default=0.0) parser.add_argument('--report', help='Number of batches between ' 'performance reports', default=100, type=int) parser.add_argument('-v', help='Verbose', action='store_true', dest='verbose') parser.add_argument('--optim', help='Optimizer algorithm', default='adagrad', choices=['adagrad', 'adadelta', 'adam']) args = parser.parse_args() utils.config_logger(args.verbose) logger = utils.get_logger('train') logger.debug('Training with following options: %s' % ' '.join(sys.argv)) train_pairs = ioutils.read_corpus(args.train, args.lower, args.lang) valid_pairs = ioutils.read_corpus(args.validation, args.lower, args.lang) # whether to generate embeddings for unknown, padding, null word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab, True, normalize=True) logger.info('Converting words to indices') # find out which labels are there in the data # (more flexible to different datasets) label_dict = utils.create_label_dict(train_pairs) train_data = utils.create_dataset(train_pairs, word_dict, label_dict) valid_data = utils.create_dataset(valid_pairs, word_dict, label_dict) ioutils.write_params(args.save, lowercase=args.lower, language=args.lang, model=args.model)
default=0, type=int) parser.add_argument('--continue', help='Continue training.', action='store_true', dest='cont') parser.add_argument('--warm-start', help='Use pre-trained model.', dest='warm') args = parser.parse_args() utils.config_logger(args.verbose) logger = utils.get_logger('train') logger.debug('Training with following options: %s' % ' '.join(sys.argv)) train_pairs = ioutils.read_corpus(args.train, args.lower, args.lang) valid_pairs = ioutils.read_corpus(args.validation, args.lower, args.lang) if args.additional_training != None: train_pairs += ioutils.read_corpus(args.additional_training, args.lower, args.lang) assert (not args.cont) # Not implemented yet. # whether to generate embeddings for unknown, padding, null is_really_cont = args.warm != None or (args.cont and os.path.exists( os.path.join(args.save, "model.meta"))) warmup_model = args.warm if is_really_cont: logger.info('Found a model. Fine-tuning...')