dest='verbose') parser.add_argument('--optim', help='Optimizer algorithm', default='adagrad', choices=['adagrad', 'adadelta', 'adam']) args = parser.parse_args() utils.config_logger(args.verbose) logger = utils.get_logger('train') logger.info('Training with following options: %s' % ' '.join(sys.argv)) #train_pairs = ioutils.read_corpus(args.train, args.lower, args.lang) #valid_pairs = ioutils.read_corpus(args.validation, args.lower, args.lang) # whether to generate embeddings for unknown, padding, null word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab, True, normalize=True) # test #print(word_dict) print(embeddings) logger.info('Converting words to indices') # find out which labels are there in the data # (more flexible to different datasets) """ label_dict = utils.create_label_dict(train_pairs) train_data = utils.create_dataset(train_pairs, word_dict, label_dict) valid_data = utils.create_dataset(valid_pairs, word_dict, label_dict) """ label_dict = utils.create_label_dict_SSQA() train_data, labels, sents1, sents2 = utils.create_dataset_SSQA( args.train, word_dict, label_dict)
for i in vars(args): f.write(str(i) + "\t" + str(vars(args)[i]) + "\n") f.close() else: args.save = args.load utils.config_logger(args.verbose) logger = utils.get_logger('train') logger.info('Reading training data') train_pairs, train_max = ioutils.read_corpus(args.train, args.lower, args.lang, args.ratio) logger.info('Reading validation data') valid_pairs, valid_max = ioutils.read_corpus(args.validation, args.lower, args.lang) logger.info('Reading test data') test_pairs, test_max = ioutils.read_corpus(args.test, args.lower, args.lang) logger.info('Reading word embeddings') word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab) max_len = None #print(train_pairs) #embeddings = utils.normalize_embeddings(embeddings) logger.debug('Embeddings have shape {} (including unknown, padding and null)' .format(embeddings.shape)) logger.info('Converting words to indices') # find out which labels are there in the data (more flexible to different datasets) label_dict = utils.create_label_dict(train_pairs) train_data = utils.create_dataset(train_pairs, word_dict, label_dict, max_len, max_len) valid_data = utils.create_dataset(valid_pairs, word_dict, label_dict, max_len, max_len) test_data = utils.create_dataset(test_pairs, word_dict, label_dict, max_len, max_len) #print(train_data.sizes1) ioutils.write_extra_embeddings(embeddings, args.save)
train_pairs += ioutils.read_corpus(args.additional_training, args.lower, args.lang) assert (not args.cont) # Not implemented yet. # whether to generate embeddings for unknown, padding, null is_really_cont = args.warm != None or (args.cont and os.path.exists( os.path.join(args.save, "model.meta"))) warmup_model = args.warm if is_really_cont: logger.info('Found a model. Fine-tuning...') word_dict, embeddings = ioutils.load_embeddings( args.embeddings, args.vocab, generate=False, normalize=True, load_extra_from=warmup_model) params = ioutils.load_params(warmup_model) else: word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab, generate=True, normalize=True) ioutils.write_params(args.save, lowercase=args.lower, language=args.lang, model=args.model) ioutils.write_extra_embeddings(embeddings, args.save)
dest='verbose') parser.add_argument('-e', help='Print pairs and labels that got a wrong answer', action='store_true', dest='errors') args = parser.parse_args() utils.config_logger(verbose=args.verbose) params = ioutils.load_params(args.model) sess = tf.InteractiveSession() model_class = utils.get_model_class(params) model = model_class.load(args.model, sess) word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocabulary, generate=False, load_extra_from=args.model, normalize=True) model.initialize_embeddings(sess, embeddings) label_dict = ioutils.load_label_dict(args.model) print('Label dict[Y] : ', label_dict['Y']) # pairs = ioutils.read_corpus(args.dataset, params['lowercase'], # params['language']) #dataset = utils.create_dataset(pairs, word_dict, label_dict) dataset, labels, sents1, sents2 = utils.create_dataset_SSQA( args.dataset, word_dict, label_dict) #for pair in pairs: #print(pair[0].encode('utf-8')) #print(pair[1].encode('utf-8')) #print(pair[2].encode('utf-8')) loss, acc, answers = model.evaluate(sess, dataset, True)
action='store_true', dest='verbose', default=1) parser.add_argument('--optim', help='Optimizer algorithm', default='adagrad', choices=['adagrad', 'adadelta', 'adam']) args = parser.parse_args() utils.config_logger(args.verbose) logger = utils.get_logger('train') logger.debug('Training with following options: %s' % ' '.join(sys.argv)) # hwwang change normalize to false word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab, False, normalize=False) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) logger.info('Creating model') vocab_size = embeddings.shape[0] embedding_size = embeddings.shape[1] #wordweightdict = ioutils.load_wordnetweight(args.wordnetweight) #wordweightdict = {} # 这里 lemma 之前的没有词对 train_pairs, _ = ioutils.read_corpus(args.train, args.lower, args.lang) valid_pairs, _ = ioutils.read_corpus(args.validation, args.lower, args.lang) test_pairs, _ = ioutils.read_corpus(args.test, args.lower, args.lang)