def test_main(): # Configurations cmd = argparse.ArgumentParser('The testing components of') cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.') cmd.add_argument('--input_format', default='plain', choices=('plain', 'conll', 'conll_char', 'conll_char_vi'), help='the input format.') cmd.add_argument("--input", help="the path to the raw text file.") cmd.add_argument("--output_format", default='hdf5', help='the output format. Supported format includes (hdf5, txt).' ' Use comma to separate the format identifiers,' ' like \'--output_format=hdf5,plain\'') cmd.add_argument("--output_prefix", help='the prefix of the output file. The output file is in the format of ' '<output_prefix>.<output_layer>.<output_format>') cmd.add_argument("--output_layer", help='the target layer to output. 0 for the word encoder, 1 for the first LSTM ' 'hidden layer, 2 for the second LSTM hidden layer, -1 for an average' 'of 3 layers.') cmd.add_argument("--model", required=True, help="path to save model") cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.') args = cmd.parse_args(sys.argv[2:]) if args.gpu >= 0: torch.cuda.set_device(args.gpu) use_cuda = args.gpu >= 0 and torch.cuda.is_available() # load the model configurations args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8'))) with open(args2.config_path, 'r') as fin: config = json.load(fin) # For the model trained with character-based word encoder. if config['token_embedder']['char_dim'] > 0: char_lexicon = {} with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens char_lexicon[token] = int(i) char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None) logging.info('char embedding size: ' + str(len(char_emb_layer.word2id))) else: char_lexicon = None char_emb_layer = None # For the model trained with word form word encoder. if config['token_embedder']['word_dim'] > 0: word_lexicon = {} with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens word_lexicon[token] = int(i) word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None) logging.info('word embedding size: ' + str(len(word_emb_layer.word2id))) else: word_lexicon = None word_emb_layer = None # instantiate the model model = Model(config, word_emb_layer, char_emb_layer, use_cuda) if use_cuda: model.cuda() logging.info(str(model)) model.load_model(args.model) # read test data according to input format read_function = read_corpus if args.input_format == 'plain' else ( read_conll_corpus if args.input_format == 'conll' else ( read_conll_char_corpus if args.input_format == 'conll_char' else read_conll_char_vi_corpus)) if config['token_embedder']['name'].lower() == 'cnn': test, text = read_function(args.input, config['token_embedder']['max_characters_per_token']) else: test, text = read_function(args.input) # create test batches from the input data. test_w, test_c, test_lens, test_masks, test_text = create_batches( test, args.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda, text=text) # configure the model to evaluation mode. model.eval() sent_set = set() cnt = 0 output_formats = args.output_format.split(',') output_layers = map(int, args.output_layer.split(',')) handlers = {} for output_format in output_formats: if output_format not in ('hdf5', 'txt'): print('Unknown output_format: {0}'.format(output_format)) continue for output_layer in output_layers: filename = '{0}.ly{1}.{2}'.format(args.output_prefix, output_layer, output_format) handlers[output_format, output_layer] = \ h5py.File(filename, 'w') if output_format == 'hdf5' else open(filename, 'w') for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text): output = model.forward(w, c, masks) for i, text in enumerate(texts): sent = '\t'.join(text) sent = sent.replace('.', '$period$') sent = sent.replace('/', '$backslash$') if sent in sent_set: continue sent_set.add(sent) # 句子文本,以\t间隔 if config['encoder']['name'].lower() == 'lstm': data = output[i, 1:lens[i]-1, :].data if use_cuda: data = data.cpu() data = data.numpy() elif config['encoder']['name'].lower() == 'elmo': data = output[:, i, 1:lens[i]-1, :].data if use_cuda: data = data.cpu() data = data.numpy() for (output_format, output_layer) in handlers: fout = handlers[output_format, output_layer] if output_layer == -1: payload = np.average(data, axis=0) else: payload = data[output_layer] if output_format == 'hdf5': fout.create_dataset(sent, payload.shape, dtype='float32', data=payload) else: for word, row in zip(text, payload): # word句子中的当前词,row 1024维向量 print('{0}\t{1}'.format(word, '\t'.join(['{0:.8f}'.format(elem) for elem in row])), file=fout) print('', file=fout) cnt += 1 if cnt % 1000 == 0: logging.info('Finished {0} sentences.'.format(cnt)) for _, handler in handlers.items(): handler.close()
def test_main(): cmd = argparse.ArgumentParser('The testing components of') cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.') cmd.add_argument('--input_format', default='plain', choices=('plain', 'conll', 'conll_char', 'conll_char_vi'), help='the input format.') cmd.add_argument("--input", help="the path to the raw text file.") cmd.add_argument('--output_ave', help='the path to the average embedding file.') cmd.add_argument('--output_lstm', help='the path to the 1st lstm-output embedding file.') cmd.add_argument("--model", required=True, help="path to save model") cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.') args = cmd.parse_args(sys.argv[2:]) if args.gpu >= 0: torch.cuda.set_device(args.gpu) use_cuda = args.gpu >= 0 and torch.cuda.is_available() args2 = dict2namedtuple( json.load( codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8'))) with open(args2.config_path, 'r') as fin: config = json.load(fin) if config['token_embedder']['char_dim'] > 0: char_lexicon = {} with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens char_lexicon[token] = int(i) char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None) logging.info('char embedding size: ' + str(len(char_emb_layer.word2id))) else: char_lexicon = None char_emb_layer = None if config['token_embedder']['word_dim'] > 0: word_lexicon = {} with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens word_lexicon[token] = int(i) word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None) logging.info('word embedding size: ' + str(len(word_emb_layer.word2id))) else: word_lexicon = None word_emb_layer = None model = Model(config, word_emb_layer, char_emb_layer, use_cuda) if use_cuda: model.cuda() logging.info(str(model)) model.load_model(args.model) if config['token_embedder']['name'].lower() == 'cnn': if args.input_format == 'plain': test, text = read_corpus( args.input, config['token_embedder']['max_characters_per_token']) elif args.input_format == 'conll': test, text = read_conll_corpus( args.input, config['token_embedder']['max_characters_per_token']) elif args.input_format == 'conll_char': test, text = read_conll_char_corpus( args.input, config['token_embedder']['max_characters_per_token']) else: test, text = read_conll_char_vi_corpus( args.input, config['token_embedder']['max_characters_per_token']) elif config['token_embedder']['name'].lower() == 'lstm': if args.input_format == 'plain': test, text = read_corpus(args.input) elif args.input_format == 'conll': test, text = read_conll_corpus(args.input) elif args.input_format == 'conll_char': test, text = read_conll_char_corpus(args.input) else: test, text = read_conll_char_vi_corpus(args.input) test_w, test_c, test_lens, test_masks, test_text = create_batches( test, args.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda, text=text) print(max([len(x) for x in test])) model.eval() sent_set = set() cnt = 0 fout_ave = h5py.File(args.output_ave, 'w') if args.output_ave is not None else None fout_lstm = h5py.File(args.output_lstm, 'w') if args.output_lstm is not None else None for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text): output = model.forward(w, c, masks) for i, text in enumerate(texts): sent = '\t'.join(text) sent = sent.replace('.', '$period$') sent = sent.replace('/', '$backslash$') if sent in sent_set: continue sent_set.add(sent) if config['encoder']['name'].lower() == 'lstm': data = output[i, 1:lens[i] - 1, :].data if use_cuda: data = data.cpu() data = data.numpy() elif config['encoder']['name'].lower() == 'elmo': data = output[:, i, 1:lens[i] - 1, :].data if use_cuda: data = data.cpu() data = data.numpy() if fout_ave is not None: data_ave = np.average(data, axis=0) fout_ave.create_dataset(sent, data_ave.shape, dtype='float32', data=data_ave) if fout_lstm is not None: data_lstm = data[1] fout_lstm.create_dataset(sent, data_lstm.shape, dtype='float32', data=data_lstm) cnt += 1 if cnt % 1000 == 0: logging.info('Finished {0} sentences.'.format(cnt)) if fout_ave is not None: fout_ave.close() if fout_lstm is not None: fout_lstm.close()
def test(): cmd = argparse.ArgumentParser('The testing components of') cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.') cmd.add_argument("--input", help="the path to the raw text file.") cmd.add_argument("--model", required=True, help="path to save model") cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.') args = cmd.parse_args(sys.argv[2:]) # if args.gpu >= 0: # torch.cuda.set_device(args.gpu) use_cuda = args.gpu >= 0 and torch.cuda.is_available() args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8'))) with open(args2.config_path, 'r') as fin: config = json.load(fin) if config['token_embedder']['char_dim'] > 0: char_lexicon = {} with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens char_lexicon[token] = int(i) char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False) logging.info('char embedding size: ' + str(len(char_emb_layer.word2id))) else: char_lexicon = None char_emb_layer = None word_lexicon = {} with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens word_lexicon[token] = int(i) if config['token_embedder']['word_dim'] > 0: word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None) logging.info('word embedding size: ' + str(len(word_emb_layer.word2id))) else: word_emb_layer = None model = Model(config, word_emb_layer, char_emb_layer, len(word_lexicon), use_cuda) if use_cuda: model.cuda() logging.info(str(model)) model.load_model(args.model) if config['token_embedder']['name'].lower() == 'cnn': test = read_corpus(args.input, config['token_embedder']['max_characters_per_token'], max_sent_len=10000) elif config['token_embedder']['name'].lower() == 'lstm': test = read_corpus(args.input, max_sent_len=10000) else: raise ValueError('') test_w, test_c, test_lens, test_masks = create_batches( test, args.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) test_result = eval_model(model, (test_w, test_c, test_lens, test_masks)) logging.info("test_ppl={:.6f}".format(test_result))
def train(): cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve') cmd.add_argument('--seed', default=1, type=int, help='The random seed.') cmd.add_argument('--gpu', default=-1, type=int, help='Use id of gpu, -1 if cpu.') cmd.add_argument('--train_path', required=True, help='The path to the training file.') cmd.add_argument('--valid_path', help='The path to the development file.') cmd.add_argument('--test_path', help='The path to the testing file.') cmd.add_argument('--config_path', required=True, help='the path to the config file.') cmd.add_argument("--word_embedding", help="The path to word vectors.") cmd.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam', 'adagrad'], help='the type of optimizer: valid options=[sgd, adam, adagrad]') cmd.add_argument("--lr", type=float, default=0.01, help='the learning rate.') cmd.add_argument("--lr_decay", type=float, default=0, help='the learning rate decay.') cmd.add_argument("--model", required=True, help="path to save model") cmd.add_argument("--batch_size", "--batch", type=int, default=32, help='the batch size.') cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.') cmd.add_argument("--clip_grad", type=float, default=5, help='the tense of clipped grad.') cmd.add_argument('--max_sent_len', type=int, default=20, help='maximum sentence length.') cmd.add_argument('--min_count', type=int, default=5, help='minimum word count.') cmd.add_argument('--max_vocab_size', type=int, default=150000, help='maximum vocabulary size.') cmd.add_argument('--save_classify_layer', default=False, action='store_true', help="whether to save the classify layer") cmd.add_argument('--valid_size', type=int, default=0, help="size of validation dataset when there's no valid.") cmd.add_argument('--eval_steps', required=False, type=int, help='report every xx batches.') opt = cmd.parse_args(sys.argv[2:]) with open(opt.config_path, 'r') as fin: config = json.load(fin) # Dump configurations print(opt) print(config) # set seed. torch.manual_seed(opt.seed) random.seed(opt.seed) if opt.gpu >= 0: torch.cuda.set_device(opt.gpu) if opt.seed > 0: torch.cuda.manual_seed(opt.seed) use_cuda = opt.gpu >= 0 and torch.cuda.is_available() token_embedder_name = config['token_embedder']['name'].lower() token_embedder_max_chars = config['token_embedder'].get('max_characters_per_token', None) if token_embedder_name == 'cnn': train_data = read_corpus(opt.train_path, token_embedder_max_chars, opt.max_sent_len) elif token_embedder_name == 'lstm': train_data = read_corpus(opt.train_path, opt.max_sent_len) else: raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) logging.info('training instance: {}, training tokens: {}.'.format(len(train_data), sum([len(s) - 1 for s in train_data]))) if opt.valid_path is not None: if token_embedder_name == 'cnn': valid_data = read_corpus(opt.valid_path, token_embedder_max_chars, opt.max_sent_len) elif token_embedder_name == 'lstm': valid_data = read_corpus(opt.valid_path, opt.max_sent_len) else: raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) logging.info('valid instance: {}, valid tokens: {}.'.format(len(valid_data), sum([len(s) - 1 for s in valid_data]))) elif opt.valid_size > 0: train_data, valid_data = divide(train_data, opt.valid_size) logging.info('training instance: {}, training tokens after division: {}.'.format( len(train_data), sum([len(s) - 1 for s in train_data]))) logging.info('valid instance: {}, valid tokens: {}.'.format( len(valid_data), sum([len(s) - 1 for s in valid_data]))) else: valid_data = None if opt.test_path is not None: if token_embedder_name == 'cnn': test_data = read_corpus(opt.test_path, token_embedder_max_chars, opt.max_sent_len) elif token_embedder_name == 'lstm': test_data = read_corpus(opt.test_path, opt.max_sent_len) else: raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) logging.info('testing instance: {}, testing tokens: {}.'.format( len(test_data), sum([len(s) - 1 for s in test_data]))) else: test_data = None if opt.word_embedding is not None: embs = load_embedding(opt.word_embedding) word_lexicon = {word: i for i, word in enumerate(embs[0])} else: embs = None word_lexicon = {} # Maintain the vocabulary. vocabulary is used in either WordEmbeddingInput or softmax classification vocab = get_truncated_vocab(train_data, opt.min_count) # Ensure index of '<oov>' is 0 for special_word in ['<oov>', '<bos>', '<eos>', '<pad>']: if special_word not in word_lexicon: word_lexicon[special_word] = len(word_lexicon) for word, _ in vocab: if word not in word_lexicon: word_lexicon[word] = len(word_lexicon) # Word Embedding if config['token_embedder']['word_dim'] > 0: word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=embs) logging.info('Word embedding size: {0}'.format(len(word_emb_layer.word2id))) else: word_emb_layer = None logging.info('Vocabulary size: {0}'.format(len(word_lexicon))) # Character Lexicon if config['token_embedder']['char_dim'] > 0: char_lexicon = {} for sentence in train_data: for word in sentence: for ch in word: if ch not in char_lexicon: char_lexicon[ch] = len(char_lexicon) for special_char in ['<bos>', '<eos>', '<oov>', '<pad>', '<bow>', '<eow>']: if special_char not in char_lexicon: char_lexicon[special_char] = len(char_lexicon) char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False) logging.info('Char embedding size: {0}'.format(len(char_emb_layer.word2id))) else: char_lexicon = None char_emb_layer = None train = create_batches( train_data, opt.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda) if opt.eval_steps is None: opt.eval_steps = len(train[0]) logging.info('Evaluate every {0} batches.'.format(opt.eval_steps)) if valid_data is not None: valid = create_batches( valid_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) else: valid = None if test_data is not None: test = create_batches( test_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) else: test = None label_to_ix = word_lexicon logging.info('vocab size: {0}'.format(len(label_to_ix))) nclasses = len(label_to_ix) model = Model(config, word_emb_layer, char_emb_layer, nclasses, use_cuda) logging.info(str(model)) if use_cuda: model = model.cuda() need_grad = lambda x: x.requires_grad if opt.optimizer.lower() == 'adam': optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=opt.lr) elif opt.optimizer.lower() == 'sgd': optimizer = optim.SGD(filter(need_grad, model.parameters()), lr=opt.lr) elif opt.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(filter(need_grad, model.parameters()), lr=opt.lr) else: raise ValueError('Unknown optimizer {}'.format(opt.optimizer.lower())) try: os.makedirs(opt.model) except OSError as exception: if exception.errno != errno.EEXIST: raise if config['token_embedder']['char_dim'] > 0: with codecs.open(os.path.join(opt.model, 'char.dic'), 'w', encoding='utf-8') as fpo: for ch, i in char_emb_layer.word2id.items(): print('{0}\t{1}'.format(ch, i), file=fpo) with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo: for w, i in word_lexicon.items(): print('{0}\t{1}'.format(w, i), file=fpo) json.dump(vars(opt), codecs.open(os.path.join(opt.model, 'config.json'), 'w', encoding='utf-8')) best_train = 1e+8 best_valid = 1e+8 test_result = 1e+8 for epoch in range(opt.max_epoch): best_train, best_valid, test_result = train_model(epoch, opt, model, optimizer, train, valid, test, best_train, best_valid, test_result) if opt.lr_decay > 0: optimizer.param_groups[0]['lr'] *= opt.lr_decay if valid_data is None: logging.info("best train ppl: {:.6f}.".format(best_train)) elif test_data is None: logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(best_train, best_valid)) else: logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}.".format(best_train, best_valid, test_result))
def get_model(self): # torch.cuda.set_device(1) self.use_cuda = torch.cuda.is_available() # load the model configurations args2 = dict2namedtuple( json.load( codecs.open(os.path.join(self.model_dir, 'config.json'), 'r', encoding='utf-8'))) config_path = os.path.join(self.model_dir, args2.config_path) print("config_patch##:", config_path) # Some of the available models may have the config in the # model dir, but the path given in the config directory was an # absolute path. if not os.path.exists(config_path): config_path = os.path.join(self.model_dir, os.path.split(config_path)[1]) logger.warning("Could not find config. Trying " + config_path) # In many cases, such as the publicly available English model, # the config is one of the default provided configs in # elmoformanylangs/configs if not os.path.exists(config_path): config_path = os.path.join( os.path.split(__file__)[0], "configs", os.path.split(config_path)[1]) logger.warning("Could not find config. Trying " + config_path) if not os.path.exists(config_path): raise FileNotFoundError( "Could not find the model config in either the model directory " "or the default configs. Path in config file: %s" % args2.config_path) with open(config_path, 'r') as fin: config = json.load(fin) # For the model trained with character-based word encoder. if config['token_embedder']['char_dim'] > 0: self.char_lexicon = {} with codecs.open(os.path.join(self.model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens self.char_lexicon[token] = int(i) char_emb_layer = EmbeddingLayer( config['token_embedder']['char_dim'], self.char_lexicon, fix_emb=False, embs=None) logger.info('char embedding size: ' + str(len(char_emb_layer.word2id))) else: self.char_lexicon = None char_emb_layer = None # For the model trained with word form word encoder. if config['token_embedder']['word_dim'] > 0: self.word_lexicon = {} with codecs.open(os.path.join(self.model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens self.word_lexicon[token] = int(i) word_emb_layer = EmbeddingLayer( config['token_embedder']['word_dim'], self.word_lexicon, fix_emb=False, embs=None) logger.info('word embedding size: ' + str(len(word_emb_layer.word2id))) else: self.word_lexicon = None word_emb_layer = None # instantiate the model model = Model(config, word_emb_layer, char_emb_layer, self.use_cuda) if self.use_cuda: model.cuda() logger.info(str(model)) model.load_model(self.model_dir) # read test data according to input format # configure the model to evaluation mode. model.eval() return model, config
def test(): cmd = argparse.ArgumentParser('The testing components of') cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.') cmd.add_argument("--input", help="the path to the test file.") cmd.add_argument('--output', help='the path to the output file.') cmd.add_argument("--models", required=True, help="path to save model") cmd.add_argument("--lexicon", required=True, help='path to the lexicon (hdf5) file.') args = cmd.parse_args(sys.argv[2:]) if args.gpu >= 0: torch.cuda.set_device(args.gpu) lexicon = h5py.File(args.lexicon, 'r') dim, n_layers = lexicon['#info'][0].item(), lexicon['#info'][1].item() logging.info('dim: {}'.format(dim)) logging.info('n_layers: {}'.format(n_layers)) model_path = args.model args2 = dict2namedtuple( json.load( codecs.open(os.path.join(model_path, 'config.json'), 'r', encoding='utf-8'))) word_lexicon = {} word_emb_layers = [] with codecs.open(os.path.join(model_path, 'word.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens word_lexicon[token] = int(i) word_emb_layer = EmbeddingLayer(args2.word_dim, word_lexicon, fix_emb=False, embs=None) logging.info('word embedding size: ' + str(len(word_emb_layers[0].word2id))) label2id, id2label = {}, {} with codecs.open(os.path.join(model_path, 'label.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: token, i = line.strip().split('\t') label2id[token] = int(i) id2label[int(i)] = token logging.info('number of labels: {0}'.format(len(label2id))) use_cuda = args.gpu >= 0 and torch.cuda.is_available() model = Model(args2, word_emb_layer, dim, n_layers, len(label2id), use_cuda) model.load_state_dict( torch.load(os.path.join(path, 'model.pkl'), map_location=lambda storage, loc: storage)) if use_cuda: model = model.cuda() raw_test_data, raw_test_labels = read_corpus(args.input) label_to_index(raw_test_labels, label2id, incremental=False) test_data, test_embed, test_labels, test_lens, order = create_batches( dim, n_layers, raw_test_data, raw_test_labels, lexicon, word_lexicon, args2.batch_size, shuffle=False, sort=True, keep_full=True, use_cuda=use_cuda) if args.output is not None: fpo = codecs.open(args.output, 'w', encoding='utf-8') else: fpo = codecs.getwriter('utf-8')(sys.stdout) model.eval() tagset = [] for x, p, y, lens in zip(test_data, test_embed, test_labels, test_lens): output, loss = model.forward(x, p, y) output_data = output.data for bid in range(len(x)): tags = [] for k in range(lens[bid]): tag = id2label[int(output_data[bid][k])] tags.append(tag) tagset.append(tags) for l in order: for tag in tagset[l]: print(tag, file=fpo) print(file=fpo) fpo.close()
def train(): cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve') cmd.add_argument('--seed', default=1, type=int, help='the random seed.') cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.') cmd.add_argument('--encoder', default='gal_lstm', choices=['lstm', 'gal_lstm'], help='the type of encoder: valid options=[lstm]') cmd.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam'], help='the type of optimizer: valid options=[sgd, adam]') cmd.add_argument('--train_path', required=True, help='the path to the training file.') cmd.add_argument('--valid_path', required=True, help='the path to the validation file.') cmd.add_argument('--test_path', required=False, help='the path to the testing file.') cmd.add_argument('--lexicon', required=True, help='the path to the hdf5 file.') cmd.add_argument('--gold_valid_path', type=str, help='the path to the validation file.') cmd.add_argument('--gold_test_path', type=str, help='the path to the testing file.') cmd.add_argument("--model", required=True, help="path to save model") cmd.add_argument("--batch_size", "--batch", type=int, default=32, help='the batch size.') cmd.add_argument("--hidden_dim", "--hidden", type=int, default=128, help='the hidden dimension.') cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.') cmd.add_argument("--word_dim", type=int, default=128, help='the input dimension.') cmd.add_argument("--dropout", type=float, default=0.0, help='the dropout rate') cmd.add_argument("--depth", type=int, default=2, help='the depth of lstm') cmd.add_argument("--word_cut", type=int, default=5, help='remove the words that is less frequent than') cmd.add_argument("--eval_steps", type=int, help='eval every x batches') cmd.add_argument("--l2", type=float, default=0.00001, help='the l2 decay rate.') cmd.add_argument("--lr", type=float, default=0.01, help='the learning rate.') cmd.add_argument("--lr_decay", type=float, default=0, help='the learning rate decay.') cmd.add_argument("--clip_grad", type=float, default=1, help='the tense of clipped grad.') cmd.add_argument("--consider_word_piece", default=False, action='store_true', help='use word piece.') cmd.add_argument('--output', help='The path to the output file.') cmd.add_argument("--script", required=True, help="The path to the evaluation script") opt = cmd.parse_args(sys.argv[2:]) print(opt) torch.manual_seed(opt.seed) random.seed(opt.seed) if opt.gpu >= 0: torch.cuda.set_device(opt.gpu) if opt.seed > 0: torch.cuda.manual_seed(opt.seed) if opt.gold_valid_path is None: opt.gold_valid_path = opt.valid_path if opt.gold_test_path is None and opt.test_path is not None: opt.gold_test_path = opt.test_path use_cuda = opt.gpu >= 0 and torch.cuda.is_available() lexicon = h5py.File(opt.lexicon, 'r') dim, n_layers = lexicon['#info'][0].item(), lexicon['#info'][1].item() logging.info('dim: {}'.format(dim)) logging.info('n_layers: {}'.format(n_layers)) raw_training_data, raw_training_labels = read_corpus(opt.train_path) raw_valid_data, raw_valid_labels = read_corpus(opt.valid_path) if opt.test_path is not None: raw_test_data, raw_test_labels = read_corpus(opt.test_path) else: raw_test_data, raw_test_labels = [], [] logging.info( 'training instance: {}, validation instance: {}, test instance: {}.'. format(len(raw_training_labels), len(raw_valid_labels), len(raw_test_labels))) logging.info( 'training tokens: {}, validation tokens: {}, test tokens: {}.'.format( sum([len(seq) for seq in raw_training_labels]), sum([len(seq) for seq in raw_valid_labels]), sum([len(seq) for seq in raw_test_labels]))) if not opt.consider_word_piece: label_to_ix = {'<pad>': 0} else: label_to_ix = {'<pad>': 0, '-word-piece-': 1} label_to_index(raw_training_labels, label_to_ix) label_to_index(raw_valid_labels, label_to_ix, incremental=False) label_to_index(raw_test_labels, label_to_ix, incremental=False) logging.info('number of tags: {0}'.format(len(label_to_ix))) word_count = collections.Counter() for x in raw_training_data: for w in x: word_count[w] += 1 word_lexicon = {} for w in word_count: if word_count[w] >= opt.word_cut: word_lexicon[w] = len(word_lexicon) for special_word in ['<oov>', '<pad>']: if special_word not in word_lexicon: word_lexicon[special_word] = len(word_lexicon) logging.info('training vocab size: {}'.format(len(word_lexicon))) word_emb_layer = EmbeddingLayer(opt.word_dim, word_lexicon, fix_emb=False, embs=None) logging.info('Word embedding size: {0}'.format(len( word_emb_layer.word2id))) n_classes = len(label_to_ix) ix2label = {ix: label for label, ix in label_to_ix.items()} word2id = word_emb_layer.word2id training_payload = create_batches(dim, n_layers, raw_training_data, raw_training_labels, lexicon, word2id, opt.batch_size, use_cuda=use_cuda) if opt.eval_steps is None or opt.eval_steps > len(raw_training_data): opt.eval_steps = len(training_payload[0]) valid_payload = create_batches(dim, n_layers, raw_valid_data, raw_valid_labels, lexicon, word2id, opt.batch_size, shuffle=False, sort=True, keep_full=True, use_cuda=use_cuda) if opt.test_path is not None: test_payload = create_batches(dim, n_layers, raw_test_data, raw_test_labels, lexicon, word2id, opt.batch_size, shuffle=False, sort=True, keep_full=True, use_cuda=use_cuda) else: test_payload = None model = Model(opt, word_emb_layer, dim, n_layers, n_classes, opt.consider_word_piece, use_cuda) logging.info(str(model)) if use_cuda: model = model.cuda() need_grad = lambda x: x.requires_grad if opt.optimizer.lower() == 'adam': optimizer = torch.optim.Adam(filter(need_grad, model.parameters()), lr=opt.lr) else: optimizer = torch.optim.SGD(filter(need_grad, model.parameters()), lr=opt.lr) try: os.makedirs(opt.model) except OSError as exception: if exception.errno != errno.EEXIST: raise with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo: for w, i in word_emb_layer.word2id.items(): print('{0}\t{1}'.format(w, i), file=fpo) with codecs.open(os.path.join(opt.model, 'label.dic'), 'w', encoding='utf-8') as fpo: for label, i in label_to_ix.items(): print('{0}\t{1}'.format(label, i), file=fpo) json.dump( vars(opt), codecs.open(os.path.join(opt.model, 'config.json'), 'w', encoding='utf-8')) best_valid, test_result = -1e8, -1e8 for epoch in range(opt.max_epoch): best_valid, test_result = train_model(epoch, model, optimizer, training_payload, valid_payload, test_payload, ix2label, best_valid, test_result) if opt.lr_decay > 0: optimizer.param_groups[0]['lr'] *= opt.lr_decay logging.info('Total encoder time: {:.2f}s'.format(model.eval_time / (epoch + 1))) logging.info('Total embedding time: {:.2f}s'.format(model.emb_time / (epoch + 1))) logging.info('Total classify time: {:.2f}s'.format( model.classify_time / (epoch + 1))) weights = model.weights if use_cuda: weights = weights.cpu() logging.info("weights: {}".format(weights.data.numpy())) logging.info("best_valid_acc: {:.6f}".format(best_valid)) logging.info("test_acc: {:.6f}".format(test_result))