Exemple #1
0
def test_main():
  # Configurations
  cmd = argparse.ArgumentParser('The testing components of')
  cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.')
  cmd.add_argument('--input_format', default='plain', choices=('plain', 'conll', 'conll_char', 'conll_char_vi'),
                   help='the input format.')
  cmd.add_argument("--input", help="the path to the raw text file.")
  cmd.add_argument("--output_format", default='hdf5', help='the output format. Supported format includes (hdf5, txt).'
                                                           ' Use comma to separate the format identifiers,'
                                                           ' like \'--output_format=hdf5,plain\'')
  cmd.add_argument("--output_prefix", help='the prefix of the output file. The output file is in the format of '
                                           '<output_prefix>.<output_layer>.<output_format>')
  cmd.add_argument("--output_layer", help='the target layer to output. 0 for the word encoder, 1 for the first LSTM '
                                          'hidden layer, 2 for the second LSTM hidden layer, -1 for an average'
                                          'of 3 layers.')
  cmd.add_argument("--model", required=True, help="path to save model")
  cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.')
  args = cmd.parse_args(sys.argv[2:])

  if args.gpu >= 0:
    torch.cuda.set_device(args.gpu)
  use_cuda = args.gpu >= 0 and torch.cuda.is_available()
  # load the model configurations
  args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8')))

  with open(args2.config_path, 'r') as fin:
    config = json.load(fin)

  # For the model trained with character-based word encoder.
  if config['token_embedder']['char_dim'] > 0:
    char_lexicon = {}
    with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi:
      for line in fpi:
        tokens = line.strip().split('\t')
        if len(tokens) == 1:
          tokens.insert(0, '\u3000')
        token, i = tokens
        char_lexicon[token] = int(i)
    char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None)
    logging.info('char embedding size: ' + str(len(char_emb_layer.word2id)))
  else:
    char_lexicon = None
    char_emb_layer = None

  # For the model trained with word form word encoder.
  if config['token_embedder']['word_dim'] > 0:
    word_lexicon = {}
    with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi:
      for line in fpi:
        tokens = line.strip().split('\t')
        if len(tokens) == 1:
          tokens.insert(0, '\u3000')
        token, i = tokens
        word_lexicon[token] = int(i)
    word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None)
    logging.info('word embedding size: ' + str(len(word_emb_layer.word2id)))
  else:
    word_lexicon = None
    word_emb_layer = None

  # instantiate the model
  model = Model(config, word_emb_layer, char_emb_layer, use_cuda)

  if use_cuda:
    model.cuda()

  logging.info(str(model))
  model.load_model(args.model)

  # read test data according to input format
  read_function = read_corpus if args.input_format == 'plain' else (
    read_conll_corpus if args.input_format == 'conll' else (
      read_conll_char_corpus if args.input_format == 'conll_char' else read_conll_char_vi_corpus))

  if config['token_embedder']['name'].lower() == 'cnn':
    test, text = read_function(args.input, config['token_embedder']['max_characters_per_token'])
  else:
    test, text = read_function(args.input)

  # create test batches from the input data.
  test_w, test_c, test_lens, test_masks, test_text = create_batches(
    test, args.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda, text=text)

  # configure the model to evaluation mode.
  model.eval()

  sent_set = set()
  cnt = 0

  output_formats = args.output_format.split(',')
  output_layers = map(int, args.output_layer.split(','))

  handlers = {}
  for output_format in output_formats:
    if output_format not in ('hdf5', 'txt'):
      print('Unknown output_format: {0}'.format(output_format))
      continue
    for output_layer in output_layers:
      filename = '{0}.ly{1}.{2}'.format(args.output_prefix, output_layer, output_format)
      handlers[output_format, output_layer] = \
        h5py.File(filename, 'w') if output_format == 'hdf5' else open(filename, 'w')

  for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text):
    output = model.forward(w, c, masks)
    for i, text in enumerate(texts):
      sent = '\t'.join(text)
      sent = sent.replace('.', '$period$')
      sent = sent.replace('/', '$backslash$')
      if sent in sent_set:
        continue
      sent_set.add(sent)  # 句子文本,以\t间隔
      if config['encoder']['name'].lower() == 'lstm':
        data = output[i, 1:lens[i]-1, :].data
        if use_cuda:
          data = data.cpu()
        data = data.numpy()
      elif config['encoder']['name'].lower() == 'elmo':
        data = output[:, i, 1:lens[i]-1, :].data
        if use_cuda:
          data = data.cpu()
        data = data.numpy()

      for (output_format, output_layer) in handlers:
        fout = handlers[output_format, output_layer]
        if output_layer == -1:
          payload = np.average(data, axis=0)
        else:
          payload = data[output_layer]
        if output_format == 'hdf5':
          fout.create_dataset(sent, payload.shape, dtype='float32', data=payload)
        else:
          for word, row in zip(text, payload):
            # word句子中的当前词,row 1024维向量
            print('{0}\t{1}'.format(word, '\t'.join(['{0:.8f}'.format(elem) for elem in row])), file=fout)
          print('', file=fout)

      cnt += 1
      if cnt % 1000 == 0:
        logging.info('Finished {0} sentences.'.format(cnt))
  for _, handler in handlers.items():
    handler.close()
Exemple #2
0
def test_main():
    cmd = argparse.ArgumentParser('The testing components of')
    cmd.add_argument('--gpu',
                     default=-1,
                     type=int,
                     help='use id of gpu, -1 if cpu.')
    cmd.add_argument('--input_format',
                     default='plain',
                     choices=('plain', 'conll', 'conll_char', 'conll_char_vi'),
                     help='the input format.')
    cmd.add_argument("--input", help="the path to the raw text file.")
    cmd.add_argument('--output_ave',
                     help='the path to the average embedding file.')
    cmd.add_argument('--output_lstm',
                     help='the path to the 1st lstm-output embedding file.')

    cmd.add_argument("--model", required=True, help="path to save model")
    cmd.add_argument("--batch_size",
                     "--batch",
                     type=int,
                     default=1,
                     help='the batch size.')

    args = cmd.parse_args(sys.argv[2:])

    if args.gpu >= 0:
        torch.cuda.set_device(args.gpu)
    use_cuda = args.gpu >= 0 and torch.cuda.is_available()

    args2 = dict2namedtuple(
        json.load(
            codecs.open(os.path.join(args.model, 'config.json'),
                        'r',
                        encoding='utf-8')))

    with open(args2.config_path, 'r') as fin:
        config = json.load(fin)

    if config['token_embedder']['char_dim'] > 0:
        char_lexicon = {}
        with codecs.open(os.path.join(args.model, 'char.dic'),
                         'r',
                         encoding='utf-8') as fpi:
            for line in fpi:
                tokens = line.strip().split('\t')
                if len(tokens) == 1:
                    tokens.insert(0, '\u3000')
                token, i = tokens
                char_lexicon[token] = int(i)
        char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'],
                                        char_lexicon,
                                        fix_emb=False,
                                        embs=None)
        logging.info('char embedding size: ' +
                     str(len(char_emb_layer.word2id)))
    else:
        char_lexicon = None
        char_emb_layer = None

    if config['token_embedder']['word_dim'] > 0:
        word_lexicon = {}
        with codecs.open(os.path.join(args.model, 'word.dic'),
                         'r',
                         encoding='utf-8') as fpi:
            for line in fpi:
                tokens = line.strip().split('\t')
                if len(tokens) == 1:
                    tokens.insert(0, '\u3000')
                token, i = tokens
                word_lexicon[token] = int(i)
        word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'],
                                        word_lexicon,
                                        fix_emb=False,
                                        embs=None)
        logging.info('word embedding size: ' +
                     str(len(word_emb_layer.word2id)))
    else:
        word_lexicon = None
        word_emb_layer = None

    model = Model(config, word_emb_layer, char_emb_layer, use_cuda)

    if use_cuda:
        model.cuda()

    logging.info(str(model))
    model.load_model(args.model)
    if config['token_embedder']['name'].lower() == 'cnn':
        if args.input_format == 'plain':
            test, text = read_corpus(
                args.input,
                config['token_embedder']['max_characters_per_token'])
        elif args.input_format == 'conll':
            test, text = read_conll_corpus(
                args.input,
                config['token_embedder']['max_characters_per_token'])
        elif args.input_format == 'conll_char':
            test, text = read_conll_char_corpus(
                args.input,
                config['token_embedder']['max_characters_per_token'])
        else:
            test, text = read_conll_char_vi_corpus(
                args.input,
                config['token_embedder']['max_characters_per_token'])
    elif config['token_embedder']['name'].lower() == 'lstm':
        if args.input_format == 'plain':
            test, text = read_corpus(args.input)
        elif args.input_format == 'conll':
            test, text = read_conll_corpus(args.input)
        elif args.input_format == 'conll_char':
            test, text = read_conll_char_corpus(args.input)
        else:
            test, text = read_conll_char_vi_corpus(args.input)

    test_w, test_c, test_lens, test_masks, test_text = create_batches(
        test,
        args.batch_size,
        word_lexicon,
        char_lexicon,
        config,
        use_cuda=use_cuda,
        text=text)

    print(max([len(x) for x in test]))

    model.eval()

    sent_set = set()

    cnt = 0

    fout_ave = h5py.File(args.output_ave,
                         'w') if args.output_ave is not None else None
    fout_lstm = h5py.File(args.output_lstm,
                          'w') if args.output_lstm is not None else None

    for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks,
                                        test_text):
        output = model.forward(w, c, masks)
        for i, text in enumerate(texts):
            sent = '\t'.join(text)
            sent = sent.replace('.', '$period$')
            sent = sent.replace('/', '$backslash$')
            if sent in sent_set:
                continue
            sent_set.add(sent)
            if config['encoder']['name'].lower() == 'lstm':
                data = output[i, 1:lens[i] - 1, :].data
                if use_cuda:
                    data = data.cpu()
                data = data.numpy()
            elif config['encoder']['name'].lower() == 'elmo':
                data = output[:, i, 1:lens[i] - 1, :].data
                if use_cuda:
                    data = data.cpu()
                data = data.numpy()
            if fout_ave is not None:
                data_ave = np.average(data, axis=0)
                fout_ave.create_dataset(sent,
                                        data_ave.shape,
                                        dtype='float32',
                                        data=data_ave)
            if fout_lstm is not None:
                data_lstm = data[1]
                fout_lstm.create_dataset(sent,
                                         data_lstm.shape,
                                         dtype='float32',
                                         data=data_lstm)
            cnt += 1
            if cnt % 1000 == 0:
                logging.info('Finished {0} sentences.'.format(cnt))
    if fout_ave is not None:
        fout_ave.close()
    if fout_lstm is not None:
        fout_lstm.close()
Exemple #3
0
def test():
  cmd = argparse.ArgumentParser('The testing components of')
  cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.')
  cmd.add_argument("--input", help="the path to the raw text file.")
  cmd.add_argument("--model", required=True, help="path to save model")
  cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.')
  args = cmd.parse_args(sys.argv[2:])

  # if args.gpu >= 0:
  #   torch.cuda.set_device(args.gpu)
  use_cuda = args.gpu >= 0 and torch.cuda.is_available()
  
  args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8')))

  with open(args2.config_path, 'r') as fin:
    config = json.load(fin)

  if config['token_embedder']['char_dim'] > 0:
    char_lexicon = {}
    with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi:
      for line in fpi:
        tokens = line.strip().split('\t')
        if len(tokens) == 1:
          tokens.insert(0, '\u3000')
        token, i = tokens
        char_lexicon[token] = int(i)
    char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False)
    logging.info('char embedding size: ' + str(len(char_emb_layer.word2id)))
  else:
    char_lexicon = None
    char_emb_layer = None

  word_lexicon = {}
  with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi:
    for line in fpi:
      tokens = line.strip().split('\t')
      if len(tokens) == 1:
        tokens.insert(0, '\u3000')
      token, i = tokens
      word_lexicon[token] = int(i)

  if config['token_embedder']['word_dim'] > 0:
    word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None)
    logging.info('word embedding size: ' + str(len(word_emb_layer.word2id)))
  else:
    word_emb_layer = None
  
  model = Model(config, word_emb_layer, char_emb_layer, len(word_lexicon), use_cuda)

  if use_cuda:
    model.cuda()

  logging.info(str(model))
  model.load_model(args.model)
  if config['token_embedder']['name'].lower() == 'cnn':
    test = read_corpus(args.input, config['token_embedder']['max_characters_per_token'], max_sent_len=10000)
  elif config['token_embedder']['name'].lower() == 'lstm':
    test = read_corpus(args.input, max_sent_len=10000)
  else:
    raise ValueError('')

  test_w, test_c, test_lens, test_masks = create_batches(
    test, args.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)

  test_result = eval_model(model, (test_w, test_c, test_lens, test_masks))

  logging.info("test_ppl={:.6f}".format(test_result))
Exemple #4
0
def train():
  cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve')
  cmd.add_argument('--seed', default=1, type=int, help='The random seed.')
  cmd.add_argument('--gpu', default=-1, type=int, help='Use id of gpu, -1 if cpu.')

  cmd.add_argument('--train_path', required=True, help='The path to the training file.')
  cmd.add_argument('--valid_path', help='The path to the development file.')
  cmd.add_argument('--test_path', help='The path to the testing file.')

  cmd.add_argument('--config_path', required=True, help='the path to the config file.')
  cmd.add_argument("--word_embedding", help="The path to word vectors.")

  cmd.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam', 'adagrad'],
                   help='the type of optimizer: valid options=[sgd, adam, adagrad]')
  cmd.add_argument("--lr", type=float, default=0.01, help='the learning rate.')
  cmd.add_argument("--lr_decay", type=float, default=0, help='the learning rate decay.')

  cmd.add_argument("--model", required=True, help="path to save model")
  
  cmd.add_argument("--batch_size", "--batch", type=int, default=32, help='the batch size.')
  cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.')
  
  cmd.add_argument("--clip_grad", type=float, default=5, help='the tense of clipped grad.')

  cmd.add_argument('--max_sent_len', type=int, default=20, help='maximum sentence length.')

  cmd.add_argument('--min_count', type=int, default=5, help='minimum word count.')

  cmd.add_argument('--max_vocab_size', type=int, default=150000, help='maximum vocabulary size.')

  cmd.add_argument('--save_classify_layer', default=False, action='store_true',
                   help="whether to save the classify layer")

  cmd.add_argument('--valid_size', type=int, default=0, help="size of validation dataset when there's no valid.")
  cmd.add_argument('--eval_steps', required=False, type=int, help='report every xx batches.')

  opt = cmd.parse_args(sys.argv[2:])

  with open(opt.config_path, 'r') as fin:
    config = json.load(fin)

  # Dump configurations
  print(opt)
  print(config)

  # set seed.
  torch.manual_seed(opt.seed)
  random.seed(opt.seed)
  if opt.gpu >= 0:
    torch.cuda.set_device(opt.gpu)
    if opt.seed > 0:
      torch.cuda.manual_seed(opt.seed)

  use_cuda = opt.gpu >= 0 and torch.cuda.is_available()

  token_embedder_name = config['token_embedder']['name'].lower()
  token_embedder_max_chars = config['token_embedder'].get('max_characters_per_token', None)
  if token_embedder_name == 'cnn':
    train_data = read_corpus(opt.train_path, token_embedder_max_chars, opt.max_sent_len)
  elif token_embedder_name == 'lstm':
    train_data = read_corpus(opt.train_path, opt.max_sent_len)
  else:
    raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))

  logging.info('training instance: {}, training tokens: {}.'.format(len(train_data),
                                                                    sum([len(s) - 1 for s in train_data])))

  if opt.valid_path is not None:
    if token_embedder_name == 'cnn':
      valid_data = read_corpus(opt.valid_path, token_embedder_max_chars, opt.max_sent_len)
    elif token_embedder_name == 'lstm':
      valid_data = read_corpus(opt.valid_path, opt.max_sent_len)
    else:
      raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))
    logging.info('valid instance: {}, valid tokens: {}.'.format(len(valid_data),
                                                                sum([len(s) - 1 for s in valid_data])))
  elif opt.valid_size > 0:
    train_data, valid_data = divide(train_data, opt.valid_size)
    logging.info('training instance: {}, training tokens after division: {}.'.format(
      len(train_data), sum([len(s) - 1 for s in train_data])))
    logging.info('valid instance: {}, valid tokens: {}.'.format(
      len(valid_data), sum([len(s) - 1 for s in valid_data])))
  else:
    valid_data = None

  if opt.test_path is not None:
    if token_embedder_name == 'cnn':
      test_data = read_corpus(opt.test_path, token_embedder_max_chars, opt.max_sent_len)
    elif token_embedder_name == 'lstm':
      test_data = read_corpus(opt.test_path, opt.max_sent_len)
    else:
      raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))
    logging.info('testing instance: {}, testing tokens: {}.'.format(
      len(test_data), sum([len(s) - 1 for s in test_data])))
  else:
    test_data = None

  if opt.word_embedding is not None:
    embs = load_embedding(opt.word_embedding)
    word_lexicon = {word: i for i, word in enumerate(embs[0])}  
  else:
    embs = None
    word_lexicon = {}

  # Maintain the vocabulary. vocabulary is used in either WordEmbeddingInput or softmax classification
  vocab = get_truncated_vocab(train_data, opt.min_count)

  # Ensure index of '<oov>' is 0
  for special_word in ['<oov>', '<bos>', '<eos>',  '<pad>']:
    if special_word not in word_lexicon:
      word_lexicon[special_word] = len(word_lexicon)

  for word, _ in vocab:
    if word not in word_lexicon:
      word_lexicon[word] = len(word_lexicon)

  # Word Embedding
  if config['token_embedder']['word_dim'] > 0:
    word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=embs)
    logging.info('Word embedding size: {0}'.format(len(word_emb_layer.word2id)))
  else:
    word_emb_layer = None
    logging.info('Vocabulary size: {0}'.format(len(word_lexicon)))

  # Character Lexicon
  if config['token_embedder']['char_dim'] > 0:
    char_lexicon = {}
    for sentence in train_data:
      for word in sentence:
        for ch in word:
          if ch not in char_lexicon:
            char_lexicon[ch] = len(char_lexicon)

    for special_char in ['<bos>', '<eos>', '<oov>', '<pad>', '<bow>', '<eow>']:
      if special_char not in char_lexicon:
        char_lexicon[special_char] = len(char_lexicon)

    char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False)
    logging.info('Char embedding size: {0}'.format(len(char_emb_layer.word2id)))
  else:
    char_lexicon = None
    char_emb_layer = None

  train = create_batches(
    train_data, opt.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda)

  if opt.eval_steps is None:
    opt.eval_steps = len(train[0])
  logging.info('Evaluate every {0} batches.'.format(opt.eval_steps))

  if valid_data is not None:
    valid = create_batches(
      valid_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)
  else:
    valid = None

  if test_data is not None:
    test = create_batches(
      test_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)
  else:
    test = None

  label_to_ix = word_lexicon
  logging.info('vocab size: {0}'.format(len(label_to_ix)))
  
  nclasses = len(label_to_ix)

  model = Model(config, word_emb_layer, char_emb_layer, nclasses, use_cuda)
  logging.info(str(model))
  if use_cuda:
    model = model.cuda()

  need_grad = lambda x: x.requires_grad
  if opt.optimizer.lower() == 'adam':
    optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=opt.lr)
  elif opt.optimizer.lower() == 'sgd':
    optimizer = optim.SGD(filter(need_grad, model.parameters()), lr=opt.lr)
  elif opt.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(filter(need_grad, model.parameters()), lr=opt.lr)
  else:
    raise ValueError('Unknown optimizer {}'.format(opt.optimizer.lower()))

  try:
    os.makedirs(opt.model)
  except OSError as exception:
    if exception.errno != errno.EEXIST:
      raise

  if config['token_embedder']['char_dim'] > 0:
    with codecs.open(os.path.join(opt.model, 'char.dic'), 'w', encoding='utf-8') as fpo:
      for ch, i in char_emb_layer.word2id.items():
        print('{0}\t{1}'.format(ch, i), file=fpo)

  with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo:
    for w, i in word_lexicon.items():
      print('{0}\t{1}'.format(w, i), file=fpo)

  json.dump(vars(opt), codecs.open(os.path.join(opt.model, 'config.json'), 'w', encoding='utf-8'))

  best_train = 1e+8
  best_valid = 1e+8
  test_result = 1e+8

  for epoch in range(opt.max_epoch):
    best_train, best_valid, test_result = train_model(epoch, opt, model, optimizer,
                                                      train, valid, test, best_train, best_valid, test_result)
    if opt.lr_decay > 0:
      optimizer.param_groups[0]['lr'] *= opt.lr_decay

  if valid_data is None:
    logging.info("best train ppl: {:.6f}.".format(best_train))
  elif test_data is None:
    logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(best_train, best_valid))
  else:
    logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}.".format(best_train, best_valid, test_result))
    def get_model(self):
        # torch.cuda.set_device(1)
        self.use_cuda = torch.cuda.is_available()
        # load the model configurations
        args2 = dict2namedtuple(
            json.load(
                codecs.open(os.path.join(self.model_dir, 'config.json'),
                            'r',
                            encoding='utf-8')))

        config_path = os.path.join(self.model_dir, args2.config_path)
        print("config_patch##:", config_path)
        # Some of the available models may have the config in the
        # model dir, but the path given in the config directory was an
        # absolute path.

        if not os.path.exists(config_path):
            config_path = os.path.join(self.model_dir,
                                       os.path.split(config_path)[1])
            logger.warning("Could not find config.  Trying " + config_path)
        # In many cases, such as the publicly available English model,
        # the config is one of the default provided configs in
        # elmoformanylangs/configs
        if not os.path.exists(config_path):
            config_path = os.path.join(
                os.path.split(__file__)[0], "configs",
                os.path.split(config_path)[1])
            logger.warning("Could not find config.  Trying " + config_path)

        if not os.path.exists(config_path):
            raise FileNotFoundError(
                "Could not find the model config in either the model directory "
                "or the default configs.  Path in config file: %s" %
                args2.config_path)

        with open(config_path, 'r') as fin:
            config = json.load(fin)

        # For the model trained with character-based word encoder.
        if config['token_embedder']['char_dim'] > 0:
            self.char_lexicon = {}
            with codecs.open(os.path.join(self.model_dir, 'char.dic'),
                             'r',
                             encoding='utf-8') as fpi:
                for line in fpi:
                    tokens = line.strip().split('\t')
                    if len(tokens) == 1:
                        tokens.insert(0, '\u3000')
                    token, i = tokens
                    self.char_lexicon[token] = int(i)
            char_emb_layer = EmbeddingLayer(
                config['token_embedder']['char_dim'],
                self.char_lexicon,
                fix_emb=False,
                embs=None)
            logger.info('char embedding size: ' +
                        str(len(char_emb_layer.word2id)))
        else:
            self.char_lexicon = None
            char_emb_layer = None

        # For the model trained with word form word encoder.
        if config['token_embedder']['word_dim'] > 0:
            self.word_lexicon = {}
            with codecs.open(os.path.join(self.model_dir, 'word.dic'),
                             'r',
                             encoding='utf-8') as fpi:
                for line in fpi:
                    tokens = line.strip().split('\t')
                    if len(tokens) == 1:
                        tokens.insert(0, '\u3000')
                    token, i = tokens
                    self.word_lexicon[token] = int(i)
            word_emb_layer = EmbeddingLayer(
                config['token_embedder']['word_dim'],
                self.word_lexicon,
                fix_emb=False,
                embs=None)
            logger.info('word embedding size: ' +
                        str(len(word_emb_layer.word2id)))
        else:
            self.word_lexicon = None
            word_emb_layer = None

        # instantiate the model
        model = Model(config, word_emb_layer, char_emb_layer, self.use_cuda)

        if self.use_cuda:
            model.cuda()

        logger.info(str(model))
        model.load_model(self.model_dir)

        # read test data according to input format

        # configure the model to evaluation mode.
        model.eval()
        return model, config
Exemple #6
0
def test():
    cmd = argparse.ArgumentParser('The testing components of')
    cmd.add_argument('--gpu',
                     default=-1,
                     type=int,
                     help='use id of gpu, -1 if cpu.')
    cmd.add_argument("--input", help="the path to the test file.")
    cmd.add_argument('--output', help='the path to the output file.')
    cmd.add_argument("--models", required=True, help="path to save model")
    cmd.add_argument("--lexicon",
                     required=True,
                     help='path to the lexicon (hdf5) file.')

    args = cmd.parse_args(sys.argv[2:])

    if args.gpu >= 0:
        torch.cuda.set_device(args.gpu)

    lexicon = h5py.File(args.lexicon, 'r')
    dim, n_layers = lexicon['#info'][0].item(), lexicon['#info'][1].item()
    logging.info('dim: {}'.format(dim))
    logging.info('n_layers: {}'.format(n_layers))

    model_path = args.model

    args2 = dict2namedtuple(
        json.load(
            codecs.open(os.path.join(model_path, 'config.json'),
                        'r',
                        encoding='utf-8')))

    word_lexicon = {}
    word_emb_layers = []
    with codecs.open(os.path.join(model_path, 'word.dic'),
                     'r',
                     encoding='utf-8') as fpi:
        for line in fpi:
            tokens = line.strip().split('\t')
            if len(tokens) == 1:
                tokens.insert(0, '\u3000')
            token, i = tokens
            word_lexicon[token] = int(i)

    word_emb_layer = EmbeddingLayer(args2.word_dim,
                                    word_lexicon,
                                    fix_emb=False,
                                    embs=None)

    logging.info('word embedding size: ' +
                 str(len(word_emb_layers[0].word2id)))

    label2id, id2label = {}, {}
    with codecs.open(os.path.join(model_path, 'label.dic'),
                     'r',
                     encoding='utf-8') as fpi:
        for line in fpi:
            token, i = line.strip().split('\t')
            label2id[token] = int(i)
            id2label[int(i)] = token
    logging.info('number of labels: {0}'.format(len(label2id)))

    use_cuda = args.gpu >= 0 and torch.cuda.is_available()

    model = Model(args2, word_emb_layer, dim, n_layers, len(label2id),
                  use_cuda)
    model.load_state_dict(
        torch.load(os.path.join(path, 'model.pkl'),
                   map_location=lambda storage, loc: storage))
    if use_cuda:
        model = model.cuda()

    raw_test_data, raw_test_labels = read_corpus(args.input)
    label_to_index(raw_test_labels, label2id, incremental=False)

    test_data, test_embed, test_labels, test_lens, order = create_batches(
        dim,
        n_layers,
        raw_test_data,
        raw_test_labels,
        lexicon,
        word_lexicon,
        args2.batch_size,
        shuffle=False,
        sort=True,
        keep_full=True,
        use_cuda=use_cuda)

    if args.output is not None:
        fpo = codecs.open(args.output, 'w', encoding='utf-8')
    else:
        fpo = codecs.getwriter('utf-8')(sys.stdout)

    model.eval()
    tagset = []
    for x, p, y, lens in zip(test_data, test_embed, test_labels, test_lens):
        output, loss = model.forward(x, p, y)
        output_data = output.data
        for bid in range(len(x)):
            tags = []
            for k in range(lens[bid]):
                tag = id2label[int(output_data[bid][k])]
                tags.append(tag)
            tagset.append(tags)

    for l in order:
        for tag in tagset[l]:
            print(tag, file=fpo)
        print(file=fpo)

    fpo.close()
Exemple #7
0
def train():
    cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve')
    cmd.add_argument('--seed', default=1, type=int, help='the random seed.')
    cmd.add_argument('--gpu',
                     default=-1,
                     type=int,
                     help='use id of gpu, -1 if cpu.')
    cmd.add_argument('--encoder',
                     default='gal_lstm',
                     choices=['lstm', 'gal_lstm'],
                     help='the type of encoder: valid options=[lstm]')
    cmd.add_argument('--optimizer',
                     default='sgd',
                     choices=['sgd', 'adam'],
                     help='the type of optimizer: valid options=[sgd, adam]')
    cmd.add_argument('--train_path',
                     required=True,
                     help='the path to the training file.')
    cmd.add_argument('--valid_path',
                     required=True,
                     help='the path to the validation file.')
    cmd.add_argument('--test_path',
                     required=False,
                     help='the path to the testing file.')
    cmd.add_argument('--lexicon',
                     required=True,
                     help='the path to the hdf5 file.')
    cmd.add_argument('--gold_valid_path',
                     type=str,
                     help='the path to the validation file.')
    cmd.add_argument('--gold_test_path',
                     type=str,
                     help='the path to the testing file.')
    cmd.add_argument("--model", required=True, help="path to save model")
    cmd.add_argument("--batch_size",
                     "--batch",
                     type=int,
                     default=32,
                     help='the batch size.')
    cmd.add_argument("--hidden_dim",
                     "--hidden",
                     type=int,
                     default=128,
                     help='the hidden dimension.')
    cmd.add_argument("--max_epoch",
                     type=int,
                     default=100,
                     help='the maximum number of iteration.')
    cmd.add_argument("--word_dim",
                     type=int,
                     default=128,
                     help='the input dimension.')
    cmd.add_argument("--dropout",
                     type=float,
                     default=0.0,
                     help='the dropout rate')
    cmd.add_argument("--depth", type=int, default=2, help='the depth of lstm')
    cmd.add_argument("--word_cut",
                     type=int,
                     default=5,
                     help='remove the words that is less frequent than')
    cmd.add_argument("--eval_steps", type=int, help='eval every x batches')
    cmd.add_argument("--l2",
                     type=float,
                     default=0.00001,
                     help='the l2 decay rate.')
    cmd.add_argument("--lr",
                     type=float,
                     default=0.01,
                     help='the learning rate.')
    cmd.add_argument("--lr_decay",
                     type=float,
                     default=0,
                     help='the learning rate decay.')
    cmd.add_argument("--clip_grad",
                     type=float,
                     default=1,
                     help='the tense of clipped grad.')
    cmd.add_argument("--consider_word_piece",
                     default=False,
                     action='store_true',
                     help='use word piece.')
    cmd.add_argument('--output', help='The path to the output file.')
    cmd.add_argument("--script",
                     required=True,
                     help="The path to the evaluation script")

    opt = cmd.parse_args(sys.argv[2:])

    print(opt)
    torch.manual_seed(opt.seed)
    random.seed(opt.seed)
    if opt.gpu >= 0:
        torch.cuda.set_device(opt.gpu)
        if opt.seed > 0:
            torch.cuda.manual_seed(opt.seed)

    if opt.gold_valid_path is None:
        opt.gold_valid_path = opt.valid_path

    if opt.gold_test_path is None and opt.test_path is not None:
        opt.gold_test_path = opt.test_path

    use_cuda = opt.gpu >= 0 and torch.cuda.is_available()

    lexicon = h5py.File(opt.lexicon, 'r')
    dim, n_layers = lexicon['#info'][0].item(), lexicon['#info'][1].item()
    logging.info('dim: {}'.format(dim))
    logging.info('n_layers: {}'.format(n_layers))

    raw_training_data, raw_training_labels = read_corpus(opt.train_path)
    raw_valid_data, raw_valid_labels = read_corpus(opt.valid_path)
    if opt.test_path is not None:
        raw_test_data, raw_test_labels = read_corpus(opt.test_path)
    else:
        raw_test_data, raw_test_labels = [], []

    logging.info(
        'training instance: {}, validation instance: {}, test instance: {}.'.
        format(len(raw_training_labels), len(raw_valid_labels),
               len(raw_test_labels)))
    logging.info(
        'training tokens: {}, validation tokens: {}, test tokens: {}.'.format(
            sum([len(seq) for seq in raw_training_labels]),
            sum([len(seq) for seq in raw_valid_labels]),
            sum([len(seq) for seq in raw_test_labels])))

    if not opt.consider_word_piece:
        label_to_ix = {'<pad>': 0}
    else:
        label_to_ix = {'<pad>': 0, '-word-piece-': 1}
    label_to_index(raw_training_labels, label_to_ix)
    label_to_index(raw_valid_labels, label_to_ix, incremental=False)
    label_to_index(raw_test_labels, label_to_ix, incremental=False)

    logging.info('number of tags: {0}'.format(len(label_to_ix)))

    word_count = collections.Counter()
    for x in raw_training_data:
        for w in x:
            word_count[w] += 1

    word_lexicon = {}
    for w in word_count:
        if word_count[w] >= opt.word_cut:
            word_lexicon[w] = len(word_lexicon)

    for special_word in ['<oov>', '<pad>']:
        if special_word not in word_lexicon:
            word_lexicon[special_word] = len(word_lexicon)
    logging.info('training vocab size: {}'.format(len(word_lexicon)))

    word_emb_layer = EmbeddingLayer(opt.word_dim,
                                    word_lexicon,
                                    fix_emb=False,
                                    embs=None)
    logging.info('Word embedding size: {0}'.format(len(
        word_emb_layer.word2id)))

    n_classes = len(label_to_ix)
    ix2label = {ix: label for label, ix in label_to_ix.items()}

    word2id = word_emb_layer.word2id

    training_payload = create_batches(dim,
                                      n_layers,
                                      raw_training_data,
                                      raw_training_labels,
                                      lexicon,
                                      word2id,
                                      opt.batch_size,
                                      use_cuda=use_cuda)

    if opt.eval_steps is None or opt.eval_steps > len(raw_training_data):
        opt.eval_steps = len(training_payload[0])

    valid_payload = create_batches(dim,
                                   n_layers,
                                   raw_valid_data,
                                   raw_valid_labels,
                                   lexicon,
                                   word2id,
                                   opt.batch_size,
                                   shuffle=False,
                                   sort=True,
                                   keep_full=True,
                                   use_cuda=use_cuda)

    if opt.test_path is not None:
        test_payload = create_batches(dim,
                                      n_layers,
                                      raw_test_data,
                                      raw_test_labels,
                                      lexicon,
                                      word2id,
                                      opt.batch_size,
                                      shuffle=False,
                                      sort=True,
                                      keep_full=True,
                                      use_cuda=use_cuda)
    else:
        test_payload = None

    model = Model(opt, word_emb_layer, dim, n_layers, n_classes,
                  opt.consider_word_piece, use_cuda)

    logging.info(str(model))
    if use_cuda:
        model = model.cuda()

    need_grad = lambda x: x.requires_grad
    if opt.optimizer.lower() == 'adam':
        optimizer = torch.optim.Adam(filter(need_grad, model.parameters()),
                                     lr=opt.lr)
    else:
        optimizer = torch.optim.SGD(filter(need_grad, model.parameters()),
                                    lr=opt.lr)

    try:
        os.makedirs(opt.model)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

    with codecs.open(os.path.join(opt.model, 'word.dic'),
                     'w',
                     encoding='utf-8') as fpo:
        for w, i in word_emb_layer.word2id.items():
            print('{0}\t{1}'.format(w, i), file=fpo)

    with codecs.open(os.path.join(opt.model, 'label.dic'),
                     'w',
                     encoding='utf-8') as fpo:
        for label, i in label_to_ix.items():
            print('{0}\t{1}'.format(label, i), file=fpo)

    json.dump(
        vars(opt),
        codecs.open(os.path.join(opt.model, 'config.json'),
                    'w',
                    encoding='utf-8'))
    best_valid, test_result = -1e8, -1e8
    for epoch in range(opt.max_epoch):
        best_valid, test_result = train_model(epoch, model, optimizer,
                                              training_payload, valid_payload,
                                              test_payload, ix2label,
                                              best_valid, test_result)
        if opt.lr_decay > 0:
            optimizer.param_groups[0]['lr'] *= opt.lr_decay
        logging.info('Total encoder time: {:.2f}s'.format(model.eval_time /
                                                          (epoch + 1)))
        logging.info('Total embedding time: {:.2f}s'.format(model.emb_time /
                                                            (epoch + 1)))
        logging.info('Total classify time: {:.2f}s'.format(
            model.classify_time / (epoch + 1)))

    weights = model.weights
    if use_cuda:
        weights = weights.cpu()
    logging.info("weights: {}".format(weights.data.numpy()))
    logging.info("best_valid_acc: {:.6f}".format(best_valid))
    logging.info("test_acc: {:.6f}".format(test_result))