コード例 #1
0
def make_dict_vocab_to_tensor(bert_config):
  """make dictionary vocabulary to BERT output tensor."""
  #BERTの設定読み込み

  #BERTモデル構築
  #vocabファイルを読み込み、語彙リストをつくる
  vocab = tokenization.load_vocab(flags_vocab_file)
  #input_idsを作る(0スタートで語彙の数のぶんだけ)
  input_ids = list(range(len(vocab)))
  #input_maskを作る(全部0だと思う)
  input_mask = [0] * (len(vocab)+1)
  #segment_idsを作る(全部0だと思う)
  segment_ids = [0] * (len(vocab)+1)

  for i in range(len(vocab)):
    model = modeling.BertModel(
      config=bert_config,
      is_training=False,
      input_ids=input_ids[i],
      input_mask=input_mask[i],
      token_type_ids=segment_ids[i],
      use_one_hot_embeddings=False)
      
    #vocabリストをBERTモデルに投入
    final_hidden = model.get_sequence_output()#Bertの最終層

    #デバッグ用。終わったら取り除く
    # final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    # batch_size = final_hidden_shape[0]
    # seq_length = final_hidden_shape[1]
    # hidden_size = final_hidden_shape[2]

  return final_hidden
コード例 #2
0
def main(_):
	tf.logging.set_verbosity(tf.logging.INFO)

	output_files = FLAGS.output_file.split(",")
	writers = [tf.python_io.TFRecordWriter(out) for out in output_files]

	rng = random.Random(FLAGS.random_seed)

	tokenizer = tokenization.WordpieceTokenizer(
			vocab=tokenization.load_vocab(FLAGS.vocab_file))

	estimator = get_embedding_estimator()

	sample = get_sample(FLAGS.input_sentence_file, FLAGS.input_mapping_file,
		rng, FLAGS.sample_size)
	batches = list(range(0, len(sample), 3000)) + [len(sample)]

	for brange in zip(batches, batches[1:]):
		batch_sample = sample[brange[0]:brange[1]]

		instances = create_training_instances(
			FLAGS.input_sentence_file, FLAGS.input_mapping_file, tokenizer,
			FLAGS.max_seq_length, rng, FLAGS.do_lower_case, batch_sample)

		write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
																		writers, estimator)

	for writer in writers:
		writer.close()
コード例 #3
0
 def is_ontology_id(id):
   vocab = tokenization.load_vocab(flags_vocab_file)
   inv_vocab = {v:k for k,v in vocab.items()}
   ontology_tokens = []
   for token in vocab.keys():
     if len(token) >= 2:
       if (token[0] == '$' and token[-1] == '$') or (token[0] == '%' and token[-1]) == '%':
         ontology_tokens.append(token)
   if tokenization.convert_ids_to_tokens(inv_vocab, [id])[0] in ontology_tokens:
     return True
   else:
     return False
コード例 #4
0
def main(args):
    vocabs = tokenization.load_vocab(args.vocab_file)
    # tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
    phases = ['train']
    if args.need_dev:
        phases.append('dev')
    if not args.no_test:
        phases.append('test')

    for phase in phases:
        print('phase:', phase)
        prepare_ner(args, vocabs, phase)
コード例 #5
0
ファイル: mytest_loss_calc.py プロジェクト: Shinya-Kouda/kgc
def is_special_id(id):
    vocab = tokenization.load_vocab(vocab_pass)
    inv_vocab = {v: k for k, v in vocab.items()}
    special_tokens = []
    for token in vocab.keys():
        if len(token) >= 2:
            if token[0] == '[' and token[-1] == ']':
                special_tokens.append(token)
    if tokenization.convert_ids_to_tokens(inv_vocab,
                                          [id])[0] in special_tokens:
        return True
    else:
        return False
コード例 #6
0
def Memory_level_model_init(init_checkpoint):
    vocab_file = 'vocab.txt'
    embedding_dim = 128
    dropout_prob = 0.1
    processor = MemoryProcessor()
    label_list = processor.get_labels()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    vocab_dim = len(tokenization.load_vocab(vocab_file))

    model = SequenceClassification(vocab_dim, embedding_dim, dropout_prob,
                                   len(label_list), device)

    if init_checkpoint is not None:
        model.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))

    return model
コード例 #7
0
 def __init__(self, vocab_file, do_lower_case=False):
     self.vocab = tokenization.load_vocab(vocab_file)
     self.inv_vocab = {v: k for k, v in self.vocab.items()}
     self.wordpiece_tokenizer = tokenization.WordpieceTokenizer(
         vocab=self.vocab)
     self.do_lower_case = do_lower_case
コード例 #8
0
ファイル: apply_opcode.py プロジェクト: zhanzq/PIE
  parser.add_argument("--input_tokens", type=str, default=None, help="path to possibly incorrect token file")
  parser.add_argument("--edit_ids", type=str, default=None, help="path to edit ids to be applied on input_tokens")
  parser.add_argument("--output_tokens", type=str, default=None, help="path to edited (hopefully corrected) file")
  parser.add_argument("--infer_mode", type=str, default="conll", help="post processing mode bea or conll")
  parser.add_argument("--path_common_inserts",type=str,default=None,help="path of common unigram inserts")
  parser.add_argument("--path_common_multitoken_inserts",type=str,default=None,help="path of common bigram inserts")
  parser.add_argument("--path_common_deletes",type=str,default=None,help="path to common deletions observed in train data")

parser = argparse.ArgumentParser()
add_arguments(parser)
FLAGS, unparsed = parser.parse_known_args()

DO_PARALLEL = False
INFER_MODE=FLAGS.infer_mode

vocab = tokenization.load_vocab(FLAGS.vocab_path)
basic_tokenizer = tokenization.BasicTokenizer(do_lower_case=False,vocab=vocab)
vocab_words = set(x for x in vocab)
common_deletes = pickle.load(open(FLAGS.path_common_deletes,"rb"))
path_common_inserts = FLAGS.path_common_inserts
path_common_multitoken_inserts = FLAGS.path_common_multitoken_inserts
opcodes = opcodes.Opcodes(path_common_inserts, path_common_multitoken_inserts)

if __name__ == '__main__':
    class config:
        INPUT_UNCORRECTED_WORDS = FLAGS.input_tokens
        INPUT_EDITS = FLAGS.edit_ids
        OUTPUT_CORRECTED_WORDS = FLAGS.output_tokens


コード例 #9
0
# counts = pair_counts(lines, maps)
# most_frequent_pairs(counts)

pairs = [
    ('year', 'ano'),
    ('wanted', 'queria'),
    ('question', 'questão'),
    ('I', 'eu'),
    ('opportunity', 'oportunidade'),
    ('problem', 'problema'),
    ('love', 'amor'),
]
K = 10

tokenizer = tokenization.WordpieceTokenizer(vocab=tokenization.load_vocab(
    "/home/arthur/Projects/bert/models/multi_aligned_cased_L-12_H-768_A-12/vocab.txt"
))

rng = random.Random(1234)
sample = set(get_sample(sent_path, map_path, rng, 50000))
lines = [l for i, l in enumerate(lines) if i in sample]

sents = get_sentences(lines, pairs, tokenizer, k=K)
embs = get_embeddings(sents)

from sklearn.manifold import TSNE
X = embs.reshape((-1, embs.shape[-1]))
X_embedded = TSNE(n_components=2, perplexity=20,
                  metric='cosine').fit_transform(X)
X_embedded.shape
コード例 #10
0
    def __init__(self):
        vocab_file = 'vocab.txt'
        vocab = tokenization.load_vocab(vocab_file=vocab_file)
        tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
        path = 'train_processed.txt'

        train_file = open(path, 'r', encoding='utf-8')
        lines = train_file.read().split('\n')

        max_length = 0

        for i in range(len(lines)):
            TK = lines[i].split(' \t')

            if max_length < len(TK[0]):
                max_length = len(TK[0])

        max_length += 1

        self.input_ids = np.zeros(shape=[len(lines), max_length],
                                  dtype=np.int32)
        self.input_mask = np.zeros(shape=[len(lines), max_length],
                                   dtype=np.int32)
        self.label = np.zeros(shape=[len(lines)], dtype=np.int32)

        for i in range(len(lines) - 1):
            TK = lines[i].split(' \t')
            if len(TK) != 2:
                TK = lines[i].split('\t')

            sentence = TK[0]
            token = tokenizer.tokenize(sentence)
            tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab,
                                                        tokens=token)

            for j in range(len(tk_ids)):
                self.input_ids[i, j + 1] = tk_ids[j]
                self.input_mask[i, j + 1] = 1
            self.input_ids[i, 0] = tokenization.convert_tokens_to_ids(
                vocab=vocab, tokens=['[CLS]'])[0]
            self.input_mask[i, 0] = 1
            self.label[i] = int(TK[1])

        path = 'test_processed.txt'

        test_file = open(path, 'r', encoding='utf-8')
        lines = test_file.read().split('\n')

        max_length = 0

        for i in range(len(lines)):
            TK = lines[i].split(' \t')

            if max_length < len(TK[0]):
                max_length = len(TK[0])

        print(max_length)
        max_length += 1

        self.test_input_ids = np.zeros(shape=[len(lines), max_length],
                                       dtype=np.int32)
        self.test_input_ids_masking = np.zeros(shape=[len(lines), max_length],
                                               dtype=np.int32)
        self.test_label = np.zeros(shape=[len(lines)], dtype=np.int32)

        for i in range(len(lines) - 1):
            TK = lines[i].split(' \t')
            if len(TK) != 2:
                TK = lines[i].split('\t')

            sentence = TK[0]
            token = tokenizer.tokenize(sentence)
            tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab,
                                                        tokens=token)

            for j in range(len(tk_ids)):
                self.test_input_ids[i, j + 1] = tk_ids[j]
                self.test_input_ids_masking[i, j + 1] = 1
            self.test_input_ids[i, 0] = tokenization.convert_tokens_to_ids(
                vocab=vocab, tokens=['[CLS]'])[0]
            self.test_input_ids_masking[i, 0] = 1

            self.test_label[i] = int(TK[1])

        self.Batch_Size = 8

        self.random_idx = np.array(range(self.label.shape[0]), dtype=np.int32)
        np.random.shuffle(self.random_idx)

        self.Batch_Idx = 0
        self.Test_Batch_Idx = 0
コード例 #11
0
def main():
    parser = argparse.ArgumentParser()

    # model
    parser.add_argument('--model', type=str, default='wordrnn')
    parser.add_argument('--dir', type=str, default=None)
    parser.add_argument('--tokenizer',
                        type=str,
                        default='nltk',
                        help='Only effective when model set to wordrnn')
    parser.add_argument('--criterion', type=str, default='full')

    # data
    parser.add_argument('--set', type=str, default='msr')
    parser.add_argument('--partition', type=str, default='va')
    parser.add_argument('--no-move-cached', action='store_true')

    parser.add_argument('--log-dir', type=str, default='train/noname')
    parser.add_argument('--save-pred', action='store_true')

    args = parser.parse_args()

    problem_set = ProblemSet.load(args.set)
    examples = problem_set.get_examples(args.partition)

    logger.info("Evaluating models saved in {} on {}-{}".format(
        args.dir, args.set, args.partition))

    if not os.path.exists(args.log_dir):
        logger.info("Creating directory at {}".format(args.log_dir))
        os.makedirs(args.log_dir)

    args_path = os.path.join(args.log_dir, 'args.json')
    with open(args_path, 'w') as f:
        logger.info("Saving arguments at {}".format(args_path))
        json.dump(vars(args), f, indent=2)

    log_path = os.path.join(args.log_dir, 'log.txt')
    file_handler = logging.FileHandler(log_path, mode='w')
    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)

    model_type = args.model.lower()
    if model_type == 'wordrnn':
        args_path = osp.join(args.dir, 'args.json')
        with open(args_path, 'r') as f:
            arg_dict = json.load(f)

        vocab_path = osp.join(args.dir, 'vocab.txt')
        vocab = load_vocab(vocab_path)
        if args.tokenizer.lower() == 'nltk':
            tokenizer = NLTKTokenizer(vocab, arg_dict['lower'])
        elif args.tokenizer.lower() == 'wordpiece':
            tokenizer = BertTokenizer(vocab_path, arg_dict['lower'])
        model = WordRNN(len(vocab), len(vocab), arg_dict['rnncell'],
                        arg_dict['emsize'], arg_dict['outsize'],
                        arg_dict['nhid'],
                        arg_dict['nlayers'], arg_dict['bidirec'],
                        arg_dict.get('autoenc',
                                     False), arg_dict['decoder_bias'])
        logger.info(model)

        ckpt_paths = glob.glob(osp.join(args.dir, '*.pt'))
        ckpt_paths.sort(key=osp.getmtime)
        for path in ckpt_paths:
            model.load_state_dict(torch.load(path))
            direction = 'autoenc' if model.autoenc else (
                'bidirec' if model.bidirec else 'forward')
            evaluate(examples, model, tokenizer, direction, args.criterion,
                     str(osp.basename(path.split('.')[0])))
            if args.save_pred:
                save_fn = osp.basename(path).replace('.pt', '.csv')
                save_preds(examples, osp.join(args.log_dir, save_fn))
    elif model_type == 'lm1b':
        lm1b_dir = settings['lm1b_dir']

        for e in examples:
            e.context[0] = ' '.join(['<S>', e.context[0]])
            e.context[-1] = ' '.join([e.context[-1], '</S>'])

        vocab = load_vocab(osp.join(lm1b_dir, 'vocab-2016-09-10.txt'))
        special_tokens = ['<S>', '</S>', '<UNK>']
        tokenizer = BaseTokenizer(vocab, False, '<UNK>', special_tokens)
        in_vocab = load_vocab(osp.join(lm1b_dir, args.dir, 'vocab.txt'))

        out_to_in = [in_vocab['<UNK>']] * 800000
        for i, token in tokenizer.ids_to_tokens.items():
            out_to_in[i] = in_vocab.get(token, in_vocab['<UNK>'])

        tf_path = osp.join(lm1b_dir, 'ckpt-*')
        npy_path = osp.join(lm1b_dir, args.dir, 'embeddings.npy')
        model = LM1B.from_tf(tf_path, npy_path, out_to_in, 8)
        logger.info(model)

        evaluate(examples, model, tokenizer, 'forward', args.criterion)
        if args.save_pred:
            save_preds(examples, osp.join(args.log_dir, 'preds.csv'))
    else:
        cache_dir = settings['pretrans_dir']
        bert_dir = osp.join(settings['pretrans_dir'], args.dir)
        model_or_dir = bert_dir if osp.exists(bert_dir) else args.dir

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        config = config_class.from_pretrained(model_or_dir,
                                              cache_dir=cache_dir)
        tokenizer = tokenizer_class.from_pretrained(
            model_or_dir,
            cache_dir=cache_dir,
            max_len=config.max_position_embeddings,
            do_lower_case='-uncased' in model_or_dir)
        model = model_class.from_pretrained(model_or_dir,
                                            cache_dir=cache_dir,
                                            config=config)

        direction = 'forward'
        if model_type == 'bert':
            direction = 'autoenc'

        evaluate(examples, model, tokenizer, direction, args.criterion)
        if args.save_pred:
            save_preds(examples, osp.join(args.log_dir, 'preds.csv'))

        if not args.no_move_cached and not osp.exists(bert_dir):
            logger.info("Creating directory at {}".format(bert_dir))
            os.mkdir(bert_dir)

            model_url = model.pretrained_model_archive_map[model_or_dir]
            model_path = osp.join(bert_dir, WEIGHTS_NAME)
            move_cached(model_url, cache_dir, model_path)

            config_url = model.config.pretrained_config_archive_map[
                model_or_dir]
            config_path = osp.join(bert_dir, CONFIG_NAME)
            move_cached(config_url, cache_dir, config_path)

            for k, url_map in tokenizer.pretrained_vocab_files_map.items():
                vocab_path = osp.join(bert_dir, tokenizer.vocab_files_names[k])
                move_cached(url_map[model_or_dir], cache_dir, vocab_path)
コード例 #12
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--dev_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for develop")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=3000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument('--model_path',
                        type=str,
                        default='./model',
                        help='save model path')
    parser.add_argument('--load_model', type=str, default=None)
    parser.add_argument('--embedding_dim', type=int, default=300)
    parser.add_argument('--dropout_prob', type=float, default=0.2)

    args = parser.parse_args()
    processors = {"memory": MemoryProcessor, "logic": LogicalProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device((args.local_rank))
        device = "cuda"
        n_gpu = torch.cuda.device_count()
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        dist.init_process_group(backend='nccl')
        torch.backends.cudnn.benchmark = True

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size *
            args.num_train_epochs)

    vocab_dim = len(tokenization.load_vocab(args.vocab_file))

    model = SequenceClassification(vocab_dim, args.embedding_dim,
                                   args.dropout_prob, len(label_list), device)

    if args.load_model is not None:
        model.load_state_dict(torch.load(args.load_model, map_location='cpu'))

    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    global_step = 0

    if args.local_rank != -1:
        model = DDP(model)
        optimizer = FP16_Optimizer(optimizer)
        '''
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
        '''
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_train:
        #train feature
        train_features = convert_to_ids(train_examples, label_list,
                                        args.max_seq_length, tokenizer)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_q_ids = torch.tensor([f.que_ids for f in train_features],
                                 dtype=torch.long)
        all_d_ids = torch.tensor([f.des_ids for f in train_features],
                                 dtype=torch.long)
        all_sd_ids = torch.tensor([f.scene_ids for f in train_features],
                                  dtype=torch.long)
        #all_Ld_ids = torch.tensor([f.local_scene_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_q_ids, all_d_ids, all_sd_ids,
                                   all_label_ids)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      num_workers=1,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        #developset feature
        dev_exmaples = processor.get_dev_examples(args.data_dir)
        dev_features = convert_to_ids(dev_exmaples, label_list,
                                      args.max_seq_length, tokenizer)

        all_dev_q_ids = torch.tensor([f.que_ids for f in dev_features],
                                     dtype=torch.long)
        all_dev_d_ids = torch.tensor([f.des_ids for f in dev_features],
                                     dtype=torch.long)
        all_dev_sd_ids = torch.tensor([f.scene_ids for f in dev_features],
                                      dtype=torch.long)
        #all_dev_Ld_ids = torch.tensor([f.local_scene_ids for f in dev_features], dtype=torch.long)
        all_dev_label_ids = torch.tensor([f.label_id for f in dev_features],
                                         dtype=torch.long)

        dev_data = TensorDataset(all_dev_q_ids, all_dev_d_ids, all_dev_sd_ids,
                                 all_dev_label_ids)
        if args.local_rank == -1:
            dev_sampler = RandomSampler(dev_data)
        else:
            dev_sampler = DistributedSampler(dev_data)

        dev_dataloader = DataLoader(dev_data,
                                    num_workers=1,
                                    sampler=dev_sampler,
                                    batch_size=args.eval_batch_size)

        model.train()
        losses = []
        dev_accuracy_list = []
        dev_losses = []
        for epoch in range(int(args.num_train_epochs)):

            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for (q_ids, d_ids, sd_ids, label_ids) in (train_dataloader):

                optimizer.zero_grad()

                q_ids = q_ids.to(device)
                d_ids = d_ids.to(device)
                sd_ids = sd_ids.to(device)
                #Ld_ids = Ld_ids.to(device)
                label_ids = label_ids.to(device)

                loss, _ = model.forward(q_ids, d_ids, sd_ids, label_ids)

                tr_loss += loss.item()
                nb_tr_examples += q_ids.size(0)
                nb_tr_steps += 1

                loss.backward()
                optimizer.step()

                global_step += 1
            if (epoch + 1) % 10 == 0:
                if args.task_name == 'memory':
                    torch.save(
                        model.state_dict(),
                        os.path.join(
                            args.model_path,
                            'non_crossPassage_res_memory_model' +
                            str(epoch + 1) + '.bin'))
                else:
                    torch.save(
                        model.state_dict(),
                        os.path.join(
                            args.model_path,
                            'non_crossPassage_res_logic_model' +
                            str(epoch + 1) + '.bin'))
            losses.append(tr_loss / nb_tr_steps)

            #develop dataset evaluation
            dev_accuracy, nb_dev_examples = 0, 0
            for q_ids, d_ids, sd_ids, label_ids in dev_dataloader:

                q_ids = q_ids.to(device)
                d_ids = d_ids.to(device)
                sd_ids = sd_ids.to(device)
                #Ld_ids = Ld_ids.to(device)
                label_ids = label_ids.to(device)

                dev_loss, logits = model.forward(q_ids, d_ids, sd_ids,
                                                 label_ids)

                label_ids = label_ids.to('cpu').numpy()
                logits = logits.to('cpu').detach().numpy()

                tmp_dev_accuracy = accuracy(logits, label_ids)
                dev_accuracy += tmp_dev_accuracy

                nb_dev_examples += q_ids.size(0)

            print('-' * 20)
            print("Epochs : {}".format(epoch + 1))
            print("dev_accuracy : {}".format(dev_accuracy / nb_dev_examples))
            print("train Loss : {}".format(tr_loss / nb_tr_steps))
            print("validataion Loss : {}".format(dev_loss.item()))
            dev_losses.append(dev_loss.item())
            print('-' * 20)

    if args.do_eval:
        eval_examples = processor.get_test_examples(args.data_dir)
        eval_features = convert_to_ids(eval_examples, label_list,
                                       args.max_seq_length, tokenizer)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_q_vectors = torch.tensor([f.que_ids for f in eval_features],
                                     dtype=torch.long)
        all_d_vectors = torch.tensor([f.des_ids for f in eval_features],
                                     dtype=torch.long)
        all_sd_vectors = torch.tensor([f.scene_ids for f in eval_features],
                                      dtype=torch.long)
        #all_Ld_vectors = torch.tensor([f.local_scene_ids for f in eval_features], dtype=torch.long)
        all_label_vectors = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)

        eval_data = TensorDataset(all_q_vectors, all_d_vectors, all_sd_vectors,
                                  all_label_vectors)

        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     num_workers=1,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        logit_label_list = []
        for step, (q_vec, d_vec, sd_vec, label_vec) in enumerate(
                tqdm(eval_dataloader, desc="Iteration")):

            q_vec = q_vec.to(device)
            d_vec = d_vec.to(device)
            sd_vec = sd_vec.to(device)
            #Ld_vec = Ld_vec.to(device)
            label_vec = label_vec.to(device)

            tmp_eval_loss, logits = model.forward(q_vec, d_vec, sd_vec,
                                                  label_vec)

            label_ids = label_vec.to('cpu').numpy()
            logits = logits.to('cpu').detach().numpy()

            tmp_eval_accuracy = accuracy(logits, label_ids)

            output = np.argmax(logits, axis=1)

            list(output)
            list(label_ids)
            logit_label_list.append([output, label_ids])

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += q_vec.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps  # len(eval_dataloader)
        eval_accuracy = eval_accuracy / nb_eval_examples  # len(eval_dataloader)

        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step
        }
        #'loss': tr_loss / nb_tr_steps}  # 'loss': loss.item()}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open('[memory]align_epoch20_output', 'w') as f:
            logit_output_list = []
            Gold_output_list = []
            for labels in logit_label_list:
                for logit in labels[0]:
                    logit_output = convert_id_to_label(logit, label_list)
                    logit_output_list.append(logit_output)
                for Gold in labels[1]:
                    Gold_output = convert_id_to_label(Gold, label_list)
                    Gold_output_list.append(Gold_output)
            for logit, gold in zip(logit_output_list, Gold_output_list):
                f.write(str(logit) + '\t' + str(gold) + '\n')

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
コード例 #13
0
    def create_embedding_table(self, embedding_type, name):
        oov_vocab, in_vocab = set(), set()
        if embedding_type == 'random':
            embedding_table = tf.Variable(tf.random_uniform(
                [self.vocab_size, self.embed_size], -1.0, 1.0),
                                          name='embed_w')
            return embedding_table

        elif re.search('word2vec', embedding_type) is not None:
            embedding_file = self.params[
                'word2vec_file']  ##https://github.com/Embedding/Chinese-Word-Vectors
            embedding_vocab = gensim.models.KeyedVectors.load_word2vec_format(
                embedding_file,
                binary=True,
                encoding='utf-8',
                unicode_errors='ignore')
            embedding_table = np.zeros((self.vocab_size, self.embed_size))
            self.vocab, index_vocab = tokenization.load_vocab(
                vocab_file=os.path.join(self.params['data_dir'],
                                        'vocab_word.txt'),
                params=self.params)

            for word, i in self.vocab.items():
                if word in embedding_vocab.vocab:
                    embedding_table[i] = embedding_vocab[word]
                    in_vocab.add(word)
                else:
                    embedding_table[i] = np.random.random(self.embed_size)
                    oov_vocab.add(word)
            tf.logging.info('OOV:%f' % (len(oov_vocab) /
                                        (len(oov_vocab) + len(in_vocab))))

            if embedding_type == 'word2vec_finetune':
                trainable = True
            elif embedding_type == 'word2vec_static':
                trainable = False
            else:
                trainable = False
                print(
                    "word2vec word embedding type please choose 'static' or 'finetune'"
                )
            embedding_table2 = tf.get_variable(
                name='embedding_w',
                shape=[self.vocab_size, self.embed_size],
                initializer=tf.constant_initializer(embedding_table),
                trainable=trainable)
            return embedding_table2

        elif re.search('fasttext', embedding_type) is not None:
            #https://fasttext.cc/docs/en/crawl-vectors.html
            embedding_vocab = self._load_embedding_pretrained(
                embedding_file=self.params['fasttext_file'])
            embedding_table = np.zeros((self.vocab_size, self.embed_size))
            self.vocab, index_vocab = tokenization.load_vocab(
                vocab_file=os.path.join(self.params['data_dir'],
                                        'vocab_word.txt'),
                params=self.params)

            for word, i in self.vocab.items():
                if word in embedding_vocab.keys():
                    embedding_table[i] = embedding_vocab[word]
                    in_vocab.add(word)
                else:
                    embedding_table[i] = np.random.random(self.embed_size)
                    oov_vocab.add(word)

            if embedding_type == 'fasttext_finetune':
                trainable = True
            elif embedding_type == 'fasttext_static':
                trainable = False
            else:
                trainable = False
                print(
                    "fasttext word embedding type please choose 'static' or 'finetune'"
                )
            embedding_table2 = tf.get_variable(
                name=name + 'embedding_w',
                shape=[self.vocab_size, self.embed_size],
                initializer=tf.constant_initializer(embedding_table),
                trainable=trainable)
            tf.logging.info('OOV:%f' % (len(oov_vocab) /
                                        (len(oov_vocab) + len(in_vocab))))
            return embedding_table2

        elif re.search('glove', embedding_type) is not None:
            pass

        elif re.search('elmo', embedding_type) is not None:
            print('Invalid embedding type: %s' % self.params['embedding_type'])
            print(
                'elmo please refer to github repository: HIT-SCIR/ELMoForManyLangs'
            )
コード例 #14
0
def Logic_level_model(qestion, clip_description, scene_description):

    #environment
    device = "cuda" if torch.cuda.is_available() else "cpu"
    max_sequence_length = 128
    vocab_file = 'vocab.txt'
    embedding_dim = 200
    dropout_prob = 0.2
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    vocab_dim = len(tokenization.load_vocab(vocab_file))

    processor = LogicalProcessor()
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=False)

    label_list = processor.get_labels()

    model = SequenceClassification(vocab_dim, embedding_dim, dropout_prob,
                                   len(label_list), device)

    init_checkpoint = 'model/Logic_model.bin'

    #Future save model Load code

    if init_checkpoint is not None:
        model.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))

    model.to(device)

    eval_example = processor._create_examples(qestion, clip_description,
                                              scene_description)
    eval_feature = convert_to_ids(eval_example, label_list,
                                  max_sequence_length, tokenizer)

    que_ids = torch.tensor([f.que_ids for f in eval_feature], dtype=torch.long)
    des_ids = torch.tensor([f.des_ids for f in eval_feature], dtype=torch.long)
    scene_ids = torch.tensor([f.scene_ids for f in eval_feature],
                             dtype=torch.long)

    if eval_feature.label_id == None:
        label_ids = None
    else:
        label_ids = torch.tensor([f.label_ids for f in eval_feature],
                                 dtype=torch.long)

    #eval_data = TensorDataset(input_ids, input_mask, segment_ids, label_ids)

    #eval_dataloader = DataLoader(eval_data)

    model.eval()

    que_ids = que_ids.to(device)
    des_ids = des_ids.to(device)
    scene_ids = scene_ids.to(device)
    if label_ids == None:
        pass
    else:
        label_ids = label_ids.to(device)
    if label_ids == None:
        logits = model(que_ids, des_ids, scene_ids, label_ids)
    else:
        loss, logits = model(que_ids, des_ids, scene_ids, label_ids)

    logits = logits.detach().cpu().numpy()

    output = np.argmax(logits, axis=0)
    output = convert_id_to_label(output, label_list)
    return output
コード例 #15
0
ファイル: mytest_loss_calc.py プロジェクト: Shinya-Kouda/kgc
import tensorflow as tf
import tokenization

vocab_pass = '******'

p_tokens = [
    '[Subject]', 'John', 'and', 'Michael', '[equalTo]', 'genius', '[when]',
    'morning'
]
predict_ids = tokenization.convert_tokens_to_ids(
    tokenization.load_vocab(vocab_pass), p_tokens)
r_tokens = [
    '[Subject]', 'John', 'and', 'Michael', '[equalTo]', 'smart', '[when]',
    'afternoon', 'and', 'evening'
]
real_ids = tokenization.convert_tokens_to_ids(
    tokenization.load_vocab(vocab_pass), r_tokens)
predict_tensor = tf.constant([[1, 2, 3], [98, 1, 6], [1, 2, 4], [22, 1, 6],
                              [3, 2, 3], [7, 1, 6], [0, 2, 3], [11, 1, 9]],
                             dtype=float)
real_tensor = tf.constant(
    [[1, 2, 3], [12, 8, 1], [1, 2, 4], [12, 8, 1], [3, 2, 3], [12, 8, 1],
     [0, 2, 3], [12, 8, 1], [1, 2, 4], [12, 8, 1]],
    dtype=float)


def is_special_id(id):
    vocab = tokenization.load_vocab(vocab_pass)
    inv_vocab = {v: k for k, v in vocab.items()}
    special_tokens = []
    for token in vocab.keys():
コード例 #16
0
def main():
    parser = argparse.ArgumentParser()

    # model structure
    parser.add_argument('--rnncell', type=str, default='LSTM')
    parser.add_argument('--emsize', type=int, default=200)
    parser.add_argument('--nhid', type=int, default=600)
    parser.add_argument('--outsize', type=int, default=400)
    parser.add_argument('--nlayers', type=int, default=2)
    parser.add_argument('--bidirec', action='store_true')
    parser.add_argument('--autoenc', action='store_true')
    parser.add_argument('--forget-bias', type=float, default=False)
    parser.add_argument('--decoder-bias', action='store_true')

    # data
    parser.add_argument('--corpus', type=str, default='guten')
    parser.add_argument('--min-len', type=int, default=10)
    parser.add_argument('--max-len', type=int, default=80)

    # vocabulary
    parser.add_argument('--vocab', type=str, default=None)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--min-cnt', type=int, default=6)

    # training
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--seed', type=int, default=3333)
    parser.add_argument('--batch-size', type=int, default=20)
    parser.add_argument('--eval-batch-size', type=int, default=10)

    # optimizer
    parser.add_argument('--optim', type=str, default='SGD')
    parser.add_argument('--lr', type=float, default=.5)
    parser.add_argument('--clip', type=float, default=5.0)
    parser.add_argument('--decay-after', type=int, default=5)
    parser.add_argument('--decay-rate', type=float, default=0.5)
    parser.add_argument('--decay-period', type=int, default=1)
    parser.add_argument('--epochs', type=int, default=10)

    # save and log
    parser.add_argument('--save-dir', type=str, default='train/noname')
    parser.add_argument('--log-interval', type=int, default=10000)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--save-all', action='store_false')
    parser.add_argument('--save-period', type=int, default=1)

    args = parser.parse_args()
    logger.debug("Running {}".format(__file__))

    if not os.path.exists(args.save_dir):
        logger.debug("Creating directory at {}".format(args.save_dir))
        os.makedirs(args.save_dir)

    args_path = os.path.join(args.save_dir, 'args.json')
    with open(args_path, 'w') as f:
        logger.debug("Saving arguments at {}".format(args_path))
        json.dump(vars(args), f, indent=2)

    log_path = os.path.join(args.save_dir, 'log.txt')
    file_handler = logging.FileHandler(log_path, mode='w')
    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Set the random seed manually for reproducibility.
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Use pre built vocabulary if it exists
    if args.vocab and os.path.exists(args.vocab):
        vocab = load_vocab(args.vocab)
        update = False
    else:
        vocab = Vocabulary()
        update = True
    tokenizer = Tokenizer(vocab, args.lower)

    tr_txts = get_txts(args.corpus, 'train')
    va_txts = get_txts(args.corpus, 'valid')

    tr_input = LineInput(tr_txts, tokenizer, update, args.min_len,
                         args.max_len)
    va_input = LineInput(va_txts, tokenizer, update, args.min_len,
                         args.max_len)
    va_batches = va_input.batchify(args.eval_batch_size, False)

    if update:
        vocab.build_from_counter(args.min_cnt)
        logger.debug("Built vocab of size {}".format(len(vocab)))

    # Build the model
    model = WordRNN(len(vocab), len(vocab), args.rnncell, args.emsize,
                    args.outsize, args.nhid, args.nlayers, args.bidirec,
                    args.autoenc, args.decoder_bias, args.forget_bias,
                    args.dropout)
    logger.debug(model)
    model.to(device)

    learnables = list(filter(lambda p: p.requires_grad, model.parameters()))
    optimizer = getattr(optim, args.optim)(learnables, lr=args.lr)

    save_vocab(vocab, os.path.join(args.save_dir, 'vocab.txt'))
    model_path = os.path.join(args.save_dir, 'model.pt')

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        # Loop over epochs.
        best_val_loss = None

        logger.info('-' * 79)
        for epoch in range(1, args.epochs + 1):
            epoch_start_time = time.time()
            tr_batches = tr_input.batchify(args.batch_size, True)
            train(model, tr_batches, learnables, optimizer, device, args)

            val_loss = evaluate(model, va_batches, device)
            logger.info('-' * 79)
            logger.info('| end of epoch {:2d} | time: {:5.2f}s '
                        '| valid loss {:5.2f} | valid ppl {:8.2f} |'.format(
                            epoch, (time.time() - epoch_start_time), val_loss,
                            math.exp(val_loss)))
            logger.info('-' * 79)

            updated_best = not best_val_loss or val_loss < best_val_loss
            if epoch >= args.decay_after > 0:
                if (epoch - args.decay_after) % args.decay_period == 0:
                    for group in optimizer.param_groups:
                        group['lr'] *= args.decay_rate

            if (epoch % args.save_period == 0) and (updated_best
                                                    or args.save_all):
                if args.save_all:
                    model_path = os.path.join(args.save_dir,
                                              'ep{}.pt'.format(epoch))
                torch.save(model.state_dict(), model_path)

                if updated_best:
                    best_val_loss = val_loss

        logger.debug("Completed training and saved to {}".format(
            args.save_dir))
    except KeyboardInterrupt:
        logger.debug('-' * 79)
        logger.debug("Exiting from training early")