Example #1
0
def do_infer_sent(args):
    if not os.path.exists(args.name + '.token'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token'))
        sys.exit()
    if not os.path.exists(args.name + '.vocab'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab'))
        sys.exit()
    if len(glob.glob(args.name + '.model.?????????.pth')) == 0:
        logging.error('no model available: {}'.format(args.name + '.model.?????????.pth'))
        sys.exit()

    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    args.embedding_size, args.pooling = read_params(args)
    model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps)
    n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer)
    if args.cuda:
        model.cuda()

    dataset = Dataset(args, token, vocab, 'infer_sent', skip_subsampling=True)
    with torch.no_grad():
        model.eval()
        for batch in dataset:
            snts = model.SentEmbed(batch[0], batch[1], 'iEmb').cpu().detach().numpy().tolist()
            for i in range(len(snts)):
                sentence = ["{:.6f}".format(w) for w in snts[i]]
                print('{}\t{}'.format(batch[2][i]+1, ' '.join(sentence) ))
Example #2
0
def do_infer_word(args):
    if not os.path.exists(args.name + '.token'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token'))
        sys.exit()
    if not os.path.exists(args.name + '.vocab'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab'))
        sys.exit()
    if len(glob.glob(args.name + '.model.?????????.pth')) == 0:
        logging.error('no model available: {}'.format(args.name + '.model.?????????.pth'))
        sys.exit()

    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    args.embedding_size, args.pooling = read_params(args)
    model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps)
    n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer)
    if args.cuda:
        model.cuda()

    if args.sim == 'cos':
        distance = nn.CosineSimilarity(dim=1, eps=1e-6)
    elif args.sim == 'pairwise':
        distance = nn.PairwiseDistance(eps=1e-6)
    else:
        logging.error('bad -sim option {}'.format(args.sim))
        sys.exit()

    dataset = Dataset(args, token, vocab, 'infer_word', skip_subsampling=True)
    with torch.no_grad():
        model.eval()
        voc_i = [i for i in range(0,len(vocab))]
        voc_e = model.Embed(voc_i,'iEmb')
        for batch in dataset:
            #batch[0] batch_wrd
            #batch[1] batch_isnt
            #batch[2] batch_iwrd
            wrd_i = batch[0]
            wrd_e = model.Embed(wrd_i, 'iEmb') #.cpu().detach().numpy().tolist()

            for i in range(len(wrd_i)): ### words to find their closest
                ind_snt = batch[1][i]
                ind_wrd = batch[2][i]
                wrd = vocab[wrd_i[i]]
                out = []
                out.append("{}:{}:{}".format(ind_snt,ind_wrd,wrd))

                dist_wrd_voc = distance(wrd_e[i].unsqueeze(0),voc_e)
                mininds = torch.argsort(dist_wrd_voc,dim=0,descending=True)
                for k in range(1,len(mininds)):
                    ind = mininds[k].item() #cpu().detach().numpy()
                    if i != ind:
                        dis = dist_wrd_voc[ind].item()
                        wrd = vocab[ind]
                        out.append("{:.6f}:{}".format(dis,wrd))
                        if len(out)-1 == args.k:
                            break
                print('\t'.join(out))
Example #3
0
 def __init__(self, **vocab_kwargs):
     self.vocab = {
         'rel': vocab_kwargs.get('rel', Vocab()),
         'ner': vocab_kwargs.get('ner', Vocab(unk='O')),
         'dep': vocab_kwargs.get('dep', Vocab()),
         'pos': vocab_kwargs.get('pos', Vocab(unk='.')),
         'word': vocab_kwargs.get('word', Vocab(unk='UNKNOWN')),
     }
Example #4
0
def load_data(model_type, pd):
    multi_sense, n_sense = set_sense_paras(model_type, pd)
    x_vocab = Vocab(pd['x_vocab_file'], multi_sense, n_sense)
    y_vocab = Vocab(pd['y_vocab_file'], False, 1)
    train_data = RelationData(pd['train_data_file'], multi_sense, n_sense)
    test_data = RelationData(pd['test_data_file'], multi_sense, n_sense)
    train_data.gen_multinomial_dist(y_vocab.size())
    return train_data, test_data, x_vocab, y_vocab
Example #5
0
def do_train(args):
    if not os.path.exists(args.name + '.token'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token'))
        sys.exit()
    if not os.path.exists(args.name + '.vocab'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab'))
        sys.exit()

    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    if os.path.exists(args.name + '.param'):
        args.embedding_size, args.pooling = read_params(args)
    else:
        write_params(args)        

    model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk)
    if args.cuda:
        model.cuda()
#    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps)
#    optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
    optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, weight_decay=0.01, amsgrad=False)
    n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer)
    dataset = Dataset(args, token, vocab, args.method)

    n_epochs = 0
    losses = []
    while True:
        n_epochs += 1
        for batch in dataset:
            model.train()
            if args.method == 'skipgram':
                loss = model.forward_skipgram(batch)
            elif args.method == 'cbow':
                loss = model.forward_cbow(batch)
            elif args.method == 'sbow':
                loss = model.forward_sbow(batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            n_steps += 1
            losses.append(loss.data.cpu().detach().numpy())
            if n_steps % args.report_every_n_steps == 0:
                accum_loss = np.mean(losses)
                logging.info('{} n_epoch={} n_steps={} Loss={:.6f}'.format(args.method, n_epochs,n_steps,accum_loss))
                losses = []
            if n_steps % args.save_every_n_steps == 0:
                save_model_optim(args.name, model, optimizer, n_steps, args.keep_last_n)
        if n_epochs >= args.max_epochs:
            logging.info('Stop (max epochs reached)')
            break
    save_model_optim(args.name, model, optimizer, n_steps, args.keep_last_n)
Example #6
0
    def inference(self):
        self.dropout = 0.0
        self.seq_size = 0
        if not self.epoch:
            for e in range(999, 0, -1):
                if os.path.exists(self.mdir+"/epoch{}.index".format(e)):
                    self.epoch = e
                    break
            if not self.epoch:
                sys.stderr.write("error: Cannot find epoch in mdir '{}'\n{}".format(self.mdir, self.usage))
                sys.exit(1)
        check_dataset(self.tst)
        if self.output == '-':
            self.output = sys.stdout
        else:
            self.output = open(self.output, "wb")
        if not os.path.exists('{}/epoch{}.index'.format(self.mdir, self.epoch)):
            sys.stderr.write('error: -epoch file {}/epoch{}.index cannot be find\n'.format(self.mdir, self.epoch))
            sys.exit(1)
        if not os.path.exists(self.mdir + '/topology'):
            sys.stderr.write('error: topology file: {} cannot be find\n'.format(self.mdir + '/topology'))
            sys.exit(1)
        src_voc = 'vocab_src'
        tgt_voc = 'vocab_tgt'
        if os.path.exists(self.mdir + '/tokenization_src.json'):
            with open(self.mdir + '/tokenization_src.json') as jsonfile:
                self.tok_src = json.load(jsonfile)
            src_voc = self.tok_src["vocabulary"]
        else:
            self.tok_src = None
        if not os.path.exists(self.mdir + '/' + src_voc):
            sys.stderr.write('error: vocab src file: {} cannot be find\n'.format(self.mdir + '/' + src_voc))
            sys.exit(1)
        if os.path.exists(self.mdir + '/tokenization_tgt.json'):
            with open(self.mdir + '/tokenization_tgt.json') as jsonfile:
                self.tok_tgt = json.load(jsonfile)
            tgt_voc = self.tok_tgt["vocabulary"]
        else:
            self.tok_tgt = None
        argv = []
        with open(self.mdir + "/topology", 'r') as f:
            for line in f:
                opt, val = line.split()
                argv.append('-'+opt)
                argv.append(val)
        # overrides options passed in command line
        self.parse(argv)

        # read vocabularies
        self.voc_src = Vocab(self.mdir + "/" + src_voc)
        self.voc_tgt = Vocab(self.mdir + "/" + tgt_voc)
        return
    def __init__(self, args):
        # get the dir with pre-trained model

        load_dir = os.path.join(args.experiment_dir, args.old_model_dir)

        # initialize, and load vocab
        self.vocab = Vocab()
        vocab_filename = os.path.join(load_dir, "vocab.json")
        self.vocab.load_from_dict(vocab_filename)

        # load configuration
        with open(os.path.join(load_dir, "config.json"), "r") as f:
            config = json.load(f)

        args.response_len = config["response_len"]
        args.history_len = config["history_len"]

        # initialize an empty dataset. used to get input features
        self.dataset = DialogueDataset(None,
                                       history_len=config["history_len"],
                                       response_len=config["response_len"],
                                       vocab=self.vocab,
                                       update_vocab=False)

        # set device
        self.device = torch.device(args.device)

        # initialize model
        model = Transformer(config["vocab_size"],
                            config["vocab_size"],
                            config["history_len"],
                            config["response_len"],
                            d_word_vec=config["embedding_dim"],
                            d_model=config["model_dim"],
                            d_inner=config["inner_dim"],
                            n_layers=config["num_layers"],
                            n_head=config["num_heads"],
                            d_k=config["dim_k"],
                            d_v=config["dim_v"],
                            dropout=config["dropout"],
                            pretrained_embeddings=None).to(self.device)

        # load checkpoint
        checkpoint = torch.load(os.path.join(load_dir, args.old_model_name),
                                map_location=self.device)
        model.load_state_dict(checkpoint['model'])

        # create chatbot
        self.chatbot = Chatbot(args, model)

        self.args = args
Example #8
0
 def write_error_results(self, opt):
     model_a_name, model_b_name = opt['cmp_model_type_list']
     model_a_file = self.instance_analysis_path + model_a_name + '.txt'
     model_b_file = self.instance_analysis_path + model_b_name + '.txt'
     model_a_indicator = pd.read_table(model_a_file, header=None).ix[:, 0]
     model_b_indicator = pd.read_table(model_b_file, header=None).ix[:, 0]
     test_instances = pd.read_table(self.test_data_file,
                                    header=None).as_matrix()
     vocab = Vocab(self.x_vocab_file, n_sense=1, id_offset=0)
     instances = self.select_better_instances(test_instances,
                                              model_a_indicator,
                                              model_b_indicator)
     # where model b makes correct predictions but model a does not
     output_file = self.instance_analysis_path + model_a_name + '-0-' + model_b_name + '-1-.txt'
     self.write_model_instances(instances, vocab, output_file)
     instances = self.select_better_instances(test_instances,
                                              model_b_indicator,
                                              model_a_indicator)
     # where model a makes correct predictions but model b does not
     output_file = self.instance_analysis_path + model_a_name + '-1-' + model_b_name + '-0-.txt'
     self.write_model_instances(instances, vocab, output_file)
     # where both model a and b make correct predictions
     instances = self.select_equal_instances(test_instances,
                                             model_a_indicator,
                                             model_b_indicator, 1)
     output_file = self.instance_analysis_path + model_a_name + '-1-' + model_b_name + '-1-.txt'
     self.write_model_instances(instances, vocab, output_file)
     # where both model a and b make wrong predictions
     instances = self.select_equal_instances(test_instances,
                                             model_a_indicator,
                                             model_b_indicator, 0)
     output_file = self.instance_analysis_path + model_a_name + '-0-' + model_b_name + '-0-.txt'
     self.write_model_instances(instances, vocab, output_file)
Example #9
0
    def inference(self):
        self.dropout = 0.0
        self.seq_size = 0
        if not self.epoch:
            sys.stderr.write("error: Missing -epoch option\n{}".format(
                self.usage))
            sys.exit()
        if not os.path.exists(self.tst):
            sys.stderr.write('error: -tst file {} cannot be find\n'.format(
                self.tst))
            sys.exit()
        if not os.path.exists(self.mdir + '/epoch' + self.epoch + '.index'):
            sys.stderr.write(
                'error: -epoch file {} cannot be find\n'.format(self.mdir +
                                                                '/epoch' +
                                                                self.epoch +
                                                                '.index'))
            sys.exit()
        if not os.path.exists(self.mdir + '/topology'):
            sys.stderr.write(
                'error: topology file: {} cannot be find\n'.format(
                    self.mdir + '/topology'))
            sys.exit()
        if not os.path.exists(self.mdir + '/vocab_src'):
            sys.stderr.write(
                'error: vocab_src file: {} cannot be find\n'.format(
                    self.mdir + '/vocab_src'))
            sys.exit()
        if not os.path.exists(self.mdir + '/vocab_tgt'):
            sys.stderr.write(
                'error: vocab_tgt file: {} cannot be find\n'.format(
                    self.mdir + '/vocab_tgt'))
            sys.exit()
        argv = []
        with open(self.mdir + "/topology", 'r') as f:
            for line in f:
                opt, val = line.split()
                argv.append('-' + opt)
                argv.append(val)
        self.parse(argv)  ### this overrides options passed in command line

        ### read vocabularies
        self.voc_src = Vocab(self.mdir + "/vocab_src")
        self.voc_tgt = Vocab(self.mdir + "/vocab_tgt")
        return
Example #10
0
    def __init__(self, model_file_path):

        model_name = os.path.basename(model_file_path)
        self._test_dir = os.path.join(config.log_root,
                                      'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._test_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._test_dir, 'rouge_dec')
        for p in [self._test_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)
Example #11
0
    def sampling_decode(self, vocab: Vocab, example: LMExample,
                        begin_symbol: int = 2, end_symbol: int = 5,
                        initial_hidden: Optional[HiddenState] = None, warm_up: Optional[int] = None,
                        max_length: int = 200, greedy: bool = False, topk: Optional[int] = None,
                        print_info: bool = True, color_outputs: bool = False, **_kwargs) \
            -> SampledOutput:
        tensor = functools.partial(sample_utils.tensor, device=self.device)
        sample = functools.partial(sample_utils.sample,
                                   greedy=greedy,
                                   topk=topk)

        self.eval()
        self.init_hidden(1, None)

        if warm_up is None:
            inputs = [begin_symbol]
            hidden = initial_hidden
            total_log_prob = 0.0
        else:
            inputs = list(vocab.numericalize(example.sentence[:warm_up]))
            total_log_prob, hidden = self.forward(tensor(inputs[:-1]),
                                                  target=tensor(inputs[1:]))
            total_log_prob = -torch.sum(total_log_prob).item() * (len(inputs) -
                                                                  1)

        while len(inputs) < max_length and inputs[-1] != end_symbol:
            # full copy of the forward pass, including dropouts. But they won't be applied due to .eval function.
            # Run LSTM over the word
            word_log_probs, new_hidden = self.forward(tensor(inputs[-1]),
                                                      hidden)
            word_id, word_log_prob = sample(word_log_probs)
            inputs.append(word_id)
            hidden = new_hidden
            total_log_prob += word_log_prob

        sample_loss = -total_log_prob / (len(inputs) - 1)
        if print_info:
            print(
                f"Sample loss: {sample_loss:.3f}, PPL: {math.exp(sample_loss):.3f}"
            )

        # Format the output
        words = [vocab.i2w[token] for token in inputs]
        if color_outputs and warm_up is not None:
            words[:warm_up] = [
                Logging.color('yellow', w) for w in words[:warm_up]
            ]

        output = SampledOutput(sentence=words,
                               sample_loss=sample_loss,
                               complete_copies=0,
                               incomplete_copies=0)
        return output
Example #12
0
    def __init__(self, model_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval',
                               batch_size=config.batch_size, single_pass=True)
        time.sleep(15)
        model_name = os.path.basename(model_path)

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)

        self.model = Model(model_path, is_eval=True)
Example #13
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(self.vocab,
                               config.train_data_path,
                               config.batch_size,
                               single_pass=False,
                               mode='train')
        time.sleep(10)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'models')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)
Example #14
0
def main(train_data_path, test_data_path, answer_data_path, vocab_data_path,
         embedding_data_path, batch_size, learning_rate, hidden_size, margin,
         epoch, save_path, pretrained_path, use_cuda):
    # load qa data
    answer_data = AnswerData(answer_data_path)
    train_data = QaData(train_data_path)
    test_data = QaData(test_data_path)

    # load pretrained embedding
    pretrained_embedding = gensim.models.KeyedVectors.load_word2vec_format(
        embedding_data_path, binary=True)
    vocab = Vocab(vocab_data_path, answer_data.lexicon + train_data.lexicon)
    pretrained_weights = np.zeros((len(vocab) + 1, 300))  # TODO magic number
    for wid, surf in vocab.wid2surf.items():
        if surf in pretrained_embedding.vocab:
            pretrained_weights[wid] = pretrained_embedding.wv[surf]

    # create dataset / data loader
    train_dataset = InsuranceQaDataset(train_data, answer_data, vocab)
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=True,
        batch_size=batch_size,
        collate_fn=train_dataset.collate)
    test_dataset = InsuranceQaDataset(test_data, answer_data, vocab)

    # train model
    if pretrained_path is not None:
        model = torch.load(pretrained_path)['model']
    else:
        model = SentenceEncoder(pretrained_weights, hidden_size)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    criterion = QaLoss(margin=margin)
    if use_cuda:
        model = model.cuda()
    train(model, train_data_loader, test_dataset, optimizer, criterion, epoch,
          use_cuda)

    # save model
    torch.save({'model': model, 'vocab': vocab}, save_path)
Example #15
0
 def __init__(
     self,
     train_folder: str,
     test_folder: str,
     alpha: float = 0.01,
     beta: float = 1.0,
     predicted_poses: int = 20,
     previous_poses: int = 10,
     stride: int = None,
     batch_size: int = 50,
     with_context: bool = False,
     embedding: str = None,
     text_folder: str = None,
     *args,
     **kwargs
 ):
     super().__init__()
     self.save_hyperparameters()
     self.encoder = Encoder(26, 150, 2, with_context)
     self.decoder = Decoder(45, 150, 300, max_gen=predicted_poses)
     self.predicted_poses = predicted_poses
     self.previous_poses = previous_poses
     self.loss = MSELoss()
     self.train_folder = train_folder
     self.test_folder = test_folder
     self.alpha = alpha
     self.beta = beta
     self.stride = predicted_poses if stride is None else stride
     self.batch_size = batch_size
     self.with_context = with_context
     if embedding is not None:
         self.vocab = Vocab(embedding)
         self.word_embedder = nn.Embedding(len(self.vocab.token_to_idx), len(self.vocab.weights[0]),
                                           _weight=torch.FloatTensor(self.vocab.weights))
         self.word_encoder = nn.GRU(len(self.vocab.weights[0]), 100, bidirectional=True)
     else:
         self.vocab = None
     self.text_folder = text_folder
Example #16
0
def do_preprocess(args):

    if args.tok_conf is None:
        opts = {}
        opts['mode'] = 'space'
        with open(args.name + '.token', 'w') as yamlfile:
            _ = yaml.dump(opts, yamlfile)
    else:
        with open(args.tok_conf) as yamlfile: 
            opts = yaml.load(yamlfile, Loader=yaml.FullLoader)
            #cp bpe args.name+'.bpe'
            #replace in opts the bpe path
            yaml.dump(opts, args.name + '.token')
    logging.info('built tokenizer config file')

    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.build(args.data,token,min_freq=args.voc_minf,max_size=args.voc_maxs)
    vocab.dump(args.name + '.vocab')
    logging.info('built vocab')
        mask = np.triu(np.ones(shape), k=1).astype('uint8')
        mask = torch.from_numpy(mask) == 0
        return mask


### for test ###
if __name__ == '__main__':
    parser = argparse.ArgumentParser(prog=sys.argv[0],
                                     usage='python3 {}'.format(sys.argv[0]),
                                     add_help=True)
    parser.add_argument('-v', '--vocab', help='vocab file', required=True)
    parser.add_argument('-i', '--input', help='input file', required=True)
    parser.add_argument('-t', '--target', help='target file', required=True)
    args = parser.parse_args()

    vocab = Vocab(args.vocab)
    dataset = AspecDataset(args.input, args.target, vocab)
    loader = DataLoader(dataset,
                        collate_fn=MiniBatchProcess(),
                        shuffle=False,
                        batch_size=50)

    for i, batch in enumerate(loader):
        # test1
        #if i>=2: break
        #print('--- batch_idx={} ---'.format(i))
        #print('id: {}'.format(batch[0]))
        #print('input: {}\n{}'.format(batch[1].shape, batch[1]))
        #print('input mask: {}\n{}'.format(batch[2].shape, batch[2]))
        #print('target: {}\n{}'.format(batch[3].shape, batch[3]))
        #print('target mask: {}\n{}'.format(batch[4].shape, batch[4]))
    def __init__(self, args):

        # set up output directory
        self.output_dir = os.path.join(args.experiment_dir, args.run_name)
        if not os.path.exists(args.experiment_dir):
            os.mkdir(args.experiment_dir)
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)
        if not os.path.exists(os.path.join(args.experiment_dir,"runs/")):
            os.mkdir(os.path.join(args.experiment_dir,"runs/"))

        # initialize tensorboard writer
        self.runs_dir = os.path.join(args.experiment_dir,"runs/",args.run_name)
        self.writer = SummaryWriter(self.runs_dir)

        # initialize global steps
        self.train_gs = 0
        self.val_gs = 0

        # initialize model config
        self.config = ModelConfig(args)

        # check if there is a model to load
        if args.old_model_dir is not None:
            self.use_old_model = True
            self.load_dir = args.old_model_dir
            self.config.load_from_file(
                os.path.join(self.load_dir, "config.json"))

            # create vocab
            self.vocab = Vocab()
            self.vocab.load_from_dict(os.path.join(self.load_dir, "vocab.json"))
            self.update_vocab = False
            self.config.min_count=1
        else:
            self.use_old_model = False

            self.vocab = None
            self.update_vocab = True

        # create data sets
        self.dataset_filename = args.dataset_filename

        # train
        self.train_dataset = DialogueDataset(
            os.path.join(self.dataset_filename, "train_data.json"),
            self.config.sentence_len,
            self.vocab,
            self.update_vocab)
        self.data_loader_train = torch.utils.data.DataLoader(
            self.train_dataset, self.config.train_batch_size, shuffle=True)
        self.config.train_len = len(self.train_dataset)

        self.vocab = self.train_dataset.vocab

        # eval
        self.val_dataset = DialogueDataset(
            os.path.join(self.dataset_filename, "val_data.json"),
            self.config.sentence_len,
            self.vocab,
            self.update_vocab)
        self.data_loader_val = torch.utils.data.DataLoader(
            self.val_dataset, self.config.val_batch_size, shuffle=True)
        self.config.val_len = len(self.val_dataset)

        # update, and save vocab
        self.vocab = self.val_dataset.vocab
        self.train_dataset.vocab = self.vocab
        if (self.config.min_count > 1):
            self.config.old_vocab_size = len(self.vocab)
            self.vocab.prune_vocab(self.config.min_count)
        self.vocab.save_to_dict(os.path.join(self.output_dir, "vocab.json"))
        self.vocab_size = len(self.vocab)
        self.config.vocab_size = self.vocab_size

        # load embeddings
        if self.config.pretrained_embeddings_dir is None:
            pretrained_embeddings = get_pretrained_embeddings(self.config.pretrained_embeddings_dir , self.vocab)
        else:
            pretrained_embeddings = None

        # print and save the config file
        self.config.print_config(self.writer)
        self.config.save_config(os.path.join(self.output_dir, "config.json"))

        # set device
        self.device = torch.device('cuda')

        # create model
        self.model = Transformer(
            self.config.vocab_size,
            self.config.label_len,
            self.config.sentence_len,
            d_word_vec=self.config.embedding_dim,
            d_model=self.config.model_dim,
            d_inner=self.config.inner_dim,
            n_layers=self.config.num_layers,
            n_head=self.config.num_heads,
            d_k=self.config.dim_k,
            d_v=self.config.dim_v,
            dropout=self.config.dropout,
            pretrained_embeddings=pretrained_embeddings
        ).to(self.device)

        # create optimizer
        self.optimizer = torch.optim.Adam(
            filter(lambda x: x.requires_grad, self.model.parameters()),
            betas=(0.9, 0.98), eps=1e-09)

        # load old model, optimizer if there is one
        if self.use_old_model:
            self.model, self.optimizer = load_checkpoint(
                os.path.join(self.load_dir, "model.bin"),
                self.model, self.optimizer, self.device)


        # create a sceduled optimizer object
        self.optimizer = ScheduledOptim(
            self.optimizer, self.config.model_dim, self.config.warmup_steps)
class ModelOperator:
    def __init__(self, args):

        # set up output directory
        self.output_dir = os.path.join(args.experiment_dir, args.run_name)
        if not os.path.exists(args.experiment_dir):
            os.mkdir(args.experiment_dir)
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)
        if not os.path.exists(os.path.join(args.experiment_dir,"runs/")):
            os.mkdir(os.path.join(args.experiment_dir,"runs/"))

        # initialize tensorboard writer
        self.runs_dir = os.path.join(args.experiment_dir,"runs/",args.run_name)
        self.writer = SummaryWriter(self.runs_dir)

        # initialize global steps
        self.train_gs = 0
        self.val_gs = 0

        # initialize model config
        self.config = ModelConfig(args)

        # check if there is a model to load
        if args.old_model_dir is not None:
            self.use_old_model = True
            self.load_dir = args.old_model_dir
            self.config.load_from_file(
                os.path.join(self.load_dir, "config.json"))

            # create vocab
            self.vocab = Vocab()
            self.vocab.load_from_dict(os.path.join(self.load_dir, "vocab.json"))
            self.update_vocab = False
            self.config.min_count=1
        else:
            self.use_old_model = False

            self.vocab = None
            self.update_vocab = True

        # create data sets
        self.dataset_filename = args.dataset_filename

        # train
        self.train_dataset = DialogueDataset(
            os.path.join(self.dataset_filename, "train_data.json"),
            self.config.sentence_len,
            self.vocab,
            self.update_vocab)
        self.data_loader_train = torch.utils.data.DataLoader(
            self.train_dataset, self.config.train_batch_size, shuffle=True)
        self.config.train_len = len(self.train_dataset)

        self.vocab = self.train_dataset.vocab

        # eval
        self.val_dataset = DialogueDataset(
            os.path.join(self.dataset_filename, "val_data.json"),
            self.config.sentence_len,
            self.vocab,
            self.update_vocab)
        self.data_loader_val = torch.utils.data.DataLoader(
            self.val_dataset, self.config.val_batch_size, shuffle=True)
        self.config.val_len = len(self.val_dataset)

        # update, and save vocab
        self.vocab = self.val_dataset.vocab
        self.train_dataset.vocab = self.vocab
        if (self.config.min_count > 1):
            self.config.old_vocab_size = len(self.vocab)
            self.vocab.prune_vocab(self.config.min_count)
        self.vocab.save_to_dict(os.path.join(self.output_dir, "vocab.json"))
        self.vocab_size = len(self.vocab)
        self.config.vocab_size = self.vocab_size

        # load embeddings
        if self.config.pretrained_embeddings_dir is None:
            pretrained_embeddings = get_pretrained_embeddings(self.config.pretrained_embeddings_dir , self.vocab)
        else:
            pretrained_embeddings = None

        # print and save the config file
        self.config.print_config(self.writer)
        self.config.save_config(os.path.join(self.output_dir, "config.json"))

        # set device
        self.device = torch.device('cuda')

        # create model
        self.model = Transformer(
            self.config.vocab_size,
            self.config.label_len,
            self.config.sentence_len,
            d_word_vec=self.config.embedding_dim,
            d_model=self.config.model_dim,
            d_inner=self.config.inner_dim,
            n_layers=self.config.num_layers,
            n_head=self.config.num_heads,
            d_k=self.config.dim_k,
            d_v=self.config.dim_v,
            dropout=self.config.dropout,
            pretrained_embeddings=pretrained_embeddings
        ).to(self.device)

        # create optimizer
        self.optimizer = torch.optim.Adam(
            filter(lambda x: x.requires_grad, self.model.parameters()),
            betas=(0.9, 0.98), eps=1e-09)

        # load old model, optimizer if there is one
        if self.use_old_model:
            self.model, self.optimizer = load_checkpoint(
                os.path.join(self.load_dir, "model.bin"),
                self.model, self.optimizer, self.device)


        # create a sceduled optimizer object
        self.optimizer = ScheduledOptim(
            self.optimizer, self.config.model_dim, self.config.warmup_steps)

        #self.optimizer.optimizer.to(torch.device('cpu'))


    def train(self, num_epochs):
        metrics = {"best_epoch":0, "highest_f1":0}

        # output an example
        self.output_example(0)

        for epoch in range(num_epochs):
            #self.writer.add_graph(self.model)
            #self.writer.add_embedding(
            #    self.model.encoder.src_word_emb.weight, global_step=epoch)

            epoch_metrics = dict()

            # train
            epoch_metrics["train"] = self.execute_phase(epoch, "train")
            # save metrics
            metrics["epoch_{}".format(epoch)] = epoch_metrics
            with open(os.path.join(self.output_dir, "metrics.json"), "w") as f:
                json.dump(metrics, f, indent=4)

            # validate
            epoch_metrics["val"] = self.execute_phase(epoch, "val")
            # save metrics
            metrics["epoch_{}".format(epoch)] = epoch_metrics
            with open(os.path.join(self.output_dir, "metrics.json"), "w") as f:
                json.dump(metrics, f, indent=4)

            # save checkpoint
            #TODO: fix this b
            if epoch_metrics["val"]["avg_results"]["F1"] > metrics["highest_f1"]:
            #if epoch_metrics["train"]["loss"] < metrics["lowest_loss"]:
            #if epoch % 100 == 0:
                self.save_checkpoint(os.path.join(self.output_dir, "model.bin"))
                metrics["lowest_f1"] = epoch_metrics["val"]["avg_results"]["F1"]
                metrics["best_epoch"] = epoch

                test_results = self.get_test_predictions(
                    os.path.join(self.dataset_filename, "test_data.json"),
                    os.path.join(self.output_dir, "predictions{}.json".format(epoch)))

            # record metrics to tensorboard
            self.writer.add_scalar("training loss total",
                epoch_metrics["train"]["loss"], global_step=epoch)
            self.writer.add_scalar("val loss total",
                epoch_metrics["val"]["loss"], global_step=epoch)


            self.writer.add_scalar("training time",
                epoch_metrics["train"]["time_taken"], global_step=epoch)
            self.writer.add_scalar("val time",
                epoch_metrics["val"]["time_taken"], global_step=epoch)

            self.writer.add_scalars("train_results", epoch_metrics["train"]["avg_results"], global_step=epoch)
            self.writer.add_scalars("val_results", epoch_metrics["val"]["avg_results"],
                                    global_step=epoch)
            # output an example
            self.output_example(epoch+1)

        self.writer.close()

    def execute_phase(self, epoch, phase):
        if phase == "train":
            self.model.train()
            dataloader = self.data_loader_train
            batch_size = self.config.train_batch_size
            train = True
        else:
            self.model.eval()
            dataloader = self.data_loader_val
            batch_size = self.config.val_batch_size
            train = False

        start = time.clock()
        phase_metrics = dict()
        epoch_loss = list()
        epoch_metrics = list()
        results = {"accuracy": list(), "precision": list(), "recall": list(), "F1": list()}

        average_epoch_loss = None
        for i, batch in enumerate(tqdm(dataloader,
                          mininterval=2, desc=phase, leave=False)):
            # prepare data
            src_seq, src_pos, src_seg, tgt= map(
                lambda x: x.to(self.device), batch[:4])

            ids = batch[4]
            start_end_idx = batch[5]

            # forward
            if train:
                self.optimizer.zero_grad()
            pred = self.model(src_seq, src_pos, src_seg, tgt)

            loss = F.cross_entropy(self.prepare_pred(pred).view(-1, 2), tgt.view(-1))

            average_loss = float(loss)
            epoch_loss.append(average_loss)
            average_epoch_loss = np.mean(epoch_loss)

            if train:
                self.writer.add_scalar("train_loss",
                    average_loss, global_step=i + epoch * self.config.train_batch_size)
                # backward
                loss.backward()

                # update parameters
                self.optimizer.step_and_update_lr()
            output = torch.argmax(self.prepare_pred(pred), 3)
            get_results(tgt.view(-1).cpu(), output.view(-1).cpu(), results)

        phase_metrics["avg_results"] = {key: np.mean(value) for key, value in results.items()}
        phase_metrics["loss"] = average_epoch_loss

        phase_metrics["time_taken"] = time.clock() - start
        string = ' {} loss: {:.3f} '.format(phase, average_epoch_loss)
        print(string, end='\n')
        return phase_metrics

    def get_test_predictions(self, test_filename, save_filename):
        test_dataset = DialogueDataset(
            test_filename,
            self.config.sentence_len,
            self.vocab,
            False)

        test_data_loader = torch.utils.data.DataLoader(
            test_dataset, self.config.val_batch_size, shuffle=True)

        with open(test_filename, 'r') as f:
            data = json.load(f)

        start = time.clock()
        phase_metrics = dict()
        epoch_loss = list()
        epoch_metrics = list()
        results = {"accuracy": list(), "precision": list(), "recall": list(),
                   "F1": list()}
        average_epoch_loss = None
        for i, batch in enumerate(tqdm(test_data_loader,
                                       mininterval=2, desc='test', leave=False)):
            # prepare data
            src_seq, src_pos, src_seg, tgt = map(
                lambda x: x.to(self.device), batch[:4])

            ids = batch[4]
            start_end_idx = batch[5]

            # forward
            pred = self.model(src_seq, src_pos, src_seg, tgt)

            loss = F.cross_entropy(self.prepare_pred(pred).view(-1, 2),
                                   tgt.view(-1))

            average_loss = float(loss)
            epoch_loss.append(average_loss)
            average_epoch_loss = np.mean(epoch_loss)

            output = torch.argmax(self.prepare_pred(pred), 3)

            record_predictions(output, data, ids, start_end_idx)

            get_results(tgt.view(-1).cpu(), output.view(-1).cpu(), results)

        phase_metrics["avg_results"] = {key: np.mean(value) for key, value in
                                        results.items()}
        phase_metrics["loss"] = average_epoch_loss

        phase_metrics["time_taken"] = time.clock() - start
        string = ' {} loss: {:.3f} '.format('test', average_epoch_loss)
        print(string, end='\n')

        data["results"] = phase_metrics

        with open(save_filename, 'w') as f:
            json.dump(data, f)

        return phase_metrics



    def save_checkpoint(self, filename):
        state = {
            'model': self.model.state_dict(),
            'optimizer': self.optimizer.optimizer.state_dict()
        }
        torch.save(state, filename)

    def output_example(self, epoch):
        random_index = random.randint(0, len(self.val_dataset))
        example = self.val_dataset[random_index]

        # prepare data
        src_seq, src_pos, src_seg, tgt_seq = map(
            lambda x: torch.from_numpy(x).to(self.device).unsqueeze(0), example[:4])

        # take out first token from target for some reason
        gold = tgt_seq[:, 1:]

        # forward
        pred = self.model(src_seq, src_pos, src_seg, tgt_seq)
        output = self.prepare_pred(pred).squeeze(0)

        words = src_seq.tolist()[0]
        target_strings = labels_2_mention_str(tgt_seq.squeeze(0))
        output_strings = labels_2_mention_str(torch.argmax(output, dim=2))

        # get history text
        string = "word: output - target\n"

        for word, t, o in zip(words, target_strings, output_strings):
            token = self.vocab.id2token[word]
            if token != "<blank>":
                string += "[{}: {} - {}], \n".format(token, o, t)

        # print
        print("\n------------------------\n")
        print(string)
        print("\n------------------------\n")

        # add result to tensorboard
        self.writer.add_text("example_output", string, global_step=epoch)
        self.writer.add_histogram("example_vocab_ranking", pred, global_step=epoch)
        self.writer.add_histogram("example_vocab_choice", output,global_step=epoch)

    def prepare_pred(self, pred):
        temp = pred
        pred = pred.view(-1)
        size = pred.size()
        nullclass = torch.ones(size, dtype=pred.dtype, device=self.device)
        nullclass -= pred
        pred = torch.stack((nullclass, pred), 1).view(-1,
                                                       self.config.sentence_len,
                                                       self.config.label_len,
                                                       2)
        return pred
Example #20
0
class BeamSearch(object):
    def __init__(self, model_file_path):

        model_name = os.path.basename(model_file_path)
        self._test_dir = os.path.join(config.log_root,
                                      'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._test_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._test_dir, 'rouge_dec')
        for p in [self._test_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)

    def beam_search(self, batch):
        # single example repeated across the batch
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t, coverage = \
            get_input_from_batch(batch, use_cuda)

        enc_out, enc_fea, enc_h = self.model.encoder(enc_batch, enc_lens)
        s_t = self.model.reduce_state(enc_h)

        dec_h, dec_c = s_t  # b x hidden_dim
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        # decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [
            Beam(tokens=[self.vocab.word2id(config.BOS_TOKEN)],
                 log_probs=[0.0],
                 state=(dec_h[0], dec_c[0]),
                 context=c_t[0],
                 coverage=(coverage[0] if config.is_coverage else None))
            for _ in range(config.beam_size)
        ]

        steps = 0
        results = []
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(config.UNK_TOKEN) \
                             for t in latest_tokens]
            y_t = Variable(torch.LongTensor(latest_tokens))
            if use_cuda:
                y_t = y_t.cuda()
            all_state_h = [h.state[0] for h in beams]
            all_state_c = [h.state[1] for h in beams]
            all_context = [h.context for h in beams]

            s_t = (torch.stack(all_state_h,
                               0).unsqueeze(0), torch.stack(all_state_c,
                                                            0).unsqueeze(0))
            c_t = torch.stack(all_context, 0)

            coverage_t = None
            if config.is_coverage:
                all_coverage = [h.coverage for h in beams]
                coverage_t = torch.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(
                y_t, s_t, enc_out, enc_fea, enc_padding_mask, c_t, extra_zeros,
                enc_batch_extend_vocab, coverage_t, steps)
            log_probs = torch.log(final_dist)
            topk_log_probs, topk_ids = torch.topk(log_probs,
                                                  config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            # On the first step, we only had one original hypothesis (the initial hypothesis). On subsequent steps, all original hypotheses are distinct.
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage[i] if config.is_coverage else None)

                for j in range(config.beam_size *
                               2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].item(),
                                        log_prob=topk_log_probs[i, j].item(),
                                        state=state_i,
                                        context=context_i,
                                        coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(config.EOS_TOKEN):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(
                        results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]

    def run(self):

        counter = 0
        start = time.time()
        batch = self.batcher.next_batch()
        while batch is not None:
            # Run beam search to get best Hypothesis
            best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = utils.outputids2words(
                output_ids, self.vocab,
                (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(dataset.EOS_TOKEN)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            original_abstract_sents = batch.original_abstracts_sents[0]

            write_for_rouge(original_abstract_sents, decoded_words, counter,
                            self._rouge_ref_dir, self._rouge_dec_dir)
            counter += 1
            if counter % 1000 == 0:
                print('%d example in %d sec' % (counter, time.time() - start))
                start = time.time()

            batch = self.batcher.next_batch()

        print("Decoder has finished reading dataset for single_pass.")
        print("Now starting ROUGE eval...")
        results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        rouge_log(results_dict, self._test_dir)
Example #21
0
    model_dir = os.path.join(data_root_folder, 'models')
    mkdir_if_missing(model_dir)

    if mode == MODE_MRT:
        model_name = cur_cfg.name + '_mrt'
    elif mode == MODE_OBJ:
        model_name = cur_cfg.name + '_obj'
    else:
        model_name = cur_cfg.name

    model_path = os.path.join(model_dir, model_name + '.state')
    print('Model path:', model_path)

    jieba_base_v = Vocab(
        os.path.join(data_root_folder, 'vocab', 'title_summ.vocab.pkl'),
        os.path.join(data_root_folder, 'vocab', 'title_summ.emb.pkl'))

    jieba_sgns_v = Vocab(
        os.path.join(data_root_folder, 'vocab', 'title_summ.vocab.pkl'),
        os.path.join(data_root_folder, 'vocab', 'title_summ.emb.pkl'))
    jieba_flag_v = Vocab(
        os.path.join(data_root_folder, 'vocab', 'title_summ.vocab.pkl'),
        os.path.join(data_root_folder, 'vocab', 'title_summ.emb.pkl'))

    # jieba_sgns_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'),
    #                      os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl'))
    # jieba_flag_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'),
    #                      os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl'))

    trainset_roots = [os.path.join(data_root_folder, 'val.txt')]
import os
import pickle
from models import VariationalModels
import re


def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


if __name__ == '__main__':
    config = get_config(mode='test')

    print('Loading Vocabulary...')
    vocab = Vocab()
    vocab.load(config.word2id_path, config.id2word_path)
    print(f'Vocabulary size: {vocab.vocab_size}')

    config.vocab_size = vocab.vocab_size

    data_loader = get_loader(
        sentences=load_pickle(config.sentences_path),
        conversation_length=load_pickle(config.conversation_length_path),
        sentence_length=load_pickle(config.sentence_length_path),
        vocab=vocab,
        batch_size=config.batch_size,
        shuffle=False)

    if config.model in VariationalModels:
        solver = VariationalSolver(config, None, data_loader, vocab=vocab, is_train=False)
Example #23
0
File: train.py Project: eguilg/mrc
    if mode == MODE_MRT:
        model_name = cur_cfg.name + '_mrt'
    elif mode == MODE_OBJ:
        model_name = cur_cfg.name + '_obj'
    else:
        model_name = cur_cfg.name

    if switch:
        model_name += '_switch'
    if use_data1:
        model_name += '_full_data'
    model_path = os.path.join(model_dir, model_name + '.state')
    print('Model path:', model_path)

    jieba_base_v = Vocab('./data/embed/base_token_vocab_jieba.pkl',
                         './data/embed/base_token_embed_jieba.pkl')
    jieba_sgns_v = Vocab('./data/embed/train_sgns_vocab_jieba.pkl',
                         './data/embed/train_sgns_embed_jieba.pkl')
    jieba_flag_v = Vocab('./data/embed/base_flag_vocab_jieba.pkl',
                         './data/embed/base_flag_embed_jieba.pkl')

    if switch:
        pyltp_base_v = Vocab('./data/embed/base_token_vocab_pyltp.pkl',
                             './data/embed/base_token_embed_pyltp.pkl')
        pyltp_sgns_v = Vocab('./data/embed/train_sgns_vocab_pyltp.pkl',
                             './data/embed/train_sgns_embed_pyltp.pkl')
        pyltp_flag_v = Vocab('./data/embed/base_flag_vocab_pyltp.pkl',
                             './data/embed/base_flag_embed_pyltp.pkl')

        transform = MaiIndexTransform(jieba_base_v, jieba_sgns_v, jieba_flag_v,
                                      pyltp_base_v, pyltp_sgns_v, pyltp_flag_v)
Example #24
0
  val_file = os.path.join(data_root_folder, 'preprocessed', 'dev-%s.preprocessed.json' % version)

  model_dir = os.path.join(data_root_folder, 'models', version)
  mkdir_if_missing(model_dir)

  if mode == MODE_MRT:
    model_name = cur_cfg.name + '_mrt'
  elif mode == MODE_OBJ:
    model_name = cur_cfg.name + '_obj'
  else:
    model_name = cur_cfg.name

  model_path = os.path.join(model_dir, model_name + '.state')
  print('Model path:', model_path)

  jieba_base_v = Vocab(os.path.join(data_root_folder, 'vocab', 'squad-%s.vocab.pkl' % version),
                       os.path.join(data_root_folder, 'vocab', 'squad-%s.emb.pkl' % version))

  jieba_sgns_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'),
                       os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl'))
  jieba_flag_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'),
                       os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl'))

  trainset_roots = [
    os.path.join(data_root_folder, 'val.txt')
  ]

  embed_lists = {
    'jieba': [jieba_base_v.embeddings, jieba_sgns_v.embeddings, jieba_flag_v.embeddings],
    'pyltp': []
  }
dev_src, dev_trg, dev_num, dev_src_max_len, dev_trg_max_len = process(
    dev_src_path, dev_trg_path)
test_src, test_trg, test_num, test_src_max_len, test_trg_max_len = process(
    test_src_path, test_trg_path)

log.write('train_num', train_num)
log.write('train_src_max_len', train_src_max_len)
log.write('train_trg_max_len', train_trg_max_len)
log.write('dev_num', dev_num)
log.write('dev_src_max_len', dev_src_max_len)
log.write('dev_trg_max_len', dev_trg_max_len)
log.write('test_num', test_num)
log.write('test_src_max_len', test_src_max_len)
log.write('test_trg_max_len', test_trg_max_len)

vocab = Vocab()

for i in range(train_num):
    vocab.add_list(train_src[i])
    vocab.add_list(train_trg[i])

for i in range(dev_num):
    vocab.add_list(dev_src[i])
    vocab.add_list(dev_trg[i])

for i in range(test_num):
    vocab.add_list(test_src[i])
    vocab.add_list(test_trg[i])

word2index, index2word = vocab.get_vocab(min_freq=4)
total_words = len(word2index)
Example #26
0
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--dataset",
                        required=True,
                        type=str,
                        help="dataset")
    # parser.add_argument("-c", "--train_dataset", required=True,
    #                     type=str, help="train dataset for train bert")
    # parser.add_argument("-t", "--test_dataset", type=str,
    #                     default=None, help="test set for evaluate train set")
    # parser.add_argument("-v", "--vocab_path", required=True,
    #                     type=str, help="built vocab model path with bert-vocab")
    parser.add_argument("-o",
                        "--output_path",
                        required=True,
                        type=str,
                        help="ex)output/bert.model")

    parser.add_argument("-hs",
                        "--hidden",
                        type=int,
                        default=256,
                        help="hidden size of transformer model")
    parser.add_argument("-l",
                        "--layers",
                        type=int,
                        default=8,
                        help="number of layers")
    parser.add_argument("-a",
                        "--attn_heads",
                        type=int,
                        default=8,
                        help="number of attention heads")
    parser.add_argument("-s",
                        "--seq_len",
                        type=int,
                        default=64,
                        help="maximum sequence len")

    parser.add_argument("-b",
                        "--batch_size",
                        type=int,
                        default=64,
                        help="number of batch_size")
    parser.add_argument("-e",
                        "--epochs",
                        type=int,
                        default=10,
                        help="number of epochs")
    parser.add_argument("-w",
                        "--num_workers",
                        type=int,
                        default=5,
                        help="dataloader worker size")
    parser.add_argument("--duplicate",
                        type=int,
                        default=5,
                        help="dataloader worker size")

    parser.add_argument("--with_cuda",
                        type=bool,
                        default=True,
                        help="training with CUDA: true, or false")
    parser.add_argument("--log_freq",
                        type=int,
                        default=10,
                        help="printing loss every n iter: setting n")
    parser.add_argument("--corpus_lines",
                        type=int,
                        default=None,
                        help="total number of lines in corpus")
    parser.add_argument("--cuda_devices",
                        type=int,
                        nargs='+',
                        default=None,
                        help="CUDA device ids")
    parser.add_argument("--on_memory",
                        type=bool,
                        default=True,
                        help="Loading on memory: true or false")

    parser.add_argument("--lr",
                        type=float,
                        default=1e-3,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.9,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")
    parser.add_argument("--dropout",
                        type=float,
                        default=0.2,
                        help="dropout value")

    args = parser.parse_args()

    print("Load Data", args.dataset)
    data_reader = DataReader(args.dataset, seq_len=args.seq_len)
    neg_data_reader = DataReader(args.dataset,
                                 graphs=data_reader.graphs,
                                 shuffle=True,
                                 duplicate=args.duplicate,
                                 seq_len=args.seq_len)
    # print("Loading Vocab", args.vocab_path)
    print("Loading Vocab")
    vocab = Vocab(data_reader.graphs)
    # vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Shuffle Data")
    'TODO'

    print("Loading Train Dataset", args.dataset)
    train_dataset = CustomBERTDataset(
        data_reader.graphs[:int(len(data_reader) * 0.8)],
        vocab,
        seq_len=args.seq_len,
        on_memory=args.on_memory,
        n_neg=args.duplicate)
    # pdb.set_trace()
    neg_train_dataset = CustomBERTDataset(
        neg_data_reader.graphs[:args.duplicate * len(train_dataset)],
        vocab,
        seq_len=args.seq_len,
        on_memory=args.on_memory,
        n_neg=args.duplicate)
    # pdb.set_trace()
    assert len(neg_train_dataset) == args.duplicate * len(train_dataset)
    # print("Loading Test Dataset", args.test_dataset)
    print("Loading Dev Dataset", args.dataset)
    test_dataset = CustomBERTDataset(
        data_reader.graphs[int(len(data_reader) * 0.8):],
        vocab,
        seq_len=args.seq_len,
        on_memory=args.on_memory,
        n_neg=args.duplicate)  # \
    neg_test_dataset = CustomBERTDataset(
        neg_data_reader.graphs[-args.duplicate * len(test_dataset):],
        vocab,
        seq_len=args.seq_len,
        on_memory=args.on_memory,
        n_neg=args.duplicate)  # \
    assert len(neg_test_dataset) == args.duplicate * len(test_dataset)
    # if args.test_dataset is not None else None
    # pdb.set_trace()
    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers,
                                   collate_fn=my_collate)
    neg_train_data_loader = DataLoader(neg_train_dataset,
                                       batch_size=args.batch_size *
                                       args.duplicate,
                                       num_workers=args.num_workers,
                                       collate_fn=my_collate)

    test_data_loader = DataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers,
                                  collate_fn=my_collate)  # \
    neg_test_data_loader = DataLoader(neg_test_dataset,
                                      batch_size=args.batch_size *
                                      args.duplicate,
                                      num_workers=args.num_workers,
                                      collate_fn=my_collate)  # \
    # if test_dataset is not None else None
    # assert False
    print("Building BERT model")
    bert = BERT(len(vocab),
                hidden=args.hidden,
                n_layers=args.layers,
                attn_heads=args.attn_heads,
                dropout=args.dropout)

    print("Creating BERT Trainer")
    # trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader,
    #                       lr=args.lr, betas=(
    #                           args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
    #                       with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, pad_index=vocab.pad_index)
    trainer = BERTTrainer(bert,
                          len(vocab),
                          train_dataloader=train_data_loader,
                          test_dataloader=test_data_loader,
                          lr=args.lr,
                          betas=(args.adam_beta1, args.adam_beta2),
                          weight_decay=args.adam_weight_decay,
                          with_cuda=args.with_cuda,
                          cuda_devices=args.cuda_devices,
                          log_freq=args.log_freq,
                          pad_index=vocab.pad_index)
    # raise NotImplementedError
    print("Training Start")
    best_loss = None
    for epoch in range(args.epochs):
        # test_loss = trainer.test(epoch)

        train_loss = trainer.train(epoch)
        torch.cuda.empty_cache()

        # if test_data_loader is not None:
        test_loss = trainer.test(epoch)
        if best_loss is None or test_loss < best_loss:
            best_loss = test_loss
            trainer.save(epoch, args.output_path)

        torch.cuda.empty_cache()
Example #27
0
 hidden_size = 128
 char_embedding_dim=128 # charater-level embedding dim
 word_embedding_dim=50  # word-level embedding dim
 max_sent_length=35
 max_word_length=16
 kernel_n=3 # 卷积核长度
 padding=2 # padding大小
 lr = 3e-3
 weight_decay = 1e-3 # 梯度衰减权值
 gradient_clipping = 5 # 梯度裁剪
 output_per_batchs = 1
 test_per_batchs = 5
 test_batchs = 1
 ITORS = 100
 # 加载字典
 vocab = Vocab(vocab_path)
 char_vocab = CharVocab(char_vocab_path)
 entity_vocab = EntityVocab(entity_vocab_path)
 # 创建数据集
 train_data_set = DataSet(path=train_data_path,
                     vocab=vocab,
                     entity_vocab=entity_vocab,
                     entity_padding_len=max_sent_length)
 test_data_set = DataSet(path=test_data_path,
                     vocab=vocab,
                     entity_vocab=entity_vocab,
                     entity_padding_len=max_sent_length)
 # 创建加载器
 train_data_loader = DataLoader(train_data_set, shuffle=True, batch_size=BATCH_SIZE)
 test_data_loader = DataLoader(test_data_set, shuffle=True, batch_size=BATCH_SIZE)
 # 是否用GPU
Example #28
0
batch_size = opt.batch_size

# read dataset
if os.path.exists('dataset.pickle'):
    with open('dataset.pickle', 'rb') as f:
        train_iter, dev_iter, test_iter, vocab = pickle.load(f)
else:
    root_dir = opt.data
    segments = ['train', 'dev', 'test']
    token_files = [
        os.path.join(root_dir, seg, '%s.toks' % tok) for tok in ['a', 'b']
        for seg in segments
    ]

    vocab = Vocab(filepaths=token_files, embedpath=opt.word_embed)

    train_iter, dev_iter, test_iter = [
        SICKDataIter(os.path.join(root_dir, segment), vocab, num_classes)
        for segment in segments
    ]
    with open('dataset.pickle', 'wb') as f:
        pickle.dump([train_iter, dev_iter, test_iter, vocab], f)

logging.info('==> SICK vocabulary size : %d ' % vocab.size)
logging.info('==> Size of train data   : %d ' % len(train_iter))
logging.info('==> Size of dev data     : %d ' % len(dev_iter))
logging.info('==> Size of test data    : %d ' % len(test_iter))

# get network
net = SimilarityTreeLSTM(sim_hidden_size, rnn_hidden_size, vocab.size,
Example #29
0
def main():
    print("Hello!")
    voca = Vocab(args.vocab_fname)
    model = Model(args, voca)
    batcher = Batcher(voca, args)

    with tf.Session(config=GPU_config()) as sess:
        model.build_graph()

        if args.mode == 'train':
            sess.run(tf.global_variables_initializer())
            if not os.path.exists(args.train_logdir):
                os.makedirs(args.train_logdir)
            if not os.path.exists(args.valid_logdir):
                os.makedirs(args.valid_logdir)
            train_writer, valid_writer = tf.summary.FileWriter(
                args.train_logdir,
                sess.graph), tf.summary.FileWriter(args.valid_logdir,
                                                   sess.graph)

            t = trange(args.max_step, leave=True)
            for i in t:
                sample, label = batcher.next_data()
                _, loss, step, summaries = model.run_train_step(sample, sess)
                t.set_description('Train loss: {}'.format(round(loss, 3)))
                train_writer.add_summary(summaries, step)

                if step % 5e3 == 0:
                    model.saver.save(sess, args.model_path, step)

                if step % 5 == 0:
                    valid_sample, valid_label = batcher.next_data(
                        is_valid=True)
                    loss, step, summaries = model.run_eval_step(
                        valid_sample, sess)
                    valid_writer.add_summary(summaries, step)
                    t.set_description('Valid loss: {}'.format(round(loss, 3)))

                if step % 100 == 0:
                    near_ids, near_words = model.get_nearest_words(
                        sess, args.near_K)
                    pprint(near_words)
                    score = coherence_score(args.test_bin_fname, voca,
                                            near_ids)
                    summary = tf.Summary()
                    summary.value.add(tag='coherence_score_{}k'.format(
                        args.near_K),
                                      simple_value=score)
                    valid_writer.add_summary(summary, step)

        else:
            load_ckpt(args.model_path, sess, model.saver)
            near_words_dict = {i: [] for i in range(args.aspect_num)}
            for k in range(5, 50, 5):
                near_ids, near_words = model.get_nearest_words(sess, k)
                score = coherence_score(args.test_bin_fname, voca, near_ids)
                print(k, score)
                for asp_idx in near_words:
                    for word in near_words[asp_idx]:
                        if word not in near_words_dict[asp_idx]:
                            near_words_dict[asp_idx].append(word)

            with open(args.nearword_fname, 'w') as f:
                for idx in range(len(list(near_words_dict.keys()))):
                    print(near_words_dict[idx])
                    f.write(str(idx) + '   ')
                    f.write(' '.join(near_words_dict[idx][:5]))
                    f.write('\n')
Example #30
0
    def learn(self):
        check_dataset(self.trn)
        if self.dev is not None:
            check_dataset(self.dev)

        ###
        #  continuation
        ###
        if os.path.exists(
                self.mdir) and os.path.exists(self.mdir + '/topology'):
            src_voc = 'vocab_src'
            tgt_voc = 'vocab_tgt'
            if os.path.exists(self.mdir + '/tokenization_src.json'):
                with open(self.mdir + '/tokenization_src.json') as jsonfile:
                    self.tok_src = json.load(jsonfile)
                src_voc = self.tok_src["vocabulary"]
            else:
                self.src_tok = None
            if not os.path.exists(self.mdir + '/' + src_voc):
                sys.stderr.write(
                    'error: vocab src file: {} cannot be find\n'.format(
                        self.mdir + '/' + src_voc))
                sys.exit(1)
            if os.path.exists(self.mdir + '/tokenization_tgt.json'):
                with open(self.mdir + '/tokenization_tgt.json') as jsonfile:
                    self.tok_tgt = json.load(jsonfile)
                tgt_voc = self.tok_tgt["vocabulary"]
            else:
                self.tgt_tok = None
            if not os.path.exists(self.mdir + '/' + tgt_voc):
                sys.stderr.write(
                    'error: vocab tgt file: {} cannot be find\n'.format(
                        self.mdir + '/' + tgt_voc))
                sys.exit(1)
            if not os.path.exists(self.mdir + '/checkpoint'):
                sys.stderr.write(
                    'error: checkpoint file: {} cannot be find\ndelete dir {} ???\n'
                    .format(self.mdir + '/checkpoint', self.mdir))
                sys.exit(1)

            argv = []
            with open(self.mdir + "/topology", 'r') as f:
                for line in f:
                    opt, val = line.split()
                    argv.append('-' + opt)
                    argv.append(val)
            # overrides options passed in command line
            self.parse(argv)
            # read vocabularies
            self.voc_src = Vocab(self.mdir + "/" + src_voc)
            self.voc_tgt = Vocab(self.mdir + "/" + tgt_voc)
            # update last epoch
            for e in range(999, 0, -1):
                if os.path.exists(self.mdir + "/epoch{}.index".format(e)):
                    self.last_epoch = e
                    break
            print("learning continuation: last epoch is {}".format(
                self.last_epoch))
        ###
        #   learning from scratch
        ###
        else:
            # read file or config/vocab_src if file is not set
            if self.src_tok:
                if not os.path.exists(self.src_tok):
                    sys.stderr.write(
                        'error: cannot find -src_tok file: {}\n'.format(
                            self.src_tok))
                    sys.exit(1)
                with open(self.src_tok) as jsonfile:
                    self.tok_src = json.load(jsonfile)
                if not self.src_voc:
                    self.src_voc = self.tok_src["vocabulary"]
            else:
                self.tok_src = None

            self.voc_src = Vocab(self.src_voc)

            if self.tgt_tok:
                if not os.path.exists(self.tgt_tok):
                    sys.stderr.write(
                        'error: cannot find -tgt_tok file: {}\n'.format(
                            self.tgt_tok))
                    sys.exit(1)
                with open(self.tgt_tok) as jsonfile:
                    self.tok_tgt = json.load(jsonfile)
                if not self.tgt_voc:
                    self.tgt_voc = self.tok_tgt["vocabulary"]
            else:
                self.tok_tgt = None

            self.voc_tgt = Vocab(self.tgt_voc)

            self.src_voc_size = self.voc_src.length
            self.tgt_voc_size = self.voc_tgt.length

            if not os.path.exists(self.mdir):
                os.makedirs(self.mdir)
            # copy vocabularies
            if self.src_tok:
                copyfile(self.src_voc,
                         self.mdir + "/" + self.tok_src["vocabulary"])
                copyfile(self.src_tok, self.mdir + "/tokenization_src.json")
            else:
                copyfile(self.src_voc, self.mdir + "/vocab_src")

            if self.tgt_tok:
                copyfile(self.tgt_voc,
                         self.mdir + "/" + self.tok_tgt["vocabulary"])
                copyfile(self.tgt_tok, self.mdir + "/tokenization_tgt.json")
            else:
                copyfile(self.tgt_voc, self.mdir + "/vocab_tgt")

            # read embeddings
            # read file or use emb_src.length if file is not set
            self.emb_src = Embeddings(self.src_emb, self.voc_src,
                                      self.src_emb_size)
            self.src_emb_size = self.emb_src.dim
            # read file or use emb_tgt.length if file is not set
            self.emb_tgt = Embeddings(self.tgt_emb, self.voc_tgt,
                                      self.tgt_emb_size)
            self.tgt_emb_size = self.emb_tgt.dim
            # write topology file
            with open(self.mdir + "/topology", 'w') as f:
                for opt, val in vars(self).items():
                    if opt.startswith("src") or opt.startswith("tgt") or \
                            opt == "aggr" or opt == "mode":
                        f.write("{} {}\n".format(opt, val))
            print("learning from scratch")
        return