Ejemplo n.º 1
0
def test_empty_vocab():
    """
    Nothing is present in an empty word list
    """
    vocab = Vocab( [ ] )
    assert vocab.as_list() == [ ]
    assert not vocab.has("sheep")
Ejemplo n.º 2
0
def train(corpus_file, out_file, mode, dim_size, window, min_count,
          negative, epoch, pool_size, chunk_size):
    with bz2.BZ2File(corpus_file) as f:
        sentences = LineSentence(f)
        sg = int(mode == 'sg')

        model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count,
                         workers=pool_size, iter=epoch, negative=negative, sg=sg)

    words = []
    entities = []
    for (w, _) in model.vocab.iteritems():
        if w.startswith(MARKER):
            entities.append(w[len(MARKER):].replace(u'_', u' '))
        else:
            words.append(w)

    vocab = Vocab(Trie(words), Trie(entities))

    word_embedding = np.zeros((len(words), dim_size), dtype=np.float32)
    entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32)
    for word in words:
        word_embedding[vocab.get_word_index(word)] = model[word]
    for entity in entities:
        entity_embedding[vocab.get_entity_index(entity)] = model[MARKER + entity.replace(u' ', u'_')]

    ret = dict(
        word_embedding=word_embedding,
        entity_embedding=entity_embedding,
        vocab=vocab,
    )
    joblib.dump(ret, out_file, compress=False)
Ejemplo n.º 3
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 4
0
def train_worker(vec_size, window_size, k,  alpha, queue, results_queue, sent_dic, sent_vecs, vocab_dic, vocab_vecs, table, win_count_dic, lock ):
    # change shared Array to numpy array
    sent_vecs = Arr(np.frombuffer(sent_vecs.get_obj()), vec_size)
    vocab_vecs = Arr(np.frombuffer(vocab_vecs.get_obj()), vec_size)
    # init objects
    sent = Sent(vec_size, sent_dic, sent_vecs)
    vocab = Vocab(vec_size, vocab_dic, vocab_vecs)
    window_table = WindowTable(vocab=vocab, 
            size=window_size, 
            table=table, 
            win_count_dic=win_count_dic)
    # get a task
    sentence = queue.get()
    while sentence != None:
        if sentence == CURRENT_TURN_END_TOKEN:
            results_queue.put(None)
            sentence = queue.get()
            continue
        Jn = 0
        windows = gen_windows_from_sentence(sentence, window_size)
        v = sent[sentence]
        for wn, window in enumerate(windows):
            window_key = "-".join([str(vocab.vocab[hash(w)]) for w in window])
            h = vocab.get_window_vec(word_index=window_key)
            # noises
            noises = window_table.get_samples(k)
            e_vT_h = np.e**np.dot(v.T, h)
            update_v = h / (1. + e_vT_h)
            update_h = v / (1. + e_vT_h)
            # add positive window's loss
            Jn += math.log( 1. / ( 1. + 1./e_vT_h))
            update_window(vocab, window_key, update_h, lock, alpha)
            for idx, key in noises:
                n_h = vocab.get_window_vec(word_index=key)
                e_vT_h = np.e ** np.dot(v, n_h)
                frac_e_v_h = 1 - \
                        1 / (1 + e_vT_h)
                # accumulate the gradient
                update_v += - n_h * frac_e_v_h
                update_n_h = - v * frac_e_v_h
                update_window(vocab, key, update_n_h, lock, alpha)
                # add noise's loss
                Jn += math.log( 1/ (1+e_vT_h))

            update_v /= ( 1 + k)
            update_sent_vec(v, update_v, lock, alpha)
        #return Jn
        results_queue.put(Jn)
        current = mp.current_process()
        #print "%s Jn: %f" % (current.name, Jn)
        sentence = queue.get()
        show_status(results_queue)
    print "process %s exit!" % current.name
    logging.warning("process %s exit!" % current.name)
Ejemplo n.º 5
0
def load_data(small=True, char_based=False, batch_size=20, vocab_size=10000, history_len=5, max_tokens=50, null_mark=False):
    vocab_path = os.path.join(resource_dir, "ptb.train.txt")
    valid_path = os.path.join(resource_dir, "ptb.valid.txt")
    if small:
        train_path = os.path.join(resource_dir, "ptb.train.10k.txt")
    else:
        train_path = os.path.join(resource_dir, "ptb.train.txt")
    vocab = Vocab(char_based=char_based, null_mark=null_mark)
    vocab.load(vocab_path, max_size=vocab_size)

    lmdata = LMDataset(vocab, train_path, valid_path, history_len=-1, char_based=char_based, max_tokens=max_tokens)
    batch = BunchSequences(lmdata, batch_size=batch_size, fragment_length=history_len)
    return vocab, batch
Ejemplo n.º 6
0
    def initialize_vocab_and_tags(tags, vocab):
        if not vocab:
            vocab = Vocab()
            vocab.add("#OOV")
            learn_vocab = True
        else:
            learn_vocab = False
        if not tags:
            tags = Vocab()
            learn_tags = True
        else:
            learn_tags = False

        return learn_tags, learn_vocab, tags, vocab
Ejemplo n.º 7
0
    def __init__(self, path="", vec_size=50, k=20, alpha=0.1, n_workers=1):
        '''
        :parameters:

            @path: string
                path to dataset, should be a single file

            @vec_size: int
                size of sentence vector and word vector

            @k: int 
                number of negative samples for a window

            @alpha: float
                learning rate
        '''
        self.k = k
        self.vec_size = vec_size
        self.n_workers = n_workers
        self.alpha = alpha
        self.vocab = Vocab()
        self.sent = Sent()
        self.window_table = WindowTable(self.vocab, SIZE)
        self.dataset = Dataset(path)

        if path:
            self.create_vocab()
            self.create_sent()
            self.create_window_table()
Ejemplo n.º 8
0
    def test_add(self):
        v = Vocab()
        v.add('a')
        v.add('b')
        v.add('c')
        v.add('d')
        v.add('a')

        self.assertEqual(len(v), 4)
        self.assertEqual(v['a'], 0)

        self.assertEqual(v.rev(3), 'd')
        self.assertEqual(v.rev(0), 'a')
Ejemplo n.º 9
0
    def initialize_vocab_and_tags(tags, vocab, alphabet):
        if not vocab:
            vocab = Vocab()
            vocab.add('#OOV')
            alphabet = Vocab()
            alphabet.add('#OOA')

            learn_vocab = True
        else:
            learn_vocab = False
        if not tags:
            tags = Vocab()
            tags.add("#OOT")
            learn_tags = True
        else:
            learn_tags = False


        return learn_tags, learn_vocab, tags, vocab, alphabet
Ejemplo n.º 10
0
def getembd():
    vocab_file='data/vocab.txt'
    vocab = Vocab(filename=vocab_file)
    emb_file = os.path.join('data/', 'webkbb_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
        print(emb.size())
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(os.path.join('data/glove','glove.6B.200d'))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())

        emb = torch.zeros(vocab.size(),glove_emb.size(1))

        for word in vocab.labelToIdx.keys():
            
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)]
            else:
                emb[vocab.getIndex(word)] = torch.Tensor(emb[vocab.getIndex(word)].size()).normal_(-0.05,0.05)
        torch.save(emb, emb_file)
        is_preprocessing_data = True # flag to quit
        print('done creating emb, quit')
Ejemplo n.º 11
0
def test_small_vocab():
    l = ["eeny", "moe", "miney", "meeny"];
    vocab = Vocab(l)
    assert vocab.has("moe")
    assert vocab.has("eeny")
    assert vocab.has("miney")
    assert vocab.has("meeny")
    assert not vocab.has("many")
    assert sorted(vocab.as_list()) == sorted(l)
Ejemplo n.º 12
0
def main(unused_args):
    ''' Generates data from a trained model (fun!) '''
    
    if not FLAGS.load_model:
        print('--load_model is required')
        return -1
  
    with tf.Graph().as_default(), tf.Session() as session:
        
        ''' load parameters of the model '''    
        with tf.variable_scope("params"):
            num_layers_var = tf.Variable(0, name='num_layers')
            hidden_size_var = tf.Variable(0, name='hidden_size')
            vocab_size_var = tf.Variable(0, name='vocab_size')
            tf.train.Saver([num_layers_var, hidden_size_var, vocab_size_var]).restore(session, FLAGS.load_model)
            vocab_var = tf.Variable([0] * vocab_size_var.eval(), name='vocab')
            tf.train.Saver([vocab_var]).restore(session, FLAGS.load_model)
            
            FLAGS.num_layers = np.asscalar(num_layers_var.eval())
            FLAGS.hidden_size = np.asscalar(hidden_size_var.eval())
            
            vocab = Vocab.from_array(vocab_var.eval())
            
            print('Loaded model from file', FLAGS.load_model)
            print('\tnum_layers:', FLAGS.num_layers)
            print('\thidden_size:', FLAGS.hidden_size)
            print('\tvocab_size', vocab.size)
        
        ''' load inference graph '''
        with tf.variable_scope("model", reuse=None):
            m = graph.inference_graph(vocab.size, FLAGS.num_layers, FLAGS.hidden_size)
          
        tf.train.Saver().restore(session, FLAGS.load_model)
        
        logits = np.ones((vocab.size,))
        state = session.run(m.initial_state)
        for i in range(FLAGS.sample_size):
            logits = logits / FLAGS.temperature
            prob = np.exp(logits)
            prob /= np.sum(prob)
            prob = prob.ravel()
            ix = np.random.choice(range(len(prob)), p=prob)
            
            print(vocab.decode(ix), end='')
        
            logits, state = session.run([m.logits, m.final_state],
                                         {m.input_data: np.array([[ix]]),
                                          m.initial_state: state})
Ejemplo n.º 13
0
def read_datasets(input_data, train_fraction=0.95, valid_fraction=0.05, vocab=None, vocab_size=128):

    print('Reading data from', input_data, '...')
    
    with open(input_data, 'rb') as f:
        data = f.read()

    if vocab is None:
        vocab = Vocab.from_data(data, vocab_size=vocab_size)
    
    train_size = int(math.floor(len(data) * train_fraction))
    valid_size = int(math.floor(len(data) * valid_fraction))
    train_data = data[:train_size]
    
    valid_data = data[train_size:train_size+valid_size]
    test_data = data[train_size+valid_size:]
    
    return [vocab.encode(c) for c in train_data], [vocab.encode(c) for c in valid_data], [vocab.encode(c) for c in test_data], vocab
Ejemplo n.º 14
0
def test_from_simulated_file():
    from io import StringIO
    l = StringIO(initial_value="""
        #comment
        # another comment line
        sheep

        rats
        #comment
        squirrels
        """)
    vocab = Vocab(l)
    assert sorted(vocab.as_list()) == ["rats", "sheep", "squirrels"]
    assert vocab.has("sheep")
    assert vocab.has("rats")
    assert vocab.has("squirrels")
    assert not vocab.has("#comment")
Ejemplo n.º 15
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 16
0
def main():
    
    

    args=parse_args()
    print(args)


    num_classes = 7
    
    data_dir = args.data_dir #,'train_texts.blk')
    train_file=os.path.join(data_dir,'train_data.pth')
    
    #val_dir = args.val_data #'val_texts.blk')
    val_file= os.path.join(data_dir,'val_data.pth')

    
    vocab_file="../data/vocab.txt"
    vocab = Vocab(filename=vocab_file)
    

    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
        
    else:
        train_dataset = WebKbbDataset(vocab, num_classes,os.path.join(data_dir,'train_texts.blk'),os.path.join(data_dir,'train_labels.blk'))

        torch.save(train_dataset, train_file)
    
    if os.path.isfile(val_file):
        val_dataset = torch.load(val_file)
        
    else:
        val_dataset = WebKbbDataset(vocab, num_classes,os.path.join(data_dir,'val_texts.blk'),os.path.join(data_dir,'val_labels.blk'))
        torch.save(val_dataset, val_file)
    


    
    
    

    vocab_size=vocab.size()
    in_dim=200
    mem_dim=200
    hidden_dim=200
    num_classes=7
    sparsity=True
    freeze=args.freeze_emb
    epochs=args.epochs
    lr=args.lr
    pretrain=args.pretrain
    
    
    
    cuda_flag=True

    if not torch.cuda.is_available():
        cuda_flag=False
                        
    model = DomTreeLSTM(vocab_size,in_dim, mem_dim, hidden_dim, num_classes, sparsity, freeze)
    criterion = nn.CrossEntropyLoss()

    if pretrain:
        
        emb_file = os.path.join('../data', 'emb.pth')
        if os.path.isfile(emb_file):
            emb = torch.load(emb_file)
            print(emb.size())
            print("Embedding weights loaded")
        else:
            print("Embedding file not found")
        
        model.emb.weight.data.copy_(emb)

    optimizer = optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
    
    trainer = Trainer(model, criterion, optimizer,train_dataset,val_dataset,cuda_flag=cuda_flag)
    
    for epoch in range(epochs):

        trainer.train()
        
        trainer.test()
Ejemplo n.º 17
0
def main():
    args = parse_args()

    # input files
    train_file = args.data_dir + '/train.json'
    test_file = args.data_dir + '/test.json'

    # output files
    # token
    vocab_tok_file = args.vocab_dir + '/vocab_tok.vocab'
    # position
    vocab_post_file = args.vocab_dir + '/vocab_post.vocab'
    # pos_tag
    vocab_pos_file = args.vocab_dir + '/vocab_pos.vocab'
    # dep_rel
    vocab_dep_file = args.vocab_dir + '/vocab_dep.vocab'
    # polarity
    vocab_pol_file = args.vocab_dir + '/vocab_pol.vocab'

    # load files
    print("loading files...")
    train_tokens, train_pos, train_dep, train_max_len = load_tokens(train_file)
    test_tokens, test_pos, test_dep, test_max_len = load_tokens(test_file)

    # lower tokens
    if args.lower:
        train_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\
                (train_tokens, test_tokens)]

    # counters
    token_counter = Counter(train_tokens + test_tokens)
    pos_counter = Counter(train_pos + test_pos)
    dep_counter = Counter(train_dep + test_dep)
    max_len = max(train_max_len, test_max_len)
    post_counter = Counter(list(range(-max_len, max_len)))
    pol_counter = Counter(['positive', 'negative', 'neutral'])

    # build vocab
    print("building vocab...")
    token_vocab = Vocab(token_counter, specials=['<pad>', '<unk>'])
    pos_vocab = Vocab(pos_counter, specials=['<pad>', '<unk>'])
    dep_vocab = Vocab(dep_counter, specials=['<pad>', '<unk>'])
    post_vocab = Vocab(post_counter, specials=['<pad>', '<unk>'])
    pol_vocab = Vocab(pol_counter, specials=[])
    print(
        "token_vocab: {}, pos_vocab: {}, dep_vocab: {}, post_vocab: {}, pol_vocab: {}"
        .format(len(token_vocab), len(pos_vocab), len(dep_vocab),
                len(post_vocab), len(pol_vocab)))

    print("dumping to files...")
    token_vocab.save_vocab(vocab_tok_file)
    pos_vocab.save_vocab(vocab_pos_file)
    dep_vocab.save_vocab(vocab_dep_file)
    post_vocab.save_vocab(vocab_post_file)
    pol_vocab.save_vocab(vocab_pol_file)
    print("all done.")
Ejemplo n.º 18
0
import os

from collections import namedtuple
import sys
import json
import dynet as dynet

config = sys.argv[1]
model = sys.argv[2]
testfile = sys.argv[3]
vocabfile = os.path.dirname(model) + "/vocab.txt"

d = json.load(open(config))
config = namedtuple("options", d.keys())(*d.values())

vocab = Vocab(vocabfile)

if "embeds" in config:
    tagger = SimpleBiltyTagger(
        config.in_dim,
        config.h_dim,
        config.c_in_dim,
        config.h_layers,
        embeds_file=config.embeds,
        word2id=vocab.word2id,
    )
else:
    tagger = SimpleBiltyTagger(config.in_dim,
                               config.h_dim,
                               config.c_in_dim,
                               config.h_layers,
Ejemplo n.º 19
0
from batcher import Dataset
from char2vec import CharCNN as Char2Vec
from vocab import Vocab

parser = argparse.ArgumentParser()
parser.add_argument('expdir')
args = parser.parse_args()

config = tf.ConfigProto(inter_op_parallelism_threads=10,
                        intra_op_parallelism_threads=10)

dataset = Dataset(10, preshuffle=False)
dataset.ReadData('../data/tweetlid/training.tsv.gz', 'all', 'tweet')

input_vocab = Vocab.MakeFromData(dataset.GetSentences(), min_count=1)
char_vocab = Vocab.Load(os.path.join(args.expdir, 'char_vocab.pickle'))

max_word_len = max([len(x) for x in input_vocab.GetWords()]) + 2
print 'max word len {0}'.format(max_word_len)

with open(os.path.join(args.expdir, 'model_params.json'), 'r') as f:
    model_params = json.load(f)

c2v = Char2Vec(char_vocab, model_params, max_sequence_len=max_word_len)
the_words, word_lengths = c2v.MakeMat(input_vocab, pad_len=max_word_len)

saver = tf.train.Saver(tf.all_variables())
session = tf.Session(config=config)

saver.restore(session, os.path.join(args.expdir, 'model.bin'))
Ejemplo n.º 20
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = Dataset(args.max_p_num, args.max_p_len, args.max_q_len,
                       args.max_w_len, args.train_files, args.dev_files,
                       args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    for word in brc_data.word_iter('dev'):
        vocab.add(word)
    for word in brc_data.word_iter('test'):
        vocab.add(word)

    logger.info('Assigning embeddings...')
    vocab.load_pretrained_char_embeddings(args.char_embed)
    vocab.load_pretrained_word_embeddings(args.word_embed)
    vocab.randomly_init_embeddings(args.pos_embed_dim)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 21
0
def test():
    embed = None
    if args.embed_path is not None and os.path.exists(args.embed_path):
        print('Loading pretrained word embedding...')
        embed = {}
        with open(args.embed_path, 'r') as f:
            f.readline()
            for line in f.readlines():
                line = line.strip().split()
                vec = [float(_) for _ in line[1:]]
                embed[line[0]] = vec
    vocab = Vocab(args, embed)

    train_data, val_data, test_data = [], [], []
    fns = os.listdir(args.train_dir)
    fns.sort(key=lambda p: int(p.split('.')[0]))
    for fn in tqdm(fns):
        f = open(args.train_dir + fn, 'r')
        train_data.append(json.load(f))
        f.close()
        vocab.add_sentence(train_data[-1]['reviewText'].split())
        vocab.add_sentence(train_data[-1]['summary'].split())
        vocab.add_user(train_data[-1]['userID'])
        vocab.add_product(train_data[-1]['productID'])
    fns = os.listdir(args.valid_dir)
    fns.sort(key=lambda p: int(p.split('.')[0]))
    for fn in tqdm(fns):
        f = open(args.valid_dir + fn, 'r')
        val_data.append(json.load(f))
        f.close()
        vocab.add_sentence(val_data[-1]['reviewText'].split())
        vocab.add_sentence(val_data[-1]['summary'].split())
        vocab.add_user(val_data[-1]['userID'])
        vocab.add_product(val_data[-1]['productID'])
    fns = os.listdir(args.test_dir)
    fns.sort(key=lambda p: int(p.split('.')[0]))
    for fn in tqdm(fns):
        f = open(args.test_dir + fn, 'r')
        test_data.append(json.load(f))
        f.close()
        vocab.add_sentence(test_data[-1]['reviewText'].split())
        vocab.add_sentence(test_data[-1]['summary'].split())
        vocab.add_user(test_data[-1]['userID'])
        vocab.add_product(test_data[-1]['productID'])
    embed = vocab.trim()
    args.embed_num = len(embed)
    args.embed_dim = len(embed[0])
    test_dataset = Dataset(test_data)
    test_iter = DataLoader(dataset=test_dataset,
                           batch_size=args.batch_size,
                           shuffle=False)

    print('Loading model...')
    checkpoint = torch.load(args.save_path + args.load_model)
    net = EncoderDecoder(checkpoint['args'], embed)
    net.load_state_dict(checkpoint['model'])
    if args.use_cuda:
        net.cuda()
    criterion = nn.NLLLoss(ignore_index=vocab.PAD_IDX, reduction='sum')

    print('Begin testing...')
    loss, r1, r2, rl = evaluate(net, criterion, vocab, test_iter, False)
    print('Loss: %f Rouge-1: %f Rouge-2: %f Rouge-l: %f' % (loss, r1, r2, rl))
Ejemplo n.º 22
0
tf.set_random_seed(666)

baseline = False

batch_size = 25
dataset = Dataset(batch_size, preshuffle=mode == 'train')
und_symbol = 'und'

dataset.ReadData(args.data, mode, args.model)

# Make the input vocabulary (words that appear in data)
if baseline:
    # The baseline is to use fixed word embeddings.
    if mode == 'train':
        # The input vocab is fixed during training.
        input_vocab = Vocab.MakeFromData(dataset.GetSentences(), min_count=2)
        input_vocab.Save(os.path.join(args.expdir, 'input_vocab.pickle'))
    else:
        # During testing we need to load the saved input vocab.
        input_vocab = Vocab.Load(
            os.path.join(args.expdir, 'input_vocab.pickle'))
else:
    # The open vocabulary can be regenerated with each run.
    min_count = 1
    if mode == 'debug':
        min_count = 10  # When visualizing word embeddings hide rare words
    maxlens = {'word': 40, 'char': 150, 'tweet': 40}
    input_vocab = Vocab.MakeFromData(dataset.GetSentences(),
                                     min_count=min_count,
                                     max_length=maxlens[args.model])
Ejemplo n.º 23
0
def train(args: Dict):
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                input_feed=args['--input-feed'],
                label_smoothing=float(args['--label-smoothing']),
                vocab=vocab)
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1

        for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            # (batch_size)
            example_losses = -model(src_sents, tgt_sents)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                                                         cum_loss / cum_examples,
                                                                                         np.exp(cum_loss / cum_tgt_words),
                                                                                         cum_examples), file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr)

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(), model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay'])
                        print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                        # load model
                        params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers', file=sys.stderr)
                        optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)
Ejemplo n.º 24
0
def main(unused_args):
    ''' Trains model from data '''

    if not FLAGS.input_data:
        raise ValueError("Must set --input_data to the filename of input dataset")

    if not FLAGS.train_dir:
        raise ValueError("Must set --train_dir to the directory where training files will be saved")

    if not os.path.exists(FLAGS.train_dir):
        os.mkdir(FLAGS.train_dir)

    with tf.Graph().as_default(), tf.Session() as session:

        ''' To make tf.train.Saver write parameters as part of the saved file, add params to the graph as variables (hackish? - MK)'''
        with tf.variable_scope("params", reuse=None):
            num_layers_var = tf.Variable(FLAGS.num_layers, trainable=False, name='num_layers')
            hidden_size_var = tf.Variable(FLAGS.hidden_size, trainable=False, name='hidden_size')
        
            ''' If pre-trained model loaded from file, use loaded vocabulary and NN geometry. Else, compute vocabulary and use command-line params for num_layers and hidden_size ''' 
            if FLAGS.load_model:
                vocab_size_var = tf.Variable(0, trainable=False, name='vocab_size')
                tf.train.Saver([num_layers_var, hidden_size_var, vocab_size_var]).restore(session, FLAGS.load_model)
                vocab_var = tf.Variable([0] * vocab_size_var.eval(), trainable=False, name='vocab')
                tf.train.Saver([vocab_var]).restore(session, FLAGS.load_model)
    
                FLAGS.num_layers = np.asscalar(num_layers_var.eval())  # need np.asscalar to upcast np.int32 to Python int
                FLAGS.hidden_size = np.asscalar(hidden_size_var.eval())
                
                vocab = Vocab.from_array(vocab_var.eval())
                train_data, valid_data, test_data, vocab = reader.read_datasets(FLAGS.input_data, FLAGS.train_fraction, FLAGS.valid_fraction, vocab=vocab)
            else:
                train_data, valid_data, test_data, vocab = reader.read_datasets(FLAGS.input_data, FLAGS.train_fraction, FLAGS.valid_fraction, vocab_size=FLAGS.vocab_size)
                vocab_size_var = tf.Variable(vocab.size, trainable=False, name='vocab_size')
                vocab_var = tf.Variable(vocab.to_array(), trainable=False, name='vocab')

        ''' build training graph '''
        initializer = tf.random_uniform_initializer(-FLAGS.init_scale, FLAGS.init_scale)
        with tf.variable_scope("model", initializer=initializer):
            m = graph.inference_graph(vocab.size, FLAGS.num_layers, FLAGS.hidden_size, FLAGS.batch_size, FLAGS.num_steps, FLAGS.dropout_rate)
            m.update(graph.cost_graph(m.logits, FLAGS.batch_size, FLAGS.num_steps, vocab.size))
            m.update(graph.training_graph(m.cost, FLAGS.grad_clip))

        # create saver before creating more graph nodes, so that we do not save any vars defined below      
        saver = tf.train.Saver(max_to_keep=50)

        ''' build graph for validation and testing (shares parameters with the training graph!) '''
        with tf.variable_scope("model", reuse=True):
            mvalid = graph.inference_graph(vocab.size, FLAGS.num_layers, FLAGS.hidden_size, FLAGS.batch_size, FLAGS.num_steps)
            mvalid.update(graph.cost_graph(mvalid.logits, FLAGS.batch_size, FLAGS.num_steps, vocab.size))

        if FLAGS.load_model:
            saver.restore(session, FLAGS.load_model)
            print('Loaded model from', FLAGS.load_model)
        else:
            print('Created model')
        
        print('\tnum_layers:', FLAGS.num_layers)
        print('\thidden_size:', FLAGS.hidden_size)
        print('\tvocab_size:', vocab.size)
        print()
        print('Training parameters')
        print('\tbatch_size:', FLAGS.batch_size)
        print('\tnum_steps:', FLAGS.num_steps)
        print('\tlearning_rate:', FLAGS.learning_rate)
        print('\tbeta1:', FLAGS.beta1)
        print('\tbeta2:', FLAGS.beta2)
        print()
        print('Datasets')
        print('\ttraining dataset size:', len(train_data))
        print('\tvalidation dataset size:', len(valid_data))
        print('\ttest dataset size:', len(test_data))
        print()
        
        ''' create two summaries: training cost and validation cost '''
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph=session.graph)
        summary_train = summary_graph('Training cost', ema_decay=0.95)
        summary_valid = summary_graph('Validation cost')
        
        session.run([
            m.lr.initializer,
            m.beta1.initializer,
            m.beta2.initializer,
        ])

        tf.initialize_all_variables().run()
        
        session.run([
            tf.assign(m.lr, FLAGS.learning_rate),
            tf.assign(m.beta1, FLAGS.beta1),
            tf.assign(m.beta2, FLAGS.beta2),
        ])

        state = session.run(m.initial_state)
        iterations = len(train_data) // FLAGS.batch_size // FLAGS.num_steps * FLAGS.max_epochs
        for i, (x, y) in enumerate(reader.next_batch(train_data, FLAGS.batch_size, FLAGS.num_steps)):
            if i >= iterations:
                break
        
            start_time = time.time()
        
            cost, state, _ = session.run([m.cost, m.final_state, m.train_op], {
                    m.input_data: x,
                    m.targets: y,
                    m.initial_state: state
            })
        
            epoch = float(i) / (len(train_data) // FLAGS.batch_size // FLAGS.num_steps)
            time_elapsed = time.time() - start_time
            print('%d/%d (epoch %.3f), train_loss = %6.8f, time/batch = %.4fs' % (i+1, iterations, epoch, cost, time_elapsed))
            
            session.run([summary_train.update], {summary_train.x: cost})
        
            if (i+1) % FLAGS.eval_val_every == 0 or i == iterations-1:
                # evaluate loss on validation data
                cost = run_test(session, mvalid, valid_data, FLAGS.batch_size, FLAGS.num_steps)
                print("validation cost = %6.8f" % cost)
                save_as = '%s/epoch%.2f_%.4f.model' % (FLAGS.train_dir, epoch, cost)
                saver.save(session, save_as)

                ''' write out summary events '''
                buffer, = session.run([summary_train.summary])
                summary_writer.add_summary(buffer, i)
                
                session.run([summary_valid.update], {summary_valid.x: cost})
                buffer, = session.run([summary_valid.summary])
                summary_writer.add_summary(buffer, i)
                
                summary_writer.flush()
        
        if len(test_data) > FLAGS.batch_size * FLAGS.num_steps:
            cost = run_test(session, mvalid, test_data, FLAGS.batch_size, FLAGS.num_steps)
            print("Test cost: %.3f" % test_loss)
Ejemplo n.º 25
0
def main():
    global args
    args = parse_args()
    # global logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s")
    # file logger
    fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w')
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    # console logger
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    # argument validation
    args.cuda = args.cuda and torch.cuda.is_available()
    if args.sparse and args.wd != 0:
        logger.error('Sparsity and weight decay are incompatible, pick one!')
        exit()
    logger.debug(args)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    sick_vocab_file = os.path.join(args.data, 'sick.vocab')
    if not os.path.isfile(sick_vocab_file):
        token_files_b = [os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir]]
        token_files_a = [os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir]]
        token_files = token_files_a + token_files_b
        sick_vocab_file = os.path.join(args.data, 'sick.vocab')
        build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD])
    logger.debug('==> SICK vocabulary size : %d ' % vocab.size())

    # load SICK dataset splits
    train_file = os.path.join(args.data, 'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)
    logger.debug('==> Size of train data   : %d ' % len(train_dataset))
    dev_file = os.path.join(args.data, 'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)
    logger.debug('==> Size of dev data     : %d ' % len(dev_dataset))
    test_file = os.path.join(args.data, 'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)
    logger.debug('==> Size of test data    : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(
                vocab.size(),
                args.input_dim,
                args.mem_dim,
                args.hidden_dim,
                args.num_classes,
                args.sparse,
                args.freeze_embed)
    criterion = nn.KLDivLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove, 'glove.840B.300d'))
        logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)]
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    model.emb.weight.data.copy_(emb)

    # create trainer object for training and testing
    trainer = Trainer(args, model, criterion, optimizer)

    best = -float('inf')
    for epoch in range(args.epochs):
        train_loss             = trainer.train(train_dataset)
        train_loss, train_pred = trainer.test(train_dataset)
        dev_loss, dev_pred     = trainer.test(dev_dataset)
        test_loss, test_pred   = trainer.test(test_dataset)

        train_pearson = metrics.pearson(train_pred, train_dataset.labels)
        train_mse = metrics.mse(train_pred, train_dataset.labels)
        logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_mse))
        dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels)
        dev_mse = metrics.mse(dev_pred, dev_dataset.labels)
        logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_mse))
        test_pearson = metrics.pearson(test_pred, test_dataset.labels)
        test_mse = metrics.mse(test_pred, test_dataset.labels)
        logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_mse))

        if best < test_pearson:
            best = test_pearson
            checkpoint = {
                'model': trainer.model.state_dict(), 
                'optim': trainer.optimizer,
                'pearson': test_pearson, 'mse': test_mse,
                'args': args, 'epoch': epoch
                }
            logger.debug('==> New optimum found, checkpointing everything now...')
            torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
Ejemplo n.º 26
0
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])

    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab,
                no_char_decoder=args['--no-char-decoder'])
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1

        for src_sents, tgt_sents in batch_iter(train_data,
                                               batch_size=train_batch_size,
                                               shuffle=True):
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            example_losses = -model(src_sents, tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print(
                    'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f '
                    'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec'
                    % (epoch, train_iter, report_loss / report_examples,
                       math.exp(report_loss / report_tgt_words), cum_examples,
                       report_tgt_words /
                       (time.time() - train_time), time.time() - begin_time),
                    file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                # dev batch size can be a bit larger
                dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' %
                      (train_iter, dev_ppl),
                      file=sys.stderr)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * \
                            float(args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

            if epoch == int(args['--max-epoch']):
                print('reached maximum number of epochs!', file=sys.stderr)
                exit(0)
Ejemplo n.º 27
0
        return preds

    def decode(self, code: List[int]):
        code = [int(c.detach().cpu().numpy()) for c in code]
        result = self.tokenizer.decode(code, remove_special_token=False)
        return result

    def encode(self, doc, maxlen):
        code = self.tokenizer.encode(doc)
        if len(code) > maxlen:
            code = code[:maxlen]
            code[-1] = self.tokenizer.eos_id
        elif len(code) < maxlen:
            pad_size = maxlen - len(code)
            code = code + pad_size * [self.tokenizer.pad_token_id]
        assert len(code) == maxlen
        return code


if __name__ == "__main__":

    # from pprint import pprint
    inf = Inference(tokenizer=Vocab.from_pretrained('./model/vocab.txt'),
                    model_path='model/28.ckpt',
                    config_file='model/config.json',
                    device='cpu')
    print(
        inf.infer([
            '國際中心/綜合報導英國薩福克郡32歲的巴萊塔(Becky[UNK]Barletta)在新婚不久後,被診斷出罹患失智症,是英國最年輕的失智症患者之一,壽命可能只剩5年。巴萊塔的父親難過的表示,據《每日郵報》報導,巴萊塔目前住在自己的娘家,因為她已經無法自理生活,需要家人全天候的照顧。巴萊塔2015年10月結婚,但在2016年性情大變,當年8月確診罹患「上額顳葉失智症」(Frontotemporal[UNK]dementia)。巴萊塔罹病後,外在的行為表現、情緒、社交及語言能力都受影響。事實上,巴萊塔原是一名滑雪教練,學生們都非常喜歡她,怎料結婚後突然改變,讓家人不能接受。據了解,巴萊塔的叔叔及母親的表弟都死於失智症,因此家人非常擔心她的狀況。巴萊塔的妹妹蘇菲(Sophie)難過的表示,其實姊姊在結婚前,,「她以前是個很棒的老師,尤其對孩子特別好,大家都很喜歡她。」專家表示,若是巴萊塔的病情持續惡化,未來就連吃飯、說話都會有問題,甚至活不過10年。蘇菲補充,「姊姊會突然向街上的人說話,問他們能不能發出些好笑的聲音。大家都不明白為什麼她的外表看起來如此正常,卻會對人如此沒禮貌。」蘇菲目前正在向各方發起募款活動,希望能夠讓外界更'
        ]))
Ejemplo n.º 28
0
class Sent2Vec(object):
    def __init__(self, path="", vec_size=50, k=20, alpha=0.1, n_workers=1):
        '''
        :parameters:

            @path: string
                path to dataset, should be a single file

            @vec_size: int
                size of sentence vector and word vector

            @k: int 
                number of negative samples for a window

            @alpha: float
                learning rate
        '''
        self.k = k
        self.vec_size = vec_size
        self.n_workers = n_workers
        self.alpha = alpha
        self.vocab = Vocab()
        self.sent = Sent()
        self.window_table = WindowTable(self.vocab, SIZE)
        self.dataset = Dataset(path)

        if path:
            self.create_vocab()
            self.create_sent()
            self.create_window_table()

    def create_vocab(self):
        for sent in self.dataset.sents:
            sent = sent.split()
            self.vocab.add_from_sent(sent)
        self.vocab.init_vecs()

    def create_sent(self):
        for sent in self.dataset.sents:
            self.sent.add(sent)
        self.sent.init_vecs()

    def create_window_table(self):
        '''
        for negative sampling
        '''
        self.window_table(self.dataset.sents)

    
    def multi_thread_train(self):
        '''
        use mini-batch to train
        '''
        jobs = Queue(maxsize=9 * self.n_workers)
        lock = threading.Lock()

        start, next_report = time.time(), [1.0]

        self.Js = []

        def worker_train():
            while True:
                # get sentence
                sent = jobs.get()
                if sent is None:
                    break

                Jn = self.train_sent(sent, lock)
                self.Js.append(Jn)

        workers = [threading.Thread(target=worker_train) for _ in xrange(self.n_workers)]

        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()
        # put dataset to Queue
        for sent in self.dataset.sents:
            jobs.put(sent)
        # put None to tell all threads to exit
        for _ in xrange(self.n_workers):
            jobs.put(None)  

        for thread in workers:
            thread.join()
        print 'Js: ', np.mean(self.Js)
        elapsed = time.time() - start
        print 'used time', elapsed

    def train(self):
        '''
        use mini-batch to train
        '''
        Js = []
        for no, sent in enumerate(self.dataset.sents):
            Jn = self.train_sent(sent)
            Js.append(Jn)
        # calculate Jn for this sentence
        mean_Js = np.mean( np.array(Js))
        print 'total J', mean_Js
        return mean_Js

    def train_sent(self, sent, lock=None):
        # the loss
        Jn = 0
        #print no, 
        #print 'training sent: ', no, sent
        # get windows from the sent
        windows = gen_windows_from_sentence(sent, SIZE)
        #print 'gen windows', windows
        # get sentence vector
        v = self.sent[sent]
        
        for wn, window in enumerate(windows):
            #print '.', 
            #assert( type(window) == type([]), "window is %s" % str(window))
            #print 'window', window
            window_key = "-".join([str(self.vocab.vocab[hash(w)]) for w in window])
            h = self.vocab.get_window_vec(word_index=window_key)
            # noises
            noises = self.window_table.get_samples(self.k)
            #n_hs = [self.vocab.get_window_vec(s[1]) for s in noises ]
            # for a positive sample
            #print 'h:', h
            #print 'v:', v
            e_vT_h = np.e**np.dot(v.T, h)
            #print "dot(v,h)", np.dot(v, h)
            #print "e_vT_h", e_vT_h
            #sys.exit(0);
            update_v = h / (1 + e_vT_h)
            update_h = v / (1 + e_vT_h)
            # add positive window's loss
            Jn += math.log( 1 / ( 1 + 1/e_vT_h))

            self.update_window(window_key, update_h, lock)
            # for each negative window sample
            for idx, key in noises:
                n_h = self.vocab.get_window_vec(word_index=key)
                e_vT_h = np.e ** np.dot(v, n_h)
                frac_e_v_h = 1 - \
                        1 / (1 + e_vT_h)
                # accumulate the gradient
                update_v += - n_h * frac_e_v_h
                update_n_h = - v * frac_e_v_h
                self.update_window(key, update_n_h, lock)
                # add noise's loss
                Jn += math.log( 1/ (1+e_vT_h))

            update_v /= ( 1 + self.k)
            # update sentence vector for each window
            # TODO change to a single turn?
            self.update_sent_vec(v, update_v, lock)
            # add loss to total Jn
        #print 
        return Jn

    def update_sent_vec(self, sent_vec, grad, lock=None):
        if lock:
            with lock:
                sent_vec += self.alpha * grad
                sent_vec /= LA.norm(sent_vec)
        else:
            sent_vec += self.alpha * grad
            sent_vec /= LA.norm(sent_vec)

    def update_window(self, key, grad, lock=None):
        '''
        update each word's vector in a window
            and norm the vectors

        :parameters:
            @key: string
                like '19-32-2'
            @grad: numpy.array
                the gradient
        '''
        word_ids = [int(id) for id in key.split('-')]
        for id in word_ids:
            word_vec = self.vocab.vecs[id]
            if lock:
                with lock:
                    word_vec += self.alpha * grad
                    word_vec /= LA.norm(word_vec)
            else:
                word_vec += self.alpha * grad
                word_vec /= LA.norm(word_vec)


    def tofile(self, path):
        '''
        save model to file
        '''
        mod2file(self, path)

    @staticmethod
    def fromfile(path):
        return mod_from_file(path)
Ejemplo n.º 29
0
def train():
    embed = None
    if args.embed_path is not None and os.path.exists(args.embed_path):
        print('Loading pretrained word embedding...')
        embed = {}
        with open(args.embed_path, 'r') as f:
            f.readline()
            for line in f.readlines():
                line = line.strip().split()
                vec = [float(_) for _ in line[1:]]
                embed[line[0]] = vec
    vocab = Vocab(args, embed)
    print('Loading datasets...')
    train_data, val_data, test_data = [], [], []
    fns = os.listdir(args.train_dir)
    fns.sort(key=lambda p: int(p.split('.')[0]))
    for fn in tqdm(fns):
        f = open(args.train_dir + fn, 'r')
        train_data.append(json.load(f))
        f.close()
        vocab.add_sentence(train_data[-1]['reviewText'].split())
        vocab.add_sentence(train_data[-1]['summary'].split())
        vocab.add_user(train_data[-1]['userID'])
        vocab.add_product(train_data[-1]['productID'])
    fns = os.listdir(args.valid_dir)
    fns.sort(key=lambda p: int(p.split('.')[0]))
    for fn in tqdm(fns):
        f = open(args.valid_dir + fn, 'r')
        val_data.append(json.load(f))
        f.close()
        vocab.add_sentence(val_data[-1]['reviewText'].split())
        vocab.add_sentence(val_data[-1]['summary'].split())
        vocab.add_user(val_data[-1]['userID'])
        vocab.add_product(val_data[-1]['productID'])
    fns = os.listdir(args.test_dir)
    fns.sort(key=lambda p: int(p.split('.')[0]))
    for fn in tqdm(fns):
        f = open(args.test_dir + fn, 'r')
        test_data.append(json.load(f))
        f.close()
        vocab.add_sentence(test_data[-1]['reviewText'].split())
        vocab.add_sentence(test_data[-1]['summary'].split())
        vocab.add_user(test_data[-1]['userID'])
        vocab.add_product(test_data[-1]['productID'])

    print('Deleting rare words...')
    embed = vocab.trim()

    args.embed_num = len(embed)
    args.embed_dim = len(embed[0])

    train_dataset = Dataset(train_data)
    val_dataset = Dataset(val_data)
    train_iter = DataLoader(dataset=train_dataset,
                            batch_size=args.batch_size,
                            shuffle=True)
    val_iter = DataLoader(dataset=val_dataset,
                          batch_size=args.batch_size,
                          shuffle=False)

    net = EncoderDecoder(args, embed)
    if args.use_cuda:
        net.cuda()
    criterion = nn.NLLLoss(ignore_index=vocab.PAD_IDX, reduction='sum')
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    print('Begin training...')
    for epoch in range(1, args.epochs + 1):
        if epoch >= args.lr_decay_start:
            adjust_learning_rate(optim, epoch - args.lr_decay_start + 1)
        for i, batch in enumerate(train_iter):
            src, trg, src_embed, trg_embed, src_mask, src_lens, trg_lens, _1, _2 = vocab.read_batch(
                batch)
            output = net(src, trg, src_embed, trg_embed, vocab.word_num,
                         src_mask, src_lens, trg_lens)
            output = torch.log(output.view(-1, output.size(-1)) + 1e-20)
            trg_output = trg.view(-1)
            loss = criterion(output, trg_output) / len(src_lens)
            loss.backward()
            clip_grad_norm_(net.parameters(), args.max_norm)
            optim.step()
            optim.zero_grad()
            cnt = (epoch - 1) * len(train_iter) + i

            if cnt % args.print_every == 0:
                print('EPOCH [%d/%d]: BATCH_ID=[%d/%d] loss=%f' %
                      (epoch, args.epochs, i, len(train_iter), loss.data))

            if cnt % args.valid_every == 0 and cnt / args.valid_every >= 0:
                print('Begin valid... Epoch %d, Batch %d' % (epoch, i))
                cur_loss, r1, r2, rl = evaluate(net, criterion, vocab,
                                                val_iter, True)
                save_path = args.save_path + 'valid_%d_%.4f_%.4f_%.4f_%.4f' % (
                    cnt / args.valid_every, cur_loss, r1, r2, rl)
                net.save(save_path)
                print(
                    'Epoch: %2d Cur_Val_Loss: %f Rouge-1: %f Rouge-2: %f Rouge-l: %f'
                    % (epoch, cur_loss, r1, r2, rl))

    return
Ejemplo n.º 30
0
from vocab import Vocab
from lmdataset import LMDataset
from lm import NeuralLM
from deepy.dataset import SequentialMiniBatches
from deepy.trainers import SGDTrainer, LearningRateAnnealer
from deepy.layers import RNN, Dense


logging.basicConfig(level=logging.INFO)

resource_dir = os.path.abspath(os.path.dirname(__file__)) + os.sep + "resources"

vocab_path = os.path.join(resource_dir, "ptb.train.txt")
train_path = os.path.join(resource_dir, "ptb.train.txt")
valid_path = os.path.join(resource_dir, "ptb.valid.txt")
vocab = Vocab(char_based=True)
vocab.load(vocab_path, max_size=1000)

model = NeuralLM(input_dim=vocab.size, input_tensor=3)
model.stack(
    RNN(hidden_size=100, output_type="sequence"),
    RNN(hidden_size=100, output_type="sequence"),
    Dense(vocab.size, "softmax"),
)


if __name__ == "__main__":
    ap = ArgumentParser()
    ap.add_argument("--model", default=os.path.join(os.path.dirname(__file__), "models", "char_rnn_model1.gz"))
    ap.add_argument("--sample", default="")
    args = ap.parse_args()
Ejemplo n.º 31
0
    return strip_eos(sents)


def calc_ppl(sents, m):
    batches, _ = get_batches(sents, vocab, args.batch_size, device)
    total_nll = 0
    with torch.no_grad():
        for inputs, targets in batches:
            total_nll += model.nll_is(inputs, targets, m).sum().item()
    n_words = sum(len(s) + 1 for s in sents)  # include <eos>
    return total_nll / len(sents), np.exp(total_nll / n_words)


if __name__ == '__main__':
    args = parser.parse_args()
    vocab = Vocab(os.path.join(args.checkpoint, 'vocab.txt'))
    set_seed(args.seed)
    cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if cuda else "cpu")
    model = get_model(os.path.join(args.checkpoint, 'model.pt'))

    if args.evaluate:
        sents = load_sent(args.data)
        batches, _ = get_batches(sents, vocab, args.batch_size, device)
        meters = evaluate(model, batches)
        print(' '.join([
            '{} {:.2f},'.format(k, meter.avg) for k, meter in meters.items()
        ]))

    if args.ppl:
        sents = load_sent(args.data)
Ejemplo n.º 32
0
parser.add_argument('--hidden-dim', default=50, type=int,
                    help='hidden node dimensionality')
parser.add_argument('--l2-penalty', default=0.0001, type=float,
                    help='l2 penalty for params')
parser.add_argument('--gru-initial-bias', default=2, type=int,
                    help='initial gru bias for r & z. higher => more like SimpleRnn')
opts = parser.parse_args()
print >>sys.stderr, opts

NUM_LABELS = 3

def log(s):
    print >>sys.stderr, util.dts(), s

# slurp training data, including converting of tokens -> ids
vocab = Vocab()
train_x, train_y, train_stats = util.load_data(opts.train_set, vocab,
                                               update_vocab=True,
                                               max_egs=int(opts.num_from_train))
log("train_stats %s %s" % (len(train_x), train_stats))
dev_x, dev_y, dev_stats = util.load_data(opts.dev_set, vocab,
                                         update_vocab=False,
                                         max_egs=int(opts.num_from_dev))
log("dev_stats %s %s" % (len(dev_x), dev_stats))

# input/output example vars
s1_idxs = T.ivector('s1')  # sequence for sentence one
s2_idxs = T.ivector('s2')  # sequence for sentence two
actual_y = T.ivector('y')  # single for sentence pair label; 0, 1 or 2

# keep track of different "layers" that handle their own gradients.
Ejemplo n.º 33
0
def run(test_dir, 
    test_srcs,
    test_src_caps,
    checkpoint, 
    vocab_src,
    vocab_tgt, 
    out="captions.out.txt",
    batch_size=16, 
    max_seq_len=MAX_LEN,
    hidden_dim=HIDDEN_DIM,
    emb_dim=EMB_DIM,
    enc_seq_len=ENC_SEQ_LEN,
    enc_dim=ENC_DIM,
    attn_activation="relu",
    deep_out=False,
    decoder=2,
    attention=3):

    if decoder == 1:
        decoder = mmt.AttentionDecoder_1
    elif decoder == 2:
        decoder = mmt.AttentionDecoder_2
    elif decoder == 3:
        decoder = mmt.AttentionDecoder_3
    elif decoder == 4:
        decoder = mmt.AttentionDecoder_4

    if attention == 1:
        attention = attentions.AdditiveAttention
    elif attention == 2:
        attention = attentions.GeneralAttention
    elif attention == 3:
        attention = attentions.ScaledGeneralAttention

    # load vocabulary
    vocabulary_src = Vocab()
    vocabulary_src.load(vocab_src)

    vocabulary_tgt = Vocab()
    vocabulary_tgt.load(vocab_tgt)

    # load test instances file paths
    srcs = open(test_srcs).read().strip().split('\n')
    srcs = [os.path.join(test_dir, s) for s in srcs]

    src_caps = open(test_src_caps, encoding='utf-8').read().strip().split('\n')


    # load model
    net = MMTNetwork(
        src_emb_dim=emb_dim,
        tgt_emb_dim=emb_dim,
        enc_dim=hidden_dim,
        dec_dim=hidden_dim,
        src_dim=vocabulary_src.n_words, 
        out_dim=vocabulary_tgt.n_words,
        img_attn_dim=512,
        src_cap_attn_dim=512,
        sos_token=0, eos_token=1, pad_token=2,
        max_seq_len=max_seq_len,
        deep_out=deep_out,
        attention=attention, decoder=decoder)
    net.to(DEVICE)

    net.load_state_dict(torch.load(checkpoint))
   
    net.eval()

    with torch.no_grad():

        # run inference
        num_instances = len(srcs)

        i = 0
        captions = []
        while i < num_instances:
            srcs_batch = srcs[i:i + batch_size]
            batch = _load_batch(srcs_batch)
            batch = batch.to(DEVICE)

            caps_in = src_caps[i:i + batch_size]
            caps_in = [vocabulary_src.sentence_to_tensor(y, max_seq_len) for y in caps_in]
            caps_in = torch.stack(caps_in, dim=0)
            caps_in = caps_in.permute(1, 0, 2)
            caps_in = caps_in.to(DEVICE)

            tokens, _ = net(source_captions=caps_in,
                image_features=batch, 
                targets=None, 
                max_len=max_seq_len)
            
            tokens = tokens.permute(1, 0, 2).detach()
            _, topi = tokens.topk(1, dim=2)
            topi = topi.squeeze(2)

            # decode token output from the model
            for j in range(len(srcs_batch)):
                c = vocabulary_tgt.tensor_to_sentence(topi[j])
                c = ' '.join(c)
                captions.append(c)

            i += len(srcs_batch)

    out_f = open(out, mode='w', encoding='utf-8')
    for c in captions:
        out_f.write(c + '\n')

    return
Ejemplo n.º 34
0
    def preprocess_data(self):
        print('Preprocessing data')
        raw_data = json.loads(open(self.raw_data_path).read().lower())
        db_data = self.db_json
        sw_ent, mw_ent = self._value_key_map(db_data)
        vocab = Vocab(cfg.vocab_size, self.otlg.special_tokens)
        # delexicalization
        dialogs = {}
        for dial_id, dial in enumerate(raw_data):
            dialogs[dial_id] = {}
            dialogs[dial_id]['goal'] = dial['goal']
            turns = []
            for turn in dial['dial']:
                turn_num = turn['turn']
                constraint = dict((slot, []) for slot in self.informable_slots)
                constraint_flat, user_request, sys_request = [], [], []
                for slot_values in turn['usr']['slu']:
                    if slot_values['act'] == 'inform':
                        slot, value = slot_values['slots'][0][0], slot_values['slots'][0][1]
                        slot = 'restaurant-' + slot
                        if slot != 'restaurant-slot' and value not in ['dontcare', 'none']:
                            constraint[slot].extend(self.word_tokenize(value))
                            constraint_flat.extend(self.word_tokenize(value))
                        if value == 'dontcare':
                            constraint[slot].extend(['dontcare'])
                            constraint_flat.extend(['dontcare'])
                    elif slot_values['act'] == 'request':
                        user_request.append('[value_%s]'%slot_values['slots'][0][1])
                            # constraint[slot].extend(['do', "n't", 'care'])
                if turn['sys']['da']:
                    for s in turn['sys']['da']:
                        s = ['price', 'range'] if s == 'pricerange' else [s]
                        if s == [["area, centre"]]:
                            s = ['area']
                        sys_request.extend(s)
                user = self.word_tokenize(turn['usr']['transcript'])
                resp = ' '.join(self.word_tokenize(turn['sys']['sent']))
                resp = self._replace_entity(resp, sw_ent, mw_ent, constraint_flat)
                resp = resp.replace('[value_phone].', '[value_phone] .').replace('ok.', 'ok .')
                resp = resp.split()
                # try:
                turns.append({
                    'turn': turn_num,
                    'user': '******'.join(user),
                    'response': ' '.join(resp),
                    'constraint': json.dumps(constraint),
                    'user_request': ' '.join(user_request),
                    'sys_request': ' '.join(sys_request),
                    'db_match': len(self.db_json_search(constraint)),
                })
                for word in user + resp:
                    vocab.add_word(word)
            dialogs[dial_id]['log'] = turns

        # save preprocessed data
        with open(self.data_path, 'w') as f:
            json.dump(dialogs, f, indent=2)

        # construct vocabulary
        vocab.construct()
        vocab.save_vocab(self.dataset_path + 'vocab')
        return dialogs
Ejemplo n.º 35
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)

    for dir_path in [args.vocab_dir, args.model_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    # unfiltered_vocab_size = vocab.size()
    print("vocab size is ", vocab.size())
    vocab.filter_tokens_by_cnt(min_cnt=2)
    print("after filtered vocab size is ", vocab.size())
    # filtered_num = unfiltered_vocab_size - vocab.size()

    vocab.randomly_init_embeddings(args.embed_size)
    if args.use_pre_train:
        vocab.load_pretrained_embeddings(args.pre_train_file)

    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
Ejemplo n.º 36
0
def main():
    global args
    args = parse_args()
    # global logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s")
    # file logger
    fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w')
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    # console logger
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    # argument validation
    args.cuda = args.cuda and torch.cuda.is_available()
    if args.sparse and args.wd!=0:
        logger.error('Sparsity and weight decay are incompatible, pick one!')
        exit()
    logger.debug(args)
    #torch.manual_seed(args.seed)
    #random.seed(args.seed)
    if args.cuda:
        #torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    train_dir = os.path.join(args.data,'train/')
    dev_dir = os.path.join(args.data,'dev/')
    test_dir = os.path.join(args.data,'test/')

    # write unique words from all token files
    sick_vocab_file = os.path.join(args.data,'sick.vocab')
    if not os.path.isfile(sick_vocab_file):
        token_files_a = [os.path.join(split,'a.toks') for split in [train_dir,dev_dir,test_dir]]
        token_files_b = [os.path.join(split,'b.toks') for split in [train_dir,dev_dir,test_dir]]
        token_files = token_files_a+token_files_b
        sick_vocab_file = os.path.join(args.data,'sick.vocab')
        build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD])
    logger.debug('==> SICK vocabulary size : %d ' % vocab.size())

    # load SICK dataset splits
    train_file = os.path.join(args.data,'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)
    logger.debug('==> Size of train data   : %d ' % len(train_dataset))
    dev_file = os.path.join(args.data,'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)
    logger.debug('==> Size of dev data     : %d ' % len(dev_dataset))
    test_file = os.path.join(args.data,'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)
    logger.debug('==> Size of test data    : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(
                vocab.size(),
                args.input_dim,
                args.mem_dim,
                args.hidden_dim,
                args.num_classes,
                args.sparse)
    criterion = nn.KLDivLoss()
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim=='adam':
        optimizer   = optim.Adam(parameters, lr=args.lr, weight_decay=args.wd)
    elif args.optim=='adagrad':
        optimizer   = optim.Adagrad(parameters, lr=args.lr, weight_decay=args.wd)
    elif args.optim=='sgd':
        optimizer   = optim.SGD(parameters, lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'adadelta':
        optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors

    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove, 'glove.840B.300d'))
        logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)]
        torch.save(emb, emb_file)
        # plug these into embedding matrix inside model

    if args.cuda:
        emb = emb.cuda()
    model.emb.weight.data.copy_(emb)

    # create trainer object for training and testing
    trainer = Trainer(args, model, criterion, optimizer)

    best = -float('inf')
    for epoch in range(args.epochs):
        train_loss             = trainer.train(train_dataset)
        train_loss, train_pred,act_idxs_train = trainer.test(train_dataset)
        dev_loss, dev_pred,act_idxs_dev     = trainer.test(dev_dataset)
        test_loss, test_pred,act_idxs_test   = trainer.test(test_dataset)

        train_pred = torch.index_select(train_pred,0,act_idxs_train)
        train_dataset_labels = torch.index_select(train_dataset.labels,0,act_idxs_train)
        train_pearson = metrics.pearson(train_pred,train_dataset_labels)
        train_spearmann = metrics.spearmann(train_pred, train_dataset_labels)
        train_mse = metrics.mse(train_pred,train_dataset_labels)
        logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tSpearman: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_spearmann, train_mse))

        dev_pred = torch.index_select(dev_pred, 0, act_idxs_dev)
        dev_dataset_labels = torch.index_select(dev_dataset.labels, 0, act_idxs_dev)
        dev_pearson = metrics.pearson(dev_pred,dev_dataset_labels)
        dev_spearmann = metrics.spearmann(dev_pred, dev_dataset_labels)
        dev_mse = metrics.mse(dev_pred,dev_dataset_labels)
        logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tSpearman: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_spearmann, dev_mse))

        test_pred = torch.index_select(test_pred, 0, act_idxs_test)
        test_dataset_labels = torch.index_select(test_dataset.labels, 0, act_idxs_test)
        test_pearson = metrics.pearson(test_pred,test_dataset_labels)
        test_spearmann = metrics.spearmann(test_pred, test_dataset_labels)
        test_mse = metrics.mse(test_pred,test_dataset_labels)
        logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tSpearman: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_spearmann,test_mse))

        if best < test_pearson:
            best = test_pearson
            checkpoint = {'model': trainer.model.state_dict(), 'optim': trainer.optimizer,
                          'pearson': test_pearson, 'mse': test_mse,
                          'args': args, 'epoch': epoch }
            logger.debug('==> New optimum found, checkpointing everything now...')
            torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
            np.savetxt("test_pred.csv", test_pred.numpy(), delimiter=",")
Ejemplo n.º 37
0
  def __init__(self, model, *args, **kwargs):
    """"""
    if args:
      if len(args) > 1:
        raise TypeError('Parser takes at most one argument')
    
    kwargs['name'] = kwargs.pop('name', model.__name__)
    super(Network, self).__init__(*args, **kwargs)
    if not os.path.isdir(self.save_dir):
      os.mkdir(self.save_dir)
    with open(os.path.join(self.save_dir, 'config.cfg'), 'w') as f:
      self._config.write(f)

    # objectives = ['pos_loss', 'trigger_loss', 'actual_parse_loss', 'srl_loss', 'multitask_loss_sum']
    # self._global_steps = {o: tf.Variable(0., trainable=False) for o in objectives}
    self._global_step = tf.Variable(0., trainable=False)
    self._global_epoch = tf.Variable(0., trainable=False)

    # todo what is this??
    # self._model = model(self._config, global_step=self.global_step)
    self._model = model(self._config)

    self._vocabs = []

    if self.conll:
      vocab_files = [(self.word_file, 1, 'Words'),
                     (self.tag_file, [3, 4], 'Tags'),
                     (self.rel_file, 7, 'Rels')]
    elif self.conll2012:
      vocab_files = [(self.word_file, 3, 'Words'),
                     (self.tag_file, [5, 4], 'Tags'), # auto, gold
                     (self.rel_file, 7, 'Rels'),
                     (self.srl_file, range(14, 50), 'SRLs'),
                     (self.trig_file, [10, 4] if self.joint_pos_predicates else 10, 'Trigs'),
                     (self.domain_file, 0, 'Domains')]

    print("Loading vocabs")
    sys.stdout.flush()
    for i, (vocab_file, index, name) in enumerate(vocab_files):
      vocab = Vocab(vocab_file, index, self._config,
                    name=name,
                    cased=self.cased if not i else True,
                    use_pretrained=(not i))
      self._vocabs.append(vocab)

    print("Predicates vocab: ")
    for l, i in sorted(self._vocabs[4].iteritems(), key=operator.itemgetter(1)):
      print("%s: %d" % (l, i))
    print("predicate_true_start_idx", self._vocabs[4].predicate_true_start_idx)

    print("Loading data")
    sys.stdout.flush()
    self._trainset = Dataset(self.train_file, self._vocabs, model, self._config, name='Trainset')
    self._validset = Dataset(self.valid_file, self._vocabs, model, self._config, name='Validset')
    self._testset = Dataset(self.test_file, self._vocabs, model, self._config, name='Testset')

    self._ops = self._gen_ops()
    self._save_vars = filter(lambda x: u'Pretrained' not in x.name, tf.global_variables())
    self.history = {
      'train_loss': [],
      'train_accuracy': [],
      'valid_loss': [],
      'valid_accuracy': [],
      'test_acuracy': 0
    }
    return
Ejemplo n.º 38
0
def prepro(args):
    logger = logging.getLogger("QANet")
    logger.info("====== preprocessing ======")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)

    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    dataloader = DataLoader(args.max_p_num, args.max_p_len, args.max_q_len, args.max_ch_len, 
                          args.train_files, args.dev_files, args.test_files)

    vocab = Vocab(lower=True)
    for word in dataloader.word_iter('train'):
        vocab.add_word(word)
        [vocab.add_char(ch) for ch in word]

    unfiltered_vocab_size = vocab.word_size()
    vocab.filter_words_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.word_size()
    logger.info('After filter {} tokens, the final vocab size is {}, char size is{}'.format(filtered_num,
                                                                            vocab.word_size(), vocab.char_size()))

    unfiltered_vocab_char_size = vocab.char_size()
    vocab.filter_chars_by_cnt(min_cnt=2)
    filtered_char_num = unfiltered_vocab_char_size - vocab.char_size()
    logger.info('After filter {} chars, the final char vocab size is {}'.format(filtered_char_num,
                                                                            vocab.char_size()))

    logger.info('Assigning embeddings...')
    if args.pretrained_word_path is not None:
        vocab.load_pretrained_word_embeddings(args.pretrained_word_path)
    else:
        vocab.randomly_init_word_embeddings(args.word_embed_size)
    
    if args.pretrained_char_path is not None:
        vocab.load_pretrained_char_embeddings(args.pretrained_char_path)
    else:
        vocab.randomly_init_char_embeddings(args.char_embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('====== Done with preparing! ======')
Ejemplo n.º 39
0
class Captcha:
    '''
    size: width, height in pixel
    font: font family(string), size (unit pound) and font color (in "#rrggbb" format)
    bgcolor: in "#rrggbb" format
    '''
    def __init__(self, size, font, bgcolor, length = 4):
        #todo: add param check and transform here
        path = 'D:\AI\\font\\arial\Arial.ttf'
        self.width, self.height = size
        self.font_family, self.font_size, self.font_color = font
        self.bgcolor = bgcolor
        self.len = length
        self.vocab = Vocab()
        self.font = ImageFont.truetype(path, self.font_size)
    def get_text(self):
        return self.vocab.rand_string(self.len)
    # by default, draw center align text
    def draw_text(self, str):
        dr = ImageDraw.Draw(self.im)
        font_width, font_height = self.font.getsize(str)
        # don't know why, but for center align, I should divide it by 2, other than 3
        dr.text(((self.width - font_width) / 3, (self.height - font_height) / 3), str, fill = self.font_color, font = self.font)
    def draw_background(self):
        pass
    def transform(self):
        params = [1 - float(random.randint(1, 2)) / 100,
                  0,
                  0,
                  0,
                  1 - float(random.randint(1, 10)) / 100,
                  float(random.randint(1, 2)) / 500,
                  0.001,
                  float(random.randint(1, 2)) / 500
                  ]
        self.im = self.im.transform((self.width, self.height), Image.PERSPECTIVE, params)
    def filter(self):
        self.im.filter(ImageFilter.EDGE_ENHANCE_MORE)
    # by default, add no noises
    def add_noise(self):
        pass
    #获取验证码(包括图片和文字)
    def get_captcha(self):
        self.im = Image.new("RGB", (self.width, self.height), (self.bgcolor))
        self.draw_background()
        str = self.get_text()
        self.draw_text(str)
        self.add_noise()
        self.transform()
        self.filter()
        #self.im.save("D:\pic\pic2.jpg")
        return self.im, str
    #自定义函数,获取图片,把png的四通道转变为三通道,返回图片和内容
    def get_myImage(self):
        #文件图片放在D盘
        #ims=Image.open("D:\AI\pythonPack\pic\反馈意见.png");十天内免登录,二维码登录
        ims = Image.open("D:\AI\pythonPack\pic4\邮箱中心.png");
        bg = Image.new("RGB", ims.size, (255, 255, 255))
        #print('bg',bg)
        #bg.paste(ims, ims)

        return bg,'邮箱中心';
Ejemplo n.º 40
0
    sentence_str = '_sentence' if args.split_sentences else ''
    tokenized_data_fn = '{}{}{}.json'.format(args.tokenized_fp, debug_str,
                                             sentence_str)
    with open(tokenized_data_fn, 'r') as fd:
        tokenized_data = json.load(fd)
    token_counts_fn = '{}{}{}.json'.format(args.token_counts_fp, debug_str,
                                           sentence_str)
    with open(token_counts_fn, 'r') as fd:
        token_counts = json.load(fd)
    N = float(token_counts['__ALL__'])
    print('Subsampling {} tokens'.format(N))

    # Store subsampled data
    tokenized_subsampled_data = []
    # And vocabulary with word counts
    vocab = Vocab()
    num_docs = len(tokenized_data)
    sections = set()
    categories = set()
    for doc_idx in tqdm(range(num_docs)):
        tokenized_doc_str = tokenized_data[doc_idx]
        subsampled_doc = []
        prev_token = 'dummy'
        doc_tokens = tokenized_doc_str.split()
        for tidx, token in enumerate(doc_tokens):
            if prev_token == token:
                continue
            wc = token_counts[token]
            is_section_header = 'header=' in token
            is_doc_header = 'document=' in token
            if is_section_header:
Ejemplo n.º 41
0
    def preprocess_data(self):
        """
        Somerrthing to note: We define requestable and informable slots as below in further experiments
        (including other baselines):
        :param raw_data:
        :param add_to_vocab:
        :param data_type:
        :return:
        """
        vocab = Vocab(cfg.vocab_size, self.otlg.special_tokens)
        for data_type in ['train', 'dev', 'test']:
            print('Preprocessing %s data'%data_type)
            raw_data =  json.loads(open(self.raw_data_path[data_type], 'r').read().lower())
            precessed_dialogs = {}
            state_dump = {}
            for dial_id, raw_dial in enumerate(raw_data):
                precessed_dialog = []
                prev_utter = ''
                single_turn = {}
                constraint_flat = []
                constraint_dict = {}
                intent = raw_dial['scenario']['task']['intent']
                if cfg.domain != 'all' and cfg.domain != intent:
                    if intent not in ['navigate','weather','schedule']:
                        raise ValueError('what is %s intent bro?' % intent)
                    else:
                        continue
                for turn_num,dial_turn in enumerate(raw_dial['dialogue']):
                    state_dump[(dial_id, turn_num)] = {}
                    if dial_turn['turn'] == 'driver':
                        u = self._lemmatize(self._tokenize(dial_turn['data']['utterance']))
                        u = re.sub(r'(\d+) ([ap]m)', lambda x: x.group(1) + x.group(2), u)
                        single_turn['user'] = u
                        prev_utter += u
                    elif dial_turn['turn'] == 'assistant':
                        s = dial_turn['data']['utterance']
                        # find entities and replace them
                        s = re.sub(r'(\d+) ([ap]m)', lambda x: x.group(1) + x.group(2), s)
                        s = self._replace_entity(s, prev_utter, intent)
                        single_turn['response'] = s

                        # get constraints
                        for s,v in dial_turn['data']['slots'].items():
                            constraint_dict[intent + '-' + s] = v

                        constraint_dict = self._clean_constraint_dict(constraint_dict, intent)
                        constraint_flat = list(constraint_dict.values())

                        single_turn['constraint'] = json.dumps(constraint_dict)
                        single_turn['turn_num'] = len(precessed_dialog)
                        single_turn['dial_id'] = dial_id

                        if 'user' in single_turn:
                            state_dump[(dial_id, len(precessed_dialog))]['constraint'] = constraint_dict
                            precessed_dialog.append(single_turn)
                        single_turn = {}

                for single_turn in precessed_dialog:
                    for word_token in constraint_flat + \
                            single_turn['user'].split() + single_turn['response'].split():
                        vocab.add_word(word_token)
                precessed_dialogs[dial_id] = precessed_dialog

            with open(self.data_path[data_type],'w') as f:
                json.dump(precessed_dialogs,f,indent=2)

        # construct vocabulary
        vocab.construct()
        vocab.save_vocab(self.dataset_path + 'vocab')

        return
Ejemplo n.º 42
0
# furthermore these are only valid if tied embeddings (at least for now that's all
# implemented)
if opts.vocab_file and not opts.tied_embeddings:
    raise Exception("must set --tied-embeddings if using pre initialised embeddings")

# sanity check other opts
assert opts.keep_prob >= 0.0 and opts.keep_prob <= 1.0

NUM_LABELS = 3

def log(s):
    print >>sys.stderr, util.dts(), s

# slurp training data, including converting of tokens -> ids
# if opts.vocab_file set read from that file, otherwise populate lookups as used
vocab = Vocab(opts.vocab_file)
train_x, train_y, train_stats = util.load_data(opts.train_set, vocab,
                                               update_vocab=True,
                                               max_egs=int(opts.num_from_train),
                                               parse_mode=opts.parse_mode)
log("train_stats %s %s" % (len(train_x), train_stats))
dev_x, dev_y, dev_stats = util.load_data(opts.dev_set, vocab,
                                         update_vocab=False,
                                         max_egs=int(opts.num_from_dev),
                                         parse_mode=opts.parse_mode)
log("dev_stats %s %s" % (len(dev_x), dev_stats))

# input/output example vars
s1_idxs = T.ivector('s1')  # sequence for sentence one
s2_idxs = T.ivector('s2')  # sequence for sentence two
actual_y = T.ivector('y')  # single for sentence pair label; 0, 1 or 2
                    type=str,
                    default='./saved_models',
                    help='Root dir for saving models.')
parser.add_argument('--seed', type=int, default=0)
args = parser.parse_args()

# set random seed
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
torch.cuda.manual_seed(args.seed)
helper.print_arguments(args)

# load vocab
print("Loading vocab...")
token_vocab = Vocab.load_vocab(args.vocab_dir + '/vocab_tok.vocab')  # token
post_vocab = Vocab.load_vocab(args.vocab_dir + '/vocab_post.vocab')  # position
pos_vocab = Vocab.load_vocab(args.vocab_dir + '/vocab_pos.vocab')  # POS
dep_vocab = Vocab.load_vocab(args.vocab_dir + '/vocab_dep.vocab')  # deprel
pol_vocab = Vocab.load_vocab(args.vocab_dir + '/vocab_pol.vocab')  # polarity
vocab = (token_vocab, post_vocab, pos_vocab, dep_vocab, pol_vocab)
print(
    "token_vocab: {}, post_vocab: {}, pos_vocab: {}, dep_vocab: {}, pol_vocab: {}"
    .format(len(token_vocab), len(post_vocab), len(pos_vocab), len(dep_vocab),
            len(pol_vocab)))
args.tok_size = len(token_vocab)
args.post_size = len(post_vocab)
args.pos_size = len(pos_vocab)

# load pretrained word emb
print("Loading pretrained word emb...")
Ejemplo n.º 44
0
def main():
    global args
    args = parse_args()

    # local directory
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    # global logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s")

    # file logger
    fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w')
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    # console logger
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    # argument validation
    # single gpu running, if too slow switch multiple-gpu running
    args.cuda = args.cuda and torch.cuda.is_available()
    device = parallel.get_device(args.device) if args.cuda else torch.device("cpu")
    args.shard_size = len(args.device) if args.cuda else 1

    # control randomness
    logger.debug(args)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # prepare vocabularies
    vocab_file = os.path.join(args.data, 'vocab.txt')
    assert os.path.isfile(vocab_file)
    src_vocab_file = os.path.join(args.data, 'src_vocab.txt')
    if args.use_src:
        assert os.path.isfile(src_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(vocab_file)
    logger.debug('==> vocabulary size : %d ' % vocab.size())
    if args.use_src:
        src_vocab = Vocab(src_vocab_file)
        logger.debug('==> source vocabulary size : %d ' % src_vocab.size())

    # initialize model, criterion/loss_function, optimizer
    model = TreeLSTMAutoEncoder(
        vocab.size(),       # vocabulary size, word embeddings
        args.input_dim,     # word embedding siz
        args.mem_dim,       # hidden size in tree

        bits_number=args.bit_number,                # the number of hashing bits
        filter_size=args.filter_size,               # the internal state size for unbottleneck
        max_num_children=args.max_num_children,     # maximum allowed children number

        noise_dev=args.noise_dev,                   # the deviation of injected noise
        startup_steps=args.startup_size,            # warmup step
        discrete_mix=args.discrete_mix,             # mix ratio for discretization
        use_bottleneck=args.use_bottleneck,         # whether use the discretization model

        src_vocab_size=src_vocab.size() if args.use_src else None,  # source vocabulary size
        atn_num_layer=args.num_layer,                               # transformer layer number
        atn_num_heads=args.num_head,                                # attention heads
        atn_dp=args.atn_dp,                                         # dropout for transformer
    )
    logger.info(model)

    model.to(device)

    if args.optim == 'adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      model.parameters()), lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(filter(lambda p: p.requires_grad,
                                         model.parameters()), lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     model.parameters()), lr=args.lr, weight_decay=args.wd)
    else:
        raise Exception("Unrecognized/Unsupported model optimizer {}".format(args.optim))

    start_epoch = 0
    global_step = 0
    best = -float('inf')

    # backup from saved checkpoints
    saved_checkpoints = '%s.pt' % os.path.join(args.save, args.expname)
    tmp_saved_checkpoints = '%s.tmp.pt' % os.path.join(args.save, args.expname)
    if os.path.isfile(saved_checkpoints):
        saved_states = torch.load(saved_checkpoints, map_location=device)

        logger.info("detected and loaded")
        model.load_state_dict(saved_states['model'])
        optimizer.load_state_dict(saved_states['optim'])
        start_epoch = saved_states['epoch'] + 1
        global_step = saved_states['global_step']
        best = saved_states['best']
    elif os.path.isfile(tmp_saved_checkpoints):
        saved_states = torch.load(tmp_saved_checkpoints, map_location=device)

        logger.info("temporary checkpoint detected and loaded")
        model.load_state_dict(saved_states['model'])
        optimizer.load_state_dict(saved_states['optim'])
        start_epoch = saved_states['epoch'] + 1
        global_step = saved_states['global_step']

    zglobal.global_update("global_step", global_step)

    # create trainer object for training and testing
    trainer = Trainer(args, model, optimizer, device, logger, epoch=start_epoch)

    if args.mode == "train":

        # load dataset splits
        train_dataset = TreeDataset(
            train_dir,
            vocab,
            src_path=train_dir if args.use_src else None,
            src_vocab=src_vocab if args.use_src else None,
            tree_depth_limit=args.max_depth,
            tree_size_limit=args.max_tree_size,
            src_len_limit=args.max_src_len,
        )
        logger.debug('==> Loading train data')

        dev_dataset = TreeDataset(
            dev_dir,
            vocab,
            src_path=dev_dir if args.use_src else None,
            src_vocab=src_vocab if args.use_src else None,
            tree_depth_limit=args.max_depth,
            tree_size_limit=args.max_tree_size,
            src_len_limit=args.max_src_len,
        )
        logger.debug('==> Loading dev data')

        logger.debug('Start training from epoch {}'.format(start_epoch))

        for epoch in range(start_epoch, args.epochs):
            train_loss = trainer.train(train_dataset)
            dev_loss, dev_acc, dev_pred, dev_repr = trainer.test(dev_dataset)

            global_step = zglobal.global_get('global_step')
            logger.info('==> Epoch {}, Step {}, Train \tLoss: {}'.format(epoch, global_step, train_loss))
            logger.info('==> Epoch {}, Step {}, Dev \tLoss: {}, ACC: {}'.format(epoch, global_step, dev_loss, dev_acc))

            # select best model according to accuracy, rather than development loss
            dev_score = dev_acc    # - dev_loss

            if best < dev_score:
                best = dev_score
                checkpoint = {
                    'model': trainer.model.state_dict(),
                    'optim': trainer.optimizer.state_dict(),
                    'args': args, 'epoch': epoch,
                    'global_step': zglobal.global_get('global_step'),
                    'best': best,
                }
                logger.debug('==> New optimum found, checkpointing everything now...')
                torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
                torch.save(checkpoint, '%s.%spt' % (os.path.join(args.save, args.expname), epoch))
                torch.save(dev_pred, '%s.dev.pred.%spt' % (os.path.join(args.save, args.expname), epoch))
                torch.save(dev_repr, '%s.dev.repr.%spt' % (os.path.join(args.save, args.expname), epoch))

    elif args.mode == "eval":

        test_dataset = TreeDataset(
            test_dir,
            vocab,
            src_path=test_dir if args.use_src else None,
            src_vocab=src_vocab if args.use_src else None,
            tree_depth_limit=args.max_depth,
            tree_size_limit=args.max_tree_size,
            src_len_limit=args.max_src_len,
        )
        logger.debug('==> Loading test data')

        # evaluating the test set
        test_loss, test_acc, test_pred, test_repr = trainer.test(test_dataset)
        torch.save(test_pred, '%s.test.pred.th' % os.path.join(args.save, args.expname))
        torch.save(test_repr, '%s.test.repr.th' % os.path.join(args.save, args.expname))

    else:
        raise Exception("Invalid training mode {}".format(args.mode))

    logger.debug('Ending')
Ejemplo n.º 45
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger(args.algo)
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    data = Dataset(args.train_files, args.dev_files, args.test_files,
                   args.max_p_len, args.max_q_len)
    vocab = Vocab()
    for word in data.word_iter('train'):
        vocab.add(word)
    unfiltered_vocab_size = vocab.size()
    vocab.filtered_tokens(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))
    logger.info('Assigning embeddings...')
    if args.use_embe:
        vocab.load_pretrained_embeddings(
            embedding_path=
            '/home/home1/dmyan/codes/tensorflow/data/word2vec/300_ver_not_pure.bin'
        )
    else:
        vocab.random_init_embeddings(args.embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
    phrase_str = '_phrase' if args.combine_phrases else ''
    tokenized_data_fn = '{}{}{}.json'.format(args.tokenized_fp, debug_str,
                                             phrase_str)
    with open(tokenized_data_fn, 'r') as fd:
        tokenized_data = json.load(fd)
    token_counts_fn = '{}{}{}.json'.format(args.token_counts_fp, debug_str,
                                           phrase_str)
    with open(token_counts_fn, 'r') as fd:
        token_counts = json.load(fd)
    N = float(token_counts['__ALL__'])
    print('Subsampling {} tokens'.format(N))

    # Store subsampled data
    tokenized_subsampled_data = []
    # And vocabulary with word counts
    vocab = Vocab()
    num_docs = len(tokenized_data)
    sections = set()
    for doc_idx in tqdm(range(num_docs)):
        category, tokenized_doc_str = tokenized_data[doc_idx]
        subsampled_doc = []
        for token in tokenized_doc_str.split():
            wc = token_counts[token]
            is_section_header = re.match(r'header=[A-Z]+', token)

            is_phrase = '_' in token
            if is_section_header:
                subsampled_doc.append(token)
                sections.add(token)
            else:
                threshold = args.min_phrase_count if is_phrase else args.min_token_count
Ejemplo n.º 47
0
    if char in input_vocab_counter:
      input_vocab_counter[char] += 1
    else:
      input_vocab_counter[char] = 1

for target_text in data_target:
  target_text = target_text.strip()
  target_texts.append(target_text)
  for char in target_text:
    if char in target_vocab_counter:
      target_vocab_counter[char] += 1
    else:
      target_vocab_counter[char] = 1


input_vocab = Vocab(list(input_vocab_counter.keys()))
target_vocab = Vocab(list(target_vocab_counter.keys()))

# Generate train, eval, and test batches
seed = 1
zipped_texts = list(zip(input_texts, target_texts))
random.Random(seed).shuffle(zipped_texts)


# # train - 90%, eval - 7%, test - 3%
train_texts = zipped_texts[0:int(len(zipped_texts)*0.9)]
eval_texts = zipped_texts[int(
    len(zipped_texts)*0.9) + 1:int(len(zipped_texts)*0.97)]
test_texts = zipped_texts[int(len(zipped_texts)*0.97):-1]
# prepare batches
Ejemplo n.º 48
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)

    logger.info('Preparing the directories...')

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):  #构建词典只包含训练集
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Assigning embeddings...')
    # vocab.randomly_init_embeddings(args.embed_size)#TODO-load_pretrained_embeddings
    vocab.load_pretrained_embeddings(args.embedding_path)  #glove pre-trained

    logger.info('Saving vocab...')
    # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
    with open(args.vocab_path, 'wb') as fout:  #不区分search&zhidao
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 49
0
    config = dict()

    # User set parameters#
    config['maxastnodes'] = 100
    config['asthops'] = 2

    if modeltype == None:
        modeltype = modelfile.split('_')[0].split('/')[-1]

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu

    data_dir = '../data_set/' + args.lang

    # load vocab
    vocab = Vocab(data_dir=data_dir)
    vocab.load_vocab()

    # load data
    test_code_data, test_ast_data, test_edges, test_nl = load_data(
        data_dir, 'test')
    test_ids = list(range(len(test_code_data)))

    # code vocab size
    config['tdatvocabsize'] = len(vocab.code2index)
    # comment vocab size
    config['comvocabsize'] = len(vocab.nl2index)
    # ast vocab size
    config['smlvocabsize'] = len(vocab.ast2index)

    # set sequence lengths
Ejemplo n.º 50
0
    traindocuments = parserNcbiTxtFile_simple(opt.train_file)
    devdocuments = parserNcbiTxtFile_simple(opt.dev_file)
    testdocuments = parserNcbiTxtFile_simple(opt.test_file)

    entityAbbres = loadAbbreviations(opt.abbre_file)
    preprocessMentions(traindocuments, devdocuments, testdocuments, entityAbbres)
    dict = load_dict(opt.dict_file)
    meshlabels, meshlabel_to_ix, dict_words = utils.parser_dict(dict)

    corpus_words = utils.parser_corpus(traindocuments, devdocuments, testdocuments)
    word_to_ix, all_words, char_to_ix = utils.generate_word_alphabet(corpus_words, dict_words)

    if opt.random_emb:
        opt.emb_filename = ''
    vocab = Vocab(word_to_ix, opt.emb_filename, opt.word_emb_size)

    dict_instances = norm_dataset.getDictInstance(dict, vocab, meshlabel_to_ix,char_to_ix)
    train_instances = norm_dataset.getNormInstance(traindocuments,vocab, meshlabel_to_ix,char_to_ix )
    dev_instances = norm_dataset.getNormInstance(devdocuments, vocab, meshlabel_to_ix,char_to_ix)
    test_instances = norm_dataset.getNormInstance(testdocuments, vocab, meshlabel_to_ix,char_to_ix)

    logging.info('dict_instances_len {}'.format(len(dict_instances)))
    logging.info('train_instance_len {}'.format(len(train_instances)))

    my_collate = utils.sorted_collate

    dict_loader = DataLoader(dict_instances, opt.batch_size, shuffle=True, collate_fn=my_collate)
    train_loader = DataLoader(train_instances, opt.batch_size, shuffle=True, collate_fn = my_collate)
    dev_loader = DataLoader(dev_instances, opt.batch_size, shuffle=False, collate_fn = my_collate)
    test_loader = DataLoader(test_instances, opt.batch_size, shuffle=False, collate_fn = my_collate)
Ejemplo n.º 51
0
###
# Globals
###
app = flask.Flask(__name__)

CONFIG = config.configuration()
app.secret_key = CONFIG.SECRET_KEY  # Should allow using session variables

#
# One shared 'Vocab' object, read-only after initialization,
# shared by all threads and instances.  Otherwise we would have to
# store it in the browser and transmit it on each request/response cycle,
# or else read it from the file on each request/responce cycle,
# neither of which would be suitable for responding keystroke by keystroke.

WORDS = Vocab(CONFIG.VOCAB)

###
# Pages
###


@app.route("/")
@app.route("/index")
def index():
    """The main page of the application"""
    flask.g.vocab = WORDS.as_list()
    flask.session["target_count"] = min(
        len(flask.g.vocab), CONFIG.SUCCESS_AT_COUNT)
    flask.session["jumble"] = jumbled(
        flask.g.vocab, flask.session["target_count"])
Ejemplo n.º 52
0
def main():
    wandb.init(project="nlp_course", config=args)
    #data preprocessing
    # train_path = './data/train.txt'
    # dev_path = './data/dev.txt'
    # dp = DataProcessor(train_path, dev_path)
    # dp.tokenize()

    # additional data processing
    train_file = Path('./tmp/train.txt')
    vocab_file = Path('./tmp/vocab.txt')

    if not vocab_file.exists():
        train_corpus = (line.strip() for line in train_file.open())
        vocab = Vocab.from_text(train_corpus,
                                max_types=MAX_TYPES,
                                min_freq=MIN_FREQ)
        vocab.save(vocab_file)
    else:
        vocab = Vocab.load(vocab_file)

    log.info(f'Vocab has {len(vocab)} types')

    train_data = TextDataset(vocab=vocab, path=train_file)
    dev_data = TextDataset(vocab=vocab, path=Path('./tmp/dev.txt'))

    # model = FNN_LM(vocab_size=len(vocab), n_class=len(vocab))
    # losses = train(model, n_epochs=5, batch_size=BATCH_SIZE, train_data=train_data,
    #                valid_data=dev_data)

    if args.model == "RNN":
        model = RNN_LM(vocab_size=len(vocab),
                       n_class=len(vocab),
                       emb_dim=args.emb_dim,
                       hid=args.hid,
                       dropout_rate=args.dropout_ratio,
                       num_layers=args.num_layers)
        losses = train(model,
                       n_epochs=args.n_epochs,
                       batch_size=BATCH_SIZE,
                       train_data=train_data,
                       valid_data=dev_data)
        torch.save(model.state_dict(), wandb.run.dir)

    elif args.model == "LSTM":
        model = LSTM_LM(vocab_size=len(vocab),
                        n_class=len(vocab),
                        emb_dim=args.emb_dim,
                        hid=args.hid,
                        dropout_rate=args.dropout_ratio,
                        num_layers=args.num_layers)
        losses = train(model,
                       n_epochs=args.n_epochs,
                       batch_size=BATCH_SIZE,
                       train_data=train_data,
                       valid_data=dev_data)
        torch.save(model.state_dict(), wandb.run.dir)

    elif args.model == "BiLSTM-ATT":
        model = BiLSTM_ATT_LM(vocab_size=len(vocab),
                              n_class=len(vocab),
                              emb_dim=args.emb_dim,
                              hid=args.hid,
                              dropout_rate=args.dropout_ratio,
                              num_layers=args.num_layers)
        losses = train(model,
                       n_epochs=args.n_epochs,
                       batch_size=BATCH_SIZE,
                       train_data=train_data,
                       valid_data=dev_data)
        torch.save(model.state_dict(), wandb.run.dir)
Ejemplo n.º 53
0
opts = parser.parse_args()
print >>sys.stderr, opts
seq_len = int(opts.hack_max_len)
hidden_dim = int(opts.hidden_dim)
embedding_dim = int(opts.embedding_dim)
batch_size = int(opts.batch_size)

# check that if one of --vocab--file or --initial_embeddings is set, they are both set.
assert not ((opts.vocab_file is None) ^ (opts.initial_embeddings is None)), "must set both --vocab-file & --initial-embeddings"

def log(s):
    print >>sys.stderr, util.dts(), s

# build vocab and load data
log("loading data")
vocab = Vocab(opts.vocab_file)
train_x, train_y, train_stats = util.load_data(opts.train_set, vocab,
                                               update_vocab=True,
                                               max_records=opts.num_from_train,
                                               max_len=seq_len,
                                               batch_size=batch_size)
log("train_stats %s %s" % (len(train_x), train_stats))
dev_x, dev_y, dev_stats = util.load_data(opts.dev_set, vocab,
                                         update_vocab=False,
                                         max_records=opts.num_from_dev,
                                         max_len=seq_len,
                                         batch_size=batch_size)
log("dev_stats %s %s" % (len(dev_x), dev_stats))
log("|VOCAB| %s" % vocab.size())

log("building model")
Ejemplo n.º 54
0
def test_single_vocab():
    vocab = Vocab([ "moe" ])
    assert vocab.as_list() == [ "moe" ]
    assert vocab.has("moe")
    assert not vocab.has("meeny")