Ejemplo n.º 1
0
def test(encdec):
    # Loads vocab.
    src_vocab = make_vocab(SRC_TRAIN_FILE, SRC_VOCAB_SIZE)
    trg_vocab = make_vocab(TRG_TRAIN_FILE, TRG_VOCAB_SIZE)
    inv_trg_vocab = make_inv_vocab(trg_vocab)

    for line in sys.stdin:
        trg_ids = test_batch(encdec, src_vocab, trg_vocab, [line_to_sent(line.strip(), src_vocab)])[0]
        # Prints the result.
        print(" ".join(inv_trg_vocab[wid] for wid in trg_ids))
Ejemplo n.º 2
0
def test(encdec, args):
    # Loads vocab.
    src_vocab = make_vocab(SRC_TRAIN_FILE, args.src_vocab)
    trg_vocab = make_vocab(TRG_TRAIN_FILE, args.trg_vocab)
    inv_trg_vocab = make_inv_vocab(trg_vocab)

    for line in sys.stdin:
        sent = [line_to_sent(line.strip(), src_vocab)]
        trg_ids = test_batch(encdec, src_vocab, trg_vocab, sent,
                             args.generation_limit)[0]
        # Prints the result.
        print(" ".join(inv_trg_vocab[wid] for wid in trg_ids))
Ejemplo n.º 3
0
 def __init__(self, applyDict=False, **kwargs):
     #TODO: document this
     """
     :param applyDict:
     :param kwargs: 'train_path' 'dev_path' 'test_path', 'dict_path', 'applyDict'
     """
     self.debug = False
     # if 'debug' in kwargs.keys():
     #     self.debug = kwargs['debug']
     self.keywords_dist = {}
     emotion_special_processing = kwargs['emotion_special_processing']
     emotion_type = kwargs['emotion_type']
     use_emotion_supervision = kwargs['use_emotion_supervision']
     if applyDict:
         print("[CORPUS]: Loading dictionary from provided path : ", kwargs['dict_path'])
         self.dictionary = load_pickle(kwargs['dict_path'])  # a previously saved pickle of a Dictionary
         print('[dictionary]: len(dictionary.word2idx) = ', len(self.dictionary.word2idx))
         if 'train_path' in kwargs.keys():
             self.train = self.tokenize(kwargs['train_path'],applyDict=applyDict,
                                        emotion_special_processing=emotion_special_processing,
                                        emotion_type=emotion_type,
                                        use_emotion_supervision=use_emotion_supervision)
         if 'dev_path' in kwargs.keys():
             self.valid = self.tokenize(kwargs['dev_path'],applyDict=applyDict,
                                        emotion_special_processing=emotion_special_processing,
                                        emotion_type=emotion_type,
                                        use_emotion_supervision=use_emotion_supervision)
         if 'test_path' in kwargs.keys():
             self.test = self.tokenize(kwargs['test_path'],applyDict=applyDict,
                                        emotion_special_processing=emotion_special_processing,
                                        emotion_type=emotion_type,
                                        use_emotion_supervision=use_emotion_supervision)
     else:
         self.dictionary = Dictionary()
         if 'train_path' in kwargs.keys():
             self.train = self.tokenize(kwargs['train_path'],
                                        emotion_special_processing=emotion_special_processing,
                                        emotion_type=emotion_type,
                                        use_emotion_supervision=use_emotion_supervision)
         if 'dev_path' in kwargs.keys():
             self.valid = self.tokenize(kwargs['dev_path'],
                                        emotion_special_processing=emotion_special_processing,
                                        emotion_type=emotion_type,
                                        use_emotion_supervision=use_emotion_supervision)
         if 'test_path' in kwargs.keys():
             self.test = self.tokenize(kwargs['test_path'],
                                        emotion_special_processing=emotion_special_processing,
                                        emotion_type=emotion_type,
                                       use_emotion_supervision=use_emotion_supervision)
         # save file when done
         make_vocab(self.dictionary, kwargs['output'])
Ejemplo n.º 4
0
    def __init__(self, dataset_path: str, max_length: int):
        """
        :param dataset_path: 데이터셋 root path
        :param max_length: 문자열의 최대 길이
        """
        self.dataset_path = dataset_path

        with open(os.path.join(dataset_path, 'sample_data'),
                  'r',
                  encoding='utf8') as f:
            self.train_sentences, self.train_labels = get_data(f)

        with open(os.path.join(dataset_path, 'test_data'),
                  'r',
                  encoding='utf8') as f:
            self.test_sentences, self.test_labels = get_data(f)
        print('data loading complete!')

        if os.path.isfile('./data/vocab.txt'):
            self.vocab = read_vocab()
        else:
            self.vocab = make_vocab(self.train_sentences)

        print('make vocab complete! vocab size = {}'.format(len(self.vocab)))

        self.sentences = preprocess(self.vocab, self.train_sentences,
                                    max_length)
        self.labels = [np.float32(x) for x in self.train_labels]
        print('training sentences :', len(self.sentences))
Ejemplo n.º 5
0
def preprocess(data_dir="./data"):
    print("begin to preprocess...")
    train_data_path = os.path.join(data_dir, "train.csv")
    new_train_data_path = os.path.join(data_dir, "train_prcssd.csv")
    test_data_path = os.path.join(data_dir, "test.csv")
    new_test_data_path = os.path.join(data_dir, "test_prcssd.csv")
    vocab_path = os.path.join(data_dir, "vocab.txt")
    # 读数据
    logging.info("loading data...")
    train_data = pd.read_csv(train_data_path)
    test_data = pd.read_csv(test_data_path)
    # 预处理
    train_data["tag"] = "train"
    test_data["tag"] = "test"
    data = train_data.append(test_data)
    logging.info("replacing bad words...")
    data["comment_text"] = data.apply(lambda d : my_utils.replace(d["comment_text"]), axis=1)
    logging.info("tokenizing...")
    data["tokens"] = data.apply(lambda d: my_utils.tokenize(d["comment_text"]), axis=1)
    logging.info("making vocabulary...")
    vocab = my_utils.make_vocab(data["tokens"])
    data["tokens"] = data.apply(lambda d: " ".join(d["tokens"]))
    train_data = data[data.tag == "train"]
    test_data = data[data.tag == "test"]
    #保存
    logging.info("saving...")
    train_data.to_csv(new_train_data_path)
    test_data.to_csv(new_test_data_path)
    my_utils.dump_vocab(vocab, vocab_path)
    logging.info("preprocess finished!")

    return train_data, test_data
Ejemplo n.º 6
0
def main(args):
    # create data batcher, vocabulary
    # batcher

    with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f:
        wc = pkl.load(f)
    word2id = make_vocab(wc, args.vsize)

    train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug)

    # make net
    print('vocab size:', len(word2id))
    ids = [id for word, id in word2id.items()]
    print(max(ids))
    print(list(sorted(ids))[0])
    net, net_args = configure_net(len(word2id), args.emb_dim, args.n_hidden,
                                  args.bi, args.n_layer, args.load_from)

    # configure training setting
    criterion, train_params = configure_training('adam', args.lr, args.clip,
                                                 args.decay, args.batch)

    # save experiment setting
    if not exists(args.path):
        os.makedirs(args.path)
    with open(join(args.path, 'vocab.pkl'), 'wb') as f:
        pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL)
    meta = {}
    meta['net'] = 'base_abstractor'
    meta['net_args'] = net_args
    meta['traing_params'] = train_params
    with open(join(args.path, 'meta.json'), 'w') as f:
        json.dump(meta, f, indent=4)

    # prepare trainer
    if args.cuda:
        net = net.cuda()

    val_fn = basic_validate(net, criterion)

    grad_fn = get_basic_grad_fn(net, args.clip)

    optimizer = optim.AdamW(net.parameters(), **train_params['optimizer'][1])

    #optimizer = optim.Adagrad(net.parameters(), **train_params['optimizer'][1])
    scheduler = ReduceLROnPlateau(optimizer,
                                  'min',
                                  verbose=True,
                                  factor=args.decay,
                                  min_lr=0,
                                  patience=args.lr_p)

    pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher,
                             args.batch, val_fn, criterion, optimizer, grad_fn)
    trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience,
                           scheduler)

    print('start training with the following hyper-parameters:')
    trainer.train()
Ejemplo n.º 7
0
def main(args):
    assert args.net_type in ['ff', 'rnn']
    # create data batcher, vocabulary
    # batcher
    with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f:
        wc = pkl.load(f)
    word2id = make_vocab(wc, args.vsize)
    train_batcher, val_batcher = build_batchers(args.net_type, word2id,
                                                args.cuda, args.debug)

    # make net
    net, net_args = configure_net(args.net_type, len(word2id), args.emb_dim,
                                  args.conv_hidden, args.lstm_hidden,
                                  args.lstm_layer, args.bi)
    if args.w2v:
        # NOTE: the pretrained embedding having the same dimension
        #       as args.emb_dim should already be trained
        embedding, _ = make_embedding({i: w
                                       for w, i in word2id.items()}, args.w2v)
        net.set_embedding(embedding)

    # configure training setting
    criterion, train_params = configure_training(args.net_type, 'adam',
                                                 args.lr, args.clip,
                                                 args.decay, args.batch)

    # save experiment setting
    if not exists(args.path):
        os.makedirs(args.path)
    with open(join(args.path, 'vocab.pkl'), 'wb') as f:
        pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL)
    meta = {}
    meta['net'] = 'ml_{}_extractor'.format(args.net_type)
    meta['net_args'] = net_args
    meta['traing_params'] = train_params
    with open(join(args.path, 'meta.json'), 'w') as f:
        json.dump(meta, f, indent=4)

    # prepare trainer
    val_fn = basic_validate(net, criterion)
    grad_fn = get_basic_grad_fn(net, args.clip)
    optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1])
    scheduler = ReduceLROnPlateau(optimizer,
                                  'min',
                                  verbose=True,
                                  factor=args.decay,
                                  min_lr=0,
                                  patience=args.lr_p)

    if args.cuda:
        net = net.cuda()
    pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher,
                             args.batch, val_fn, criterion, optimizer, grad_fn)
    trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience,
                           scheduler)

    print('start training with the following hyper-parameters:')
    print(meta)
    trainer.train()
Ejemplo n.º 8
0
 def __init__(self, applyDict=False, **kwargs):
     #TODO: document this
     """
     :param applyDict:
     :param kwargs: 'train_path' 'dev_path' 'test_path', 'dict_path', 'applyDict'
     """
     if applyDict:
         self.dictionary = load_pickle(kwargs['dict_path'])  # a previously saved pickle of a Dictionary
     else:
         self.dictionary = Dictionary()
         if 'train_path' in kwargs.keys():
             self.train = self.tokenize(kwargs['train_path'])
         if 'dev_path' in kwargs.keys():
             self.valid = self.tokenize(kwargs['dev_path'])
         if 'test_path' in kwargs.keys():
             self.test = self.tokenize(kwargs['test_path'])
         # save file when done
         make_vocab(self.dictionary, kwargs['output'])
Ejemplo n.º 9
0
def main(args):
    assert args.net_type in ['ff', 'rnn']
    # create data batcher, vocabulary
    # batcher
    with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f:
        wc = pkl.load(f)
    word2id = make_vocab(wc, args.vsize)
    train_batcher, val_batcher = build_batchers(args.net_type, word2id,
                                                args.cuda, args.debug)

    # make net
    net, net_args = configure_net(args.net_type,
                                  len(word2id), args.emb_dim, args.conv_hidden,
                                  args.lstm_hidden, args.lstm_layer, args.bi)
    if args.w2v:
        # NOTE: the pretrained embedding having the same dimension
        #       as args.emb_dim should already be trained
        embedding, _ = make_embedding(
            {i: w for w, i in word2id.items()}, args.w2v)
        net.set_embedding(embedding)

    # configure training setting
    criterion, train_params = configure_training(
        args.net_type, 'adam', args.lr, args.clip, args.decay, args.batch
    )

    # save experiment setting
    if not exists(args.path):
        os.makedirs(args.path)
    with open(join(args.path, 'vocab.pkl'), 'wb') as f:
        pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL)
    meta = {}
    meta['net']           = 'ml_{}_extractor'.format(args.net_type)
    meta['net_args']      = net_args
    meta['traing_params'] = train_params
    with open(join(args.path, 'meta.json'), 'w') as f:
        json.dump(meta, f, indent=4)

    # prepare trainer
    val_fn = basic_validate(net, criterion)
    grad_fn = get_basic_grad_fn(net, args.clip)
    optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1])
    scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True,
                                  factor=args.decay, min_lr=0,
                                  patience=args.lr_p)

    if args.cuda:
        net = net.cuda()
    pipeline = BasicPipeline(meta['net'], net,
                             train_batcher, val_batcher, args.batch, val_fn,
                             criterion, optimizer, grad_fn)
    trainer = BasicTrainer(pipeline, args.path,
                           args.ckpt_freq, args.patience, scheduler)

    print('start training with the following hyper-parameters:')
    print(meta)
    trainer.train()
Ejemplo n.º 10
0
 def __init__(self, applyDict=False, **kwargs):
     """
     :param applyDict: whether to create a corpus with an already made dictionary
     :param kwargs: 'train_path' 'dev_path' 'test_path', 'dict_path', 'output'. For most uses
     you need all types of path, though you could make a Corpus without a train-dev-test split.
     dict_path is only accessed if applyDict is true.
     """
     if applyDict:
         self.dictionary = load_pickle(
             kwargs['dict_path']
         )  # a previously saved pickle of a Dictionary
     else:
         self.dictionary = Dictionary()
         if 'train_path' in kwargs.keys():
             self.train = self.tokenize(kwargs['train_path'])
         if 'dev_path' in kwargs.keys():
             self.valid = self.tokenize(kwargs['dev_path'])
         if 'test_path' in kwargs.keys():
             self.test = self.tokenize(kwargs['test_path'])
         # save file when done
         make_vocab(self.dictionary, kwargs['output'])
Ejemplo n.º 11
0
 def __init__(self, options, session):
     self._options = options
     self._session = session
     word_freq, word_id, id_word, phrase_ids = utils.make_vocab(vocabfile=self._options.vocab, corpus=self._options.train_data, phrase_ids_file=self._options.phrase_data, phrase_reverse=self._options.reverse)
     self._word_freq = word_freq
     self._word_id = word_id
     self._id_word = id_word
     self._phrase_ids = phrase_ids
     self.save_setting()
     self.freq_table = self.make_freq_table(self._id_word, self._word_freq)
     phrase_max_size = max([len(word_seq) for word_seq in phrase_ids.values()] + [0])
     self.build_graph(phrase_max_size, self._options.composition_function, self._options.dim, self._options.batch_size,
                      self._options.neg, self._options.learning_rate, self._id_word, self.freq_table, self._options.init_word_data, 
                      self._options.init_context_data, self._options.epoch_num, not self._options.not_embedding_train)
Ejemplo n.º 12
0
def main(args):
  with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f:
      wc = pkl.load(f)
      word2id = make_vocab(wc, args.vsize)

  abs_args = SimpleNamespace(
    **vars(args),
    path='./uni_pretrained_abstractor',
    w2v='./word_vectors/word2vec.128d.226k.bin',
    n_layer=1,
    n_hidden=256,
    max_art=100,
    max_abs=30,
  )
  abs_trainer, abs_net = abs_prep_trainer(abs_args, word2id=word2id)

  exs_args = SimpleNamespace(
    **vars(args),
    path='./uni_pretrained_extractor',
    w2v=None, # no embedding since reuse abs's encoder
    net_type='rnn',
    lstm_layer=1,
    lstm_hidden=256,
    max_word=100,
    max_sent=60
  )

  exs_trainer, _ = exs_prep_trainer(exs_args, word2id=word2id, encoder=abs_net.encoder)

  # training generator
  exs_train_gen = exs_trainer.train_gen('extractor')
  abs_train_gen = abs_trainer.train_gen('abstractor')

  for exs_end, abs_end in zip(exs_train_gen, abs_train_gen):
    if exs_end and abs_end:
      print('Uni Training End')
      break
Ejemplo n.º 13
0
        [en, cn] = line.strip('\n').split('\t')

        outputs.append(cn[:-1])  # 去掉汉语标签句末标点
        inputs.append(en.replace(
            ',',
            ' ,')[:-1].lower())  # 句中逗号后本有空格,在逗号前增加空格,然后将逗号按一个元素分隔,去掉句末标点,转为小写

    #print('分词前:',inputs[:10])
    #print('分词前:',outputs[:10])
    inputs = cn_segment(inputs)
    outputs = en_segment(outputs)
    #print('分词后:',inputs[:10])
    #print('分词后:',outputs[:10])
    # print(outputs)

encoder_vocab, decoder_vocab = make_vocab(inputs, outputs)
print('\n-----------vocab have made-----------')

encoder_inputs, decoder_inputs, decoder_targets = data_format(
    inputs, outputs, encoder_vocab, decoder_vocab)

arg = create_hparams()
arg.input_vocab_size = len(encoder_vocab)
arg.label_vocab_size = len(decoder_vocab)
arg.epochs = epoch
arg.batch_size = batch_size

g = Graph(arg)

saver = tf.train.Saver()
with tf.Session() as sess:
Ejemplo n.º 14
0
def main(args):
    # create data batcher, vocabulary
    # batcher
    with open(join(args.data_path, 'vocab_cnt.pkl'), 'rb') as f:
        wc = pkl.load(f)
    word2id = make_vocab(wc, args.vsize, args.max_target_sent)  #一个word的词典
    train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug)

    # make net

    if args.w2v:
        # NOTE: the pretrained embedding having the same dimension
        #       as args.emb_dim should already be trained
        embedding, _ = make_embedding({i: w
                                       for w, i in word2id.items()},
                                      args.w2v)  #提供一个embedding矩阵

        net, net_args = configure_net(len(word2id), args.emb_dim,
                                      args.n_hidden, args.bi, args.n_layer,
                                      args.sampling_teaching_force,
                                      args.self_attn, args.hi_encoder,
                                      embedding)
    else:
        print("please provide pretrain_w2v")
        return

    # configure training setting
    criterion, train_params = configure_training('adam', args.lr, args.clip,
                                                 args.decay, args.batch)

    # save experiment setting

    if not exists(args.path):
        os.makedirs(args.path)
    with open(join(args.path, 'vocab.pkl'), 'wb') as f:
        pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL)

    net_args_backup = net_args.copy()
    del net_args_backup["embedding"]

    meta = {}
    meta['net'] = 'base_abstractor'
    meta['net_args'] = net_args_backup
    meta['traing_params'] = train_params

    with open(join(args.path, 'meta.json'), 'w') as f:
        json.dump(meta, f, indent=4)

    # prepare trainer
    val_fn = basic_validate(net, criterion)
    grad_fn = get_basic_grad_fn(net, args.clip)
    optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1])
    scheduler = ReduceLROnPlateau(optimizer,
                                  'min',
                                  verbose=True,
                                  factor=args.decay,
                                  min_lr=0,
                                  patience=args.lr_p)

    if args.cuda:
        net = net.cuda()
    pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher,
                             args.batch, val_fn, criterion, optimizer, grad_fn)
    trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience,
                           scheduler)

    print('start training with the following hyper-parameters:')
    print(meta)
    trainer.train()
Ejemplo n.º 15
0
from sklearn.utils import shuffle
from utils import make_vocab, load_data_cnn
from wordCNN import *

init()  #coloroma init

# embedding_dim = 200
# golveFileName = os.path.join("data", "twitter_hate_off_word_vectors.txt")
# saveFileName = os.path.join("data", "twitter_hate_off_word_vectors" + str(embedding_dim) + ".npy")
embedding_dim = 100
golveFileName = os.path.join(
    "data", "glove.twitter.27B." + str(embedding_dim) + "d.txt")
saveFileName = os.path.join("data",
                            "filteredGlove" + str(embedding_dim) + ".npy")
vocab_size = make_vocab(file=golveFileName,
                        save_name=saveFileName,
                        embedding_dim=embedding_dim)
wordVecs = np.load(saveFileName).astype(np.float32)

train_x, train_y, dev_x, dev_y, test_x, test_y, doc_emb_train, doc_emb_test, doc_emb_dev = load_data_cnn(
)

n_epochs = 20
train_instances = len(train_x)
batch_size = 128
train_batches = train_instances // batch_size

use_gcn = True
path1 = "./saved/use_gcn_cnn"
path2 = "./saved/no_use_gcn_cnn"
if use_gcn:
Ejemplo n.º 16
0
def mlp(tr_data, te_data, eng_para, col_name, grid_size, \
        optimizer, batch_size, hidden_size, mlp_feature, \
        nb_epoch, prediction, model_name, is_train):
    # Load the dataset
    print 'Loading dataset ...'
    tr_feature, tr_label, tr_ids = mlp_feature(tr_data, eng_para, True, col_name)
    te_feature, te_label, te_ids = mlp_feature(te_data, eng_para, True, col_name)
    rg = RoadGrid(np.vstack((tr_label, te_label)), grid_size)
    tr_label = rg.transform(tr_label)
    # te_label = rg.transform(te_label)

    ## !!! maybe here need to ensure train data are the same shape as test data
    train_size, n_con = tr_feature.shape
    test_size, n_con = te_feature.shape
    n_dis = len(tr_ids)

    # Create neural network model
    print 'Preprocessing data ...'
    # Standardize continous input
    # tr_feature, te_feature = preprocess(tr_feature, te_feature)
    tr_feature, te_feature = preprocess(tr_feature, te_feature)
    # te_feature = preprocess(te_feature)
    tr_input = {'con_input' : tr_feature, 'output' : tr_label}
    te_input = {'con_input' : te_feature}
    # Prepare embedding input
    dis_dims, vocab_sizes = [], []
    for ii, tr_ids_, te_ids_ in zip(range(n_dis), tr_ids, te_ids): # make sure tr_ids contain several different discrete features
        vocab_size, vocab_dict = make_vocab(tr_ids_, te_ids_)
        tr_id_idx_, te_id_idx_ = [], []
        dis_dim = len(tr_ids_)
        for i in range(dis_dim):
            tr_id_idx_ += map(lambda x: vocab_dict[x], tr_ids_[i])
            te_id_idx_ += map(lambda x: vocab_dict[x], te_ids_[i])
        tr_ids = np.array(tr_id_idx_, dtype=np.int32).reshape(dis_dim, train_size).transpose()
        te_ids = np.array(te_id_idx_, dtype=np.int32).reshape(dis_dim, test_size).transpose()

        ## Add discrete feature to dict
        tr_input['emb_input%d' % ii] = tr_ids
        te_input['emb_input%d' % ii] = te_ids

        dis_dims.append(dis_dim)
        vocab_sizes.append(vocab_size)

    print 'Building model and compiling functions ...'
    # Define network structure
    grid_info = rg.grid_center
    network = build_mlp(n_con, n_dis, dis_dims, vocab_sizes, len(grid_info), hidden_size)

#network.compile(loss={'output': 'categorical_crossentropy'}, optimizer=SGD(lr=1e-2, momentum=0.9, nesterov=True))
    network.compile(loss={'output': 'categorical_crossentropy'}, optimizer=optimizer)

    # Build network
    # pickle_name = 'MLP-softmax-0.4.pickle'
    pickle_name = model_name

    if is_train:
        history = network.fit(tr_input, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1)
        # Dump Network
        with open('model/'+pickle_name, 'wb') as f:
           pickle.dump(network, f, -1)
    else:
        # Load Network
        f = open('model/'+pickle_name)
        network = pickle.load(f)

    # Make prediction
    ## 1. weighted
    if prediction == 'weighted':
        te_pred = np.asarray(network.predict(te_input)['output'])
        te_pred = te_pred.dot(grid_info)
    # Generate report
    # gen_report(te_label, te_pred, pickle_name, [type(optimizer), batch_size, hidden_size, 'Weighted'])
    elif prediction == 'argmax':
    ## 2. argmax
        te_pred = np.asarray(network.predict(te_input)['output'])
        te_pred = np.argmax(te_pred, axis=1)
        te_pred = [grid_info[idx] for idx in te_pred]
    # Generate report
    # gen_report(te_label, te_pred, pickle_name, [type(optimizer), batch_size, hidden_size, 'Argmax'])
    else:
        te_pred = None
    return te_pred
Ejemplo n.º 17
0
import numpy as np
import os
from sklearn import metrics
from sklearn.utils import shuffle
from utils import make_vocab, load_data_cnn
from wordCNN import *

init() #coloroma init

# embedding_dim = 200
# golveFileName = os.path.join("data", "twitter_hate_off_word_vectors.txt")
# saveFileName = os.path.join("data", "twitter_hate_off_word_vectors" + str(embedding_dim) + ".npy")
embedding_dim = 100
golveFileName = os.path.join("data", "glove.twitter.27B." + str(embedding_dim) + "d.txt")
saveFileName = os.path.join("data", "filteredGlove" + str(embedding_dim) + ".npy")
vocab_size = make_vocab(data = "twitter_hate_off", file = golveFileName, save_name = saveFileName, embedding_dim = embedding_dim)
print(vocab_size)
wordVecs = np.load(saveFileName).astype(np.float32)

train_x, train_y, dev_x, dev_y, test_x, test_y, doc_emb_train, doc_emb_test, doc_emb_dev = load_data_cnn(data = "twitter_hate_off")

n_epochs = 5
train_instances = len(train_x)
batch_size = 128
train_batches = train_instances // batch_size

use_gcn = False
path1 = "./saved/use_gcn_cnn"
path2 = "./saved/no_use_gcn_cnn" 
if use_gcn:
  path = path1
Ejemplo n.º 18
0
                        default=1,
                        help="num epoches for traning")
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        help="batch size for traning")
    parser.add_argument('--site_path',
                        type=str,
                        default='nodejs_brows/static',
                        help='path to your site for storing model')
    args = parser.parse_args()

    train_data = load_data(os.path.join(args.data, 'train.txt'))
    valid_data = load_data(os.path.join(args.data, 'valid.txt'))

    words_vocab = make_vocab(train_data['words'])
    tags_vocab = make_vocab(train_data['tags'])

    train_data['words_sequences'] = make_sequences(train_data['words'],
                                                   words_vocab)
    valid_data['words_sequences'] = make_sequences(valid_data['words'],
                                                   words_vocab)

    train_data['tags_sequences'] = make_sequences(train_data['tags'],
                                                  tags_vocab)
    valid_data['tags_sequences'] = make_sequences(valid_data['tags'],
                                                  tags_vocab)

    train_X = pad_sequences(train_data['words_sequences'],
                            maxlen=MAX_SEQUENCE_LENGTH,
                            value=PAD_ID,
Ejemplo n.º 19
0
def main(num_epochs=500):
    # Load the dataset
    print 'Loading dataset ...'
    eng_para = pd.read_csv('data/2g_gongcan.csv')
#eng_para = eng_para.loc[:, ['LAC', 'CI', 'Angle', 'Longitude', 'Latitude', 'Power', 'GSM Neighbor Count', 'TD Neighbor Count']]
    tr_feature, tr_label, tr_ids = load_dataset('data/forward_recovered.csv', eng_para, True) 
    te_feature, te_label, te_ids = load_dataset('data/backward_recovered.csv', eng_para, False)
    ## !!! maybe here need to ensure train data are the same shape as test data
    train_size, n_con = tr_feature.shape
    test_size, n_con = te_feature.shape
    n_dis = len(tr_ids) 

    # Create neural network model
    print 'Preprocessing data ...'
    # Standardize continous input
    tr_feature, te_feature = preprocess(tr_feature, te_feature)
    tr_input = {'con_input' : tr_feature}
    te_input = {'con_input' : te_feature}
    # Prepare embedding input
    dis_dims, vocab_sizes = [], []
    for ii, tr_ids_, te_ids_ in zip(range(n_dis), tr_ids, te_ids): # make sure tr_ids contain several different discrete features
        vocab_size, vocab_dict = make_vocab(tr_ids_, te_ids_) 
        tr_id_idx_, te_id_idx_ = [], []
        dis_dim = len(tr_ids_)
        for i in range(dis_dim):
            tr_id_idx_ += map(lambda x: vocab_dict[x], tr_ids_[i])
            te_id_idx_ += map(lambda x: vocab_dict[x], te_ids_[i])
        tr_ids = np.array(tr_id_idx_, dtype=np.int32).reshape(dis_dim, train_size).transpose()
        te_ids = np.array(te_id_idx_, dtype=np.int32).reshape(dis_dim, test_size).transpose()

        ## Add discrete feature to dict
        tr_input['emb_input%d' % ii] = tr_ids
        te_input['emb_input%d' % ii] = te_ids

        dis_dims.append(dis_dim)
        vocab_sizes.append(vocab_size)

    print 'Building model and compiling functions ...'
    # Define network structure
    l_output = build_mlp(n_con, n_dis, dis_dims, vocab_sizes)
    
    # Set batch size
    bi = BatchIterator(batch_size=10)

    # Build network
    network = NeuralNet(l_output,
                 regression=True,
                 update_learning_rate=1e-5,
                 update=nesterov_momentum,
                 update_momentum=0.9,
                 train_split=TrainSplit(eval_size=0.05),
                 verbose=1,
                 batch_iterator_train=bi,
                 objective_loss_function=lasagne.objectives.squared_error,
                 max_epochs=5000)

    pickle_name = 'MLP-0.10.pickle'

    mul_val = 10000.
    lon_offset = np.mean(tr_label[:, 0])
    lon_std = np.mean(tr_label[:, 0])
    lat_offset = np.mean(tr_label[:, 1])
    lat_std = np.mean(tr_label[:, 1])
    ######## Change Target
    tr_label[:, 0] = (tr_label[:, 0] - lon_offset) * mul_val 
    tr_label[:, 1] = (tr_label[:, 1] - lat_offset) * mul_val 
    tr_label = tr_label.astype(np.float32)
    print tr_label

    is_train = True
    if is_train:
        network.fit(tr_input, tr_label)
        # Dump Network
        with open('model/'+pickle_name, 'wb') as f:
           pickle.dump(network, f, -1)
    else:
        # Load Network
        f = open('model/'+pickle_name)
        network = pickle.load(f) 

    # Make prediction
    te_pred = network.predict(te_input)

    te_pred[:, 0] = te_pred[:, 0] / mul_val + lon_offset
    te_pred[:, 1] = te_pred[:, 1] / mul_val + lat_offset
    f_out = open('pred.csv', 'w')
    for pred_pt, true_pt in zip(te_pred, te_label):
        f_out.write('%f,%f,%f,%f\n' % (pred_pt[0], pred_pt[1], true_pt[0], true_pt[1]))

    # Generate report
    gen_report(te_label, te_pred, pickle_name)
Ejemplo n.º 20
0
def train(args):

    assert args.encoder  in ['BiLSTM', 'DeepLSTM', 'Transformer']
    assert args.decoder  in ['SL', 'PN']
    assert args.emb_type in ['W2V', 'BERT']

    # create data batcher, vocabulary
    # batcher
    with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f:
        wc = pkl.load(f)
    word2id = make_vocab(wc, args.vsize)
    train_batcher, val_batcher = build_batchers(args.decoder, args.emb_type, 
                                                word2id, args.cuda, args.debug)

    # make model
    model, model_args = configure_net(args.encoder, args.decoder, args.emb_type, len(word2id), 
                                      args.emb_dim, args.conv_hidden, args.encoder_hidden, 
                                      args.encoder_layer)
    
    if args.emb_type == 'W2V':
        # NOTE: the pretrained embedding having the same dimension
        #       as args.emb_dim should already be trained
        w2v_path='./CNNDM/word2vec/word2vec.128d.226k.bin'
        embedding, _ = make_embedding(
            {i: w for w, i in word2id.items()}, w2v_path)
        model.set_embedding(embedding)

    # configure training setting
    criterion, train_params = configure_training(
        args.decoder, 'adam', args.lr, args.clip, args.decay, args.batch
    )

    # save experiment setting
    if not exists(args.path):
        os.makedirs(args.path)
    with open(join(args.path, 'vocab.pkl'), 'wb') as f:
        pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL)
    meta = {}
    meta['model_args']    = model_args
    meta['traing_params'] = train_params
    with open(join(args.path, 'meta.json'), 'w') as f:
        json.dump(meta, f, indent=4)

    # prepare trainer
    val_fn = basic_validate(model, criterion, args.decoder)
    grad_fn = get_basic_grad_fn(model, args.clip)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), **train_params['optimizer'][1])
    scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True,
                                  factor=args.decay, min_lr=2e-5,
                                  patience=args.lr_p)

    if args.cuda:
        model = model.cuda()
    pipeline = BasicPipeline(model, args.decoder, 
                             train_batcher, val_batcher, args.batch, val_fn,
                             criterion, optimizer, grad_fn)
    trainer = BasicTrainer(pipeline, args.path,
                           args.ckpt_freq, args.patience, scheduler)
    
    # for name, para in net.named_parameters():
    #     if para.requires_grad:
    #         print(name)

    print('Start training with the following hyper-parameters:')
    print(meta)
    trainer.train()
Ejemplo n.º 21
0
def main(args):
    # create data batcher, vocabulary
    # batcher
    if args.bert:
        import logging
        logging.basicConfig(level=logging.ERROR)

    if not args.bert:
        with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f:
            wc = pkl.load(f)
        word2id = make_vocab(wc, args.vsize)
    if not args.gat:
        if args.bert:
            train_batcher, val_batcher, word2id = build_batchers_bert(
                args.cuda, args.debug, args.bertmodel)
        else:
            train_batcher, val_batcher = build_batchers(
                word2id, args.cuda, args.debug)
    else:
        if args.bert:
            train_batcher, val_batcher, word2id = build_batchers_gat_bert(
                args.cuda,
                args.debug,
                args.gold_key,
                args.adj_type,
                args.mask_type,
                args.topic_flow_model,
                num_worker=args.num_worker,
                bert_model=args.bertmodel)
        else:
            train_batcher, val_batcher = build_batchers_gat(
                word2id,
                args.cuda,
                args.debug,
                args.gold_key,
                args.adj_type,
                args.mask_type,
                args.topic_flow_model,
                num_worker=args.num_worker)

    # make net
    if args.gat:
        _args = {}
        _args['rtoks'] = 1
        _args['graph_hsz'] = args.n_hidden
        _args['blockdrop'] = 0.1
        _args['sparse'] = False
        _args['graph_model'] = 'transformer'
        _args['adj_type'] = args.adj_type

        net, net_args = configure_net_gat(
            len(word2id),
            args.emb_dim,
            args.n_hidden,
            args.bi,
            args.n_layer,
            args.load_from,
            gat_args=_args,
            adj_type=args.adj_type,
            mask_type=args.mask_type,
            feed_gold=False,
            graph_layer_num=args.graph_layer,
            feature=args.feat,
            subgraph=args.topic_flow_model,
            hierarchical_attn=args.topic_flow_model,
            bert=args.bert,
            bert_length=args.max_art)
    else:
        net, net_args = configure_net(len(word2id), args.emb_dim,
                                      args.n_hidden, args.bi, args.n_layer,
                                      args.load_from, args.bert, args.max_art)

    if args.w2v:
        assert not args.bert
        # NOTE: the pretrained embedding having the same dimension
        #       as args.emb_dim should already be trained
        embedding, _ = make_embedding({i: w
                                       for w, i in word2id.items()}, args.w2v)
        net.set_embedding(embedding)

    # configure training setting
    if 'soft' in args.mask_type and args.gat:
        criterion, train_params = configure_training_multitask(
            'adam', args.lr, args.clip, args.decay, args.batch, args.mask_type,
            args.bert)
    else:
        criterion, train_params = configure_training('adam', args.lr,
                                                     args.clip, args.decay,
                                                     args.batch, args.bert)

    # save experiment setting
    if not exists(args.path):
        os.makedirs(args.path)
    with open(join(args.path, 'vocab.pkl'), 'wb') as f:
        pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL)
    meta = {}
    meta['net'] = 'base_abstractor'
    meta['net_args'] = net_args
    meta['traing_params'] = train_params
    with open(join(args.path, 'meta.json'), 'w') as f:
        json.dump(meta, f, indent=4)

    # prepare trainer
    if args.cuda:
        net = net.cuda()

    if 'soft' in args.mask_type and args.gat:
        val_fn = multitask_validate(net, criterion)
    else:
        val_fn = basic_validate(net, criterion)
    grad_fn = get_basic_grad_fn(net, args.clip)
    print(net._embedding.weight.requires_grad)

    optimizer = optim.AdamW(net.parameters(), **train_params['optimizer'][1])

    #optimizer = optim.Adagrad(net.parameters(), **train_params['optimizer'][1])
    scheduler = ReduceLROnPlateau(optimizer,
                                  'min',
                                  verbose=True,
                                  factor=args.decay,
                                  min_lr=0,
                                  patience=args.lr_p)

    # pipeline = BasicPipeline(meta['net'], net,
    #                          train_batcher, val_batcher, args.batch, val_fn,
    #                          criterion, optimizer, grad_fn)
    # trainer = BasicTrainer(pipeline, args.path,
    #                        args.ckpt_freq, args.patience, scheduler)
    if 'soft' in args.mask_type and args.gat:
        pipeline = MultiTaskPipeline(meta['net'], net, train_batcher,
                                     val_batcher, args.batch, val_fn,
                                     criterion, optimizer, grad_fn)
        trainer = MultiTaskTrainer(pipeline, args.path, args.ckpt_freq,
                                   args.patience, scheduler)
    else:
        pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher,
                                 args.batch, val_fn, criterion, optimizer,
                                 grad_fn)
        trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq,
                               args.patience, scheduler)

    print('start training with the following hyper-parameters:')
    print(meta)
    trainer.train()
                    type=str)
parser.add_argument('vocab_file', help='a newline-delimited list of vocabulary words for which ' \
    'to generate embeddings', type=str)
parser.add_argument(
    'output_file',
    help='path and filename where embeddings should be written',
    type=str)
parser.add_argument('--count_file', help='optional path and filename for a file where counts of the number ' \
    'of context sentences per vocabulary word should be written', default=None, type=str)
args = parser.parse_args()

tokenizer = RobertaTokenizer.from_pretrained(args.model_path)
model = RobertaForMaskedLM.from_pretrained(args.model_path)
model.eval()

vocab = utils.make_vocab(args.vocab_file)
FEATURE_COUNT = 768  # Change this value to 1024 for the large RoBERTa model.
MAX_LINES = 2000  # Maximum number of context lines to average per vocabulary embedding.

if __name__ == "__main__":
    # Process vocabulary words in the outer loop.
    for v in vocab:
        with open(args.context_file, 'r') as lines:
            v_sum = torch.zeros([1, FEATURE_COUNT])
            v_tokens = utils.tokenize_text(v, tokenizer)
            utils.print_tokenized_text(v_tokens, tokenizer)
            count_sentence = 0
            count_tensor = 0

            # Process all lines in the context file in the inner loop.
            for line in lines:
Ejemplo n.º 23
0
def main():
    # Loads vocab.
    vocab = make_vocab("data/ptb.train.txt")
    print("#vocab:", len(vocab))  # maybe 10000
    eos_id = vocab["<s>"]

    # Loads all corpus.
    train_corpus = load_corpus("data/ptb.train.txt", vocab)
    valid_corpus = load_corpus("data/ptb.valid.txt", vocab)
    num_train_sents = len(train_corpus)
    num_valid_sents = len(valid_corpus)
    num_train_labels = count_labels(train_corpus)
    num_valid_labels = count_labels(valid_corpus)
    print("train:", num_train_sents, "sentences,", num_train_labels, "labels")
    print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels")

    # Device and computation graph.
    dev = D.CUDA(0)
    Device.set_default(dev)
    g = Graph()
    Graph.set_default(g)

    # Our LM.
    lm = RNNLM(len(vocab), eos_id)

    # Optimizer.
    optimizer = O.SGD(1)
    #optimizer.set_weight_decay(1e-6)
    optimizer.set_gradient_clipping(5)
    optimizer.add(lm)

    # Sentence IDs.
    train_ids = list(range(num_train_sents))
    valid_ids = list(range(num_valid_sents))

    best_valid_ppl = 1e10

    # Train/valid loop.
    for epoch in range(MAX_EPOCH):
        print("epoch", epoch + 1, "/", MAX_EPOCH, ":")
        # Shuffles train sentence IDs.
        random.shuffle(train_ids)

        # Training.
        train_loss = 0
        for ofs in range(0, num_train_sents, BATCH_SIZE):
            batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)]
            batch = make_batch(train_corpus, batch_ids, eos_id)

            g.clear()

            outputs = lm.forward(batch, True)
            loss = lm.loss(outputs, batch)
            train_loss += loss.to_float() * len(batch_ids)

            optimizer.reset_gradients()
            loss.backward()
            optimizer.update()

            print("%d" % ofs, end="\r")
            sys.stdout.flush()

        train_ppl = math.exp(train_loss / num_train_labels)
        print("  train ppl =", train_ppl)

        # Validation.
        valid_loss = 0
        for ofs in range(0, num_valid_sents, BATCH_SIZE):
            batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)]
            batch = make_batch(valid_corpus, batch_ids, eos_id)

            g.clear()

            outputs = lm.forward(batch, False)
            loss = lm.loss(outputs, batch)
            valid_loss += loss.to_float() * len(batch_ids)
            print("%d" % ofs, end="\r")
            sys.stdout.flush()

        valid_ppl = math.exp(valid_loss / num_valid_labels)
        print("  valid ppl =", valid_ppl)

        if valid_ppl < best_valid_ppl:
            best_valid_ppl = valid_ppl
            print("  BEST")
        else:
            old_lr = optimizer.get_learning_rate_scaling()
            new_lr = 0.5 * old_lr
            optimizer.set_learning_rate_scaling(new_lr)
            print("  learning rate scaled:", old_lr, "->", new_lr)
Ejemplo n.º 24
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--make-vocab",
        action="store_true",
        help="Set this flag if you want to make vocab from train data.")
    parser.add_argument("--do-train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do-predict",
                        action="store_true",
                        help="Whether to run prediction.")
    parser.add_argument("--epoch-idx",
                        type=int,
                        default=EPOCHS,
                        help="Choose which model to predict.")
    args = parser.parse_args()

    logger = config_log()

    if args.make_vocab:
        make_vocab(train_file=TRAIN_FILE,
                   do_lower_case=DO_LOWER_CASE,
                   result_dir=RESULT_DIR,
                   text_col_name=TEXT_COL_NAME)
    if args.do_train:
        train(logger=logger)
    if args.do_predict:
        predict(args.epoch_idx, logger=logger)
Ejemplo n.º 25
0
def main(task_config, n=21, k=2, device=0, d=100, epochs=100):
    # Global parameters
    debug_mode = True
    verbose = True
    save = True
    freeze_word_embeddings = True
    over_population_threshold = 100
    relative_over_population = True
    data_augmentation = True
    if debug_mode:
        data_augmentation = False
        over_population_threshold = None

    logging.info("Task name: {}".format(task_config['name']))
    logging.info("Debug mode: {}".format(debug_mode))
    logging.info("Verbose: {}".format(verbose))
    logging.info("Freeze word embeddings: {}".format(freeze_word_embeddings))
    logging.info(
        "Over population threshold: {}".format(over_population_threshold))
    logging.info(
        "Relative over population: {}".format(relative_over_population))
    logging.info("Data augmentation: {}".format(data_augmentation))

    use_gpu = torch.cuda.is_available()
    # use_gpu = False
    if use_gpu:
        cuda_device = device
        torch.cuda.set_device(cuda_device)
        logging.info('Using GPU')

    # Load dataset
    dataset = task_config['dataset'](debug_mode, relative_path='./data/')

    all_sentences = dataset.get_train_sentences + dataset.get_valid_sentences + dataset.get_test_sentences

    word_embeddings = load_embeddings(
        './data/glove_embeddings/glove.6B.{}d.txt'.format(d))
    chars_embeddings = load_embeddings(
        './predicted_char_embeddings/char_mimick_glove_d100_c20')

    # Prepare vectorizer
    word_to_idx, char_to_idx = make_vocab(all_sentences)
    vectorizer = WordsInContextVectorizer(word_to_idx, char_to_idx)
    vectorizer = vectorizer

    # Initialize training parameters
    model_name = '{}_n{}_k{}_d{}_e{}'.format(task_config['name'], n, k, d,
                                             epochs)
    lr = 0.001
    if debug_mode:
        model_name = 'testing_' + model_name
        save = False
        epochs = 3

    # Create the model
    net = LRComick(
        characters_vocabulary=char_to_idx,
        words_vocabulary=word_to_idx,
        characters_embedding_dimension=20,
        # characters_embeddings=chars_embeddings,
        word_embeddings_dimension=d,
        words_embeddings=word_embeddings,
        # context_dropout_p=0.5,
        # fc_dropout_p=0.5,
        freeze_word_embeddings=freeze_word_embeddings)
    model_name = "{}_{}_v{}".format(model_name, net.__class__.__name__.lower(),
                                    net.version)
    handler = logging.FileHandler('{}.log'.format(model_name))
    logger.addHandler(handler)

    model = Model(
        model=net,
        optimizer=Adam(net.parameters(), lr=lr),
        loss_function=square_distance,
        metrics=[cosine_sim],
    )
    if use_gpu:
        model.cuda()

    # Prepare examples
    train_loader, valid_loader, test_loader, oov_loader = prepare_data(
        dataset=dataset,
        embeddings=word_embeddings,
        vectorizer=vectorizer,
        n=n,
        use_gpu=use_gpu,
        k=k,
        over_population_threshold=over_population_threshold,
        relative_over_population=relative_over_population,
        data_augmentation=data_augmentation,
        debug_mode=debug_mode,
        verbose=verbose,
    )

    # Set up the callbacks and train
    train(
        model,
        model_name,
        train_loader=train_loader,
        valid_loader=valid_loader,
        epochs=epochs,
    )

    test_embeddings = evaluate(model,
                               test_loader=test_loader,
                               test_embeddings=word_embeddings,
                               save=save,
                               model_name=model_name + '.txt')

    predicted_oov_embeddings = predict_mean_embeddings(model, oov_loader)

    # Override embeddings with the training ones
    # Make sure we only have embeddings from the corpus data
    logging.info("Evaluating embeddings...")
    predicted_oov_embeddings.update(word_embeddings)

    for task in task_config['tasks']:
        logging.info("Using predicted embeddings on {} task...".format(
            task['name']))
        task['script'](predicted_oov_embeddings,
                       task['name'] + "_" + model_name, device, debug_mode)
    logger.removeHandler(handler)
Ejemplo n.º 26
0
def train(encdec, optimizer, prefix, best_valid_ppl):
    # Registers all parameters to the optimizer.
    optimizer.add_model(encdec)

    # Loads vocab.
    src_vocab = make_vocab(SRC_TRAIN_FILE, SRC_VOCAB_SIZE)
    trg_vocab = make_vocab(TRG_TRAIN_FILE, TRG_VOCAB_SIZE)
    inv_trg_vocab = make_inv_vocab(trg_vocab)
    print("#src_vocab:", len(src_vocab))
    print("#trg_vocab:", len(trg_vocab))

    # Loads all corpus
    train_src_corpus = load_corpus(SRC_TRAIN_FILE, src_vocab)
    train_trg_corpus = load_corpus(TRG_TRAIN_FILE, trg_vocab)
    valid_src_corpus = load_corpus(SRC_VALID_FILE, src_vocab)
    valid_trg_corpus = load_corpus(TRG_VALID_FILE, trg_vocab)
    test_src_corpus = load_corpus(SRC_TEST_FILE, src_vocab)
    test_ref_corpus = load_corpus_ref(REF_TEST_FILE, trg_vocab)
    num_train_sents = len(train_trg_corpus)
    num_valid_sents = len(valid_trg_corpus)
    num_test_sents = len(test_ref_corpus)
    num_train_labels = count_labels(train_trg_corpus)
    num_valid_labels = count_labels(valid_trg_corpus)
    print("train:", num_train_sents, "sentences,", num_train_labels, "labels")
    print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels")

    # Sentence IDs
    train_ids = list(range(num_train_sents))
    valid_ids = list(range(num_valid_sents))

    # Train/valid loop.
    for epoch in range(MAX_EPOCH):
        # Computation graph.
        g = Graph()
        Graph.set_default(g)

        print("epoch %d/%d:" % (epoch + 1, MAX_EPOCH))
        print("  learning rate scale = %.4e" %
              optimizer.get_learning_rate_scaling())

        # Shuffles train sentence IDs.
        random.shuffle(train_ids)

        # Training.
        train_loss = 0.
        for ofs in range(0, num_train_sents, BATCH_SIZE):
            print("%d" % ofs, end="\r")
            sys.stdout.flush()

            batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)]
            src_batch = make_batch(train_src_corpus, batch_ids, src_vocab)
            trg_batch = make_batch(train_trg_corpus, batch_ids, trg_vocab)

            g.clear()
            encdec.encode(src_batch, True)
            loss = encdec.loss(trg_batch, True)
            train_loss += loss.to_float() * len(batch_ids)

            optimizer.reset_gradients()
            loss.backward()
            optimizer.update()

        train_ppl = math.exp(train_loss / num_train_labels)
        print("  train PPL = %.4f" % train_ppl)

        # Validation.
        valid_loss = 0.
        for ofs in range(0, num_valid_sents, BATCH_SIZE):
            print("%d" % ofs, end="\r")
            sys.stdout.flush()

            batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)]
            src_batch = make_batch(valid_src_corpus, batch_ids, src_vocab)
            trg_batch = make_batch(valid_trg_corpus, batch_ids, trg_vocab)

            g.clear()
            encdec.encode(src_batch, False)
            loss = encdec.loss(trg_batch, False)
            valid_loss += loss.to_float() * len(batch_ids)

        valid_ppl = math.exp(valid_loss / num_valid_labels)
        print("  valid PPL = %.4f" % valid_ppl)

        # Calculates test BLEU.
        stats = defaultdict(int)
        for ofs in range(0, num_test_sents, BATCH_SIZE):
            print("%d" % ofs, end="\r")
            sys.stdout.flush()

            src_batch = test_src_corpus[ofs:min(ofs +
                                                BATCH_SIZE, num_test_sents)]
            ref_batch = test_ref_corpus[ofs:min(ofs +
                                                BATCH_SIZE, num_test_sents)]

            hyp_ids = test_batch(encdec, src_vocab, trg_vocab, src_batch)
            for hyp_line, ref_line in zip(hyp_ids, ref_batch):
                for k, v in get_bleu_stats(ref_line[1:-1], hyp_line).items():
                    stats[k] += v

        bleu = calculate_bleu(stats)
        print("  test BLEU = %.2f" % (100 * bleu))

        # Saves best model/optimizer.
        if valid_ppl < best_valid_ppl:
            best_valid_ppl = valid_ppl
            print("  saving model/optimizer ... ", end="")
            sys.stdout.flush()
            encdec.save(prefix + ".model")
            optimizer.save(prefix + ".optimizer")
            save_ppl(prefix + ".valid_ppl", best_valid_ppl)
            print("done.")
        else:
            # Learning rate decay by 1/sqrt(2)
            new_scale = .7071 * optimizer.get_learning_rate_scaling()
            optimizer.set_learning_rate_scaling(new_scale)
def main(args):
    # create data batcher, vocabulary
    # batcher
    with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f:
        wc = pkl.load(f)
    word2id = make_vocab(wc, args.vsize)
    train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug)

    # make net
    net, net_args = configure_net(len(word2id), args.emb_dim, args.n_hidden,
                                  args.bi, args.n_layer)
    if args.w2v:
        # NOTE: the pretrained embedding having the same dimension
        #       as args.emb_dim should already be trained
        embedding, oov = make_embedding({i: w
                                         for w, i in word2id.items()},
                                        args.w2v)
        net.set_embedding(embedding)

    # configure training setting
    criterion, train_params = configure_training('adam', args.lr, args.clip,
                                                 args.decay, args.batch)

    # save experiment setting
    if not exists(args.path):
        os.makedirs(args.path)
    with open(join(args.path, 'vocab.pkl'), 'wb') as f:
        pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL)
    meta = {}
    meta['net'] = 'base_abstractor'
    meta['net_args'] = net_args
    meta['traing_params'] = train_params
    with open(join(args.path, 'meta.json'), 'w') as f:
        json.dump(meta, f, indent=4)

    # prepare trainer
    val_fn = basic_validate(net, criterion)
    grad_fn = get_basic_grad_fn(net, args.clip)
    optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1])
    scheduler = ReduceLROnPlateau(optimizer,
                                  'min',
                                  verbose=True,
                                  factor=args.decay,
                                  min_lr=0,
                                  patience=args.lr_p)

    if args.cuda:
        net = net.cuda()
    pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher,
                             args.batch, val_fn, criterion, optimizer, grad_fn)
    trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience,
                           scheduler)

    print('start training with the following hyper-parameters:')
    print(meta)

    # # Print model's state_dict
    # print("Model's state_dict:")
    # for param_tensor in net.state_dict():
    #     print(param_tensor, "\t", net.state_dict()[param_tensor].size())
    #
    # # Print optimizer's state_dict
    # print("Optimizer's state_dict:")
    # for var_name in optimizer.state_dict():
    #     print(var_name, "\t", optimizer.state_dict()[var_name])

    # # IMPORT PRETRAINED MODEL PARAMETERS
    # net.load_state_dict(torch.load(
    #     'pretrained_eng_model/abstractor/ckpt/ckpt-0-0')['state_dict'])
    # net.eval()  # do I need that or not?

    # copy net
    # from copy import deepcopy
    # net_copy = deepcopy(net)
    # net_copy.load_state_dict(torch.load('pretrained_eng_model/abstractor/ckpt/ckpt-0-0', map_location='cpu')['state_dict'])

    # for key in net_copy.state_dict():
    #     print('key: ', key)
    #     param = net_copy.state_dict()[key]
    #     print('param.shape: ', param.shape)
    #     print('param.requires_grad: ', param.requires_grad)
    #     print('param.shape, param.requires_grad: ', param.shape, param.requires_grad)
    #     print('isinstance(param, nn.Module) ', isinstance(param, nn.Module))
    #     print('isinstance(param, nn.Parameter) ', isinstance(param, nn.Parameter))
    #     print('isinstance(param, torch.Tensor): ', isinstance(param, torch.Tensor))
    #     print('=====')

    # save current state dict
    model_dict = net.state_dict()

    # save some parameters for testing purposes if the dict was loaded successfully
    p1 = net._embedding.weight[0][0].detach().cpu().numpy()
    p2 = net._enc_lstm.weight_hh_l0[0][0].detach().cpu().numpy()
    p3 = net._attn_wm.data[0][0].detach().cpu().numpy()

    # print(p1)
    # print(p2)
    # print(p3)

    # load dict from pretrained net
    ABS_DIR = os.environ['ABS']
    print(ABS_DIR)

    # uncomment for gpu
    # pretrained_dict = torch.load(ABS_DIR)['state_dict']
    pretrained_dict = torch.load(ABS_DIR)['state_dict']

    # skip embedding weights
    pretrained_dict = {
        k: v
        for k, v in pretrained_dict.items() if k != '_embedding.weight'
    }

    # overwrite entries in the existing state dict
    model_dict.update(pretrained_dict)

    print('Model will be trained on device:')
    print(model_dict['_embedding.weight'].device)

    # load the new state dict
    net.load_state_dict(model_dict)

    # check if the update was correct
    pn1 = net._embedding.weight[0][0].detach().cpu().numpy()
    pn2 = net._enc_lstm.weight_hh_l0[0][0].detach().cpu().numpy()
    pn3 = net._attn_wm.data[0][0].detach().cpu().numpy()

    # print(pn1)
    # print(pn2)
    # print(pn3)

    assert p1 == pn1  # embedding layer has to be the same
    assert p2 != pn2
    assert p3 != pn3

    print('Embedding layer has not been overwritten')

    # set updating of the parameters
    for name, param in net.named_parameters():
        #param.requires_grad = True
        print(name, param.requires_grad)

    trainer.train()