Example #1
0
    def __init__(self,
                 w2v_ph,
                 model_root,
                 strees=[],
                 n_iters=1000,
                 n_step2save=250,
                 window=5,
                 tolerance=0.03):
        _word2vec = Word2Vec()
        _word2vec.model_fromfile(w2v_ph)

        model = _ParseTreeAutoencoder(
            word2vec=_word2vec,
            strees=strees,
        )
        ExecFrame.__init__(self,
                           model=model,
                           model_root=model_root,
                           n_iters=n_iters,
                           n_step2save=n_step2save,
                           window=window,
                           tolerance=tolerance)
        self.c_rate = 0.0
        self.cur_c_rate = 0.0
TARGET_LEN = 40

pretrained_ckpt = 'attention/model_best_rouge1.ckpt'

device = 'cuda'

# read data
print('reading data...')
from _utils import read_jsonl
valid_X, valid_Y = read_jsonl(VALID_FILE_PATH)
print('done')

# load pretrained word embedding
print('loading word embedding...')
from _word2vec import Word2Vec
word2vec = Word2Vec(EMBEDDING_SAVE_PATH, 300, raw=False)
embedding = word2vec.embedding

SOS_token = word2vec.word2idx['<SOS>']
EOS_token = word2vec.word2idx['<EOS>']
PAD_token = word2vec.word2idx['<PAD>']
UNK_token = word2vec.word2idx['<UNK>']
print('done')

# transform sentences to embedding
print('valid_X')
valid_X = word2vec.sent2idx(valid_X, INPUT_LEN)
valid_Y = word2vec.sent2idx(valid_Y, INPUT_LEN)

# convert them to dataset and dataloader
import torch
Example #3
0
CKPT_NAME = 'seq2seq/model.ckpt'

device = 'cuda'

# read data
print('reading data...')
from _utils import read_jsonl
train_X, train_Y = read_jsonl(TRAIN_FILE_PATH)
valid_X, valid_Y = read_jsonl(VALID_FILE_PATH)
test_X, _ = read_jsonl(TEST_FILE_PATH, False)
print('done')

# load pretrained word embedding
print('loading word embedding...')
from _word2vec import Word2Vec
word2vec = Word2Vec(EMBEDDING_FILE_PATH, EMBEDDING_DIM)
embedding = word2vec.make_embedding(
    [train_X, train_Y, valid_X, valid_Y, test_X], MIN_DISCARD_LEN)

SOS_token = word2vec.word2idx['<SOS>']
EOS_token = word2vec.word2idx['<EOS>']
PAD_token = word2vec.word2idx['<PAD>']
UNK_token = word2vec.word2idx['<UNK>']
print('done')

# dump word2vec object
import pickle
with open(EMBEDDING_SAVE_PATH, 'wb') as f:
    tmp = {}
    tmp['embedding'] = word2vec.embedding
    tmp['word2idx'] = word2vec.word2idx
Example #4
0
    'len_X': 40,
    'len_Y': 30,
    'BATCH_SIZE': 64,
    'model_name': sys.argv[3],
}
args = argparse.Namespace(**args)

# set random seed
random.seed(1003)
np.random.seed(1003)
torch.manual_seed(1003)
torch.cuda.manual_seed_all(1003)
torch.backends.cudnn.deterministic = True

# word2vec
en, cn = Word2Vec(os.path.join(args.dir, '{}_en.json')), Word2Vec(
    os.path.join(args.dir, '{}_cn.json'))

en_BOS_token = en.word2idx['<BOS>']
en_EOS_token = en.word2idx['<EOS>']
en_PAD_token = en.word2idx['<PAD>']
en_UNK_token = en.word2idx['<UNK>']

cn_BOS_token = cn.word2idx['<BOS>']
cn_EOS_token = cn.word2idx['<EOS>']
cn_PAD_token = cn.word2idx['<PAD>']
cn_UNK_token = cn.word2idx['<UNK>']

# train, valid, test datas
#train_X, train_Y = read_data(os.path.join(args.dir, 'training.txt'))
#valid_X, valid_Y = read_data(os.path.join(args.dir, 'validation.txt'))
Example #5
0
            return
        bt.forward_update_vec()
        sentence_vec = bt.root.vector
        return sentence_vec


if __name__ == "__main__":
    args = sys.argv[1:]
    if len(args) == 0:
        print 'cat stree_paths | ./cmd.py w2v_ph bae_ph'
        sys.exit(-1)
    stree_paths = sys.stdin.read().split()
    # args
    w2v_ph, bae_ph = args
    # load word2vec
    _word2vec = Word2Vec()
    _word2vec.model_fromfile(w2v_ph)
    # load bae
    bae = obj_from_file(bae_ph)
    tree2vec = Tree2Vec(_word2vec, bae)
    for path in stree_paths:
        output = []
        with open(path) as f:
            strees = f.readlines()
            # used to recoveray the original valid sentences
            valid_line_nos = []
            for no, stree in enumerate(strees):
                stree = stree.strip()
                #print 'parsing', stree
                sentence_vec = tree2vec.get_vec_from_stree(stree)
                if sentence_vec is not None: