Exemple #1
0
def infer():
    tar_id2vocab, BOS_ID, EOS_ID = get_vocab(args.dataset, args.batch_size)
    vocab_size = len(tar_id2vocab)

    print(args)
    net = VAESeq2SeqInferModel(args.embed_dim, args.hidden_size,
                               args.latent_size, vocab_size)

    model = paddle.Model(net)
    model.prepare()
    model.load(args.init_from_ckpt)

    infer_output = paddle.ones((args.batch_size, 1), dtype='int64') * BOS_ID

    space_token = ' '
    line_token = '\n'
    with io.open(args.infer_output_file, 'w', encoding='utf-8') as out_file:
        predict_lines = model.predict_batch(infer_output)[0]
        for line in predict_lines:
            end_id = -1
            if EOS_ID in line:
                end_id = np.where(line == EOS_ID)[0][0]
            new_line = [tar_id2vocab[e[0]] for e in line[:end_id]]
            out_file.write(space_token.join(new_line))
            out_file.write(line_token)
Exemple #2
0
def build_text():
    iw,vocab,_=get_vocab()
    with open(text_path,'w',encoding='utf-8') as f:
        data=load_data()
        for post,resp in data:
            post=" ".join(tokenize(post[0],vocab=vocab))
            resp=" ".join(tokenize(resp[0],vocab=vocab))
            f.write(post+"\t"+resp+"\n")
Exemple #3
0
def get_embedding():
    emb_path="datasets/temp/embedding.np"
    if os.path.exists(emb_path):
        return np.load(open(emb_path,'rb'))
    else:
        model=KeyedVectors.load_word2vec_format(model_path,binary=True)
        iw,vocab,_=get_vocab()
        size=len(list(vocab.keys()))
        emb=np.zeros(shape=[size,emb_dim])
        for word,index in vocab.items():
            if index in [0,1] or word not in model.vocab:
                continue
            emb[index]=model[word]
        np.save(open(emb_path,"wb"),emb)
        return emb
Exemple #4
0
 def __init__(self, args):
     super(DCTTS, self).__init__()
     self.args = args
     self.embed = nn.Embedding(len(data.get_vocab(args.lang)), args.Ce, padding_idx=0)
     self.TextEnc = TextEncoder(d_in=args.Ce, d_out=args.Cx*2, d_hidden=args.Cx*2)
     self.AudioEnc = AudioEncoder(d_in=args.n_mels, d_out=args.Cx, d_hidden=args.Cx)
     self.Attention = DotProductAttention(d_hidden=args.Cx)
     self.AudioDec = AudioDecoder(d_in=args.Cx*2, d_out=args.n_mels, d_hidden=args.Cy)
     self.PostNet = PostNet(d_in=args.n_mels, d_out=args.n_mels, d_hidden=args.Cx)
     self.F0Enc = nn.Sequential(
         nn.Linear(1, 32),
         nn.ReLU(),
         nn.Linear(32, args.Cx*2),
         nn.Tanh(),
     )
import tensorflow as tf
import numpy as np
from data import get_vocab, load_train_data_pipe, data_iter_combine
from model_combine import COMBINE

sen_len = 40
label_num = 5
sparse_len = 140
crf_num = 10720
learning_rate = 0.02
batch_size = 10
num_epoch = 10
dropout = True

print('read vocab ...')
vocab_size, embedding_size, embedding, vocab_w2i = get_vocab()
num_hidden = embedding_size

label_onehot = {}
label_hotone = {}
label_onehot['b-person'] = 1
label_onehot['i-person'] = 2
label_onehot['b-organization'] = 3
label_onehot['i-organization'] = 4
label_onehot['o'] = 0
label_hotone[1] = 'b-person'
label_hotone[2] = 'i-person'
label_hotone[3] = 'b-organization'
label_hotone[4] = 'i-organization'
label_hotone[0] = 'o'
Exemple #6
0
#coding:utf-8
import tensorflow as tf
from data import get_train_data, get_vocab, split_data, response_len, post_len, padding
import random
import os

from pprint import pprint
import numpy as np
import time

id2w, w2id, freq = get_vocab()

from emo_cls.classification import Classification
from seq2seq_attention_9emo import Seq2SeqAttentionMinDis, Seq2SeqAttentionMaxDis, Seq2SeqAttentionEmoContent
from seq2seq_attention_9emo import Seq2SeqAttentionHappy, Seq2SeqAttentionSad, Seq2SeqAttentionAnger, Seq2SeqAttentionDisgust
from seq2seq_attention_9emo import Seq2SeqAttentionLike  #,Seq2SeqAttentionSurprise,Seq2SeqAttentionFear

train_datas, val_datas, test_datas = split_data()

keys = ['posts', 'postLen', 'resps', 'respLen', 'resp_tfidf']
train_datas = [train_datas[k] for k in keys]
val_datas = [val_datas[k] for k in keys]
print("train num:%s" % len(train_datas[0]))

seq_len = 20
batch_size = 128
D_step = 5
G_step = 1
is_debug = True

# Emotion Classifier
Exemple #7
0
import random
import numpy as np

train_data_file = 'D:\\nlp\\我的实验\\句子相似度\\sick\\SICK_train.txt'
trial_data_file = 'D:\\nlp\\我的实验\\句子相似度\\sick\\SICK_trial.txt'
test_data_file = 'D:\\nlp\\我的实验\\句子相似度\\sick\\SICK_test_annotated.txt'

train_data = data.load_data(train_data_file)
trial_data = data.load_data(trial_data_file)
test_data = data.load_data(test_data_file)

train_data = train_data + trial_data
print('train data size: ', len(train_data))
print('test data size: ', len(test_data))

vocab, vocab_size, word_to_id, id_to_word, word_to_count = data.get_vocab(
    train_data + test_data)
print('vocab size: ', vocab_size)

word_to_senses_path = 'D:\\nlp\\我的实验\\多义词\\trained_40w\\word_to_sense.txt'
#word_to_vector_path='D:\\nlp\\我的实验\\多义词\\trained_40w\\word_vectors.txt'
sense_to_vector_path = 'D:\\nlp\\我的实验\\多义词\\trained_40w\\sense_vectors.txt'

ass_vector = data.get_sense_ass(word_to_senses_path, sense_to_vector_path)
init_emb = data.fill_with_gloves(word_to_id, ass_vector)
#print(init_emb)
print('Embedding Size: %d' % init_emb.shape[1])

train_ndata = data.convert_to_numeric(train_data, word_to_id)
test_ndata = data.convert_to_numeric(test_data, word_to_id)

#model
Exemple #8
0
import jieba

##分词器 thulac
import thulac
thu1 = thulac.thulac(seg_only=True)  #只进行分词,不进行词性标注
text = thu1.cut("我爱北京天安门", text=True)  #进行一句话分词
print(text)

#分词器 pynlpir
import pynlpir
pynlpir.open()

s = '欢迎科研人员、技术工程师、企事业单位与个人参与NLPIR平台的建设工作。'
pynlpir.segment(s, pos_tagging=False)

_, vocab, _ = get_vocab()


def sent2ids(sent):
    #    sent=" ".join(jieba.lcut(sent))
    #    sent=" ".join(thu1.cut(sent,text=True))
    sent = " ".join(pynlpir.segment(sent, pos_tagging=False))
    words = tokenize(sent, vocab)
    print(words)
    ids = [vocab.get(w, 1) for w in words]
    print(ids)
    l = len(ids)
    return padding([ids], max_len=20), np.array([l]), np.array([20])


if __name__ == "__main__":
Exemple #9
0
        loss = train(model=model,
                     optimizer=optimizer,
                     input_variable=input_variable,
                     target_variable=target_variable,
                     criterion=criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' %
                  (timeSince(start, iter / n_epochs), iter,
                   iter / n_epochs * 100, print_loss_avg))


source_vocab, target_vocab = data.get_vocab()

# Actually training the model
model = model_transformer.Transformer(queries_dim=64,
                                      keys_dim=64,
                                      values_dim=64,
                                      model_dim=512,
                                      num_encoder_layers=6,
                                      num_decoder_layers=6,
                                      n_source_vocab=source_vocab,
                                      n_target_vocab=target_vocab,
                                      num_encoder_heads=8,
                                      num_decoder_heads=8)

if use_cuda:
    model = model.cuda()
Exemple #10
0
from pprint import pprint
from matplotlib import cm
import matplotlib.pyplot as plt

import data
from data import get_vocab, get_padded_train_data, get_predicates, get_questions
from word2vec import get_embedding
import re
import random

base_weight_path = "./weights/"
base_encoded_path = "./datasets/predict/encoded_data"
question_len = data.question_len
predicate_len = data.predicate_len

id2w, vocab = get_vocab()
size = len(vocab)
embedding = get_embedding()
embedding = embedding / np.sqrt(
    (np.sum(np.square(embedding), axis=-1, keepdims=True) + 1e-8))  #用这个效果好


#cos函数
def cosine(x1, x2):
    return K.sum(x1 * x2, axis=-1) / (
        K.sqrt(K.sum(x1 * x1, axis=-1) * K.sum(x2 * x2, axis=-1) + 1e-12)
    )  #cos


def neg_log_loss(x):
    cos1 = x[0]
    group.add_argument("--meta_dev",
                       type=str,
                       default="metaphor_data/validation.csv")
    group.add_argument("--meta_test",
                       type=str,
                       default="metaphor_data/test.csv")
    group.add_argument("--output", type=str, default="output.tsv")
    args = vars(parser.parse_args())
    logging.info(args)

    (meta_train, meta_dev, meta_test) = \
        (args["meta_train"], args["meta_dev"], args["meta_test"])
    # Set seed to combat random effects
    set_seed(args["seed"])

    vocab, sentences = get_vocab(meta_train, meta_dev, meta_test)
    bert = args["model"] == "bert"

    # Metaphor data filenames
    if args["dev"]:
        (meta_test, meta_dev) = (meta_dev, meta_test)

    meta_train = get_metaphor_data(meta_train,
                                   args["batch_size"],
                                   args["k"],
                                   bert,
                                   train=True)
    meta_dev = get_metaphor_data(meta_dev, 8, args["k"], bert)
    meta_test = get_metaphor_data(meta_test, 8, args["k"], bert)

    # Initialise an empty model and train it.
Exemple #12
0
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
if not os.path.exists(result_path): os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))

if __name__ == '__main__':

    build_word_index(args.word_embedding_file, args.src_vocab_file,
                     args.tgt_file, args.tgt_vocab_file)

    src_vocab = get_vocab(args.src_vocab_file)
    src_vocab_size = len(src_vocab)
    src_unknown = src_vocab_size
    src_padding = src_vocab_size + 1
    #print(len(src_vocab))
    #print(vocab_size)

    tgt_vocab = get_vocab(args.tgt_vocab_file)
    tgt_vocab_size = len(tgt_vocab)
    tgt_unknown = tgt_vocab_size
    tgt_padding = tgt_vocab_size + 1
    #print(tgt_vocab)

    embedding = load_word2vec_embedding(args.word_embedding_file,
                                        args.embedding_dim, src_vocab_size)