コード例 #1
0
def read_train_data(file):
    print('read training set')
    # pairs_tensor = []
    pairs_li = []
    lines = 0  # 一共多少个训练数据

    for line in open(file, 'r', encoding='utf-8').readlines():
        lines += 1
        try:
            source, target = line.split('==')
        except:
            print('format mismatch in dataset: ', line.split('=='))
            continue

        if len(source.split(' - ')) == 4:  # poem_1031k_theme
            source = source + ' - ' + source.split(' - ')[0]
        source_words = ('START1 ' + source + ' END1').split(' ')
        target = target.replace('\n', '')
        target_words = target.replace('\t', ' / ').split(' ') + [
            '/'
        ] + target.split('\t')[0].split(' ')  # 用5个句子训练

        source_ids = [
            word2id.get(word, vocab_size - 1) for word in source_words
        ]  # default = 4776 '-' PAD?
        target_ids = [
            word2id.get(word, vocab_size - 1) for word in target_words
        ]
        target_ids.append(EOS_token)  # 没有SOS_token

        if len(target_ids) == 40:  # 只考虑七言
            pairs_li.append([source_ids, target_ids])

    print('training set size:', len(pairs_li))
    return pairs_li  # tmp
コード例 #2
0
    def predict(self, data, cangtou, predict_param):
        hard_rhyme = predict_param['hard_rhyme']
        with torch.no_grad():
            encoder_hidden = self.encoder.initHidden(1)

            input_sentence, input_length = data
            encoder_outputs, encoder_hidden = self.encoder(
                input_sentence, [input_length], encoder_hidden)

            # 将encoder_outputs padding至INPUT_MAX_LENGTH 因为attention中已经固定此维度大小为INPUT_MAX_LENGTH
            encoder_outputs_padded = torch.zeros(1,
                                                 self.input_max_len,
                                                 self.encoder.hidden_size,
                                                 device=device)
            for b in range(1):
                for ei in range(input_length):
                    encoder_outputs_padded[b, ei] = encoder_outputs[b, ei]

            decoder_input = torch.tensor([[SOS_token]],
                                         device=device)  # 第一个input是START
            decoder_hidden = encoder_hidden  # Use last hidden state from encoder to start decoder

            sen_len = 7  # 暂时
            sen_num = 4
            decoded_words = []

            for i in range(sen_num):
                for j in range(sen_len):
                    position = torch.tensor([[i, j]],
                                            dtype=torch.float,
                                            device=device)
                    decoder_output, decoder_hidden, decoder_attention = self.decoder(
                        decoder_input, decoder_hidden, encoder_outputs_padded,
                        position)
                    if j == 0 and cangtou and i < len(cangtou):
                        top_word = cangtou[i]
                        top_id = torch.LongTensor(
                            [word2id.get(top_word, vocab_size - 1)])
                    else:
                        top_id, top_word = get_next_word(decoder_output.data,
                                                         decoded_words,
                                                         hard_rhyme=hard_rhyme)
                        if top_word == 'N':
                            print('cannot meet requirements')
                            break
                    decoded_words.append(top_word)
                    decoder_input = top_id.reshape(
                        (1, 1)).detach()  # detach from history as input

                position = torch.tensor([[i, 7]],
                                        dtype=torch.float,
                                        device=device)
                tmp_decoder_output, tmp_decoder_hidden, tmp_decoder_attention = self.decoder(
                    decoder_input, decoder_hidden, encoder_outputs_padded,
                    position)
                decoder_hidden = tmp_decoder_hidden
                decoder_input = torch.tensor([[2]], device=device)  # '/'作为输入

        return decoded_words
コード例 #3
0
ファイル: Seq2seq_6.py プロジェクト: CSLT-THU/Vivi_3.0
    def predict(self, data, cangtou, predict_param):
        with torch.no_grad():
            encoder_hidden = self.encoder.initHidden(1)

            sen_len = 7  # 暂时
            sen_num = 4
            
            input_sentence, input_length = data
            decoded_words = [id2word[str(id.item())] for id in input_sentence[0]]
            decoder_input = torch.tensor([[2]], device=device)  # 第一个input是/ # Jun24
           
            for i in range(sen_num-1):
                encoder_outputs, encoder_hidden = self.encoder(input_sentence, [input_length], encoder_hidden)

            # 将encoder_outputs padding至INPUT_MAX_LENGTH 因为attention中已经固定此维度大小为INPUT_MAX_LENGTH
                encoder_outputs_padded = torch.zeros(1, self.input_max_len, self.encoder.hidden_size,
                                                 device=device)
                for b in range(1):
                    for ei in range(input_length):
                        encoder_outputs_padded[b, ei] = encoder_outputs[b, ei]

                decoder_hidden = encoder_hidden  # Use last hidden state from encoder to start decoder
                
                for j in range(sen_len):
                    position = torch.tensor([[i, j]], dtype=torch.float, device=device)
                    decoder_output, decoder_hidden, decoder_attention = self.decoder(
                        decoder_input, decoder_hidden, encoder_outputs_padded, position)
                    if j == 0 and cangtou and i < len(cangtou):
                        top_word = cangtou[i]
                        top_id = torch.LongTensor([word2id.get(top_word, vocab_size - 1)])
                    else:
                        top_id, top_word = get_next_word(decoder_output.data, decoded_words)
                        if top_word == 'N':
                            print('cannot meet requirements')
                            break
                    decoded_words.append(top_word)
                    decoder_input = top_id.reshape((1, 1)).detach()  # detach from history as input

                li = [word2id[word] for word in decoded_words]
                for k in range(self.input_max_len-len(li)):
                    li.append(0)
                input_sentence = torch.tensor([li], dtype=torch.long, device=device) 
                input_length = torch.tensor([len(decoded_words)], dtype=torch.long, device=device) 
                decoder_input = torch.tensor([[2]], device=device)  # '/'作为输入
                
        return decoded_words
        
コード例 #4
0
def read_nL21L_train_data(file):
    pairs = []
    for line in open(file, 'r', encoding='utf-8').readlines():
        source, target = line.split('==')
        target = target.replace('\n', '')
        sentences = target.split('\t')
        idss = []
        for sentence in sentences:
            words = sentence.split(' ')
            ids = [word2id.get(word, vocab_size - 1) for word in words]
            ids.append(2)  # 包括target行 每行都加了/
            idss.append(ids)
        p1 = [idss[0], idss[1]]
        p2 = [idss[0] + idss[1], idss[2]]
        p3 = [idss[0] + idss[1] + idss[2], idss[3]]
        pairs.append(p1)
        pairs.append(p2)
        pairs.append(p3)
    return pairs
コード例 #5
0
def read_nL21L_eval_data(file):
    print('read eval set (nL21L)')
    input_li = []
    lines = []
    targets = []
    for line in open(file, 'r', encoding='utf-8').readlines():
        line = line.replace('\n', '')
        line, target = line.split('==')
        lines.append('kw')
        target = target.replace('\n', '')

        l1, l2, l3, l4 = target.split('\t')
        words = l1.split(' ')
        ids = [word2id.get(word, vocab_size - 1) for word in words]
        input_li.append(ids)

        target = target.replace(' ', '').replace('\t', '/')
        targets.append(target)

    print('read test set done')
    return input_li, lines, targets
コード例 #6
0
def read_BERT_train_data(dataset):
    dataset_path = 'resource/dataset/' + dataset + '.txt'
    encodes_path = 'BERT_as_service/encodes/enc_li_' + dataset + '.json'
    target_ids_li = []
    pairs = []
    for line in open(dataset_path, 'r', encoding='utf-8').readlines():
        source, target = line.split('==')
        target = target.replace('\n', '')
        target_words = target.replace('\t', ' / ').split(' ') + [
            '/'
        ] + target.split('\t')[0].split(' ')  # 用5个句子训练
        target_ids = [
            word2id.get(word, vocab_size - 1) for word in target_words
        ]
        target_ids.append(EOS_token)  # 没有SOS_token
        target_ids_li.append(target_ids)
    with open(encodes_path, 'r', encoding='utf-8') as f:
        source_enc_li = json.load(f)
    for i in range(len(target_ids_li)):
        pairs.append([source_enc_li[i], target_ids_li[i]])
    print('training set size:', len(pairs))
    return pairs
コード例 #7
0
def line2ids(line):  # for testset
    input_words = line.split(' ')
    input_ids = [word2id.get(word, vocab_size - 1)
                 for word in input_words]  # default = 4776 '-' ?
    input_ids.append(EOS_token)
    return input_ids