Example #1
0
def main() -> None:
    tokenizer = Tokenizer(args.vocab_file)
    vocabulary_size = len(tokenizer)

    searcher = BeamSearch(tokenizer.eos_index, beam_size=args.search_width)

    model = VAE(
        num_embeddings=len(tokenizer),
        dim_embedding=args.dim_embedding,
        dim_hidden=args.dim_hidden,
        dim_latent=args.dim_latent,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional,
        dropout=0.,
        word_dropout=0.,
        dropped_index=tokenizer.unk_index,
    ).to(device)
    model.load_state_dict(torch.load(args.checkpoint_file,
                                     map_location=device))
    model.eval()

    sentence1 = input('Please input sentence1: ')
    sentence2 = input('Please input sentence2: ')

    s1 = [tokenizer.bos_index
          ] + tokenizer.encode(sentence1) + [tokenizer.eos_index]
    s2 = [tokenizer.bos_index
          ] + tokenizer.encode(sentence2) + [tokenizer.eos_index]

    z1, _ = model.encode(
        torch.tensor([s1]).to(device),
        torch.tensor([len(s1)]).to(device))
    z2, _ = model.encode(
        torch.tensor([s2]).to(device),
        torch.tensor([len(s2)]).to(device))

    print("\nGenerate intermediate sentences")
    print("      %s" % sentence1)
    for r in range(1, 10):
        z = (1 - 0.1 * r) * z1 + 0.1 * r * z2
        hidden = model.fc_hidden(z)
        hidden = hidden.view(1, -1,
                             model.dim_hidden).transpose(0, 1).contiguous()

        start_predictions = torch.zeros(1, device=device).fill_(
            tokenizer.bos_index).long()
        start_state = {'hidden': hidden.permute(1, 0, 2)}
        predictions, log_probabilities = searcher.search(
            start_predictions, start_state, model.step)

        tokens = predictions[0, 0]
        tokens = tokens[tokens != tokenizer.eos_index].tolist()
        print("[%d:%d] %s" % (10 - r, r, tokenizer.decode(tokens)))
    print("      %s" % sentence2)
Example #2
0
class IntentClassifier(tf.keras.Model):
    def __init__(self,
                 n_intents=None,
                 dropout=0.2,
                 model_name="bert-base-uncased"):
        super().__init__(name="intent_classifier")

        self.tokenizer = Tokenizer()
        self.bert = TFBertForSequenceClassification.from_pretrained(model_name)
        self.dropout = Dropout(dropout)
        self.intent_classifier = Dense(n_intents, activation='softmax')

    def call(self, inputs, **kwargs):
        # The second output of the main BERT layer corresponds to the [CLS] token
        # and gives a pooled representation for the full sequence

        pooled_output = self.bert(inputs, **kwargs)
        pooled_output = self.dropout(pooled_output)
        intent = self.intent_classifier(pooled_output)
        return intent

    def get_embedding(self, plain_text, **kwargs):
        encoded = self.tokenizer.encode(plain_text)
        print(">> encoded", encoded)

        _, pooled_output = self.bert(encoded, **kwargs)
        return pooled_output.numpy()
Example #3
0
class DreamDataset(Dataset):
    """
    自定义dataset
    针对周公姐解梦数据集,定义一个相关的取数据的方式
    """
    def __init__(self):
        # 一般init函数是加载所有数据
        super(DreamDataset, self).__init__()
        # 读原始数据
        self.sents_src, self.sents_tgt = read_corpus(
            Config.dream_train_corpus_path)
        self.word2idx = load_bert_vocab()
        self.idx2word = {k: v for v, k in self.word2idx.items()}
        self.tokenizer = Tokenizer(self.word2idx)

    def __getitem__(self, i):
        # 得到单个数据
        src = self.sents_src[i]
        tgt = self.sents_tgt[i]

        token_ids, token_type_ids = self.tokenizer.encode(src, tgt)
        output = {
            "token_ids": token_ids,
            "token_type_ids": token_type_ids,
        }
        return output

    def __len__(self):
        return len(self.sents_src)
Example #4
0
class BertDataset(Dataset):
    """
    针对特定数据集,定义一个相关的取数据的方式
    """
    def __init__(self, sents_src, sents_tgt, vocab_path):
        ## 一般init函数是加载所有数据
        super(BertDataset, self).__init__()
        # 读原始数据
        # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir)
        self.sents_src = sents_src
        self.sents_tgt = sents_tgt
        self.word2idx = load_chinese_base_vocab(vocab_path)
        self.idx2word = {k: v for v, k in self.word2idx.items()}
        self.tokenizer = Tokenizer(self.word2idx)

    def __getitem__(self, i):
        ## 得到单个数据
        src = self.sents_src[i]
        tgt = self.sents_tgt[i]
        token_ids, token_type_ids = self.tokenizer.encode(src,
                                                          tgt,
                                                          max_length=256)
        output = {
            "token_ids": token_ids,
            "token_type_ids": token_type_ids,
        }
        return output

    def __len__(self):

        return len(self.sents_src)
def main(input_path, output_path, sp_model_path, n_val, n_test, seed):
    tokenizer = Tokenizer(sp_model_path, bos_eos=True)

    train_dir = os.path.join(output_path, 'train')
    val_dir = os.path.join(output_path, 'val')
    test_dir = os.path.join(output_path, 'test')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    input_file_paths = sorted(glob.glob(os.path.join(input_path, '*.txt')))
    random.seed(seed)
    random.shuffle(input_file_paths)

    for i, input_file_path in enumerate(input_file_paths):
        print(f'\r{i + 1} / {len(input_file_paths)}', end='')

        file_name = os.path.basename(input_file_path)
        with open(input_file_path) as f:
            tids = tokenizer.encode(f.read())
        tids = torch.tensor(tids, dtype=torch.long)

        if i < n_val:
            torch.save(tids,
                       os.path.join(val_dir, file_name.replace('.txt', '.pt')))
        elif n_val <= i < n_val + n_test:
            torch.save(
                tids, os.path.join(test_dir, file_name.replace('.txt', '.pt')))
        else:
            torch.save(
                tids, os.path.join(train_dir, file_name.replace('.txt',
                                                                '.pt')))

    print()
    print('done.')
Example #6
0
def generate(
    x: str,
    beam_width: int,
    device: torch.device,
    max_seq_len: int,
    model: Transformer,
    tokenizer: Tokenizer
) -> str:
    model.eval()
    seq = torch.LongTensor([tokenizer.bos_id]).to(device)
    x = torch.LongTensor([tokenizer.encode(x, max_len=-1)]).to(device)

    accum_prob = torch.zeros(beam_width).to(device)

    for _ in range(max_seq_len):
        pred_y = model.predict(x, seq)

        top_k_in_all_beams = []
        for out_beams in range(seq.size(0)):
            top_k_prob_in_beam, top_k_index_in_beam = \
                pred_y[out_beams, -1].topk(
                    k=beam_width,
                    dim=-1
                )
            for in_beam in range(beam_width):

                prob = accum_prob[out_beams] -\
                    top_k_prob_in_beam[in_beam].log()
                prob = prob.unsqueeze(0)

                temp_seq = torch.cat([
                    seq[out_beams],
                    top_k_index_in_beam[in_beam].unsqueeze(0)
                ], dim=-1).unsqueeze(0)

                top_k_in_all_beams.append({
                    'prob': prob,
                    'seq': temp_seq
                })

        _, top_k_index_in_all_beams = torch.cat([
            beam['prob'] for beam in top_k_in_all_beams
        ]).topk(k=beam_width, dim=0)

        seq = torch.cat([
            top_k_in_all_beams[index]['seq']
            for index in top_k_index_in_all_beams
        ], dim=0)

        accum_prob = torch.cat([
            top_k_in_all_beams[index]['prob']
            for index in top_k_index_in_all_beams
        ], dim=0)

        if x.size(0) != seq.size(0):
            x = x.repeat(seq.size(0) // x.size(0), 1)

    for i in tokenizer.batch_decode(seq.tolist()):
        print(i)
    def __init__(self,
                 tokenizer: Tokenizer,
                 file_path: str,
                 block_size: int,
                 overwrite_cache=False):
        super(TextDataset, self).__init__()
        self.path = file_path
        assert os.path.isfile(file_path)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory,
            "cached_lm_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.data = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)
            else:
                logger.info(
                    f"Creating features from dataset file at {directory}")

                self.data = []
                with open(file_path, encoding="utf-8") as f:
                    text = f.read()

                tokenized_text = tokenizer.encode(text)

                for i in range(0,
                               len(tokenized_text.ids) - block_size + 1,
                               block_size):  # Truncate in block of block_size
                    self.data.append(tokenized_text.ids[i:i + block_size])
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should loook for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.data,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]",
                    cached_features_file,
                    time.time() - start)
Example #8
0
def collate(data: List[str], tokenizer: Tokenizer, block_size: int) -> Batch:
    ids = tokenizer.encode(data, block_size)
    mask = tokenizer.mask(ids)
    return Batch(ids=ids, attention_mask=mask)
class Seq2SeqModel(nn.Module):
    """
    模型
    """
    def __init__(self, config: BertConfig):
        super(Seq2SeqModel, self).__init__()
        # 获取配置信息
        self.hidden_dim = config.hidden_size
        self.vocab_size = config.vocab_size

        # encoder and decoder
        self.bert = BertModel(config)
        self.decoder = BertLMPredictionHead(
            config, self.bert.embeddings.word_embeddings.weight)

        # 加载字典和分词器
        self.word2ix = load_bert_vocab()
        self.tokenizer = Tokenizer(self.word2ix)

    def compute_loss(self, predictions, labels, target_mask):
        """
        target_mask : 句子a部分和pad部分全为0, 而句子b部分为1
        """
        predictions = predictions.view(-1, self.vocab_size)
        labels = labels.view(-1)
        target_mask = target_mask.view(-1).float()
        loss = nn.CrossEntropyLoss(ignore_index=0, reduction="none")
        return (loss(predictions, labels) * target_mask
                ).sum() / target_mask.sum()  ## 通过mask 取消 pad 和句子a部分预测的影响

    def forward(self,
                input_tensor,
                token_type_id,
                position_enc=None,
                labels=None,
                device="cpu"):
        '''
        :param input_tensor: 传入输入
        :param token_type_id: 句子标志
        :param position_enc: 位置编码
        :param labels: 解码的句子
        :param device:
        :return:
        '''
        input_shape = input_tensor.size()

        seq_len = input_shape[1]
        # 构建特殊的mask
        ones = torch.ones((1, 1, seq_len, seq_len),
                          dtype=torch.float32,
                          device=device)
        a_mask = ones.tril()  # 下三角矩阵
        s_ex12 = token_type_id.unsqueeze(1).unsqueeze(2).float()
        s_ex13 = token_type_id.unsqueeze(1).unsqueeze(3).float()
        a_mask = (1.0 - s_ex12) * (1.0 - s_ex13) + s_ex13 * a_mask
        # print(a_mask.size())   # torch.Size([2, 1, 44, 44])

        enc_layers, _ = self.bert(input_tensor,
                                  position_ids=position_enc,
                                  token_type_ids=token_type_id,
                                  attention_mask=a_mask,
                                  output_all_encoded_layers=True)
        # print(_.size())  # torch.Size([2, 768])   (batch_size, hidden_size)

        squence_out = enc_layers[-1]  # 取出来最后一层输出
        # print(squence_out.size())    # torch.Size([2, 31, 768])

        predictions = self.decoder(squence_out)
        # print(labels.size())   # torch.Size([2, 30])
        # print(predictions.size())   # torch.Size([2, 31, 21128])

        if labels is not None:
            # 计算loss
            # 需要构建特殊的输出mask 才能计算正确的loss
            # 预测的值不用取最后sep符号的结果 因此是到-1
            predictions = predictions[:, :-1].contiguous()
            # print(predictions.size())  # torch.Size([2, 30, 21128])

            target_mask = token_type_id[:, 1:].contiguous()
            # print(target_mask)
            loss = self.compute_loss(predictions, labels, target_mask)
            return predictions, loss
        else:
            return predictions

    def generate(self, text, out_max_length=50, beam_size=1, device="cpu"):
        # 对一个句子生成相应的结果
        # 通过输出最大长度得到输入的最大长度,这里问题不大,如果超过最大长度会进行截断
        self.out_max_length = out_max_length
        input_max_length = Config.max_length - out_max_length
        # print(text)
        token_ids, token_type_ids = self.tokenizer.encode(
            text, max_length=input_max_length)
        token_ids = torch.tensor(token_ids, device=device).view(1, -1)
        token_type_ids = torch.tensor(token_type_ids,
                                      device=device).view(1, -1)
        out_puts_ids = self.beam_search(token_ids,
                                        token_type_ids,
                                        self.word2ix,
                                        beam_size=beam_size,
                                        device=device)
        # 解码 得到相应输出
        return self.tokenizer.decode(out_puts_ids)

    def beam_search(self,
                    token_ids,
                    token_type_ids,
                    word2ix,
                    beam_size=1,
                    device="cpu"):
        """
        beam-search操作
        """
        sep_id = word2ix["[SEP]"]
        # 用来保存输出序列
        output_ids = [[]]
        # 用来保存累计得分
        output_scores = torch.zeros(token_ids.shape[0], device=device)
        for step in range(self.out_max_length):

            scores = self.forward(token_ids, token_type_ids, device=device)
            # print(scores.shape)
            if step == 0:
                # 重复beam-size次 输入ids
                token_ids = token_ids.view(1, -1).repeat(beam_size, 1)
                token_type_ids = token_type_ids.view(1,
                                                     -1).repeat(beam_size, 1)
            ## 计算log 分值 (beam_size, vocab_size)
            logit_score = torch.log_softmax(scores, dim=-1)[:, -1]
            logit_score = output_scores.view(-1, 1) + logit_score  # 累计得分
            ## 取topk的时候我们是展平了然后再去调用topk函数
            # 展平
            logit_score = logit_score.view(-1)
            hype_score, hype_pos = torch.topk(logit_score, beam_size)
            indice1 = hype_pos / scores.shape[-1]  # 行索引
            indice2 = hype_pos % scores.shape[-1]  # 列索引

            # 下面需要更新一下输出了
            new_hype_scores = []
            new_hype_ids = []
            # 为啥有这个[],就是因为要过滤掉结束的序列。
            next_chars = []  # 用来保存新预测出来的一个字符,继续接到输入序列后面,再去预测新字符
            for i_1, i_2, score in zip(indice1, indice2, hype_score):
                i_1 = i_1.item()
                i_2 = i_2.item()
                socre = score.item()

                hype_id = output_ids[i_1] + [i_2]  # 保存所有输出的序列,而不仅仅是新预测的单个字符

                if i_2 == sep_id:
                    # 说明解码到最后了
                    if score == torch.max(hype_score).item():
                        # 说明找到得分最大的那个序列了 直接返回即可
                        return hype_id[:-1]
                    else:
                        # 完成一个解码了,但这个解码得分并不是最高,因此的话需要跳过这个序列
                        beam_size -= 1
                else:
                    new_hype_ids.append(hype_id)
                    new_hype_scores.append(score)
                    next_chars.append(i_2)  # 收集一下,需要连接到当前的输入序列之后

            output_ids = new_hype_ids

            output_scores = torch.tensor(new_hype_scores,
                                         dtype=torch.float32,
                                         device=device)
            # 现在需要重新构造输入数据了,用上一次输入连接上这次新输出的字符,再输入bert中预测新字符
            token_ids = token_ids[:len(output_ids)].contiguous(
            )  # 截取,因为要过滤掉已经完成预测的序列
            token_type_ids = token_type_ids[:len(output_ids)].contiguous()

            next_chars = torch.tensor(next_chars,
                                      dtype=torch.long,
                                      device=device).view(-1, 1)
            next_token_type_ids = torch.ones_like(next_chars, device=device)
            # 连接
            token_ids = torch.cat((token_ids, next_chars), dim=1)
            token_type_ids = torch.cat((token_type_ids, next_token_type_ids),
                                       dim=1)
            if beam_size < 1:
                break

        # 如果达到最大长度的话 直接把得分最高的输出序列返回把
        return output_ids[output_scores.argmax().item()]
Example #10
0
class Engine:
    def __init__(self, data_file_path="intents_db.pkl"):
        self.data = []
        self.known_intents = []
        self.n_intents = 0
        self.intents_labels = {}
        self.intents_embeddings = {}
        self.data_file_path = data_file_path
        # self.model_file_path = "intent_classifier.h5"
        self.encoder = Tokenizer()
        self.model = None

    def initialize(self):
        RELOAD_DATA = os.path.exists(self.data_file_path)
        if RELOAD_DATA:
            logger.info("Reloading data from file {}".format(
                self.data_file_path))
            with open(self.data_file_path, 'rb') as f:
                self.data = pickle.load(f)
                self.update_intents()
        else:
            self.data = []
            with open("raw_intents.csv", 'r') as f:
                rows = f.readlines()
                for row in rows:
                    if len(row) == 0:
                        continue
                    if row.startswith("#"):
                        continue
                    query, intent = row.split(",")
                    self.data.append({
                        'raw': {
                            'query': query.lower().strip(),
                            'intent': intent.lower().strip()
                        },
                        'nlu': {}
                    })
            self.update_intents()
            self.tokenize_data()
        print("Known intents:")
        print(self.known_intents)

        self.model = IntentClassifier(n_intents=self.n_intents)
        self.model.compile(optimizer='adam',
                           loss='categorical_crossentropy',
                           metrics=['categorical_accuracy'])
        X, y = self.make_training_dataset(self.data)
        print(X["input_ids"].shape, X['attention_masks'].shape, y.shape)
        self.model.train_on_batch(X, y)
        self.model.summary()
        self.make_intents_embeddings()

    def update_intents(self, new_intent=None):
        if new_intent is None:
            self.known_intents = list(
                set([
                    x['raw']['intent'] for x in self.data
                    if 'intent' in x['raw']
                ]))
        else:
            if new_intent not in self.known_intents:
                self.known_intents.append(new_intent)
        self.known_intents.sort()
        self.n_intents = len(self.known_intents)
        self.intents_labels = {k: i for i, k in enumerate(self.known_intents)}

    def make_intents_embeddings(self):
        embeddings = self.model.get_embedding(self.known_intents)
        self.intents_embeddings = {
            k: emb
            for k, emb in zip(self.known_intents, embeddings)
        }
        print(self.known_intents)

    def tokenize_data(self):
        query = [x['raw']['query'] for x in self.data]
        encoded = self.encoder.encode(query)
        for i in range(len(self.data)):
            x = self.data[i]
            x['nlu'] = {
                'input_ids': encoded['input_ids'][i],
                'attention_masks': encoded['attention_masks'][i],
                'label': self.intents_labels[x['raw']['intent']]
            }

    @staticmethod
    def make_training_dataset(batch):
        X = {
            "input_ids":
            np.array([x['nlu']['input_ids'] for x in batch]),
            'attention_masks':
            np.array([x['nlu']['attention_masks'] for x in batch])
        }
        y = np.array([x['nlu']['label'] for x in batch], dtype=np.int64)
        y = to_categorical(y)
        return X, y

    def write_out(self):
        # Write out to file
        print("Saving data to file {}".format(self.data_file_path))
        with open(self.data_file_path, 'wb') as f:
            pickle.dump(self.data, f)
        #model.save(model_file_path)

    def predict_intent(self, txt):
        this_embedding = self.model.get_embedding([txt])
        all_embeddings = [
            self.intents_embeddings[i] for i in self.known_intents
        ]
        scores = cosine_similarity(this_embedding, all_embeddings)
        k = np.argmax(scores[0])
        confidence = scores[0][k]
        closest_intent = self.known_intents[k]
        return closest_intent, confidence

    def loop(self):
        while True:
            print("Tell me what you would like to do")
            txt = input()
            txt = txt.lower()
            if txt in ['q', 'quit', 'stop']:
                return

            intent, confidence = self.predict_intent(txt)
            print("Is this your purpose? {} (confidence={:.3f})".format(
                intent, confidence))
            reply = input().lower()
            if reply in ['n', 'no', 'nope']:
                print("What is the purpose?")
                intent = input().lower()
                if intent not in self.known_intents:
                    closest_intent, confidence = self.predict_intent(intent)
                    print("Is this the same as {} (confidence={:.3f})? [y, n]".
                          format(closest_intent, confidence))
                    reply = input().lower()
                    if reply in ['y', 'yes']:
                        intent = closest_intent
                    else:
                        print("This is a new intent to me")
                        self.update_intents(intent)
                        self.make_intents_embeddings()
            print("Ok, so you are asking for: {}".format(intent))

            encoded = self.encoder.encode(txt)
            entry = {
                'raw': {
                    'query': txt,
                    'intent': intent
                },
                'nlu': {
                    'input_ids': encoded['input_ids'][0],
                    'attention_masks': encoded['attention_masks'][0],
                    'label': self.intents_labels[intent]
                }
            }
            self.data.append(entry)

            X, y = self.make_training_dataset([entry])
            self.model.train_on_batch(X, y)
Example #11
0
                        type=int,
                        default=20,
                        help='размер генерируемого текста')
    parser.add_argument('--num_beams', type=int)
    parser.add_argument('--num_return_sequences', default=1, type=int)
    parser.add_argument('--no_repeat_ngram_size', type=int)
    parser.add_argument('--temperature', type=float)
    parser.add_argument('--top_k', type=int)
    parser.add_argument('--top_p', type=float)
    args = parser.parse_args()
    model = GPT2LMHeadModel.from_pretrained(args.output_dir
                                            or train_args.output_dir)
    tokenizer = Tokenizer(train_args.tokenizer_path)
    model.eval()

    input_ids = tokenizer.encode([args.start])
    if args.top_k:
        if args.top_p:
            outputs = model.generate(
                input_ids,
                do_sample=True,
                max_length=args.length,
                top_k=args.top_k,
                top_p=args.top_p,
                no_repeat_ngram_size=args.no_repeat_ngram_size,
                num_return_sequences=args.num_return_sequences)
        else:
            outputs = model.generate(
                input_ids,
                do_sample=True,
                max_length=args.length,