コード例 #1
0
ファイル: bpe_demo.py プロジェクト: WyAzx/ml_final_project
def test():
    bpemb_en = BPEmb(lang="en", dim=100)
    s = "Stratford"
    res1 = bpemb_en.encode(s)
    res2 = bpemb_en.encode_ids(s)
    print(res1)
    print(res2)

    bpemb_en_100k = BPEmb(lang="en", vs=100000, dim=100)  # 40 M;词表越大切分越少
    s = "hello world !"
    bpemb_en_100k.encode_ids(s)
    res1 = bpemb_en_100k.encode(s)
    res2 = bpemb_en_100k.encode_ids(s)
    print(res1)
    print(res2)
コード例 #2
0
ファイル: bpe_textcnn.py プロジェクト: WyAzx/ml_final_project
class DataLoader():
    def __init__(self, vocab_size, embedding_dim, max_sequence_len):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.max_sequence_len = max_sequence_len
        self.bpemb_en_100k = BPEmb(lang="en",
                                   vs=self.vocab_size,
                                   dim=self.embedding_dim)  # 40 M;词表越大切分越少

    def get_x_data(self, sentences):
        sentences_ids = self.bpemb_en_100k.encode_ids(sentences)  # 使用bpe
        x = pad_sequences(sentences_ids, maxlen=self.max_sequence_len)
        return x

    def get_train_data(self):
        nrows = 100
        train_df = pd.read_csv("../data/train_preprocessed.csv", nrows=nrows)
        X_train = self.get_x_data(train_df["comment_text"])
        Y_train = train_df['target'].values
        x, y = np.asarray(X_train), np.asarray(Y_train)
        print(x.shape, y.shape)
        return x, y

    def get_test_data(self):
        nrows = 100
        test_df = pd.read_csv("../data/test_preprocessed.csv", nrows=nrows)
        X_test = self.get_x_data(test_df["comment_text"])
        return np.asarray(X_test), test_df
コード例 #3
0
class BPEmbEmbedding(AbstractEmbedding):

    def __init__(self, lang: str, dim: int = 300, vs: int = 100000, add_pad_emb: bool = True):
        super().__init__()
        try:
            from bpemb import BPEmb
            self.embedder = BPEmb(lang=lang, dim=dim, vs=vs, add_pad_emb=add_pad_emb, cache_dir=Path(aikido.cache_root) / "embeddings")
            self.embeddings_ = nn.Embedding.from_pretrained(tensor(self.embedder.vectors, dtype=torch.float),
                                                            padding_idx=vs)
            self.dim_ = dim
            self.vs_ = vs
        except ImportError:
            logging.error("-" * 100)
            logging.error("no bpemb installation found. see https://github.com/bheinzerling/bpemb")
            logging.error("-" * 100)
            pass

    @property
    def embedding_length(self) -> int:
        return self.dim_

    @property
    def vocabulary_length(self) -> int:
        return self.vs_

    def encode_ids(self, word):
        return self.embedder.encode_ids(word)

    def embed(self, x):
        return self.embeddings_(x)

    def raw_embedding(self) -> nn.Embedding:
        return self.embeddings_
コード例 #4
0
class SubWordVocab(object):
    def __init__(self, size):
        self.encoder = BPEmb(lang='en', vs=size)

        assert self.sos_id == 1
        assert self.eos_id == 2

    def __len__(self):
        return self.encoder.vs

    @property
    def sos_id(self):
        return 1

    @property
    def eos_id(self):
        return self.encoder.EOS

    def encode(self, syms):
        return self.encoder.encode_ids(syms)

    def decode(self, ids):
        syms = self.encoder.decode_ids(ids)
        if isinstance(syms, list):
            return ''
        return syms
コード例 #5
0
ファイル: model.py プロジェクト: benjamin-croker/TopicC
class TopicCEncSimpleBPemb(_TopicCBase):
    def __init__(self, embed_size, output_size, enc_hidden_size):
        super(TopicCEncSimpleBPemb, self).__init__()
        print("init: TopicCEncSimpleBPemb model")

        self.embedding_model = BPEmb(dim=embed_size, lang="en", vs=100000)
        self.encoder = nn.GRU(input_size=embed_size,
                              hidden_size=enc_hidden_size,
                              num_layers=1,
                              bidirectional=True)
        self.seq_to_output_map = nn.Linear(2 * enc_hidden_size,
                                           output_size,
                                           bias=False)

    def embed_sequence(self, sequence: str) -> torch.Tensor:
        v_ids = self.embedding_model.encode_ids(sequence)
        return torch.tensor(self.embedding_model.vectors[v_ids]).to(
            self._device)

    def create_seq_vecs(
        self, sequences: List[str]
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        # returns the padded seq vector, lengths and original order index
        # sequences are sorted by length, and can be reverted to their original
        # order with the unsorting index vector

        # start by embedding the sequence vectors
        seq_vecs = [self.embed_sequence(s) for s in sequences]

        # sort lengths and set the device
        lengths = torch.tensor([seq_v.shape[0] for seq_v in seq_vecs])
        lengths, sort_i = lengths.sort(descending=True)
        _, orig_i = sort_i.sort()

        # pad the seq vecs and sort by length (dim 2 is the batch dimension)
        seq_vec_pad = rnn.pad_sequence(seq_vecs).to(self._device)
        seq_vec_pad = seq_vec_pad[:, sort_i, :]

        return seq_vec_pad, lengths, orig_i

    def forward(self, sequences: List[str]) -> torch.Tensor:
        # Make the word embeddings for each sequence
        pad_seq_vecs, lengths, orig_i = self.create_seq_vecs(sequences)

        # pack the sequence for the GRU
        # packed_seq_vecs.shape = max_seq_len, batch_size, embedding_dim
        packed_seq_vecs = rnn.pack_padded_sequence(pad_seq_vecs, lengths)

        # run through the GRU
        _, h_n = self.encoder(packed_seq_vecs)
        seq_output = torch.cat((h_n[0], h_n[1]), dim=1)
        output = self.seq_to_output_map(seq_output)
        # sort the output to match the original order
        output = output[orig_i, :]

        return nn.functional.log_softmax(output, dim=1)
コード例 #6
0
class TextEncoder():
    def __init__(self, vocab_path):
        with open(vocab_path, 'rb') as file:
            self.vocab = pickle.load(file)

        self.bpemb_en = BPEmb(lang="en", dim=300, vs=1000)

    def encode(self, text):
        token_ids = self.bpemb_en.encode_ids(text)
        ids = np.array([self.vocab[t] for t in token_ids])
        return ids
コード例 #7
0
ファイル: TestBPE.py プロジェクト: erelcan/keras-transformer
def test_encoding():
    text = ["This is Stratford", "<pad>"]

    bpemb_en = BPEmb(lang="en", add_pad_emb=True)

    # We can auto-add and encode start/end tokens. However, encoder can't handle <pad> directly.
    # We should pad outside with the corresponding index (index of the last word when add_pad_emb True).
    print(bpemb_en.encode(text))
    print(bpemb_en.encode_with_eos(text))
    print(bpemb_en.encode_with_bos_eos(text))
    print(bpemb_en.encode_ids(text))
    print(bpemb_en.encode_ids_with_eos(text))
    print(bpemb_en.encode_ids_with_bos_eos(text))
コード例 #8
0
    def build_index(self, meta_parquet_path, vocab_size=50000, trees=100):
        '''
        Build an annoy index of metadata titles. Uses BPEEmb, so small vocab size is fine
        
        index_path: path with filename, where filename ends with '.ann'
        trees: Number of trees to use for the Annoy index. More is better but slower to
            build.
        '''
        from bpemb import BPEmb
        from compare_tools.hathimeta import clean_title

        metadf = pd.read_parquet(meta_parquet_path, columns=['htid', 'title'])
        bpemb_en = BPEmb(lang="en", dim=self.dims, vs=vocab_size)

        # Insert vectors for documents into Annoy index, using the integer from
        # the metadf index as the id
        for i, row in metadf.reset_index().fillna('').astype(str).iterrows():
            bpe_ids = bpemb_en.encode_ids(row.title)
            # Sum of full title. Imperfect, would work better if BPEs for each word were averaged first.
            vec = bpemb_en.vectors[bpe_ids].sum(0)

            trimmed_bpe_ids = bpemb_en.encode_ids(clean_title(row.title))
            trimmed_vec = bpemb_en.vectors[trimmed_bpe_ids].sum(0)

            # Average, with more weight on the cleaned title.
            weighted = np.average([vec, trimmed_vec], axis=0, weights=[.3, .7])

            self.u.add_item(i, weighted)
            if i % 100000 == 0:
                print(i, end=',')
        print()

        # will take about 30m for 100 dims and 8mi titles
        self.u.build(trees)
        self.u.save(index_path)
        metadf.reset_index()['htid'].to_csv(self.index_reference_path,
                                            compression='gzip')
コード例 #9
0
class TweetTokenizer():
    def __init__(self, dim=50, vocab_size=10000, mode='get_id'):
        self.dim = dim
        self.vocab_size = vocab_size
        self.bpemb_en = BPEmb(lang="en", dim=dim, vs=vocab_size)
        self.embedding_weight = self.bpemb_en.vectors
        self.mode = mode
    
    def __call__(self, tweet, mode='get_id'):
        if mode == 'get_id':
            return torch.tensor(self.bpemb_en.encode_ids(tweet), dtype=torch.long)
        elif mode == 'raw':
            return self.bpemb_en.encode(tweet)
        else:
            raise ValueError('Invalid mode')
コード例 #10
0
class BPembTokenizer(Tokenizer):
    def __init__(self, vocab_size=50000, emb_dim=300, lang='en'):
        super(BPembTokenizer, self).__init__()
        from bpemb import BPEmb
        self.bpemb_en = BPEmb(lang=lang, vs=vocab_size, dim=emb_dim)

    def get_embeddings(self):
        return self.bpemb_en.vectors

    def encode_ids(self, text):
        return self.bpemb_en.encode_ids(text)

    def decode_ids(self, ids):
        return self.bpemb_en.decode_ids(ids)

    def tokenize(self, text):
        return self.bpemb_en.encode(text)
コード例 #11
0
class LoadedBPEEmbeddingTextVectorizer(LoadedTextVectorizer):
    def __init__(self, predictor_config):
        predictor_config = predictor_config['vectorizer']
        self.bpemb = BPEmb(lang='en',
                           dim=predictor_config['embedding_dim'],
                           vs=predictor_config['max_vocab_size'],
                           add_pad_emb=True)
        self.max_seq_len = predictor_config['max_seq_len']

    def get_cutoff_ratios(self, texts: List[str]) -> List[float]:
        sequences = self.bpemb.encode_ids(texts)
        return [len(sequence) / self.max_seq_len for sequence in sequences]

    def vectorize(self, texts: List[str]):
        vectorized = _vectorize_padded(bpemb=self.bpemb,
                                       max_seq_len=self.max_seq_len,
                                       texts=texts)
        cut_off_ratios = self.get_cutoff_ratios(texts)
        return vectorized, cut_off_ratios
コード例 #12
0
ファイル: dataset.py プロジェクト: Danil328/EasyNetNLP
class StatusDataset(Dataset):
    def __init__(self, path=config.path_to_data, mode='train'):
        self.path_to_data = path
        self.mode = mode
        print(f"Loading {self.mode} data...")
        self.data = self.read_data()
        self.preprocess_data()
        self.bpemb_ru = BPEmb(lang="ru", dim=300, vs=50000)
        self.placeholder = torch.zeros(config.max_seq_length, dtype=torch.long)

    def read_data(self):

        if self.mode == 'train':
            data = pd.read_csv(self.path_to_data)
            data = data[data.isTest == 0][['text_orig', 'label']]
        elif self.mode == 'val':
            data = pd.read_csv(self.path_to_data)
            data = data[data.isTest == 1][['text_orig', 'label']]
        elif self.mode == 'test':
            data = pd.read_parquet(self.path_to_data)
        return data

    def preprocess_data(self):
        self.data['text_orig'] = self.data['text_orig'].map(self.remove_urls)
        self.data = shuffle(self.data)
        self.data.reset_index(drop=True, inplace=True)

    def remove_urls(self, v_text):
        v_text = re.sub(r"(/[\w\-?=$&:;#@/]+)", '', v_text, flags=re.MULTILINE)
        v_text = re.sub(r'(https?:[/.]*)', '', v_text, flags=re.MULTILINE)
        return v_text

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        text = self.data['text_orig'][idx]
        label = self.data['label'][idx]
        ids_tokens = self.bpemb_ru.encode_ids(text)
        placeholder = self.placeholder.clone()
        placeholder[:len(ids_tokens)] = torch.tensor(
            ids_tokens)[:config.max_seq_length]
        return placeholder, torch.tensor(label)
コード例 #13
0
    def dump(self, dump_path, bpe: BPEmb, max_sents=1, shuffle=0):
        with ExitStack() as stack:
            article_content_file = stack.enter_context(open(dump_path + '.content', "w", encoding='utf-8'))
            article_label_file = stack.enter_context(open(dump_path + '.labels', "w", encoding='utf-8'))

            if shuffle:
                indices = (ind for ind in np.random.RandomState(seed=shuffle).permutation(len(self.articles)))
                print("Write dataset to file (shuffled)")
            else:
                indices = range(len(self.articles))
                print("Write dataset to file (no shuffling)")

            for ind in indices:
                article = self.articles[ind]
                sent_cnt = min(max_sents, len(article.sentences))

                flat_sents = list(itertools.chain.from_iterable(article.sentences[:sent_cnt]))
                sent_bpe_ids = [str(bpe_id) for bpe_id in bpe.encode_ids(" ".join(flat_sents))]
                label_ids = [str(self.label_dict.null_index)] * len(sent_bpe_ids)

                article_content_file.write(" ".join(sent_bpe_ids) + '\n')
                article_label_file.write(" ".join(label_ids) + '\n')
コード例 #14
0
    def dump(self, dump_path, bpe: BPEmb, shuffle=0):
        with ExitStack() as stack:
            boxes_content_file = stack.enter_context(open(dump_path + '.content', "w", encoding='utf-8'))
            boxes_label_file = stack.enter_context(open(dump_path + '.labels', "w", encoding='utf-8'))
            boxes_positions_file = stack.enter_context(open(dump_path + '.pos', "w", encoding='utf-8'))

            if shuffle:
                indices = (ind for ind in np.random.RandomState(seed=shuffle).permutation(len(self.infoboxes)))
                print("Write dataset to file (shuffled)")
            else:
                indices = range(len(self.infoboxes))
                print("Write dataset to file (no shuffling)")

            for ind in indices:
                infobox = self.infoboxes[ind]
                box_content = []
                box_labels = []
                box_positions = []
                for record in infobox.records:
                    rec_content = " ".join(record.content)
                    rec_bpe_ids = [str(bpe_id) for bpe_id in bpe.encode_ids(rec_content)]
                    num_tokens = len(rec_bpe_ids)
                    if record.field_label in self.label_dict.word2id:
                        rec_label_id = self.label_dict.word2id[record.field_label]
                    else:
                        print("Unknown field label %s" % record.field_label)
                        rec_label_id = self.label_dict.unk_index

                    label_ids = [str(rec_label_id)] * num_tokens
                    positions = [str(num + 1) for num in range(num_tokens)]

                    box_content.extend(rec_bpe_ids)
                    box_labels.extend(label_ids)
                    box_positions.extend(positions)

                boxes_content_file.write(" ".join(box_content) + '\n')
                boxes_label_file.write(" ".join(box_labels) + '\n')
                boxes_positions_file.write(" ".join(box_positions) + '\n')
コード例 #15
0
class DatasetBase(ABC):
    is_multilingual = False

    def __init__(self, conf, lang, bert=None):
        self.conf = conf
        self.lang = lang
        self.bert = bert
        self.device = torch.device(f"cuda:{conf.gpu_id}")
        self.name = conf.dataset
        self.tag = conf.tag
        self.batch_size = conf.batch_size
        self.eval_batch_size = conf.eval_batch_size
        self.examples_to_print = conf.n_examples

        if self.conf.tag_scheme:
            self.convert_tags = iob_to[self.tag_scheme]
        self.load_data_raw()
        self.NO_TAG = "NO_TAG"
        tags = self.get_tags()
        print(Counter(tags).most_common())
        shapes = self.get_shapes()
        char_enc = None
        if conf.char_enc_file:
            assert Path(conf.char_enc_file).exists()
            char_enc = joblib.load(conf.char_enc_file)
        if self.name.endswith("multi_finetune"):
            assert char_enc
        if char_enc:
            self.char_enc = char_enc
        else:
            chars = self.get_chars()
            self.char_enc = LabelEncoder(
                to_torch=True, device=self.device).fit(chars)
        tag_enc = None
        if conf.tag_enc_file:
            assert Path(conf.tag_enc_file).exists()
            tag_enc = joblib.load(conf.tag_enc_file)
        if tag_enc:
            self.tag_enc = tag_enc
        else:
            self.tag_enc = LabelEncoder(
                to_torch=True, device=self.device).fit(tags)
        self.shape_enc = LabelEncoder(
            to_torch=True, device=self.device).fit(shapes)

        self.bpemb = BPEmb(
            lang=conf.bpemb_lang,
            vs=conf.vocab_size,
            dim=conf.bpemb_dim,
            add_pad_emb=True)
        if conf.use_fasttext:
            f = conf.fasttext_emb_file.format(dataset=self.name, lang=lang)
            self.fasttext_emb = load_word2vec_file(f, add_unk=True)
        self.pad_idx = self.bpemb.emb.key_to_index["<pad>"]
        if not conf.no_dataset_tensorize:
            self.tensorize()

    @abstractmethod
    def load_data_raw(self):
        pass

    @abstractmethod
    def get_chars(self):
        pass

    @abstractmethod
    def get_tags(self):
        pass

    @abstractmethod
    def get_shapes(self):
        pass

    @abstractmethod
    def tensorize(self):
        pass

    def tensorize_sent(self, sent):
        tags_str = [token[self.tag] or self.NO_TAG for token in sent]
        tags = self.tag_enc.transform(tags_str)
        tokens = [token["form"] for token in sent]
        token_shape = self.shape_enc.transform(token_shapes(tokens))

        bpe_ids = [
            self.bpemb.encode_ids([token["form"]])[0] for token in sent]
        bpe_token_start_mask = self.start_mask(bpe_ids)
        bpe_token_end_mask = self.end_mask(bpe_ids)
        bpe_ids = tensor(list(flatten(bpe_ids))).to(device=self.device)
        assert bpe_token_start_mask.shape == bpe_ids.shape
        assert bpe_token_start_mask.sum().item() == len(tags)
        assert bpe_token_end_mask.shape == bpe_ids.shape
        assert bpe_token_end_mask.sum().item() == len(tags)

        try:
            chars = self.char_enc.transform([
                [char for char in token["form"]] for token in sent])
        except ValueError as e:
            print(e)
            return None
        char_token_start_mask = self.start_mask(chars)
        char_token_end_mask = self.end_mask(chars)
        chars = tensor(list(flatten(chars))).to(device=self.device)

        char_token, char_token_len = self.sub_token_and_len(
            chars, char_token_start_mask)
        bpe_token, bpe_token_len = self.sub_token_and_len(
            bpe_ids, bpe_token_start_mask)

        tensorized = {
            "token": tokens,
            "tag": tags,
            "token_shape": token_shape,
            "bpe": bpe_ids,
            "bpe_token": bpe_token,
            "bpe_token_len": bpe_token_len,
            "bpe_token_start_mask": bpe_token_start_mask,
            "bpe_token_end_mask": bpe_token_end_mask,
            "char": chars,
            "char_token_start_mask": char_token_start_mask,
            "char_token_end_mask": char_token_end_mask,
            "char_token": char_token,
            "char_token_len": char_token_len,
            }
        if hasattr(self, "fasttext_emb"):
            tensorized["fasttext"] = tensor(
                to_word_indexes(
                    [token["form"].lower() for token in sent],
                    self.fasttext_emb,
                    unk="<unk>")).to(device=self.device)
        if self.bert is not None:
            try:
                tensorized["bert_ids"], \
                    tensorized["bert_mask"], \
                    tensorized["bert_token_starts"] = \
                    self.bert.subword_tokenize_to_ids(tokens)
                assert len(tensorized["bert_ids"]) <= self.conf.bert_max_seq_len
                if self.examples_to_print > 0:
                    print(tokens)
                    print(
                        self.bert.model_name,
                        self.bert.subword_tokenize(tokens))
                    self.examples_to_print -= 1
            except AssertionError as e:
                print(e)
                return None
            # TODO: ta (Tamil) WikiAnn has weird whitespace characters that
            # are treated differently by the BERT tokenizer, leading to
            # mismatches in tag and token counts
            if len(tags) != tensorized["bert_token_starts"].sum():
                print("Skipping instance with inconsistent tokenization:")
                print(" ## ".join(tags_str))
                print(" ## ".join(tokens))
                return None

        return tensorized

    @staticmethod
    def start_mask(subsegments):
        mask = list(flatten(
            [[1] + [0] * (len(ids) - 1) for ids in subsegments]))
        return tensor(mask).cuda().byte()

    @staticmethod
    def end_mask(subsegments):
        mask = list(flatten(
            [([0] * (len(ids) - 1)) + [1] for ids in subsegments]))
        return tensor(mask).cuda().byte()

    @staticmethod
    def sub_token_and_len(sub, sub_token_mask):
        char_token_start = sub_token_mask.nonzero().squeeze(1)
        char_token_end = cat([
            char_token_start[1:],
            tensor([sub_token_mask.size(0)]).to(char_token_start)])
        char_token = [
            sub[s:e] for s, e in zip(char_token_start, char_token_end)]
        char_token_len = char_token_end - char_token_start
        return char_token, char_token_len

    def token_texts(self, split_name):
        split = getattr(self, split_name)
        return [instance["token"] for instance in split]

    def assert_batch_size(self):
        if not hasattr(self, "batch_size"):
            raise ValueError(
                "Need to set batch_size before calling train_loader")

    def assert_eval_batch_size(self):
        if not hasattr(self, "eval_batch_size"):
            raise ValueError(
                "Need to set eval_batch_size before calling"
                "dev_loader or test_loader")

    def loader(self, dataset, **kwargs):
        return DataLoader(dataset, collate_fn=collate_fn, **kwargs)
コード例 #16
0
class Predictor(PredictorBase):
    def __init__(self, config):
        super(Predictor, self).__init__(config)
        self.config = config
        self.model = None
        self.sess = None
        # self.builder = tf.saved_model.builder.SavedModelBuilder("savedModel")

        if self.config["use_bpe"]:
            self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"])
        else:
            # 加载词汇表
            self.word_to_idx = self.load_vocab()
            self.idx_to_label = {value: key for key, value in self.word_to_idx.items()}

        # 初始化模型
        self.create_model()
        print("load model finished")
        # 加载计算图
        self.load_graph()
        print("load graph finished")

    def load_vocab(self):
        # 将词汇-索引映射表加载出来
        with open(os.path.join(self.output_path, "word_to_index.pkl"), "rb") as f:
            word_to_index = pickle.load(f)

        return word_to_index

    def sentence_to_encode(self, sentence):
        """
        创建数据对象
        :return:
        """
        if not sentence:
            return None

        if len(sentence) > 20:
            return None

        if self.config["use_bpe"]:
            word_idx = self.bpe_zh.encode_ids(sentence)
            word_idx = list(map(lambda x: x + 1, word_idx))
        else:
            word_idx = [self.word_to_idx.get(token, self.word_to_idx["UNK"]) for token in sentence]

        new_word_idx = self.process_data(word_idx)
        return new_word_idx

    @staticmethod
    def process_data(sentence):
        """
        对数据做预处理
        :param sentence:
        :return:
        """
        encoder_inputs = [sentence]
        return dict(encoder_inputs=encoder_inputs)

    def response(self, tokens_list):
        sents = []
        for i in range(self.config["beam_size"]):
            sent_token = tokens_list[:, i]
            if self.config["use_bpe"]:
                sent = self.bpe_zh.decode_ids(list(map(lambda x: x - 1, sent_token)))
            else:
                sent = "".join([self.idx_to_label[token] for token in sent_token])
            sents.append(sent)

        return sents

    def create_model(self):
        """
        根据config文件选择对应的模型,并初始化
        :return:
        """
        if self.config["model_name"] == "seq2seq_lstm":
            self.model = Seq2SeqTransformer(config=self.config, vocab_size=len(self.word_to_idx),
                                            word_vectors=None)

        if self.config["model_name"] == "seq2seq_bilstm":
            self.model = Seq2SeqBiLstmModel(config=self.config, vocab_size=len(self.word_to_idx),
                                            word_vectors=None)

    def load_graph(self):
        """
        加载计算图
        :return:
        """
        self.sess = tf.Session()
        ckpt = tf.train.get_checkpoint_state(os.path.join(os.path.abspath(os.path.dirname(os.getcwd())),
                                                          self.config["ckpt_model_path"]))
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('Reloading model parameters..')
            self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"]))
        # inputs = {"inputs": tf.saved_model.utils.build_tensor_info(self.model.encoder_inputs),
        #           "inputs_length": tf.saved_model.utils.build_tensor_info(self.model.encoder_inputs_length),
        #           "keep_prob": tf.saved_model.utils.build_tensor_info(self.model.keep_prob)}
        #
        # outputs = {"predictions": tf.saved_model.utils.build_tensor_info(self.model.predictions)}
        #
        # prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs=inputs,
        #                                                                               outputs=outputs,
        #                                                                               method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
        # legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op")
        # self.builder.add_meta_graph_and_variables(self.sess, [tf.saved_model.tag_constants.SERVING],
        #                                           signature_def_map={"dialogue": prediction_signature},
        #                                           legacy_init_op=legacy_init_op)

        # self.builder.save()

    def predict(self, sentence):
        """
         给定一条句子,预测结果
        :return:
        """
        sentence_ids = self.sentence_to_encode(sentence)
        prediction_ = self.model.infer(sentence_ids["encoder_inputs"])
        prediction = self.sess.run(prediction_)
        print(prediction.shape)
        response = self.response(prediction)
        return response
コード例 #17
0
class seq2seqDataset(data.Dataset):
    def __init__(self, root_path, seq_length, embed_dim, embed_vec_space):
        """Initialize the dataset"""

        self.root_path = root_path
        self.seq_length = seq_length
        self.tokenized_data = self.tokenize(root_path)
        self.embed = BPEmb(lang="en", vs=embed_vec_space, add_pad_emb=True)
        self.pad = 1002
        self.sos = 1001
        self.eos = 1000

        self.augmentator = nac.KeyboardAug(aug_char_min=0,
                                           aug_char_p=0.4,
                                           aug_word_p=0.5,
                                           aug_word_min=0,
                                           aug_word_max=self.seq_length // 5,
                                           special_char=False)

    def tokenize(self, root_path):

        with open(root_path, 'r') as f:
            text = f.read()
        lt = len(text)
        print(lt)

        splitted = []
        start = 0

        while True:

            flag = (1 if self.seq_length >= lt - start else 0)
            if lt <= start:
                break
            cur_text = text[start:start + min(self.seq_length, lt - start)]
            last_chunk_len = None
            try:
                last_chunk_len = (0 if cur_text[-1].isspace() else len(
                    cur_text.split()[-1]))
            except:
                print(cur_text)
                start += self.seq_length
                continue
            start += self.seq_length - last_chunk_len

            if last_chunk_len == len(cur_text.strip()):
                start += self.seq_length
                continue
            st = cur_text[:-last_chunk_len].strip()
            if st == st.swapcase():
                start += self.seq_length
                continue
            splitted.append(st)

            if flag:
                break

        return splitted

    # def one_hot_encode(self, arr, n_labels):
    #     one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)

    #     # Fill the appropriate elements with ones
    #     one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.

    #     # Finally reshape it to get back to the original array
    #     one_hot = one_hot.reshape((*arr.shape, n_labels))

    # return one_hot
    def augment(self, seq):

        augmented_data = self.augmentator.augment(seq)
        return seq

    def get_encodes(self, seq, use_aug=False):
        if use_aug:
            seq = self.augment(seq)

        def padded_encode(x):

            res = np.full((self.seq_length, self.embed.dim),
                          self.embed['<pad>'])
            res1hot = np.full((self.seq_length), self.pad, dtype=np.int32)
            enc = self.embed.encode_ids(x)

            res[:len(enc)] = self.embed.vectors[enc]
            res1hot[:len(enc)] = np.array(enc)
            length = len(enc)

            res = np.insert(res, 0, np.full((self.embed.dim), self.sos), 0)

            length += 1
            if use_aug:
                res = np.insert(res, length, np.full((self.embed.dim),
                                                     self.eos), 0)
                length += 1

                return length, res
            res1hot = np.insert(res1hot, length, self.eos, 0)
            return length, res, res1hot

        return padded_encode(seq)

    def get_params(self):
        return None

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, index):
        sequence = self.tokenized_data[index]

        lengths_x, X = self.get_encodes(sequence, use_aug=True)
        lengths_y, Y, y1hot = self.get_encodes(sequence)

        return lengths_x, torch.from_numpy(X), lengths_y, torch.from_numpy(
            Y), torch.from_numpy(y1hot)
コード例 #18
0
ファイル: nlu_dense.py プロジェクト: zoovu/rasa
class BytePairFeaturizer(DenseFeaturizer, GraphComponent):
    @classmethod
    def required_components(cls) -> List[Type]:
        """Components that should be included in the pipeline before this component."""
        return [Tokenizer]

    @staticmethod
    def required_packages() -> List[Text]:
        """Any extra python dependencies required for this component to run."""
        return ["bpemb"]

    @staticmethod
    def get_default_config() -> Dict[Text, Any]:
        """Returns the component's default config."""
        return {
            **DenseFeaturizer.get_default_config(),
            # specifies the language of the subword segmentation model
            "lang": None,
            # specifies the dimension of the subword embeddings
            "dim": None,
            # specifies the vocabulary size of the segmentation model
            "vs": None,
            # if set to True and the given vocabulary size can't be loaded for the given
            # model, the closest size is chosen
            "vs_fallback": True,
        }

    def __init__(
        self,
        config: Dict[Text, Any],
        name: Text,
    ) -> None:
        """Constructs a new byte pair vectorizer."""
        super().__init__(name, config)
        # The configuration dictionary is saved in `self._config` for reference.
        self.model = BPEmb(
            lang=self._config["lang"],
            dim=self._config["dim"],
            vs=self._config["vs"],
            vs_fallback=self._config["vs_fallback"],
        )

    @classmethod
    def create(
        cls,
        config: Dict[Text, Any],
        model_storage: ModelStorage,
        resource: Resource,
        execution_context: ExecutionContext,
    ) -> GraphComponent:
        """Creates a new component (see parent class for full docstring)."""
        return cls(config, execution_context.node_name)

    def process(self, messages: List[Message]) -> List[Message]:
        """Processes incoming messages and computes and sets features."""
        for message in messages:
            for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
                self._set_features(message, attribute)
        return messages

    def process_training_data(self, training_data: TrainingData) -> TrainingData:
        """Processes the training examples in the given training data in-place."""
        self.process(training_data.training_examples)
        return training_data

    def _create_word_vector(self, document: Text) -> np.ndarray:
        """Creates a word vector from a text. Utility method."""
        encoded_ids = self.model.encode_ids(document)
        if encoded_ids:
            return self.model.vectors[encoded_ids[0]]

        return np.zeros((self.component_config["dim"],), dtype=np.float32)

    def _set_features(self, message: Message, attribute: Text = TEXT) -> None:
        """Sets the features on a single message. Utility method."""
        tokens = message.get(TEXT_TOKENS)

        # If the message doesn't have tokens, we can't create features.
        if not tokens:
            return None

        # We need to reshape here such that the shape is equivalent to that of sparsely
        # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim).
        text_vector = self._create_word_vector(document=message.get(TEXT)).reshape(
            1, -1
        )
        word_vectors = np.array(
            [self._create_word_vector(document=t.text) for t in tokens]
        )

        final_sequence_features = Features(
            word_vectors,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            text_vector,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)

    @classmethod
    def validate_config(cls, config: Dict[Text, Any]) -> None:
        """Validates that the component is configured properly."""
        if not config["lang"]:
            raise ValueError("BytePairFeaturizer needs language setting via `lang`.")
        if not config["dim"]:
            raise ValueError(
                "BytePairFeaturizer needs dimensionality setting via `dim`."
            )
        if not config["vs"]:
            raise ValueError("BytePairFeaturizer needs a vector size setting via `vs`.")
コード例 #19
0
class BytePairFeaturizer(DenseFeaturizer):
    """This component adds BPEmb features."""
    @classmethod
    def required_components(cls) -> List[Type[Component]]:
        return [Tokenizer]

    @classmethod
    def required_packages(cls) -> List[Text]:
        return ["bpemb"]

    defaults = {
        # specifies the language of the subword segmentation model
        "lang": None,
        # specifies the dimension of the subword embeddings
        "dim": None,
        # specifies the vocabulary size of the segmentation model
        "vs": None,
        # if set to True and the given vocabulary size can't be loaded for the given
        # model, the closest size is chosen
        "vs_fallback": True,
        # specifies the folder in which downloaded BPEmb files will be cached
        "cache_dir": str(Path.home() / Path(".cache/bpemb")),
        # specifies the path to a custom SentencePiece model file
        "model_file": None,
        # specifies the path to a custom embedding file
        "emb_file": None,
    }

    language_list = [
        "mt",
        "sd",
        "cr",
        "ba",
        "ht",
        "scn",
        "bi",
        "stq",
        "sm",
        "diq",
        "no",
        "yi",
        "vec",
        "bug",
        "am",
        "tl",
        "mn",
        "atj",
        "ko",
        "mai",
        "lij",
        "tcy",
        "sl",
        "bn",
        "dv",
        "rm",
        "ng",
        "ml",
        "kg",
        "koi",
        "war",
        "et",
        "mhr",
        "als",
        "bar",
        "ii",
        "sco",
        "got",
        "pnb",
        "ss",
        "bpy",
        "tum",
        "ru",
        "qu",
        "hy",
        "tw",
        "bm",
        "vep",
        "dty",
        "udm",
        "gd",
        "lbe",
        "rmy",
        "azb",
        "kw",
        "ja",
        "wuu",
        "pag",
        "ro",
        "tet",
        "ee",
        "min",
        "su",
        "ha",
        "glk",
        "pcd",
        "tk",
        "nrm",
        "ku",
        "gn",
        "ty",
        "bh",
        "pap",
        "fr",
        "ia",
        "cs",
        "ky",
        "ff",
        "kab",
        "rn",
        "csb",
        "tt",
        "cy",
        "ilo",
        "kaa",
        "hif",
        "ak",
        "pa",
        "crh",
        "ti",
        "myv",
        "ur",
        "se",
        "uz",
        "cdo",
        "lez",
        "srn",
        "kk",
        "pih",
        "de",
        "an",
        "tyv",
        "ext",
        "gan",
        "wo",
        "si",
        "lmo",
        "hak",
        "az",
        "ka",
        "ik",
        "frr",
        "hsb",
        "ho",
        "af",
        "nds",
        "pam",
        "el",
        "fur",
        "cu",
        "hr",
        "my",
        "nl",
        "da",
        "ch",
        "vls",
        "es",
        "as",
        "lt",
        "ny",
        "so",
        "oc",
        "lad",
        "pnt",
        "ms",
        "bcl",
        "os",
        "co",
        "ks",
        "or",
        "ay",
        "wa",
        "nah",
        "fa",
        "pl",
        "mzn",
        "za",
        "th",
        "fj",
        "kbp",
        "be",
        "zh",
        "ce",
        "sh",
        "sr",
        "id",
        "chy",
        "ps",
        "lo",
        "tr",
        "st",
        "he",
        "ang",
        "sah",
        "io",
        "gom",
        "ki",
        "sn",
        "kbd",
        "jam",
        "bo",
        "pms",
        "sk",
        "kv",
        "ckb",
        "nv",
        "dsb",
        "zea",
        "xmf",
        "fi",
        "ltg",
        "ksh",
        "ve",
        "new",
        "na",
        "jv",
        "tn",
        "sw",
        "rw",
        "ln",
        "bs",
        "gag",
        "ab",
        "olo",
        "is",
        "bjn",
        "ceb",
        "om",
        "vi",
        "ast",
        "uk",
        "mg",
        "mwl",
        "arz",
        "li",
        "mrj",
        "yo",
        "frp",
        "gl",
        "la",
        "km",
        "sv",
        "nap",
        "jbo",
        "bxr",
        "gv",
        "br",
        "fo",
        "ug",
        "pi",
        "bg",
        "ie",
        "din",
        "sa",
        "pdc",
        "cho",
        "lb",
        "ig",
        "aa",
        "sc",
        "fy",
        "kj",
        "eo",
        "eu",
        "kl",
        "sq",
        "to",
        "mi",
        "tpi",
        "kr",
        "hi",
        "arc",
        "ga",
        "nov",
        "mdf",
        "vo",
        "pfl",
        "rue",
        "haw",
        "kn",
        "mh",
        "mr",
        "te",
        "ca",
        "ace",
        "cv",
        "zu",
        "it",
        "iu",
        "av",
        "sg",
        "hz",
        "lv",
        "ts",
        "lrc",
        "ar",
        "hu",
        "nn",
        "nso",
        "krc",
        "mk",
        "tg",
        "ne",
        "dz",
        "ta",
        "mus",
        "ady",
        "en",
        "lg",
        "xal",
        "gu",
        "pt",
        "xh",
        "szl",
        "chr",
    ]

    def __init__(self,
                 component_config: Optional[Dict[Text, Any]] = None) -> None:
        super().__init__(component_config)

        model_file, emb_file = (self.component_config[k]
                                for k in ["model_file", "emb_file"])
        if model_file:
            if not os.path.exists(model_file):
                raise FileNotFoundError(
                    f"BytePair model {model_file} not found. Please check config."
                )
        if emb_file:
            if not os.path.exists(emb_file):
                raise FileNotFoundError(
                    f"BytePair embedding file {emb_file} not found. Please check config."
                )

        if not self.component_config["lang"]:
            raise ValueError(
                "You must specify the `lang` parameter for BytePairEmbedding in `config.yml`."
            )

        if not self.component_config["vs"]:
            raise ValueError(
                "You must specify the `vs` parameter for BytePairEmbedding in `config.yml`."
            )

        if not self.component_config["dim"]:
            raise ValueError(
                "You must specify the `dim` parameter for BytePairEmbedding in `config.yml`."
            )

        self.model = BPEmb(
            lang=self.component_config["lang"],
            dim=self.component_config["dim"],
            vs=self.component_config["vs"],
            vs_fallback=self.component_config["vs_fallback"],
            cache_dir=self.component_config["cache_dir"],
            model_file=self.component_config["model_file"],
            emb_file=self.component_config["emb_file"],
        )

    def train(
        self,
        training_data: TrainingData,
        config: Optional[RasaNLUModelConfig] = None,
        **kwargs: Any,
    ) -> None:
        for example in training_data.intent_examples:
            for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
                self.set_bpemb_features(example, attribute)

    def create_word_vector(self, document: Text) -> np.ndarray:
        encoded_ids = self.model.encode_ids(document)
        if encoded_ids:
            return self.model.vectors[encoded_ids[0]]

        return np.zeros((self.component_config["dim"], ), dtype=np.float32)

    def set_bpemb_features(self,
                           message: Message,
                           attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        # We need to reshape here such that the shape is equivalent to that of sparsely
        # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim).
        text_vector = self.create_word_vector(
            document=message.get(TEXT)).reshape(1, -1)
        word_vectors = np.array(
            [self.create_word_vector(document=t.text) for t in tokens])

        final_sequence_features = Features(
            word_vectors,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            text_vector,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)

    def process(self, message: Message, **kwargs: Any) -> None:
        self.set_bpemb_features(message)

    def persist(self, file_name: Text,
                model_dir: Text) -> Optional[Dict[Text, Any]]:
        pass

    @classmethod
    def load(
        cls,
        meta: Dict[Text, Any],
        model_dir: Optional[Text] = None,
        model_metadata: Optional["Metadata"] = None,
        cached_component: Optional["Component"] = None,
        **kwargs: Any,
    ) -> "Component":
        if cached_component:
            return cached_component

        return cls(meta)
コード例 #20
0
class BpeTrainData(object):
    def __init__(self, config):
        self._train_data_path = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), config["train_data"])
        self._output_path = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())),
                                         config["output_path"])
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path)

        self._embedding_size = config["embedding_size"]  # 词向量的长度
        self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"])

        self.vocab_size = None
        self.word_vectors = None

        self.pad_token = 0
        self.go_token = 2
        self.eos_token = 3

    def read_data(self):
        """
        读取数据
        :return: 返回分词后的对话对,questions, responses = [[]]
        """
        with open(self._train_data_path, "r", encoding="utf8") as f:
            requests = []
            responses = []
            for line in f.readlines():
                request, response = line.strip().split("<SEP>")
                requests.append(request.strip())
                responses.append(response.strip())
        return requests, responses

    def get_word_vectors(self):
        """
        直接从预训练好的bpe模型中加载词向量,并获得相应的词向量矩阵
        :return:
        """
        vocab = self.bpe_zh.words
        # 因为bpe模型中是不包含<pad>的,因此在0位置添加一个<pad>字符
        vocab.append("<pad>")
        vectors = self.bpe_zh.vectors
        print(vectors.shape)
        pad_vector = np.random.randn(self._embedding_size)
        word_vectors = np.vstack((pad_vector, vectors))
        return vocab, word_vectors

    def trans_to_index(self, data):
        """
        将输入转化为索引表示
        :param data: 输入的是questions 和 responses
        :return:
        """
        data_ids = []
        for sentence in data:
            token_ids = self.bpe_zh.encode_ids(sentence)
            # 因为在bpe的vocab中的0位置添加了<pad>字符,bpe vocab对应的索引是从1开始,因此对这里的ids都加1
            token_ids = list(map(lambda x: x + 1, token_ids))
            data_ids.append(token_ids)

        return data_ids

    def padding(self, batch):
        """
        对每个batch数据按数据集中最大长度的句子进行补全
        :param batch:
        :return:
        """
        question_length = [len(sample[0]) for sample in batch]
        max_question_length = max(question_length)
        questions = [sample[0] + [self.pad_token] * (max_question_length - len(sample[0]))
                     for sample in batch]

        # 在这里先对response加上一个终止符<eos>
        responses = [sample[1] + [self.eos_token] for sample in batch]
        response_length = [len(response) for response in responses]
        max_response_length = max(response_length)

        # 对response按最大长度补齐
        pad_responses = [response + [self.pad_token] * (max_response_length - len(response)) for response in responses]

        return dict(questions=questions, responses=pad_responses,
                    question_length=question_length, response_length=response_length)

    def gen_data(self):
        """
        生成可导入到模型中的数据
        :return:
        """
        # 如果不是第一次数据预处理,则直接读取
        if os.path.exists(os.path.join(self._output_path, "train_data.pkl")):
            print("load existed train data")
            with open(os.path.join(self._output_path, "train_data.pkl"), "rb") as f:
                train_data = pickle.load(f)
            return train_data

        # 1,读取原始数据
        questions, responses = self.read_data()

        # 2,生成vocab_size 和 词向量
        vocab, word_vectors = self.get_word_vectors()
        self.vocab_size = len(vocab)
        self.word_vectors = word_vectors

        # 4,输入转索引
        questions_idx = self.trans_to_index(questions)
        responses_idx = self.trans_to_index(responses)

        # 生成数据并保存下来
        train_data = [[questions_idx[i], responses_idx[i]] for i in range(len(questions_idx))]
        with open(os.path.join(self._output_path, "train_data.pkl"), "wb") as fw:
            pickle.dump(train_data, fw)
        return train_data

    def next_batch(self, data, batch_size):
        """
        生成batch数据集
        :param data: 输入
        :param batch_size: 批量的大小
        :return:
        """
        random.shuffle(data)
        batch_num = len(data) // batch_size

        for i in range(batch_num):
            batch_data = data[batch_size * i: batch_size * (i + 1)]
            new_batch = self.padding(batch_data)
            yield new_batch
コード例 #21
0
ファイル: bot.py プロジェクト: ivankholin/final_project
class Model():
    '''
	класс модели
	производит загрузку модели и расстановку пунктуации в предложении    
	'''
    def __init__(self,
                 export_dir,
                 vocab_size=5000,
                 emb_dim=200,
                 dict_punct=None):
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim

        self.bpemb_ru = BPEmb(lang='ru', vs=vocab_size, dim=emb_dim)

        self.export_dir = export_dir
        self.predict_fn = predictor.from_saved_model(export_dir)
        if dict_punct is None:
            self.d = {
                1: 4922,
                2: 4921,
                3: 4978,
                4: 4985,
                5: 4947,
                6: 4963,
                7: 4936
            }
        else:
            self.d = dict_punct

    def parse_fn(self, line):
        '''
		функция кодировки строки:
		line- строка
		'''
        feature = np.array([self.bpemb_ru.encode_ids(line)]).astype(np.int32)
        return feature, np.array([len(feature[0])])

    def to_capital_latter(self, sentence):
        '''фукция, переводящая прописные буквы в заглавные после точки'''
        tmp = ''
        flag = True
        for c in sentence:
            if flag and c != ' ':
                tmp += c.upper()
                flag = False
            else:
                tmp += c
            if c in '.?!':
                flag = True
        return tmp

    def predict(self, line):
        x, x_len = self.parse_fn(line)
        predict = self.predict_fn({'x': x, 'len': x_len})
        a = []
        for i in range(predict['lengths'][0]):
            a.append(predict['sequences'][0][i])
            if predict['prediction'][0][i] != 0:
                a.append(self.d[predict['prediction'][0][i]])
        return self.to_capital_latter(self.bpemb_ru.decode_ids(np.array(a)))
コード例 #22
0
class BpeEvalData(object):
    def __init__(self, config):

        self._eval_data_path = os.path.join(
            os.path.abspath(os.path.dirname(os.getcwd())), config["eval_data"])
        self._output_path = os.path.join(
            os.path.abspath(os.path.dirname(os.getcwd())),
            config["output_path"])

        self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"])
        self.pad_token = 0
        self.eos_token = 3

    def read_data(self):
        """
        读取数据
        :return: 返回分词后的文本内容和标签,questions, responses = [[]]
        """
        with open(self._eval_data_path, "r", encoding="utf8") as f:
            requests = []
            responses = []
            for line in f.readlines():
                request, response = line.strip().split("<SEP>")
                requests.append(request.strip())
                responses.append(response.strip())
        return requests, responses

    def trans_to_index(self, data):
        """
        将输入转化为索引表示
        :param data: 输入的是questions 和 responses

        :return:
        """
        data_ids = []
        for sentence in data:
            token_ids = self.bpe_zh.encode_ids(sentence)
            # 因为在bpe的vocab中的0位置添加了<pad>字符,bpe vocab对应的索引是从1开始,因此对这里的ids都加1
            token_ids = list(map(lambda x: x + 1, token_ids))
            data_ids.append(token_ids)
        return data_ids

    def padding(self, batch):
        """
        对每个batch数据按数据集中最大长度的句子进行补全
        :param batch:
        :return:
        """
        question_length = [len(sample[0]) for sample in batch]
        max_question_length = max(question_length)
        questions = [
            sample[0] + [self.pad_token] *
            (max_question_length - len(sample[0])) for sample in batch
        ]

        # 在这里先对response加上一个终止符<eos>
        responses = [sample[1] + [self.eos_token] for sample in batch]
        response_length = [len(response) for response in responses]
        max_response_length = max(response_length)

        # 对response按最大长度补齐
        pad_responses = [
            response + [self.pad_token] * (max_response_length - len(response))
            for response in responses
        ]

        return dict(questions=questions,
                    responses=pad_responses,
                    question_length=question_length,
                    response_length=response_length)

    def gen_data(self):
        """
        生成可导入到模型中的数据
        :return:
        """
        # 如果不是第一次数据预处理,则直接读取
        if os.path.exists(os.path.join(self._output_path, "eval_data.pkl")):
            print("load existed eval data")
            with open(os.path.join(self._output_path, "eval_data.pkl"),
                      "rb") as f:
                eval_data = pickle.load(f)
            return eval_data

        # 1,读取原始数据
        questions, responses = self.read_data()

        # 3,输入转索引
        questions_idx = self.trans_to_index(questions)
        responses_idx = self.trans_to_index(responses)

        # 生成数据并保存下来
        eval_data = [[questions_idx[i], responses_idx[i]]
                     for i in range(len(questions_idx))]
        with open(os.path.join(self._output_path, "eval_data.pkl"),
                  "wb") as fw:
            pickle.dump(eval_data, fw)
        return eval_data

    def next_batch(self, data, batch_size):
        """
        生成batch数据集
        :param data: 输入
        :param batch_size: 批量的大小
        :return:
        """
        random.shuffle(data)
        batch_num = len(data) // batch_size

        for i in range(batch_num):
            batch_data = data[batch_size * i:batch_size * (i + 1)]
            new_batch = self.padding(batch_data)
            yield new_batch
コード例 #23
0
ファイル: model.py プロジェクト: benjamin-croker/TopicC
class TopicKeyEncBPemb(_TopicKeyBase):
    def __init__(self, embed_size, enc_hidden_size, dense_size):
        super(TopicKeyEncBPemb, self).__init__()
        print("init: TopicKeyEncBPemb model")

        self.embedding_model = BPEmb(dim=embed_size, lang="en", vs=100000)
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=enc_hidden_size,
                               num_layers=2,
                               bidirectional=True)

        self._lstm_layers = 2
        self._lstm_directions = 2

        self.enc_to_dense_map = nn.Linear(2 * enc_hidden_size, dense_size)
        self.dense_to_output_map = nn.Linear(dense_size, 1)

    def embed_sequence(self, sequence: List[str]) -> torch.Tensor:
        # sequence is a list of words (strings)
        # average the per-word encoding vector
        def _enc(word):
            v_ids = self.embedding_model.encode_ids(word)
            return self.embedding_model.vectors[v_ids]

        return torch.tensor([_enc(word).mean(axis=0)
                             for word in sequence]).to(self._device)

    def create_seq_vecs(
        self, sequences: List[List[str]]
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        # returns the padded seq vector, lengths and original order index
        # sequences are sorted by length, and can be reverted to their original
        # order with the unsorting index vector

        # start by embedding the sequence vectors
        seq_vecs = [self.embed_sequence(s) for s in sequences]

        # sort lengths and set the device
        lengths = torch.tensor([seq_v.shape[0] for seq_v in seq_vecs])
        lengths, sort_i = lengths.sort(descending=True)
        _, orig_i = sort_i.sort()

        # pad the seq vecs and sort by length (dim 1 is the batch dimension)
        seq_vec_pad = rnn.pad_sequence(seq_vecs).to(self._device)
        seq_vec_pad = seq_vec_pad[:, sort_i, :]

        return seq_vec_pad, lengths, orig_i

    def forward(
            self,
            sequences: List[List[str]]) -> Tuple[torch.Tensor, torch.Tensor]:
        # Make the word embeddings for each sequence
        pad_seq_vecs, lengths, orig_i = self.create_seq_vecs(sequences)

        # masks to indicate which parts of the sequences are padding
        # lengths are sorted largest to smallest, so lengths[0] is max_len
        pad_mask = torch.zeros(len(sequences),
                               lengths[0],
                               device=self._device,
                               dtype=torch.bool)
        for i, length in enumerate(lengths):
            pad_mask[i, length:] = True

        # pack the sequence for the LSTM
        # packed_seq_vecs.shape = max_seq_len, batch_size, embedding_dim
        packed_seq_vecs = rnn.pack_padded_sequence(pad_seq_vecs, lengths)

        # run through the LSTM
        enc_outputs, _ = self.encoder(packed_seq_vecs)
        # enc_outputs.shape: max_seq_len, batch_size, 2*enc_hidden_size
        enc_outputs, _ = nn.utils.rnn.pad_packed_sequence(enc_outputs)
        # re-order so batch dim is first
        # enc_outputs.shape: batch_size, max_seq_len, 2*enc_hidden_size
        enc_outputs = enc_outputs.permute(1, 0, 2)

        # dense.shape: batch_size, max_seq_len, 2*enc_hidden_size
        dense = self.enc_to_dense_map(enc_outputs)
        dense = torch.tanh(dense)

        # final output layer, and remove the last dimension
        # output.shape = batch_size, max_seq_len
        output = self.dense_to_output_map(dense).squeeze(dim=2)

        return output[orig_i, :], pad_mask[orig_i, :]
コード例 #24
0
from torch import nn, tensor


# In[69]:

emb_layer = nn.Embedding.from_pretrained(tensor(bpemb_fr.vectors))


# In[70]:

emb_layer


# In[71]:

ids = bpemb_fr.encode_ids("Ceci est une phrase française")


# In[72]:

ids


# In[73]:

bpemb_fr.vectors[ids].shape


# In[74]:

emb_layer(tensor(ids)).shape
コード例 #25
0
ファイル: model.py プロジェクト: benjamin-croker/TopicC
class TopicCEncBPemb(_TopicCBase):
    def __init__(self, embed_size, output_size, enc_hidden_size,
                 attention_size, dense_size):
        super(TopicCEncBPemb, self).__init__()
        print("init: TopicCEncBPemb model")

        self.embedding_model = BPEmb(dim=embed_size, lang="en", vs=100000)
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=enc_hidden_size,
                               num_layers=2,
                               bidirectional=True)

        self._lstm_layers = 2
        self._lstm_directions = 2

        self.enc_to_att_map = nn.Linear(self._lstm_directions *
                                        enc_hidden_size,
                                        attention_size,
                                        bias=False)
        # The attention vector is used like the decoder states in the attention
        # component in a seq-to-seq model, however it's a single, learnable
        # vector in this case
        self.att_vec = nn.Linear(attention_size, 1, bias=False)
        self.seq_to_dense_map = nn.Linear(4 * enc_hidden_size, dense_size)
        self.dense_to_output_map = nn.Linear(dense_size, output_size)

    def embed_sequence(self, sequence: str) -> torch.Tensor:
        v_ids = self.embedding_model.encode_ids(sequence)
        return torch.tensor(self.embedding_model.vectors[v_ids]).to(
            self._device)

    def create_seq_vecs(
        self, sequences: List[str]
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        # returns the padded seq vector, lengths and original order index
        # sequences are sorted by length, and can be reverted to their original
        # order with the unsorting index vector

        # start by embedding the sequence vectors
        seq_vecs = [self.embed_sequence(s) for s in sequences]

        # sort lengths and set the device
        lengths = torch.tensor([seq_v.shape[0] for seq_v in seq_vecs])
        lengths, sort_i = lengths.sort(descending=True)
        _, orig_i = sort_i.sort()

        # pad the seq vecs and sort by length (dim 1 is the batch dimension)
        seq_vec_pad = rnn.pad_sequence(seq_vecs).to(self._device)
        seq_vec_pad = seq_vec_pad[:, sort_i, :]

        return seq_vec_pad, lengths, orig_i

    def forward(self, sequences: List[str]) -> torch.Tensor:
        # Make the word embeddings for each sequence
        pad_seq_vecs, lengths, orig_i = self.create_seq_vecs(sequences)

        # pack the sequence for the LSTM
        # packed_seq_vecs.shape = max_seq_len, batch_size, embedding_dim
        packed_seq_vecs = rnn.pack_padded_sequence(pad_seq_vecs, lengths)

        # run through the LSTM
        # h_n.shape = num_layers*num_directions, batch, hidden_size
        # Note from docs:
        #   the layers can be separated using h_n.view(num_layers, num_directions, batch, hidden_size)
        enc_outputs, (h_n, _) = self.encoder(packed_seq_vecs)
        h_n = h_n.view(self._lstm_layers, self._lstm_directions, h_n.shape[1],
                       h_n.shape[2])

        # unpack the sequence
        # enc_outputs.shape = max_seq_len, batch_size, 2*enc_hidden_size
        enc_outputs, _ = nn.utils.rnn.pad_packed_sequence(enc_outputs)

        # encoder masks to indicate which parts of the sequence should be considered
        enc_masks = torch.zeros(enc_outputs.shape[0],
                                enc_outputs.shape[1],
                                1,
                                dtype=torch.bool,
                                device=self._device)
        for i, length in enumerate(lengths):
            enc_masks[length:, i, 0] = True

        # encoder outputs projected to the dimension of the attention vector
        # att_proj.shape = max_seq_len, batch_size, attention_size
        att_proj = self.enc_to_att_map(enc_outputs)
        # weights to the where attention is given to each index of the sequence
        # att_w.shape = max_seq_len, batch_size, 1
        att_w = self.att_vec(att_proj)
        # mask out sections which are not part of the sequence
        att_w = att_w.masked_fill(enc_masks, -float('inf'))
        # att_w.shape = max_seq_len, batch_size, 1
        # turn into a normalised probability with softmax
        att_w = nn.functional.softmax(att_w, dim=0)

        # permute so the batch dimension is first instead of second
        # pointer_w.shape = batch_size, max_seq_len, 1
        att_w = att_w.permute(1, 0, 2)
        # permute so the batch dimension is first, and seq_len dim is summed
        # enc_outputs.shape = batch_size, 2*enc_hidden_size, max_seq_len
        enc_outputs = enc_outputs.permute(1, 2, 0)
        # weighted sum of states
        # att_output.shape = batch_size, 2*enc_hidden_size, 1
        att_output = torch.bmm(enc_outputs, att_w)
        # remove the last dimension
        # att_output.shape = batch_size, 2*enc_hidden_size
        att_output = att_output.squeeze(dim=2)

        # combine with the hidden states
        # get the last layer index0 = -1 for both directions index1 = (0, 1)
        seq_output = torch.cat((att_output, h_n[-1][0], h_n[-1][1]), dim=1)

        # have a dense non-linear layer
        # dense.shape = batch_size, dense_size
        dense = self.seq_to_dense_map(seq_output)
        dense = torch.tanh(dense)

        # final output layer
        # output.shape = batch_size, n_categories
        output = self.dense_to_output_map(dense)

        # sort the output to match the original order
        output = output[orig_i, :]

        return nn.functional.log_softmax(output, dim=1)
コード例 #26
0
class BytePairFeaturizer(DenseFeaturizer):
    """This component adds BPEmb features."""
    @classmethod
    def required_components(cls) -> List[Type[Component]]:
        return [Tokenizer]

    @classmethod
    def required_packages(cls) -> List[Text]:
        return ["bpemb"]

    defaults = {
        # specifies the language of the subword segmentation model
        "lang": "en",
        # specifies the dimension of the subword embeddings
        "dim": 25,
        # specifies the vocabulary size of the segmentation model
        "vs": 1000,
        # if set to True and the given vocabulary size can't be loaded for the given
        # model, the closest size is chosen
        "vs_fallback": True,
        # specifies the folder in which downloaded BPEmb files will be cached
        "cache_dir": Path.home() / Path(".cache/bpemb"),
        # specifies the path to a custom SentencePiece model file
        "model_file": None,
        # specifies the path to a custom embedding file. Supported formats are Word2Vec
        # plain text and GenSim binary.
        "emb_file": None,
    }

    language_list = [
        "mt",
        "sd",
        "cr",
        "ba",
        "ht",
        "scn",
        "bi",
        "stq",
        "sm",
        "diq",
        "no",
        "yi",
        "vec",
        "bug",
        "am",
        "tl",
        "mn",
        "atj",
        "ko",
        "mai",
        "lij",
        "tcy",
        "sl",
        "bn",
        "dv",
        "rm",
        "ng",
        "ml",
        "kg",
        "koi",
        "war",
        "et",
        "mhr",
        "als",
        "bar",
        "ii",
        "sco",
        "got",
        "pnb",
        "ss",
        "bpy",
        "tum",
        "ru",
        "qu",
        "hy",
        "tw",
        "bm",
        "vep",
        "dty",
        "udm",
        "gd",
        "lbe",
        "rmy",
        "azb",
        "kw",
        "ja",
        "wuu",
        "pag",
        "ro",
        "tet",
        "ee",
        "min",
        "su",
        "ha",
        "glk",
        "pcd",
        "tk",
        "nrm",
        "ku",
        "gn",
        "ty",
        "bh",
        "pap",
        "fr",
        "ia",
        "cs",
        "ky",
        "ff",
        "kab",
        "rn",
        "csb",
        "tt",
        "cy",
        "ilo",
        "kaa",
        "hif",
        "ak",
        "pa",
        "crh",
        "ti",
        "myv",
        "ur",
        "se",
        "uz",
        "cdo",
        "lez",
        "srn",
        "kk",
        "pih",
        "de",
        "an",
        "tyv",
        "ext",
        "gan",
        "wo",
        "si",
        "lmo",
        "hak",
        "az",
        "ka",
        "ik",
        "frr",
        "hsb",
        "ho",
        "af",
        "nds",
        "pam",
        "el",
        "fur",
        "cu",
        "hr",
        "my",
        "nl",
        "da",
        "ch",
        "vls",
        "es",
        "as",
        "lt",
        "ny",
        "so",
        "oc",
        "lad",
        "pnt",
        "ms",
        "bcl",
        "os",
        "co",
        "ks",
        "or",
        "ay",
        "wa",
        "nah",
        "fa",
        "pl",
        "mzn",
        "za",
        "th",
        "fj",
        "kbp",
        "be",
        "zh",
        "ce",
        "sh",
        "sr",
        "id",
        "chy",
        "ps",
        "lo",
        "tr",
        "st",
        "he",
        "ang",
        "sah",
        "io",
        "gom",
        "ki",
        "sn",
        "kbd",
        "jam",
        "bo",
        "pms",
        "sk",
        "kv",
        "ckb",
        "nv",
        "dsb",
        "zea",
        "xmf",
        "fi",
        "ltg",
        "ksh",
        "ve",
        "new",
        "na",
        "jv",
        "tn",
        "sw",
        "rw",
        "ln",
        "bs",
        "gag",
        "ab",
        "olo",
        "is",
        "bjn",
        "ceb",
        "om",
        "vi",
        "ast",
        "uk",
        "mg",
        "mwl",
        "arz",
        "li",
        "mrj",
        "yo",
        "frp",
        "gl",
        "la",
        "km",
        "sv",
        "nap",
        "jbo",
        "bxr",
        "gv",
        "br",
        "fo",
        "ug",
        "pi",
        "bg",
        "ie",
        "din",
        "sa",
        "pdc",
        "cho",
        "lb",
        "ig",
        "aa",
        "sc",
        "fy",
        "kj",
        "eo",
        "eu",
        "kl",
        "sq",
        "to",
        "mi",
        "tpi",
        "kr",
        "hi",
        "arc",
        "ga",
        "nov",
        "mdf",
        "vo",
        "pfl",
        "rue",
        "haw",
        "kn",
        "mh",
        "mr",
        "te",
        "ca",
        "ace",
        "cv",
        "zu",
        "it",
        "iu",
        "av",
        "sg",
        "hz",
        "lv",
        "ts",
        "lrc",
        "ar",
        "hu",
        "nn",
        "nso",
        "krc",
        "mk",
        "tg",
        "ne",
        "dz",
        "ta",
        "mus",
        "ady",
        "en",
        "lg",
        "xal",
        "gu",
        "pt",
        "xh",
        "szl",
        "chr",
    ]

    def __init__(self,
                 component_config: Optional[Dict[Text, Any]] = None) -> None:
        super().__init__(component_config)

        self.model = BPEmb(
            lang=self.component_config["lang"],
            dim=self.component_config["dim"],
            vs=self.component_config["vs"],
            vs_fallback=self.component_config["vs_fallback"],
            cache_dir=self.component_config["cache_dir"],
        )

    def train(
        self,
        training_data: TrainingData,
        config: Optional[RasaNLUModelConfig] = None,
        **kwargs: Any,
    ) -> None:
        for example in training_data.intent_examples:
            for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
                self.set_bpemb_features(example, attribute)

    def create_word_vector(self, document: Text) -> np.ndarray:
        encoded_ids = self.model.encode_ids(document)
        if encoded_ids:
            return self.model.vectors[encoded_ids[0]]

        return np.zeros((self.component_config["dim"], ), dtype=np.float32)

    def set_bpemb_features(self,
                           message: Message,
                           attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        text_vector = self.create_word_vector(document=message.text)
        word_vectors = [
            self.create_word_vector(document=t.text)
            for t in train_utils.tokens_without_cls(message, attribute)
        ]
        X = np.array(word_vectors + [text_vector])

        features = self._combine_with_existing_dense_features(
            message,
            additional_features=X,
            feature_name=DENSE_FEATURE_NAMES[attribute])
        message.set(DENSE_FEATURE_NAMES[attribute], features)

    def process(self, message: Message, **kwargs: Any) -> None:
        self.set_bpemb_features(message)

    def persist(self, file_name: Text,
                model_dir: Text) -> Optional[Dict[Text, Any]]:
        pass

    @classmethod
    def load(
        cls,
        meta: Dict[Text, Any],
        model_dir: Optional[Text] = None,
        model_metadata: Optional["Metadata"] = None,
        cached_component: Optional["Component"] = None,
        **kwargs: Any,
    ) -> "Component":
        if cached_component:
            return cached_component

        return cls(meta)
コード例 #27
0
# Build our DALL-E model
dalle = DALLE(
    dim=CODEBOOK_DIM,  # Codebook Dimension
    vae=
    vae,  # DiscreteVAE instance: image sequence length and number of image tokens inferred
    num_text_tokens=VOCAB_SIZE + 1,  # Vocab size for text. Add 1 for <PAD>
    text_sequence_len=TEXT_SEQ_LEN,  # Text sequence length
    depth=DEPTH,  # Transformer depth: should aim to be 64
    heads=HEADS,  # Attention heads
    dim_head=DIM_HEAD,  # Attention head dimension
    reversible=
    REVERSIBLE,  # Whether to use ReversibleSequence or SequentialSequence
    attn_dropout=ATTN_DROPOUT,  # Attention dropout
    ff_dropout=FF_DROPOUT  # Feedforward dropout
)

dalle.load_weights("./dalle_tensorflow/model_weights/dalle/dalle_weights")

text = "A running horse."

bpe_encoder = BPEmb(lang="en", vs=VOCAB_SIZE, add_pad_emb=True)
text = bpe_encoder.encode_ids(text)
text = np.array(text)
text = np.pad(array=text, pad_width=[0, TEXT_SEQ_LEN - len(text)])
text = tf.expand_dims(text, axis=0)
mask = tf.cast(tf.where(text != 0, 1, 0, text), dtype=tf.bool)

output_images = dalle.generate_images(text, mask=mask)
output_images = tf.reshape(tensor=output_images, shape=[IMG_SIZE, IMG_SIZE, 3])
output_images = save_img(path="dalle_out.jpg", x=output_images)