Beispiel #1
0
 def __init__(self, vocab_size, embedding_dim, max_sequence_len):
     self.vocab_size = vocab_size
     self.embedding_dim = embedding_dim
     self.max_sequence_len = max_sequence_len
     self.bpemb_en_100k = BPEmb(lang="en",
                                vs=self.vocab_size,
                                dim=self.embedding_dim)  # 40 M;词表越大切分越少
Beispiel #2
0
class SubWordVocab(object):
    def __init__(self, size):
        self.encoder = BPEmb(lang='en', vs=size)

        assert self.sos_id == 1
        assert self.eos_id == 2

    def __len__(self):
        return self.encoder.vs

    @property
    def sos_id(self):
        return 1

    @property
    def eos_id(self):
        return self.encoder.EOS

    def encode(self, syms):
        return self.encoder.encode_ids(syms)

    def decode(self, ids):
        syms = self.encoder.decode_ids(ids)
        if isinstance(syms, list):
            return ''
        return syms
Beispiel #3
0
class BPEmbVaeSampler(VAESampler):
    def __init__(self, lang, vs, dim, decode_from, params, cuda=False):
        self.bp = BPEmb(lang=lang, vs=vs, dim=dim)
        super().__init__(decode_from, params, cuda)

    def to_s(self, decoded):
        out = []
        for item in decoded:
            s = self.bp.decode(item).replace('▁', ' ').strip()
            s = s[0].upper() + s[1:]
            s = re.sub(r'\bi\b', 'I', s)
            s = re.sub(r'[.!?]\s+(\w)',
                       lambda m: m.group()[:-1] + m.group()[-1].upper(), s)
            out.append(s)
        return out

    def str2ids(self, s):
        """
        Encode string s with BPEmb. BPEmb has a fixed vocabulary size, but
        the model only has outputs for vocab items that are used in the
        training data, so this function replaces any BPEmb ids *not* in the
        training vocabulary with the model's "unknown" id.
        """
        encoded = self.bp.encode(s)
        ids = [self.vocab.word2id.get(item, self.vocab.unk_id) \
                for item in encoded]
        return ids
Beispiel #4
0
def test_multi_language():
    text = ["This is Stratford", "Kitap okuyordu."]
    bpemb_multi = BPEmb(lang="multi", add_pad_emb=True)
    print(bpemb_multi.encode_ids_with_bos_eos(text))
    print(
        bpemb_multi.decode_ids([[1, 5496, 200, 23866, 3927, 2],
                                [1, 45350, 44934, 67191, 94777, 2]]))
Beispiel #5
0
 def __init__(self,
              output_dim,
              vocab_size=10000,
              embed_dim=50,
              lang='en',
              embedding_preload=True,
              gpu_id=-1,
              dropout=0):
     super(LanguagePeripheral, self).__init__()
     self.gpu_id = gpu_id
     self.pad_char = vocab_size
     self.bpe_encoder = BPEmb(lang=lang,
                              vs=vocab_size,
                              dim=embed_dim,
                              add_pad_emb=True)
     # Add an extra padding character
     self.embed_layer = nn.Embedding(vocab_size + 1,
                                     embed_dim,
                                     padding_idx=self.pad_char)
     if (embedding_preload == True):
         self.embed_layer.load_state_dict(
             {'weight': torch.tensor(self.bpe_encoder.emb.vectors)})
         print("Loading pretrained word embeddings.")
     self.enc_dropout = nn.Dropout(dropout)
     self.output = nn.Linear(embed_dim, output_dim)
Beispiel #6
0
def test_punctuation():
    text = [
        "Leonidas: This's Sparta!!", "Leonidas : This ' s Sparta ! !",
        "Leonidas This s Sparta"
    ]
    bpemb_multi = BPEmb(lang="multi", add_pad_emb=True)
    print(bpemb_multi.encode(text))
class Hparams:
    parser = argparse.ArgumentParser()

    bpemb_en = BPEmb(lang="en", dim=50)
    bpemb_de = BPEmb(lang='de', dim=50)

    # preprocess
    parser.add_argument('--BUFFER_SIZE', default=10000)
    parser.add_argument('--batch_size', default=64)
    parser.add_argument('--maxlen', default=40, help='max length of sentences')
    parser.add_argument('--tokenizer_de',
                        default=bpemb_de,
                        help='encoding method')
    parser.add_argument('--tokenizer_en',
                        default=bpemb_en,
                        help='decoding method')

    # train
    parser.add_argument('--num_layers',
                        default=4,
                        help='blocks number of encoder and decoder')
    parser.add_argument('--d_model', default=128)
    parser.add_argument('--dff', default=512)
    parser.add_argument('--num_heads', default=8)
    parser.add_argument('--dropout_rate', default=0.1)
    parser.add_argument('--checkpoint_dir', default='./checkpoints/train')
    parser.add_argument('--checkpoint_dir_de', default='./checkpoints/de_en')
    parser.add_argument('--epochs', default=10)
 def __init__(self, predictor_config):
     predictor_config = predictor_config['vectorizer']
     self.bpemb = BPEmb(lang='en',
                        dim=predictor_config['embedding_dim'],
                        vs=predictor_config['max_vocab_size'],
                        add_pad_emb=True)
     self.max_seq_len = predictor_config['max_seq_len']
 def get_embedding_vec(self, word):
     if self.model is None:
         self.model = BPEmb(lang="en", dim=self.dim, vs=self.bp_vocab_size)
     if not self.case_sensitive:
         word = word.lower()
     vecs = self.model.embed(word)
     return np.reshape(np.sum(vecs, axis=0), (self.dim, ))
Beispiel #10
0
 def __init__(self, lang='ru', pretrained=True, vocab_size=100000, dim=300):
     self.lang = lang
     self.pretrained = pretrained
     self.bpe = BPEmb(lang=self.lang,
                      vs=vocab_size,
                      dim=dim,
                      vs_fallback=True)
def get_transformer(ff_dim: int, n_layers: int, n_heads: int,
                    dropout_prob: float):
    """
    Creates a new transformer and tokenizer using the given parameters
    :param ff_dim:
    :param n_layers:
    :param n_heads:
    :param dropout_prob:
    :return:
    """
    # Load english model with 25k word-pieces
    tokenizer = BPEmb(lang='en', dim=300, vs=25000)
    # Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token
    pretrained_embeddings = np.concatenate(
        [tokenizer.emb.vectors,
         np.zeros(shape=(1, 300))], axis=0)
    # Extract the vocab and add an extra [PAD] token
    vocabulary = tokenizer.emb.index2word + ['[PAD]']
    tokenizer.pad_token_id = len(vocabulary) - 1

    model = TransformerClassifier(torch.tensor(pretrained_embeddings).type(
        torch.FloatTensor),
                                  ff_dim=ff_dim,
                                  d_model=300,
                                  n_heads=n_heads,
                                  n_layers=n_layers,
                                  dropout_prob=dropout_prob).to(device)

    return model, tokenizer
Beispiel #12
0
def get_cnn(in_channels, out_channels, kernel_heights, stride, padding, dropout_prob):
    """
    Creates a new CNN and tokenizer using the given parameters
    :return:
    """
    # Load english model with 25k word-pieces
    tokenizer = BPEmb(lang='en', dim=300, vs=25000)
    # Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token
    pretrained_embeddings = np.concatenate([tokenizer.emb.vectors, np.zeros(shape=(1, 300))], axis=0)
    # Extract the vocab and add an extra [PAD] token
    vocabulary = tokenizer.emb.index2word + ['[PAD]']
    tokenizer.pad_token_id = len(vocabulary) - 1

    model = CNN(
        torch.tensor(pretrained_embeddings).type(torch.FloatTensor),
        n_labels=2,
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_heights=kernel_heights,
        stride=stride,
        padding=padding,
        dropout=dropout_prob
    ).to(device)

    return model, tokenizer
Beispiel #13
0
 def __init__(self,
              lang="en",
              dim=200,
              vs=200000,
              distance_metric="cosine"):
     from bpemb import BPEmb
     self.bpemb = BPEmb(lang=lang, dim=dim, vs=vs)
     self.distance_metric = distance_metric
Beispiel #14
0
 def __init__(self, path=config.path_to_data, mode='train'):
     self.path_to_data = path
     self.mode = mode
     print(f"Loading {self.mode} data...")
     self.data = self.read_data()
     self.preprocess_data()
     self.bpemb_ru = BPEmb(lang="ru", dim=300, vs=50000)
     self.placeholder = torch.zeros(config.max_seq_length, dtype=torch.long)
Beispiel #15
0
 def __init__(
     self, lang, vs=10000, dim=100, cache_dir=Path.home() / Path(".cache/bpemb")
 ):
     self.lang = lang
     self.vs = vs
     self.dim = dim
     self.cache_dir = cache_dir
     self.module = BPEmb(lang=lang, vs=vs, dim=dim, cache_dir=cache_dir)
Beispiel #16
0
def load_bpe(vocab_size):
    """ Load pre-trained byte pair embedding models.

    Return src, trg
    """
    bpemb_tr = BPEmb(lang="tr", vs=vocab_size)
    bpemb_en = BPEmb(lang="en", vs=vocab_size)
    return bpemb_tr, bpemb_en
    def __init__(self, **kwargs):
        lang = kwargs.get("lang", "en")
        vs = kwargs.get("limit", 200000)

        self.bpemb = BPEmb(lang=lang, vs=vs)
        self.tokenizer = SpacyTokenizer(model="en",
                                        annotators=["lemma", "pos", "ner"])
        self.annotators = self.tokenizer.annotators
Beispiel #18
0
def test_decoding():
    # Although <pad> word is added, when decoding it can't handle. Therefore, remove padding before decoding.
    # Decoding removes start/end tokens.
    bpemb_en = BPEmb(lang="en", add_pad_emb=True)
    # ids = [1, 215, 80, 8526, 1221, 2]
    ids = [[1, 215, 80, 8526, 1221, 2], [1, 215, 80, 8526, 1221, 2]]
    # ids = [1, 215, 80, 8526, 1221, 2, 10000, 10000]
    # print(bpemb_en.vectors[10000])
    print(bpemb_en.decode_ids(ids))
def clean_sub_word_sentence(word_ids: np.array, bpemb: BPEmb):
    # Extra padding token is remove in BPEmb
    word_ids = word_ids - 1
    try:
        index = list(word_ids).index(bpemb.EOS)
        words = bpemb.decode_ids(word_ids[:index])
    except ValueError:  # No EOS found in sequence
        words = bpemb.decode_ids(word_ids)

    return words
    def __init__(self, config):

        self._eval_data_path = os.path.join(
            os.path.abspath(os.path.dirname(os.getcwd())), config["eval_data"])
        self._output_path = os.path.join(
            os.path.abspath(os.path.dirname(os.getcwd())),
            config["output_path"])

        self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"])
        self.pad_token = 0
        self.eos_token = 3
    def __init__(self,
                 component_config: Optional[Dict[Text, Any]] = None) -> None:
        super().__init__(component_config)

        self.model = BPEmb(
            lang=self.component_config["lang"],
            dim=self.component_config["dim"],
            vs=self.component_config["vs"],
            vs_fallback=self.component_config["vs_fallback"],
            cache_dir=self.component_config["cache_dir"],
        )
Beispiel #22
0
    def __init__(self, embed_size, output_size, enc_hidden_size):
        super(TopicCEncSimpleBPemb, self).__init__()
        print("init: TopicCEncSimpleBPemb model")

        self.embedding_model = BPEmb(dim=embed_size, lang="en", vs=100000)
        self.encoder = nn.GRU(input_size=embed_size,
                              hidden_size=enc_hidden_size,
                              num_layers=1,
                              bidirectional=True)
        self.seq_to_output_map = nn.Linear(2 * enc_hidden_size,
                                           output_size,
                                           bias=False)
Beispiel #23
0
def process(texts, vocab_size=25000, dim=300):
    emb = BPEmb(lang='de', vs=vocab_size, dim=dim)

    texts = [emb.encode(t) for t in texts]

    unique_words = set([w for t in texts for w in t])
    vecs = [
        wv for (i, wv) in enumerate(zip(emb.words, emb.vectors))
        if i < 3 or wv[0] in unique_words
    ]  # reserve the special tokens

    return texts, vecs
    def __init__(self, conf, lang, bert=None):
        self.conf = conf
        self.lang = lang
        self.bert = bert
        self.device = torch.device(f"cuda:{conf.gpu_id}")
        self.name = conf.dataset
        self.tag = conf.tag
        self.batch_size = conf.batch_size
        self.eval_batch_size = conf.eval_batch_size
        self.examples_to_print = conf.n_examples

        if self.conf.tag_scheme:
            self.convert_tags = iob_to[self.tag_scheme]
        self.load_data_raw()
        self.NO_TAG = "NO_TAG"
        tags = self.get_tags()
        print(Counter(tags).most_common())
        shapes = self.get_shapes()
        char_enc = None
        if conf.char_enc_file:
            assert Path(conf.char_enc_file).exists()
            char_enc = joblib.load(conf.char_enc_file)
        if self.name.endswith("multi_finetune"):
            assert char_enc
        if char_enc:
            self.char_enc = char_enc
        else:
            chars = self.get_chars()
            self.char_enc = LabelEncoder(
                to_torch=True, device=self.device).fit(chars)
        tag_enc = None
        if conf.tag_enc_file:
            assert Path(conf.tag_enc_file).exists()
            tag_enc = joblib.load(conf.tag_enc_file)
        if tag_enc:
            self.tag_enc = tag_enc
        else:
            self.tag_enc = LabelEncoder(
                to_torch=True, device=self.device).fit(tags)
        self.shape_enc = LabelEncoder(
            to_torch=True, device=self.device).fit(shapes)

        self.bpemb = BPEmb(
            lang=conf.bpemb_lang,
            vs=conf.vocab_size,
            dim=conf.bpemb_dim,
            add_pad_emb=True)
        if conf.use_fasttext:
            f = conf.fasttext_emb_file.format(dataset=self.name, lang=lang)
            self.fasttext_emb = load_word2vec_file(f, add_unk=True)
        self.pad_idx = self.bpemb.emb.key_to_index["<pad>"]
        if not conf.no_dataset_tensorize:
            self.tensorize()
Beispiel #25
0
 def __init__(
     self,
     config: Dict[Text, Any],
     name: Text,
 ) -> None:
     """Constructs a new byte pair vectorizer."""
     super().__init__(name, config)
     # The configuration dictionary is saved in `self._config` for reference.
     self.model = BPEmb(
         lang=self._config["lang"],
         dim=self._config["dim"],
         vs=self._config["vs"],
         vs_fallback=self._config["vs_fallback"],
     )
Beispiel #26
0
def get_multibpe_embeddings(x: List[str],
                            multibpemb=None,
                            vs=1000000,
                            dim=300):
    if multibpemb is None:
        multibpemb = BPEmb(lang="multi", vs=vs, dim=dim)

    embeddings = []
    for sentence in x:
        features = multibpemb.embed(sentence)
        embeddings.append(features)

    embeddings = pad(embeddings, [0 for _ in range(dim)], 32)
    return embeddings
Beispiel #27
0
 def __init__(self, lang: str, dim: int = 300, vs: int = 100000, add_pad_emb: bool = True):
     super().__init__()
     try:
         from bpemb import BPEmb
         self.embedder = BPEmb(lang=lang, dim=dim, vs=vs, add_pad_emb=add_pad_emb, cache_dir=Path(aikido.cache_root) / "embeddings")
         self.embeddings_ = nn.Embedding.from_pretrained(tensor(self.embedder.vectors, dtype=torch.float),
                                                         padding_idx=vs)
         self.dim_ = dim
         self.vs_ = vs
     except ImportError:
         logging.error("-" * 100)
         logging.error("no bpemb installation found. see https://github.com/bheinzerling/bpemb")
         logging.error("-" * 100)
         pass
Beispiel #28
0
def test():
    bpemb_en = BPEmb(lang="en", dim=100)
    s = "Stratford"
    res1 = bpemb_en.encode(s)
    res2 = bpemb_en.encode_ids(s)
    print(res1)
    print(res2)

    bpemb_en_100k = BPEmb(lang="en", vs=100000, dim=100)  # 40 M;词表越大切分越少
    s = "hello world !"
    bpemb_en_100k.encode_ids(s)
    res1 = bpemb_en_100k.encode(s)
    res2 = bpemb_en_100k.encode_ids(s)
    print(res1)
    print(res2)
def make_byte_pair(corpus):
    '''This function implements byte-pair encodings'''

    # the bpe model
    bpemb_en = BPEmb(lang="en")

    # we are using the method to remove the stopwords so that the memory usage gets low
    tokenized_corpus = tokenize_preprocess_corpus(corpus)
    documents = []

    for word_tokens in tokenized_corpus:
        sentence = ' '.join(word_tokens)
        documents.append(bpemb_en.encode(sentence))

    return documents
Beispiel #30
0
class TweetTokenizer():
    def __init__(self, dim=50, vocab_size=10000, mode='get_id'):
        self.dim = dim
        self.vocab_size = vocab_size
        self.bpemb_en = BPEmb(lang="en", dim=dim, vs=vocab_size)
        self.embedding_weight = self.bpemb_en.vectors
        self.mode = mode
    
    def __call__(self, tweet, mode='get_id'):
        if mode == 'get_id':
            return torch.tensor(self.bpemb_en.encode_ids(tweet), dtype=torch.long)
        elif mode == 'raw':
            return self.bpemb_en.encode(tweet)
        else:
            raise ValueError('Invalid mode')