class Hparams:
    parser = argparse.ArgumentParser()

    bpemb_en = BPEmb(lang="en", dim=50)
    bpemb_de = BPEmb(lang='de', dim=50)

    # preprocess
    parser.add_argument('--BUFFER_SIZE', default=10000)
    parser.add_argument('--batch_size', default=64)
    parser.add_argument('--maxlen', default=40, help='max length of sentences')
    parser.add_argument('--tokenizer_de',
                        default=bpemb_de,
                        help='encoding method')
    parser.add_argument('--tokenizer_en',
                        default=bpemb_en,
                        help='decoding method')

    # train
    parser.add_argument('--num_layers',
                        default=4,
                        help='blocks number of encoder and decoder')
    parser.add_argument('--d_model', default=128)
    parser.add_argument('--dff', default=512)
    parser.add_argument('--num_heads', default=8)
    parser.add_argument('--dropout_rate', default=0.1)
    parser.add_argument('--checkpoint_dir', default='./checkpoints/train')
    parser.add_argument('--checkpoint_dir_de', default='./checkpoints/de_en')
    parser.add_argument('--epochs', default=10)
Exemple #2
0
def load_bpe(vocab_size):
    """ Load pre-trained byte pair embedding models.

    Return src, trg
    """
    bpemb_tr = BPEmb(lang="tr", vs=vocab_size)
    bpemb_en = BPEmb(lang="en", vs=vocab_size)
    return bpemb_tr, bpemb_en
    def __init__(self, bpe_info, padding_info):
        super().__init__()
        self._bpe_info = bpe_info
        self._padding_info = padding_info

        self._shared_bpe = None
        self._encoder_bpe = None
        self._decoder_bpe = None
        if "shared_bpe" in self._bpe_info:
            self._shared_bpe = BPEmb(**self._bpe_info["shared_bpe"])
            self._encoder_bpe = self._shared_bpe
            self._decoder_bpe = self._shared_bpe
        else:
            self._encoder_bpe = BPEmb(**self._bpe_info["encoder_bpe"])
            self._decoder_bpe = BPEmb(**self._bpe_info["decoder_bpe"])
Exemple #4
0
def test():
    bpemb_en = BPEmb(lang="en", dim=100)
    s = "Stratford"
    res1 = bpemb_en.encode(s)
    res2 = bpemb_en.encode_ids(s)
    print(res1)
    print(res2)

    bpemb_en_100k = BPEmb(lang="en", vs=100000, dim=100)  # 40 M;词表越大切分越少
    s = "hello world !"
    bpemb_en_100k.encode_ids(s)
    res1 = bpemb_en_100k.encode(s)
    res2 = bpemb_en_100k.encode_ids(s)
    print(res1)
    print(res2)
Exemple #5
0
 def __init__(self, lang='ru', pretrained=True, vocab_size=100000, dim=300):
     self.lang = lang
     self.pretrained = pretrained
     self.bpe = BPEmb(lang=self.lang,
                      vs=vocab_size,
                      dim=dim,
                      vs_fallback=True)
Exemple #6
0
 def __init__(self,
              output_dim,
              vocab_size=10000,
              embed_dim=50,
              lang='en',
              embedding_preload=True,
              gpu_id=-1,
              dropout=0):
     super(LanguagePeripheral, self).__init__()
     self.gpu_id = gpu_id
     self.pad_char = vocab_size
     self.bpe_encoder = BPEmb(lang=lang,
                              vs=vocab_size,
                              dim=embed_dim,
                              add_pad_emb=True)
     # Add an extra padding character
     self.embed_layer = nn.Embedding(vocab_size + 1,
                                     embed_dim,
                                     padding_idx=self.pad_char)
     if (embedding_preload == True):
         self.embed_layer.load_state_dict(
             {'weight': torch.tensor(self.bpe_encoder.emb.vectors)})
         print("Loading pretrained word embeddings.")
     self.enc_dropout = nn.Dropout(dropout)
     self.output = nn.Linear(embed_dim, output_dim)
Exemple #7
0
class EmbVectorizer(BaseEstimator, TransformerMixin):
    """
    Adds embedding features for passed text
    """
    bpemb = BPEmb(lang="uk")

    def __init__(self):
        ...

    def fit(self, documents, y=None):
        return self

    def calc_emb(self, text):
        res = np.zeros(EmbVectorizer.bpemb.vectors.shape[1], dtype=np.float32)
        # tokens = word_tokenize(text)
        # for t in tokens:
        embs = EmbVectorizer.bpemb.embed(text)
        for e in embs:
            res += e
        n = len(embs)
        if n:
            res /= n
        return res

    def transform(self, X):
        res = []
        for row in range(0, len(X)):
            res.append(self.calc_emb(X[row]))
        np_res = np.array(res, dtype=float)
        return np_res
 def __init__(self, vocab_size, embedding_dim, max_sequence_len):
     self.vocab_size = vocab_size
     self.embedding_dim = embedding_dim
     self.max_sequence_len = max_sequence_len
     self.bpemb_en_100k = BPEmb(lang="en",
                                vs=self.vocab_size,
                                dim=self.embedding_dim)  # 40 M;词表越大切分越少
 def load(cls, conf, lang, bert=None):
     mkdir(conf.cache_dir)
     fasttext_emb = conf.fasttext_emb_file if conf.use_fasttext else None
     fname = (
         f"{conf.dataset}.{lang}." +
         (f"max{conf.max_ninst}." if conf.max_ninst else "") +
         (f"maxeval{conf.max_eval_ninst}." if conf.max_eval_ninst else "") +
         (f"cv{conf.crossval_idx}." if conf.crossval_idx is not None else "") +
         (f"bert{conf.bert_max_seq_len}." if bert is not None else "") +
         (f"fasttext." if fasttext_emb is not None else "") +
         f"vs{conf.vocab_size}.{conf.tag}." +
         (f"{conf.tag_scheme}." if conf.tag_scheme else "") +
         "pt"
         )
     cache_file = conf.cache_dir / fname
     ds = None
     try:
         print("loading", cache_file)
         ds = torch.load(cache_file)
         print("loaded", cache_file)
         ds.bpemb = BPEmb(
             lang=conf.bpemb_lang,
             vs=conf.vocab_size,
             dim=conf.bpemb_dim,
             add_pad_emb=True)
     except FileNotFoundError:
         pass
     if ds is None:
         print(f"Loading dataset {conf.dataset} {lang}")
         ds = cls(conf, lang, bert=bert)
         bpemb = ds.bpemb
         ds.bpemb = None  # cannot pickle SwigPyObject
         torch.save(ds, cache_file)
         ds.bpemb = bpemb
     return ds
def get_transformer(ff_dim: int, n_layers: int, n_heads: int,
                    dropout_prob: float):
    """
    Creates a new transformer and tokenizer using the given parameters
    :param ff_dim:
    :param n_layers:
    :param n_heads:
    :param dropout_prob:
    :return:
    """
    # Load english model with 25k word-pieces
    tokenizer = BPEmb(lang='en', dim=300, vs=25000)
    # Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token
    pretrained_embeddings = np.concatenate(
        [tokenizer.emb.vectors,
         np.zeros(shape=(1, 300))], axis=0)
    # Extract the vocab and add an extra [PAD] token
    vocabulary = tokenizer.emb.index2word + ['[PAD]']
    tokenizer.pad_token_id = len(vocabulary) - 1

    model = TransformerClassifier(torch.tensor(pretrained_embeddings).type(
        torch.FloatTensor),
                                  ff_dim=ff_dim,
                                  d_model=300,
                                  n_heads=n_heads,
                                  n_layers=n_layers,
                                  dropout_prob=dropout_prob).to(device)

    return model, tokenizer
Exemple #11
0
def main(args=None) -> None:
    """
    CLI function to manually download all the dependencies for a pre-trained model.

    Example of usage:

    .. code-block:: sh

        download_model fasttext
    """
    if args is None:
        args = sys.argv[1:]

    parsed_args = get_args(args)

    model_type = parsed_args.model_type

    if "fasttext" in model_type and "fasttext-light" not in model_type:
        download_fasttext_embeddings(saving_dir=CACHE_PATH)
    elif model_type == "fasttext-light":
        download_fasttext_magnitude_embeddings(saving_dir=CACHE_PATH)
    elif "bpemb" in model_type:
        BPEmb(
            lang="multi", vs=100000, dim=300
        )  # The class manage the download of the pre-trained words embedding

    model_path = os.path.join(CACHE_PATH, f"{model_type}.ckpt")
    version_path = os.path.join(CACHE_PATH, f"{model_type}.version")
    if not os.path.isfile(model_path) or not os.path.isfile(version_path):
        download_weights(model_type, CACHE_PATH)
    elif not latest_version(model_type, cache_path=CACHE_PATH):
        print(
            "A new version of the pre-trained model is available. The newest model will be downloaded."
        )
        download_weights(model_type, CACHE_PATH)
 def __init__(self, predictor_config):
     predictor_config = predictor_config['vectorizer']
     self.bpemb = BPEmb(lang='en',
                        dim=predictor_config['embedding_dim'],
                        vs=predictor_config['max_vocab_size'],
                        add_pad_emb=True)
     self.max_seq_len = predictor_config['max_seq_len']
Exemple #13
0
def get_vocab(main_config, args, logger):
    main_cfg = MainConfig(main_config, args)
    vocab_loaded = False
    # if we don't have vocab or bpe files, create everything from scratch
    if check_file_exists(main_cfg.data_dir, main_cfg.vocab_file) and \
            check_file_exists(main_cfg.data_dir, main_cfg.train_bpe_file) and \
            check_file_exists(main_cfg.data_dir, main_cfg.dev_bpe_file) and \
            check_file_exists(main_cfg.data_dir, main_cfg.test_bpe_file):
        logger.info('"{}" and bpe files found in data folder, loading stats.'.format(main_cfg.vocab_file))
        vocab = load_vocab(main_cfg)
        vocab_loaded = True
    else:
        logger.info('No "{}" or bpe files found in data folder, creating new vocab.'.format(main_cfg.vocab_file))
        bpemb = BPEmb(lang="en", dim=main_cfg.embedding_size, vs=main_cfg.emb_vocab_size)
        logger.info('Bemb loaded')
        vocab = create_vocab(main_cfg, bpemb)
        write_vocab(main_cfg, vocab)

    logger.info('Max sequence length of {} covers {}% of sentences'.format(vocab.max_seq_length, SEQ_LEN_THRESHOLD))

    # if vocab is created new we should probably also create emebeddings again
    if not vocab_loaded or not check_file_exists("", main_cfg.embeddings):
        logger.info('No embedding matrix found, loading embeddings.')
        create_embeddings(main_cfg, vocab)
        logger.info('Saved embedding matrix to "{}" in data folder.'.format(main_cfg.embeddings))
    else:
        logger.info('Embedding matrix found in data folder.')

    return int(vocab.max_seq_length), int(vocab.size)
def preprocess(emb_dim, word_vocab_size):
    make_dirs()

    box_file_path = config.ORG_TRAIN_DATA_PATH + "/train.box"
    field_vocab_path = config.PRC_TRAIN_DATA_PATH + "/field.vocab"
    field_dict_path = config.PRC_TRAIN_DATA_PATH + "/field.dict"
    word_dict_path = config.PRC_TRAIN_DATA_PATH + "/word.dict"

    bpemb_en = BPEmb(lang="en", dim=emb_dim, vs=word_vocab_size)

    metadata = PreprocessMetadata(emb_dim, word_vocab_size, word_dict_path,
                                  field_dict_path)
    metadata.init_bpe_module()

    field_vocab = create_field_label_vocab(box_file_path, field_vocab_path)
    field_dict = LabelDict.get(vocab=list(field_vocab),
                               dict_binpath=field_dict_path)
    bpe_dict = BpeWordDict.get(vocab=bpemb_en.words,
                               dict_binpath=word_dict_path)

    print("Saving metadata")
    torch.save(metadata, config.PRC_TRAIN_DATA_PATH + '/metadata.bin')

    skipped_boxes = prepare_infobox_datasets(field_dict, bpemb_en)
    prepare_articles_dataset(field_dict, bpemb_en, skipped_boxes)

    create_mono_datasets(field_dict, bpemb_en)

    print("Preprocessing done")

    return bpemb_en, bpe_dict, field_dict
Exemple #15
0
def test_punctuation():
    text = [
        "Leonidas: This's Sparta!!", "Leonidas : This ' s Sparta ! !",
        "Leonidas This s Sparta"
    ]
    bpemb_multi = BPEmb(lang="multi", add_pad_emb=True)
    print(bpemb_multi.encode(text))
Exemple #16
0
def test_multi_language():
    text = ["This is Stratford", "Kitap okuyordu."]
    bpemb_multi = BPEmb(lang="multi", add_pad_emb=True)
    print(bpemb_multi.encode_ids_with_bos_eos(text))
    print(
        bpemb_multi.decode_ids([[1, 5496, 200, 23866, 3927, 2],
                                [1, 45350, 44934, 67191, 94777, 2]]))
Exemple #17
0
    def __init__(self, vocab, args, audio_conf, manifest_filepath_list, normalize=False, augment=False, input_type="char", is_train=False):
        """
        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
        a comma. Each new line is a different sample. Example below:
        /path/to/audio.wav,/path/to/audio.txt
        ...
        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
        :param manifest_filepath: Path to manifest csv as describe above
        :param labels: String containing all the possible characters to map to
        :param normalize: Apply standard mean and deviation normalization to audio tensor
        :param augment(default False):  Apply random tempo and gain perturbations
        """
        self.max_size = 0
        self.ids_list = []
        for i in range(len(manifest_filepath_list)):
            manifest_filepath = manifest_filepath_list[i]
            ids = pd.read_csv(manifest_filepath, header=None).values.tolist()
            self.ids_list.append(ids)
            self.max_size = max(len(ids), self.max_size)

        self.max_size = self.max_size * len(manifest_filepath_list)
        print("max_size:", self.max_size)

        print("input_type:", input_type)
        self.input_type = input_type
        self.manifest_filepath_list = manifest_filepath_list
        self.normalize = normalize
        self.vocab = vocab

        if self.input_type == "bpe":
            self.bpeemb_list = []
            for i in range(len(self.lang_list)):
                lang = self.lang_list[i].replace("<","").replace(">","").lower()
                self.bpeemb_list.append(BPEmb(lang=lang, vs=1000))
        super(LogFBankDataset, self).__init__()
 def get_embedding_vec(self, word):
     if self.model is None:
         self.model = BPEmb(lang="en", dim=self.dim, vs=self.bp_vocab_size)
     if not self.case_sensitive:
         word = word.lower()
     vecs = self.model.embed(word)
     return np.reshape(np.sum(vecs, axis=0), (self.dim, ))
Exemple #19
0
def get_cnn(in_channels, out_channels, kernel_heights, stride, padding, dropout_prob):
    """
    Creates a new CNN and tokenizer using the given parameters
    :return:
    """
    # Load english model with 25k word-pieces
    tokenizer = BPEmb(lang='en', dim=300, vs=25000)
    # Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token
    pretrained_embeddings = np.concatenate([tokenizer.emb.vectors, np.zeros(shape=(1, 300))], axis=0)
    # Extract the vocab and add an extra [PAD] token
    vocabulary = tokenizer.emb.index2word + ['[PAD]']
    tokenizer.pad_token_id = len(vocabulary) - 1

    model = CNN(
        torch.tensor(pretrained_embeddings).type(torch.FloatTensor),
        n_labels=2,
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_heights=kernel_heights,
        stride=stride,
        padding=padding,
        dropout=dropout_prob
    ).to(device)

    return model, tokenizer
 def __init__(self, verbose: bool = True, **kwargs) -> None:
     super().__init__(verbose=verbose)
     with warnings.catch_warnings():
         # annoying scipy.sparcetools private module warnings removal
         # annoying boto warnings
         warnings.filterwarnings("ignore")
         model = BPEmb(**kwargs)
     self.model = model
    def __init__(self, **kwargs):
        lang = kwargs.get("lang", "en")
        vs = kwargs.get("limit", 200000)

        self.bpemb = BPEmb(lang=lang, vs=vs)
        self.tokenizer = SpacyTokenizer(model="en",
                                        annotators=["lemma", "pos", "ner"])
        self.annotators = self.tokenizer.annotators
Exemple #22
0
 def __init__(self,
              lang="en",
              dim=200,
              vs=200000,
              distance_metric="cosine"):
     from bpemb import BPEmb
     self.bpemb = BPEmb(lang=lang, dim=dim, vs=vs)
     self.distance_metric = distance_metric
Exemple #23
0
 def __init__(self, path=config.path_to_data, mode='train'):
     self.path_to_data = path
     self.mode = mode
     print(f"Loading {self.mode} data...")
     self.data = self.read_data()
     self.preprocess_data()
     self.bpemb_ru = BPEmb(lang="ru", dim=300, vs=50000)
     self.placeholder = torch.zeros(config.max_seq_length, dtype=torch.long)
Exemple #24
0
 def __init__(
     self, lang, vs=10000, dim=100, cache_dir=Path.home() / Path(".cache/bpemb")
 ):
     self.lang = lang
     self.vs = vs
     self.dim = dim
     self.cache_dir = cache_dir
     self.module = BPEmb(lang=lang, vs=vs, dim=dim, cache_dir=cache_dir)
def loadBPE(lang, vector_size):
    """
    It automatically downloads the embedding file and loads it as a gensim keyed vector
    :param lang: langauge is enough, no need for embedding file
    :return:
    """
    model = BPEmb(lang=lang, dim=vector_size)
    return model
Exemple #26
0
def test_decoding():
    # Although <pad> word is added, when decoding it can't handle. Therefore, remove padding before decoding.
    # Decoding removes start/end tokens.
    bpemb_en = BPEmb(lang="en", add_pad_emb=True)
    # ids = [1, 215, 80, 8526, 1221, 2]
    ids = [[1, 215, 80, 8526, 1221, 2], [1, 215, 80, 8526, 1221, 2]]
    # ids = [1, 215, 80, 8526, 1221, 2, 10000, 10000]
    # print(bpemb_en.vectors[10000])
    print(bpemb_en.decode_ids(ids))
Exemple #27
0
 def __init__(self, verbose: bool = True) -> None:
     super().__init__(verbose=verbose)
     with warnings.catch_warnings():
         # annoying scipy.sparcetools private module warnings removal
         # annoying boto warnings
         warnings.filterwarnings("ignore")
         model = BPEmb(lang="multi", vs=100000,
                       dim=300)  # defaults parameters
     self.model = model
Exemple #28
0
def load_pretrained_embedding_bpe(embedding_matrix):
    """ load bpe embedding; add <pad> as id=0 """

    bpemb = BPEmb(lang="en", vs=25000, dim=200)
    embedding_matrix[1:] = bpemb.vectors
    print('loaded bpe pre-trained embedding')
    print('embedding vectors count:', embedding_matrix.shape[0])

    return embedding_matrix
def convert_dataset(dataset: list, char2i: dict, labels2i: dict) -> List[Any]:
    bpemb_en = BPEmb(lang='en', vs=200000, dim=300)

    d = {'O': 1, 'B': 2, 'I': 3}
    for i, (a, l) in enumerate(dataset):
        a, l = data2words((a, l), bpemb_en, char2i)
        dataset[i] = (np.array(a), np.array([d[x] for x in l]))
        return
    return dataset
def prepare_bpe_weights(lang, vs, dim, weight_path=None):
    # Note that embedding weights and preprocessor weights should be the same.
    # If the weight computer is not deterministic, be careful..
    # May relax the requirement depending on the case..
    # E.g. weight values may be different for the embedder and the processor.
    # However, the ids should match the words in the right order..
    if weight_path is None:
        bpe = BPEmb(lang=lang, add_pad_emb=True, vs=vs, dim=dim)
        weights = bpe.vectors
    else:
        if check_file_exists(weight_path):
            weights = load_from_pickle(weight_path)
        else:
            bpe = BPEmb(lang=lang, add_pad_emb=True, vs=vs, dim=dim)
            weights = bpe.vectors
            save_to_pickle(weights, weight_path)

    return weights