class Hparams: parser = argparse.ArgumentParser() bpemb_en = BPEmb(lang="en", dim=50) bpemb_de = BPEmb(lang='de', dim=50) # preprocess parser.add_argument('--BUFFER_SIZE', default=10000) parser.add_argument('--batch_size', default=64) parser.add_argument('--maxlen', default=40, help='max length of sentences') parser.add_argument('--tokenizer_de', default=bpemb_de, help='encoding method') parser.add_argument('--tokenizer_en', default=bpemb_en, help='decoding method') # train parser.add_argument('--num_layers', default=4, help='blocks number of encoder and decoder') parser.add_argument('--d_model', default=128) parser.add_argument('--dff', default=512) parser.add_argument('--num_heads', default=8) parser.add_argument('--dropout_rate', default=0.1) parser.add_argument('--checkpoint_dir', default='./checkpoints/train') parser.add_argument('--checkpoint_dir_de', default='./checkpoints/de_en') parser.add_argument('--epochs', default=10)
def load_bpe(vocab_size): """ Load pre-trained byte pair embedding models. Return src, trg """ bpemb_tr = BPEmb(lang="tr", vs=vocab_size) bpemb_en = BPEmb(lang="en", vs=vocab_size) return bpemb_tr, bpemb_en
def __init__(self, bpe_info, padding_info): super().__init__() self._bpe_info = bpe_info self._padding_info = padding_info self._shared_bpe = None self._encoder_bpe = None self._decoder_bpe = None if "shared_bpe" in self._bpe_info: self._shared_bpe = BPEmb(**self._bpe_info["shared_bpe"]) self._encoder_bpe = self._shared_bpe self._decoder_bpe = self._shared_bpe else: self._encoder_bpe = BPEmb(**self._bpe_info["encoder_bpe"]) self._decoder_bpe = BPEmb(**self._bpe_info["decoder_bpe"])
def test(): bpemb_en = BPEmb(lang="en", dim=100) s = "Stratford" res1 = bpemb_en.encode(s) res2 = bpemb_en.encode_ids(s) print(res1) print(res2) bpemb_en_100k = BPEmb(lang="en", vs=100000, dim=100) # 40 M;词表越大切分越少 s = "hello world !" bpemb_en_100k.encode_ids(s) res1 = bpemb_en_100k.encode(s) res2 = bpemb_en_100k.encode_ids(s) print(res1) print(res2)
def __init__(self, lang='ru', pretrained=True, vocab_size=100000, dim=300): self.lang = lang self.pretrained = pretrained self.bpe = BPEmb(lang=self.lang, vs=vocab_size, dim=dim, vs_fallback=True)
def __init__(self, output_dim, vocab_size=10000, embed_dim=50, lang='en', embedding_preload=True, gpu_id=-1, dropout=0): super(LanguagePeripheral, self).__init__() self.gpu_id = gpu_id self.pad_char = vocab_size self.bpe_encoder = BPEmb(lang=lang, vs=vocab_size, dim=embed_dim, add_pad_emb=True) # Add an extra padding character self.embed_layer = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=self.pad_char) if (embedding_preload == True): self.embed_layer.load_state_dict( {'weight': torch.tensor(self.bpe_encoder.emb.vectors)}) print("Loading pretrained word embeddings.") self.enc_dropout = nn.Dropout(dropout) self.output = nn.Linear(embed_dim, output_dim)
class EmbVectorizer(BaseEstimator, TransformerMixin): """ Adds embedding features for passed text """ bpemb = BPEmb(lang="uk") def __init__(self): ... def fit(self, documents, y=None): return self def calc_emb(self, text): res = np.zeros(EmbVectorizer.bpemb.vectors.shape[1], dtype=np.float32) # tokens = word_tokenize(text) # for t in tokens: embs = EmbVectorizer.bpemb.embed(text) for e in embs: res += e n = len(embs) if n: res /= n return res def transform(self, X): res = [] for row in range(0, len(X)): res.append(self.calc_emb(X[row])) np_res = np.array(res, dtype=float) return np_res
def __init__(self, vocab_size, embedding_dim, max_sequence_len): self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.max_sequence_len = max_sequence_len self.bpemb_en_100k = BPEmb(lang="en", vs=self.vocab_size, dim=self.embedding_dim) # 40 M;词表越大切分越少
def load(cls, conf, lang, bert=None): mkdir(conf.cache_dir) fasttext_emb = conf.fasttext_emb_file if conf.use_fasttext else None fname = ( f"{conf.dataset}.{lang}." + (f"max{conf.max_ninst}." if conf.max_ninst else "") + (f"maxeval{conf.max_eval_ninst}." if conf.max_eval_ninst else "") + (f"cv{conf.crossval_idx}." if conf.crossval_idx is not None else "") + (f"bert{conf.bert_max_seq_len}." if bert is not None else "") + (f"fasttext." if fasttext_emb is not None else "") + f"vs{conf.vocab_size}.{conf.tag}." + (f"{conf.tag_scheme}." if conf.tag_scheme else "") + "pt" ) cache_file = conf.cache_dir / fname ds = None try: print("loading", cache_file) ds = torch.load(cache_file) print("loaded", cache_file) ds.bpemb = BPEmb( lang=conf.bpemb_lang, vs=conf.vocab_size, dim=conf.bpemb_dim, add_pad_emb=True) except FileNotFoundError: pass if ds is None: print(f"Loading dataset {conf.dataset} {lang}") ds = cls(conf, lang, bert=bert) bpemb = ds.bpemb ds.bpemb = None # cannot pickle SwigPyObject torch.save(ds, cache_file) ds.bpemb = bpemb return ds
def get_transformer(ff_dim: int, n_layers: int, n_heads: int, dropout_prob: float): """ Creates a new transformer and tokenizer using the given parameters :param ff_dim: :param n_layers: :param n_heads: :param dropout_prob: :return: """ # Load english model with 25k word-pieces tokenizer = BPEmb(lang='en', dim=300, vs=25000) # Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token pretrained_embeddings = np.concatenate( [tokenizer.emb.vectors, np.zeros(shape=(1, 300))], axis=0) # Extract the vocab and add an extra [PAD] token vocabulary = tokenizer.emb.index2word + ['[PAD]'] tokenizer.pad_token_id = len(vocabulary) - 1 model = TransformerClassifier(torch.tensor(pretrained_embeddings).type( torch.FloatTensor), ff_dim=ff_dim, d_model=300, n_heads=n_heads, n_layers=n_layers, dropout_prob=dropout_prob).to(device) return model, tokenizer
def main(args=None) -> None: """ CLI function to manually download all the dependencies for a pre-trained model. Example of usage: .. code-block:: sh download_model fasttext """ if args is None: args = sys.argv[1:] parsed_args = get_args(args) model_type = parsed_args.model_type if "fasttext" in model_type and "fasttext-light" not in model_type: download_fasttext_embeddings(saving_dir=CACHE_PATH) elif model_type == "fasttext-light": download_fasttext_magnitude_embeddings(saving_dir=CACHE_PATH) elif "bpemb" in model_type: BPEmb( lang="multi", vs=100000, dim=300 ) # The class manage the download of the pre-trained words embedding model_path = os.path.join(CACHE_PATH, f"{model_type}.ckpt") version_path = os.path.join(CACHE_PATH, f"{model_type}.version") if not os.path.isfile(model_path) or not os.path.isfile(version_path): download_weights(model_type, CACHE_PATH) elif not latest_version(model_type, cache_path=CACHE_PATH): print( "A new version of the pre-trained model is available. The newest model will be downloaded." ) download_weights(model_type, CACHE_PATH)
def __init__(self, predictor_config): predictor_config = predictor_config['vectorizer'] self.bpemb = BPEmb(lang='en', dim=predictor_config['embedding_dim'], vs=predictor_config['max_vocab_size'], add_pad_emb=True) self.max_seq_len = predictor_config['max_seq_len']
def get_vocab(main_config, args, logger): main_cfg = MainConfig(main_config, args) vocab_loaded = False # if we don't have vocab or bpe files, create everything from scratch if check_file_exists(main_cfg.data_dir, main_cfg.vocab_file) and \ check_file_exists(main_cfg.data_dir, main_cfg.train_bpe_file) and \ check_file_exists(main_cfg.data_dir, main_cfg.dev_bpe_file) and \ check_file_exists(main_cfg.data_dir, main_cfg.test_bpe_file): logger.info('"{}" and bpe files found in data folder, loading stats.'.format(main_cfg.vocab_file)) vocab = load_vocab(main_cfg) vocab_loaded = True else: logger.info('No "{}" or bpe files found in data folder, creating new vocab.'.format(main_cfg.vocab_file)) bpemb = BPEmb(lang="en", dim=main_cfg.embedding_size, vs=main_cfg.emb_vocab_size) logger.info('Bemb loaded') vocab = create_vocab(main_cfg, bpemb) write_vocab(main_cfg, vocab) logger.info('Max sequence length of {} covers {}% of sentences'.format(vocab.max_seq_length, SEQ_LEN_THRESHOLD)) # if vocab is created new we should probably also create emebeddings again if not vocab_loaded or not check_file_exists("", main_cfg.embeddings): logger.info('No embedding matrix found, loading embeddings.') create_embeddings(main_cfg, vocab) logger.info('Saved embedding matrix to "{}" in data folder.'.format(main_cfg.embeddings)) else: logger.info('Embedding matrix found in data folder.') return int(vocab.max_seq_length), int(vocab.size)
def preprocess(emb_dim, word_vocab_size): make_dirs() box_file_path = config.ORG_TRAIN_DATA_PATH + "/train.box" field_vocab_path = config.PRC_TRAIN_DATA_PATH + "/field.vocab" field_dict_path = config.PRC_TRAIN_DATA_PATH + "/field.dict" word_dict_path = config.PRC_TRAIN_DATA_PATH + "/word.dict" bpemb_en = BPEmb(lang="en", dim=emb_dim, vs=word_vocab_size) metadata = PreprocessMetadata(emb_dim, word_vocab_size, word_dict_path, field_dict_path) metadata.init_bpe_module() field_vocab = create_field_label_vocab(box_file_path, field_vocab_path) field_dict = LabelDict.get(vocab=list(field_vocab), dict_binpath=field_dict_path) bpe_dict = BpeWordDict.get(vocab=bpemb_en.words, dict_binpath=word_dict_path) print("Saving metadata") torch.save(metadata, config.PRC_TRAIN_DATA_PATH + '/metadata.bin') skipped_boxes = prepare_infobox_datasets(field_dict, bpemb_en) prepare_articles_dataset(field_dict, bpemb_en, skipped_boxes) create_mono_datasets(field_dict, bpemb_en) print("Preprocessing done") return bpemb_en, bpe_dict, field_dict
def test_punctuation(): text = [ "Leonidas: This's Sparta!!", "Leonidas : This ' s Sparta ! !", "Leonidas This s Sparta" ] bpemb_multi = BPEmb(lang="multi", add_pad_emb=True) print(bpemb_multi.encode(text))
def test_multi_language(): text = ["This is Stratford", "Kitap okuyordu."] bpemb_multi = BPEmb(lang="multi", add_pad_emb=True) print(bpemb_multi.encode_ids_with_bos_eos(text)) print( bpemb_multi.decode_ids([[1, 5496, 200, 23866, 3927, 2], [1, 45350, 44934, 67191, 94777, 2]]))
def __init__(self, vocab, args, audio_conf, manifest_filepath_list, normalize=False, augment=False, input_type="char", is_train=False): """ Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by a comma. Each new line is a different sample. Example below: /path/to/audio.wav,/path/to/audio.txt ... :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds :param manifest_filepath: Path to manifest csv as describe above :param labels: String containing all the possible characters to map to :param normalize: Apply standard mean and deviation normalization to audio tensor :param augment(default False): Apply random tempo and gain perturbations """ self.max_size = 0 self.ids_list = [] for i in range(len(manifest_filepath_list)): manifest_filepath = manifest_filepath_list[i] ids = pd.read_csv(manifest_filepath, header=None).values.tolist() self.ids_list.append(ids) self.max_size = max(len(ids), self.max_size) self.max_size = self.max_size * len(manifest_filepath_list) print("max_size:", self.max_size) print("input_type:", input_type) self.input_type = input_type self.manifest_filepath_list = manifest_filepath_list self.normalize = normalize self.vocab = vocab if self.input_type == "bpe": self.bpeemb_list = [] for i in range(len(self.lang_list)): lang = self.lang_list[i].replace("<","").replace(">","").lower() self.bpeemb_list.append(BPEmb(lang=lang, vs=1000)) super(LogFBankDataset, self).__init__()
def get_embedding_vec(self, word): if self.model is None: self.model = BPEmb(lang="en", dim=self.dim, vs=self.bp_vocab_size) if not self.case_sensitive: word = word.lower() vecs = self.model.embed(word) return np.reshape(np.sum(vecs, axis=0), (self.dim, ))
def get_cnn(in_channels, out_channels, kernel_heights, stride, padding, dropout_prob): """ Creates a new CNN and tokenizer using the given parameters :return: """ # Load english model with 25k word-pieces tokenizer = BPEmb(lang='en', dim=300, vs=25000) # Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token pretrained_embeddings = np.concatenate([tokenizer.emb.vectors, np.zeros(shape=(1, 300))], axis=0) # Extract the vocab and add an extra [PAD] token vocabulary = tokenizer.emb.index2word + ['[PAD]'] tokenizer.pad_token_id = len(vocabulary) - 1 model = CNN( torch.tensor(pretrained_embeddings).type(torch.FloatTensor), n_labels=2, in_channels=in_channels, out_channels=out_channels, kernel_heights=kernel_heights, stride=stride, padding=padding, dropout=dropout_prob ).to(device) return model, tokenizer
def __init__(self, verbose: bool = True, **kwargs) -> None: super().__init__(verbose=verbose) with warnings.catch_warnings(): # annoying scipy.sparcetools private module warnings removal # annoying boto warnings warnings.filterwarnings("ignore") model = BPEmb(**kwargs) self.model = model
def __init__(self, **kwargs): lang = kwargs.get("lang", "en") vs = kwargs.get("limit", 200000) self.bpemb = BPEmb(lang=lang, vs=vs) self.tokenizer = SpacyTokenizer(model="en", annotators=["lemma", "pos", "ner"]) self.annotators = self.tokenizer.annotators
def __init__(self, lang="en", dim=200, vs=200000, distance_metric="cosine"): from bpemb import BPEmb self.bpemb = BPEmb(lang=lang, dim=dim, vs=vs) self.distance_metric = distance_metric
def __init__(self, path=config.path_to_data, mode='train'): self.path_to_data = path self.mode = mode print(f"Loading {self.mode} data...") self.data = self.read_data() self.preprocess_data() self.bpemb_ru = BPEmb(lang="ru", dim=300, vs=50000) self.placeholder = torch.zeros(config.max_seq_length, dtype=torch.long)
def __init__( self, lang, vs=10000, dim=100, cache_dir=Path.home() / Path(".cache/bpemb") ): self.lang = lang self.vs = vs self.dim = dim self.cache_dir = cache_dir self.module = BPEmb(lang=lang, vs=vs, dim=dim, cache_dir=cache_dir)
def loadBPE(lang, vector_size): """ It automatically downloads the embedding file and loads it as a gensim keyed vector :param lang: langauge is enough, no need for embedding file :return: """ model = BPEmb(lang=lang, dim=vector_size) return model
def test_decoding(): # Although <pad> word is added, when decoding it can't handle. Therefore, remove padding before decoding. # Decoding removes start/end tokens. bpemb_en = BPEmb(lang="en", add_pad_emb=True) # ids = [1, 215, 80, 8526, 1221, 2] ids = [[1, 215, 80, 8526, 1221, 2], [1, 215, 80, 8526, 1221, 2]] # ids = [1, 215, 80, 8526, 1221, 2, 10000, 10000] # print(bpemb_en.vectors[10000]) print(bpemb_en.decode_ids(ids))
def __init__(self, verbose: bool = True) -> None: super().__init__(verbose=verbose) with warnings.catch_warnings(): # annoying scipy.sparcetools private module warnings removal # annoying boto warnings warnings.filterwarnings("ignore") model = BPEmb(lang="multi", vs=100000, dim=300) # defaults parameters self.model = model
def load_pretrained_embedding_bpe(embedding_matrix): """ load bpe embedding; add <pad> as id=0 """ bpemb = BPEmb(lang="en", vs=25000, dim=200) embedding_matrix[1:] = bpemb.vectors print('loaded bpe pre-trained embedding') print('embedding vectors count:', embedding_matrix.shape[0]) return embedding_matrix
def convert_dataset(dataset: list, char2i: dict, labels2i: dict) -> List[Any]: bpemb_en = BPEmb(lang='en', vs=200000, dim=300) d = {'O': 1, 'B': 2, 'I': 3} for i, (a, l) in enumerate(dataset): a, l = data2words((a, l), bpemb_en, char2i) dataset[i] = (np.array(a), np.array([d[x] for x in l])) return return dataset
def prepare_bpe_weights(lang, vs, dim, weight_path=None): # Note that embedding weights and preprocessor weights should be the same. # If the weight computer is not deterministic, be careful.. # May relax the requirement depending on the case.. # E.g. weight values may be different for the embedder and the processor. # However, the ids should match the words in the right order.. if weight_path is None: bpe = BPEmb(lang=lang, add_pad_emb=True, vs=vs, dim=dim) weights = bpe.vectors else: if check_file_exists(weight_path): weights = load_from_pickle(weight_path) else: bpe = BPEmb(lang=lang, add_pad_emb=True, vs=vs, dim=dim) weights = bpe.vectors save_to_pickle(weights, weight_path) return weights