def create_vocab_bpe(data_path='data/mt_corpus_ts.txt', vocab_path='data/vocab.txt', vocab_size=25000, simple_subword=False): print('start create vocab from:', data_path) line_num = 0 encoder = Encoder(vocab_size, pct_bpe=0.75, ngram_min=1, ngram_max=4, required_tokens=required_tokens, word_tokenizer=WhitespaceTokenizer().tokenize) texts = [] with open(data_path, encoding='utf-8') as fin: for line in fin: line_num += 1 if line_num == 1: continue if line_num % 100000 == 0: print(line_num) #tuples = line.strip().split('\t') #zh = tuples[1] texts.append(line) encoder.fit(texts) bpe_dict = encoder.vocabs_to_dict() with open(vocab_path + '.dict', 'w', encoding='utf-8') as fout: fout.write(json.dumps(bpe_dict)) terms = list(encoder.bpe_vocab.keys()) terms += list(encoder.word_vocab.keys()) terms = set(terms) terms = list(terms) vocabs = special_terms + terms vocabs_dict = dict() for i, term in enumerate(vocabs): vocabs_dict[term] = i if not simple_subword: for i, term in enumerate(vocabs): if term not in special_terms and term not in required_tokens: vocabs_dict["@@" + term] = i + len(vocabs) with open(vocab_path, 'w', encoding='utf-8') as fout: fout.write(json.dumps(vocabs_dict, indent=0)) print('create vocab done. save to: ', vocab_path)
def prepare_data(data_path, freq_dist_path, embedding_path, vocabulary_size=10000, embedding_size=200, predict=False, max_length=None, use_bpe=False): max_length_provided = max_length is not None separator = "," if data_path.endswith("tsv"): separator = "\t" # construct vocabulary vocabulary = None if not use_bpe: with open(freq_dist_path, "rb") as freq_dist_file: freq_dist = pickle.load(freq_dist_file) vocabulary = {"<pad>": 0, "<unk>": 1, "<user>": 2, "<url>": 3} most_common = freq_dist.most_common(vocabulary_size - len(vocabulary)) vocabulary.update({w[0]: i + 2 for i, w in enumerate(most_common)}) print("Constructed vocabulary of size {}.".format(vocabulary_size)) # load data and convert it to indices data = [] labels = [] if not max_length_provided: max_length = 0 with open(data_path, "r") as data_file: lines = data_file.readlines() for i, line in enumerate(lines): if not predict: tweet_id, sentiment, tweet = line.split(separator) else: tweet_id, tweet = line.split(separator) data.append(tweet.strip()) if not predict: labels.append(int(sentiment)) print("Loaded data ({} tweets).".format(len(data))) if not use_bpe: new_data = [] for tweet in data: words = tweet.split() indices = [] for w_idx, w in enumerate(words): if max_length_provided and w_idx == max_length: break index = vocabulary.get(w) if index is not None: indices.append(index) else: indices.append(vocabulary.get("<unk>")) if not max_length_provided and len(indices) > max_length: max_length = len(indices) new_data.append(indices) data = new_data pad_value = vocabulary.get("<pad>") else: print("Training BPE encoder...") encoder = Encoder(vocab_size=vocabulary_size, required_tokens=["<user>", "<url>"], UNK="<unk>", PAD="<pad>") encoder.fit(data) vocabulary = encoder.vocabs_to_dict() print("Constructed BPE vocabulary of size {}.".format(vocabulary_size)) new_data = [] for tweet in data: indices = list(next(encoder.transform([tweet]))) if not max_length_provided and len(indices) > max_length: max_length = len(indices) new_data.append(indices) data = new_data pad_value = encoder.word_vocab[encoder.PAD] # load embedding vectors embedding_vectors = {} if not use_bpe: with open(embedding_path, "r") as glove_file: for i, line in enumerate(glove_file): tokens = line.split() word = tokens[0] if vocabulary.get(word): vector = [float(e) for e in tokens[1:]] embedding_vectors[word] = np.array(vector) print("Found {} GLOVE vectors for vocabulary of size {}.".format( len(embedding_vectors), len(vocabulary))) print( "Loaded embedding vectors ({} dimensions).".format(embedding_size)) # construct embedding matrix embedding_matrix = np.random.randn(vocabulary_size, embedding_size) * 0.01 if not use_bpe: for word, i in list(vocabulary.items()): embedding_vector = embedding_vectors.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector print("Constructed embedding matrix.") # pad data (might want to change max_length to be CLI argument) data = pad_sequences(data, maxlen=max_length, padding="post", value=pad_value) if not predict: labels = np.array(labels) print("Padded sequences to length {}.".format(max_length)) if not predict: return vocabulary, data, labels, embedding_matrix return vocabulary, data, embedding_matrix
def construct_vocabulary(data: Union[str, List[Union[List[str], str]]], vocabulary_size: int = 10000, use_bpe: bool = False, bpe_percentage: float = 0.2, vocabulary_save_file: str = None) -> dict: counts = None if type(data) == str and ".pkl" in data: with open(data, "rb") as f: counts = pickle.load(f) if type(counts) != nltk.FreqDist: logger.info("Loaded vocabulary from file.") return counts elif use_bpe: logger.error("Cannot construct BPE vocabulary from frequency distribution file.") raise ValueError("Cannot construct BPE vocabulary from frequency distribution file.") else: logger.info("Constructing vocabulary from frequency distribution file.") elif not use_bpe: logger.info("Constructing vocabulary from data.") if type(data) == str: separator = "," if data.endswith("tsv"): separator = "\t" # load data from file new_data = [] with open(data, "r") as data_file: lines = data_file.readlines() for i, line in enumerate(lines): _, _, tweet = line.split(separator) new_data.append(TOKENIZER.tokenize(tweet)) data = new_data elif type(data[0]) != list: data = [TOKENIZER.tokenize(t) for t in data] all_words = [] for tweet in data: all_words.extend(tweet) counts = nltk.FreqDist(all_words) if use_bpe: logger.info("Training BPE encoder...") encoder = Encoder(vocab_size=vocabulary_size, pct_bpe=bpe_percentage, word_tokenizer=lambda x: TOKENIZER.tokenize(x), required_tokens=["<start>", "<extract>", "<user>", "<url>"], UNK="<unk>", PAD="<pad>") encoder.fit(data) vocabulary = encoder.vocabs_to_dict() logger.info("Constructed BPE vocabulary of size {}.".format(vocabulary_size)) else: vocabulary = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<extract>": 3} initial_vocab_length = len(vocabulary) most_common = counts.most_common(vocabulary_size - initial_vocab_length) vocabulary.update({w[0]: i + initial_vocab_length for i, w in enumerate(most_common)}) logger.info("Constructed embedding vocabulary of size {}.".format(len(vocabulary))) if vocabulary_save_file: if not vocabulary_save_file.endswith(".pkl"): vocabulary_save_file += ".pkl" with open(vocabulary_save_file, "wb") as f: pickle.dump(vocabulary, f) logger.info("Saved vocabulary to \"{}\".".format(vocabulary_save_file)) return vocabulary
sequences = [] with open("../../datasets/multi30k/train.en") as f: for line in f: sequences.append(line.strip()) ref = [x.split() for x in sequences] ref_len = [len(x) for x in ref] print("REF:", max(ref_len)) def parse(x): return x.split() enc = Encoder(4096, ngram_min=1, ngram_max=2, pct_bpe=0.8, silent=True, word_tokenizer=parse) enc.fit(sequences) base = enc.vocabs_to_dict() duplicate_keys = [] for key in base['byte_pairs']: if key in base['words']: duplicate_keys.append(key) if len(duplicate_keys) > 0: print("got duplicates:") print(duplicate_keys) else: print("NO DUPLICATES! :)") keybase = {**base['words'], **base['byte_pairs']} inv_map = {v: k for k, v in keybase.items()}