Beispiel #1
0
def create_vocab_bpe(data_path='data/mt_corpus_ts.txt',
                     vocab_path='data/vocab.txt',
                     vocab_size=25000,
                     simple_subword=False):
    print('start create vocab from:', data_path)
    line_num = 0
    encoder = Encoder(vocab_size,
                      pct_bpe=0.75,
                      ngram_min=1,
                      ngram_max=4,
                      required_tokens=required_tokens,
                      word_tokenizer=WhitespaceTokenizer().tokenize)
    texts = []
    with open(data_path, encoding='utf-8') as fin:
        for line in fin:
            line_num += 1
            if line_num == 1:
                continue
            if line_num % 100000 == 0:
                print(line_num)
            #tuples = line.strip().split('\t')
            #zh = tuples[1]
            texts.append(line)
    encoder.fit(texts)
    bpe_dict = encoder.vocabs_to_dict()
    with open(vocab_path + '.dict', 'w', encoding='utf-8') as fout:
        fout.write(json.dumps(bpe_dict))

    terms = list(encoder.bpe_vocab.keys())
    terms += list(encoder.word_vocab.keys())
    terms = set(terms)
    terms = list(terms)
    vocabs = special_terms + terms
    vocabs_dict = dict()
    for i, term in enumerate(vocabs):
        vocabs_dict[term] = i
    if not simple_subword:
        for i, term in enumerate(vocabs):
            if term not in special_terms and term not in required_tokens:
                vocabs_dict["@@" + term] = i + len(vocabs)
    with open(vocab_path, 'w', encoding='utf-8') as fout:
        fout.write(json.dumps(vocabs_dict, indent=0))
    print('create vocab done. save to: ', vocab_path)
def prepare_data(data_path,
                 freq_dist_path,
                 embedding_path,
                 vocabulary_size=10000,
                 embedding_size=200,
                 predict=False,
                 max_length=None,
                 use_bpe=False):
    max_length_provided = max_length is not None

    separator = ","
    if data_path.endswith("tsv"):
        separator = "\t"

    # construct vocabulary
    vocabulary = None
    if not use_bpe:
        with open(freq_dist_path, "rb") as freq_dist_file:
            freq_dist = pickle.load(freq_dist_file)
        vocabulary = {"<pad>": 0, "<unk>": 1, "<user>": 2, "<url>": 3}
        most_common = freq_dist.most_common(vocabulary_size - len(vocabulary))
        vocabulary.update({w[0]: i + 2 for i, w in enumerate(most_common)})
        print("Constructed vocabulary of size {}.".format(vocabulary_size))

    # load data and convert it to indices
    data = []
    labels = []
    if not max_length_provided:
        max_length = 0
    with open(data_path, "r") as data_file:
        lines = data_file.readlines()
        for i, line in enumerate(lines):
            if not predict:
                tweet_id, sentiment, tweet = line.split(separator)
            else:
                tweet_id, tweet = line.split(separator)
            data.append(tweet.strip())

            if not predict:
                labels.append(int(sentiment))
    print("Loaded data ({} tweets).".format(len(data)))

    if not use_bpe:
        new_data = []
        for tweet in data:
            words = tweet.split()
            indices = []
            for w_idx, w in enumerate(words):
                if max_length_provided and w_idx == max_length:
                    break

                index = vocabulary.get(w)
                if index is not None:
                    indices.append(index)
                else:
                    indices.append(vocabulary.get("<unk>"))

            if not max_length_provided and len(indices) > max_length:
                max_length = len(indices)

            new_data.append(indices)
        data = new_data

        pad_value = vocabulary.get("<pad>")
    else:
        print("Training BPE encoder...")
        encoder = Encoder(vocab_size=vocabulary_size,
                          required_tokens=["<user>", "<url>"],
                          UNK="<unk>",
                          PAD="<pad>")
        encoder.fit(data)
        vocabulary = encoder.vocabs_to_dict()
        print("Constructed BPE vocabulary of size {}.".format(vocabulary_size))

        new_data = []
        for tweet in data:
            indices = list(next(encoder.transform([tweet])))
            if not max_length_provided and len(indices) > max_length:
                max_length = len(indices)
            new_data.append(indices)
        data = new_data

        pad_value = encoder.word_vocab[encoder.PAD]

    # load embedding vectors
    embedding_vectors = {}
    if not use_bpe:
        with open(embedding_path, "r") as glove_file:
            for i, line in enumerate(glove_file):
                tokens = line.split()
                word = tokens[0]
                if vocabulary.get(word):
                    vector = [float(e) for e in tokens[1:]]
                    embedding_vectors[word] = np.array(vector)
        print("Found {} GLOVE vectors for vocabulary of size {}.".format(
            len(embedding_vectors), len(vocabulary)))
        print(
            "Loaded embedding vectors ({} dimensions).".format(embedding_size))

    # construct embedding matrix
    embedding_matrix = np.random.randn(vocabulary_size, embedding_size) * 0.01
    if not use_bpe:
        for word, i in list(vocabulary.items()):
            embedding_vector = embedding_vectors.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    print("Constructed embedding matrix.")

    # pad data (might want to change max_length to be CLI argument)
    data = pad_sequences(data,
                         maxlen=max_length,
                         padding="post",
                         value=pad_value)
    if not predict:
        labels = np.array(labels)
    print("Padded sequences to length {}.".format(max_length))

    if not predict:
        return vocabulary, data, labels, embedding_matrix
    return vocabulary, data, embedding_matrix
Beispiel #3
0
def construct_vocabulary(data: Union[str, List[Union[List[str], str]]],
                         vocabulary_size: int = 10000,
                         use_bpe: bool = False,
                         bpe_percentage: float = 0.2,
                         vocabulary_save_file: str = None) -> dict:
    counts = None
    if type(data) == str and ".pkl" in data:
        with open(data, "rb") as f:
            counts = pickle.load(f)
        if type(counts) != nltk.FreqDist:
            logger.info("Loaded vocabulary from file.")
            return counts
        elif use_bpe:
            logger.error("Cannot construct BPE vocabulary from frequency distribution file.")
            raise ValueError("Cannot construct BPE vocabulary from frequency distribution file.")
        else:
            logger.info("Constructing vocabulary from frequency distribution file.")
    elif not use_bpe:
        logger.info("Constructing vocabulary from data.")

        if type(data) == str:
            separator = ","
            if data.endswith("tsv"):
                separator = "\t"

            # load data from file
            new_data = []
            with open(data, "r") as data_file:
                lines = data_file.readlines()
                for i, line in enumerate(lines):
                    _, _, tweet = line.split(separator)
                    new_data.append(TOKENIZER.tokenize(tweet))
            data = new_data
        elif type(data[0]) != list:
            data = [TOKENIZER.tokenize(t) for t in data]

        all_words = []
        for tweet in data:
            all_words.extend(tweet)

        counts = nltk.FreqDist(all_words)

    if use_bpe:
        logger.info("Training BPE encoder...")
        encoder = Encoder(vocab_size=vocabulary_size,
                          pct_bpe=bpe_percentage,
                          word_tokenizer=lambda x: TOKENIZER.tokenize(x),
                          required_tokens=["<start>", "<extract>", "<user>", "<url>"],
                          UNK="<unk>", PAD="<pad>")
        encoder.fit(data)
        vocabulary = encoder.vocabs_to_dict()
        logger.info("Constructed BPE vocabulary of size {}.".format(vocabulary_size))
    else:
        vocabulary = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<extract>": 3}
        initial_vocab_length = len(vocabulary)
        most_common = counts.most_common(vocabulary_size - initial_vocab_length)
        vocabulary.update({w[0]: i + initial_vocab_length for i, w in enumerate(most_common)})
        logger.info("Constructed embedding vocabulary of size {}.".format(len(vocabulary)))

    if vocabulary_save_file:
        if not vocabulary_save_file.endswith(".pkl"):
            vocabulary_save_file += ".pkl"
        with open(vocabulary_save_file, "wb") as f:
            pickle.dump(vocabulary, f)
        logger.info("Saved vocabulary to \"{}\".".format(vocabulary_save_file))

    return vocabulary
Beispiel #4
0
sequences = []
with open("../../datasets/multi30k/train.en") as f:
    for line in f:
        sequences.append(line.strip())

ref = [x.split() for x in sequences]
ref_len = [len(x) for x in ref]
print("REF:", max(ref_len))

def parse(x):
    return x.split()

enc = Encoder(4096, ngram_min=1, ngram_max=2, pct_bpe=0.8, silent=True, word_tokenizer=parse)
enc.fit(sequences)

base = enc.vocabs_to_dict()
duplicate_keys = []
for key in base['byte_pairs']:
    if key in base['words']:
        duplicate_keys.append(key)
if len(duplicate_keys) > 0:
    print("got duplicates:")
    print(duplicate_keys)
else:
    print("NO DUPLICATES! :)")

keybase = {**base['words'], **base['byte_pairs']}


inv_map = {v: k for k, v in keybase.items()}