Esempio n. 1
0
class WordTokenizer(BaseTokenizer):
    def __init__(self):
        self.tokenizer = MosesTokenizer()

    def tokenize(self, text: str) -> List[str]:
        return self.tokenizer(text.strip())

    def detokenize(self, tokens: List[str]) -> str:
        text = " ".join(tokens).strip()
        return text

    def close(self):
        self.tokenizer.close()
def process_corpus(embeddings_dictionary, corpus, vectors, language):
    """
    Cleans corpus using the dictionary of embeddings.
    Any word without an associated embedding in the dictionary is ignored.
    Adds '__target-language' and '__source-language' at the end
    of the words according to their language.
    """
    clean_corpus, clean_vectors, keys = [], {}, []
    words_we_want = set(embeddings_dictionary)
    tokenize = MosesTokenizer(language)
    for key, doc in enumerate(corpus):
        clean_doc = []
        words = tokenize(doc)
        for word in words:
            if word in words_we_want:
                clean_doc.append(word + "__%s" % language)
                clean_vectors[word + "__%s" % language] = np.array(
                    vectors[word].split()).astype(np.float)
        if len(clean_doc) > 3 and len(clean_doc) < 25:
            keys.append(key)
        clean_corpus.append(" ".join(clean_doc))
    tokenize.close()
    return np.array(clean_corpus), clean_vectors, keys
Esempio n. 3
0
    ]
    special_tokens.extend(args["special_symbols"].split(","))

    # slice with vocab size
    vocab = counter.most_common(args["vocab_size"] - len(special_tokens))

    # print out-of-vocabulary
    total_freq = sum(counter.values())
    oov_freq = total_freq - sum([v[1] for v in vocab])
    print(
        f"oov: {oov_freq}/{total_freq} ({oov_freq * 100.0 / total_freq:.2f}%)")

    # save word vocab
    output_vocab_path = os.path.join(output_dir, "tok.vocab")
    with open(output_vocab_path, "w", encoding="utf-8") as f:
        for token in special_tokens:
            f.write(f"{token}\t-1\n")
        for token, freq in vocab:
            f.write(f"{token}\t{freq}\n")

    # save fairseq vocab
    with open(os.path.join(output_dir, "fairseq.vocab"), "w") as fout:
        with open(os.path.join(output_dir, "tok.vocab"), "r") as fin:
            start_idx = 4 + len(args["special_symbols"].split(
                ","))  # pad, unk, bos, eos + special_symbols
            for line in fin.readlines()[start_idx:]:
                splitted = line.split("\t")
                fout.write(f"{' '.join(splitted)}")
    tokenize.close()
    print("done.")