Ejemplo n.º 1
0
def load_bin_embeddings(args, source):
    """
    Reload pretrained embeddings from a fastText binary file.
    """
    # load fastText binary file
    lang = args.src_lang if source else args.tgt_lang
    # remove stop words out of these top words
    max_vocab = args.vocab_size

    model = load_fasttext_model(args.src_emb_path if source else args.tgt_emb_path)
    words, freqs = model.get_labels(include_freq=True)
    assert model.get_dimension() == args.emb_dim
    print("Loaded binary model. Generating embeddings ...")
    embeddings = np.concatenate([model.get_word_vector(w)[None] for w in words], 0)
    print("Generated embeddings for %i words." % len(words))
    assert embeddings.shape == (len(words), args.emb_dim)

    # select a subset of word embeddings (to deal with casing)
    # stop words might have been removed from freqs and train_indexes
    word2id, indexes, freqs = select_subset(words, max_vocab, freqs)
    embeddings = embeddings[indexes]

    id2word = {i: w for w, i in word2id.items()}
    dico = Dictionary(id2word, word2id, lang)

    assert embeddings.shape == (len(dico), args.emb_dim)
    print("Number of words in {} = {}".format(lang, len(dico)))

    return embeddings, dico
Ejemplo n.º 2
0
def count_vocab(data_file, max_vcb_size):

    vocab = Dictionary()
    with open(data_file, 'r') as f:
        for sent in f.readlines():
            sent = sent.strip()
            for word in sent.split():
                vocab.add(word)

    # vocab.write_into_file('all.vocab')

    words_cnt = sum(vocab.freq.itervalues())
    new_vocab, new_words_cnt = vocab.keep_vocab_size(max_vcb_size)
    wlog('|Final vocabulary| / |Original vocabulary| = {} / {} = {:4.2f}%'
         .format(new_words_cnt, words_cnt, (new_words_cnt/words_cnt) * 100))

    return new_vocab
Ejemplo n.º 3
0
def read_txt_embeddings(args, source, full_vocab):
    """
    Reload pretrained embeddings from a text file.
    """
    word2id = {}
    vectors = []

    # load pretrained embeddings
    lang = args.src_lang if source else args.tgt_lang
    emb_path = args.src_emb_path if source else args.tgt_emb_path
    _emb_dim_file = args.emb_dim
    with io.open(emb_path,
                 'r',
                 encoding='utf-8',
                 newline='\n',
                 errors='ignore') as f:
        for i, line in enumerate(f):
            if i == 0:
                split = line.split()
                assert len(split) == 2
                assert _emb_dim_file == int(split[1])
            else:
                word, vect = line.rstrip().split(' ', 1)
                if not full_vocab:
                    word = word.lower()
                vect = np.fromstring(vect, sep=' ')
                if np.linalg.norm(vect) == 0:  # avoid to have null embeddings
                    vect[0] = 0.01
                if word in word2id:
                    if full_vocab:
                        print("Word '%s' found twice in %s embedding file" %
                              (word, 'source' if source else 'target'))
                else:
                    if not vect.shape == (_emb_dim_file, ):
                        print(
                            "Invalid dimension (%i) for %s word '%s' in line %i."
                            % (vect.shape[0], 'source' if source else 'target',
                               word, i))
                        continue
                    assert vect.shape == (_emb_dim_file, ), i
                    word2id[word] = len(word2id)
                    vectors.append(vect[None])
            if args.max_vocab > 0 and len(
                    word2id) >= args.max_vocab and not full_vocab:
                break

    assert len(word2id) == len(vectors)
    print("Loaded %i pre-trained word embeddings." % len(vectors))

    # compute new vocabulary / embeddings
    id2word = {v: k for k, v in word2id.items()}
    dico = Dictionary(id2word, word2id, lang)
    embeddings = np.concatenate(vectors, 0)
    # embeddings = torch.from_numpy(embeddings).float()

    assert embeddings.shape == (len(dico), args.emb_dim)
    return dico, embeddings
Ejemplo n.º 4
0
def load_embeddings(args, source: bool):
    """
    Reload pretrained embeddings from a fastText binary file or word2vec file.
    """
    lang = args.src_lang if source else args.tgt_lang
    max_vocab = args.max_vocab

    emb_path = args.src_emb_path if source else args.tgt_emb_path
    if emb_path.endswith('.bin'):
        model = load_fasttext_model(emb_path)
        words, freqs = model.get_labels(include_freq=True)
        assert model.get_dimension() == args.emb_dim

        print("Loaded binary model. Generating embeddings ...")
        embeddings = np.concatenate(
            [model.get_word_vector(w)[None] for w in words], 0)

    elif emb_path.endswith(('.vec', '.vec.gz')):
        words, embeddings = read_txt_embeddings(args, source, True)
        freqs = load_txt_counts(args, source, words)

    else:
        raise Exception('Unknown embeddings file format: "%s"' % emb_path)

    print("Generated embeddings for %i words." % len(words))
    assert embeddings.shape == (len(words), args.emb_dim)

    # select a subset of word embeddings (to deal with casing)
    # stop words might have been removed from freqs and train_indexes
    word2id, indexes, freqs = select_subset(words, max_vocab, freqs, lang=lang)

    word_dist = None
    if 'smooth_c' in args:
        # smooth the frequency
        word_dist = cal_empiral_freqs(np.array(freqs), args.smooth_c)

        # remove stop words out of these top words
        if 'src_train_most_frequent' in args and 'tgt_train_most_frequent' in args:
            mf = args.src_train_most_frequent if source else args.tgt_train_most_frequent
            if mf > 0:
                word_dist = word_dist[:mf] / word_dist[:mf].sum()

    embeddings = embeddings[indexes]
    id2word = {i: w for w, i in word2id.items()}

    # create the dictionary
    dico = Dictionary(id2word, word2id, lang, word_dist)

    assert embeddings.shape == (len(dico), args.emb_dim)
    print(f"Number of words in {lang} = {len(dico)}")
    if 'smooth_c' in args:
        print("Max frequency = %.7f, min frequency = %.7f" %
              (max(word_dist), min(word_dist)))

    return dico, embeddings, word_dist
Ejemplo n.º 5
0
def count_vocab(data_file, max_vcb_size, max_seq_len=50, char=False):

    assert data_file and os.path.exists(data_file), 'need file to extract vocabulary ...'

    vocab = Dictionary()
    #with open(data_file, 'r') as f:
    with io.open(data_file, encoding='utf-8') as f:
        for sent in f.readlines():
            sent = sent.strip()
            if char is True: words = zh_to_chars(sent)
            else: words = sent.split()
            if len(words) > max_seq_len: continue
            for word in words: vocab.add(word)

    words_cnt = sum(vocab.freq.itervalues())
    new_vocab, new_words_cnt = vocab.keep_vocab_size(max_vcb_size)
    wlog('|Final vocabulary| / |Original vocabulary| = {} / {} = {:4.2f}%'
         .format(new_words_cnt, words_cnt, (new_words_cnt/words_cnt) * 100))

    return new_vocab
Ejemplo n.º 6
0
def extract_vocab(data_file, vocab_file, max_vcb_size=30000, max_seq_len=50):

    if os.path.exists(vocab_file) is True:

        # If vocab file has been exist, we load word dictionary
        wlog('Load dictionary from file {}'.format(vocab_file))
        vocab = Dictionary()
        vocab.load_from_file(vocab_file)

    else:

        vocab = count_vocab(data_file, max_vcb_size, max_seq_len)
        vocab.write_into_file(vocab_file)
        wlog('Save dictionary file into {}'.format(vocab_file))

    return vocab
Ejemplo n.º 7
0
def load_bin_embeddings(args, source: bool):
    """
    Reload pretrained embeddings from a fastText binary file.
    """
    # reload fastText binary file
    lang = args.src_lang if source else args.tgt_lang
    # remove stop words out of these top words
    mf = args.src_train_most_frequent if source else args.tgt_train_most_frequent
    max_vocab = args.max_vocab

    model = load_fasttext_model(
        args.src_emb_path if source else args.tgt_emb_path)
    words, freqs = model.get_labels(include_freq=True)
    assert model.get_dimension() == args.emb_dim
    print("Loaded binary model. Generating embeddings ...")
    embeddings = np.concatenate(
        [model.get_word_vector(w)[None] for w in words], 0)
    print("Generated embeddings for %i words." % len(words))
    assert embeddings.shape == (len(words), args.emb_dim)

    # select a subset of word embeddings (to deal with casing)
    # stop words might have been removed from freqs and train_indexes
    word2id, indexes, freqs = select_subset(words, max_vocab, freqs, lang=lang)
    # smooth the frequency
    word_dist = cal_empiral_freqs(np.array(freqs), args.smooth_c)
    embeddings = embeddings[indexes]

    id2word = {i: w for w, i in word2id.items()}

    if mf > 0:
        word_dist = word_dist[:mf] / word_dist[:mf].sum()

    dico = Dictionary(id2word, word2id, lang, word_dist)

    assert embeddings.shape == (len(dico), args.emb_dim)
    print(f"Number of words in {lang} = {len(dico)}", len(word_dist))
    print("Max frequency = %.7f, min frequency = %.7f" %
          (max(word_dist), min(word_dist)))

    return dico, embeddings, word_dist