コード例 #1
0
def choose_pretrain_embeddings(model_name, norm=False):
    if model_name == "GLOVE":
        embeddings = fetch_GloVe(corpus="wiki-6B", dim=300)
    elif model_name == "FASTTEXT":
        embeddings = fetch_FastText(normalize=norm)
    elif model_name == "SKIPGRAM":
        embeddings = fetch_SG_GoogleNews(normalize=norm)
    elif model_name == "LEXVEC":
        embeddings = fetch_LexVec(normalize=norm)
    elif model_name == "HPCA":
        embeddings = fetch_HPCA(normalize=norm)
    elif model_name == "HDC":
        embeddings = fetch_HDC(normalize=norm)
    return embeddings
コード例 #2
0
parser.add_option(
    "-c",
    "--clean_words",
    dest="clean_words",
    help=
    "Clean_words argument passed to load_embedding function. If set to True will remove"
    "most of the non-alphanumeric characters, which should speed up evaluation.",
    default=False)

if __name__ == "__main__":
    (options, args) = parser.parse_args()

    # Load embeddings
    fname = options.filename
    if not fname:
        w = fetch_GloVe(corpus="wiki-6B", dim=300)
    else:
        if not os.path.isabs(fname):
            fname = os.path.join(_get_dataset_dir(), fname)

        format = options.format

        if not format:
            _, ext = os.path.splitext(fname)
            if ext == ".bin":
                format = "word2vec_bin"
            elif ext == ".txt":
                format = "word2vec"
            elif ext == ".pkl":
                format = "dict"
コード例 #3
0
                        datefmt='%I:%M:%S')

    if args.incremental:

        embeddings = ["SKIPGRAM", "GLOVE", "FASTTEXT", "LEXVEC", "HPCA", "HDC"]
        word_embeddings = dict(zip(list(embeddings), [None] * len(embeddings)))
        unique_words = get_unique_words(tasks)

        for i, model_name in enumerate(word_embeddings.keys()):
            embeddings = choose_pretrain_embeddings(model_name)
            word_embeddings = get_wordvec_dictionary(unique_words, model_name,
                                                     word_embeddings,
                                                     embeddings)
            del embeddings

        if args.save_vecs != None:
            save_embeddings(word_embeddings, 'analogy_word_embeddings')

    else:
        # Fetch GloVe embedding (warning: it might take few minutes)
        embeddings = {
            "GLOVE": fetch_GloVe(corpus="wiki-6B", dim=300),
            "FASTTEXT": fetch_FastText(normalize=True),
            "SKIPGRAM": fetch_SG_GoogleNews(normalize=True),
            "LEXVEC": fetch_LexVec(),
        }
        if "MEN" in tasks or "MSR" in tasks:
            get_wordvec_dictionary(tasks, embeddings, save=args.ttype)
        else:
            get_downstream_dictionary(tasks, embeddings, save=args.ttype)
コード例 #4
0
# -*- coding: utf-8 -*-

"""
 Simple example showing evaluating embedding on similarity datasets
"""
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.similarity import evaluate_similarity

# Configure logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

# Fetch GloVe embedding (warning: it might take few minutes)
w_glove = fetch_GloVe(corpus="wiki-6B", dim=300)

# Define tasks
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

# Print sample data
for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))

# Calculate results using helper function
for name, data in iteritems(tasks):
    print "Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_glove, data.X, data.y))