def fetch_conceptnet_numberbatch(clean_words=False): """ Fetches ConceptNetNumberbatch embeddings. Embeddings are normalized to unit length, and the vocabulary terms are lowercase. Parameters ---------- clean_words: bool, default: False If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters Returns ------- w: Embedding Instance of Embedding class References ---------- Published at https://github.com/commonsense/conceptnet-numberbatch Reference paper: Robert Speer, Joshua Chin, and Catherine Havasi (2017). "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge." In proceedings of AAAI 2017. """ path = _fetch_file( url= 'https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz', data_dir='embeddings', uncompress=False, verbose=1) return load_embedding(path, format='word2vec', normalize=False, clean_words=clean_words)
def fetch_HDC(dim=300, normalize=True, lower=False, clean_words=False): """ Fetches PDC embeddings trained on wiki by Fei Sun Parameters ---------- dim: int, default:300 Dimensionality of embedding normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. Returns ------- w: Embedding Embedding instance References ---------- Embeddings were published on http://ofey.me/projects/wordrep/. Reference paper: Fei Sun, Jiafeng Guo, Yanyan Lan, Jun Xu, and Xueqi Cheng. "Learning word representations by jointly modeling syntagmatic and paradigmatic relations" """ url = { 50: "https://www.dropbox.com/s/q22ssy8055loknz/wikicorp.201004-hdc-" "iter-20-alpha-0.025-window-10-dim-50-neg-10-subsample-0.0001.txt.bz2?dl=1", 100: "https://www.dropbox.com/s/13226et55fi6g50/wikicorp.201004-hdc-" "iter-20-alpha-0.025-window-10-dim-100-neg-10-subsample-0.0001.txt.bz2?dl=1", 300: "https://www.dropbox.com/s/jrfwel32yd8w0lu/wikicorp.201004-hdc-" "iter-20-alpha-0.025-window-10-dim-300-neg-10-subsample-0.0001.txt.bz2?dl=1" } assert dim in url, "Unavailable dimensionality" path = _fetch_file(url=url[dim], data_dir="embeddings", uncompress=False, move="hdc/hdc{}.txt.bz2".format(dim), verbose=1) return load_embedding(path, format="word2vec", normalize=normalize, lower=lower, clean_words=clean_words)
def fetch_LexVec(which="commoncrawl-W+C", normalize=True, lower=False, clean_words=False): """ Fetches LexVec embeddings Parameters ---------- which: str, default: "commoncrawl-W+C" Can choose between "commoncrawl-W", "commoncrawl-W+C", "wikipedia+newscrawl-W", "wikipedia+newscrawl-W+C", "commoncrawl-ngramsubwords-W" normalize: bool, default: True If true will normalize all vector to unit length lower: bool, default: False If true, will convert string to lowercase clean_words: bool, default: False If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters Returns ------- w: Embedding Instance of Embedding class References ---------- Published at https://github.com/alexandres/lexvec Reference paper: Salle, Alexandre, Marco Idiart, and Aline Villavicencio. Matrix Factorization using Window Sampling and Negative Sampling for Improved Word Representations. The 54th Annual Meeting of the Association for Computational Linguistics. 2016. """ download_file = { "commoncrawl-W": "https://www.dropbox.com/s/flh1fjynqvdsj4p/lexvec.commoncrawl.300d.W.pos.vectors.gz?dl=1", "commoncrawl-W+C": "https://www.dropbox.com/s/zkiajh6fj0hm0m7/lexvec.commoncrawl.300d.W%2BC.pos.vectors.gz?dl=1", "wikipedia+newscrawl-W": "https://www.dropbox.com/s/kguufyc2xcdi8yk/lexvec.enwiki%2Bnewscrawl.300d.W.pos.vectors.gz?dl=1", "wikipedia+newscrawl-W+C": "https://www.dropbox.com/s/u320t9bw6tzlwma/lexvec.enwiki%2Bnewscrawl.300d.W%2BC.pos.vectors.gz?dl=1", "commoncrawl-ngramsubwords-W": "https://www.dropbox.com/s/mrxn933chn5u37z/lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors.gz?dl=1" } path = _fetch_file(url=download_file[which], data_dir="embeddings", verbose=1) return load_embedding(path, format="word2vec", normalize=normalize, lower=lower, clean_words=clean_words)
def fetch_NMT(which="DE", normalize=True, lower=False, clean_words=False): """ Fetches word embeddings induced by Neural Translation Machine Parameters ---------- which: str, default: "DE" Can choose between DE and FR, which fetches accordingly EN -> DE or EN -> FR translation induced word embeddings normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. Returns ------- w: Embedding Instance of Embedding class References ---------- Published at https://www.cl.cam.ac.uk/~fh295/. Reference paper: Hill, Cho et al., "Embedding Word Similarity With Neural Machine Translation", 2014 """ dirname = _fetch_file(url="https://www.cl.cam.ac.uk/~fh295/TEmbz.tar.gz", data_dir="embeddings", uncompress=True, verbose=1) assert which in ["DE", "FR"], "Unrecognized which parameter" fname = { "FR": "Trans_embds/D_RNN_500k_144h.pkl", "DE": "Trans_embds/D_german_50k_500k_168h.pkl" } return load_embedding(path.join(dirname, fname[which]), format="dict", normalize=normalize, lower=lower, clean_words=clean_words)
def fetch_morphoRNNLM(which, normalize=True, lower=False, clean_words=False): """ Fetches recursive morphological neural network embeddings Parameters ---------- which: str, default: "CW" Can choose between CW and HSMN normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. Returns ------- w: Embedding Instance of Embedding class References ---------- Published at http://stanford.edu/~lmthang/morphoNLM/ Reference paper: Luong, Socher et al., "Better Word Representations with Recursive Neural Networks for Morphology", 2013 """ download_file = { "CW": "https://www.dropbox.com/s/7fdj2666iqv4xbu/cwCsmRNN.bin.gz?dl=1", "HSMN": "https://www.dropbox.com/s/okw1i6kc6e2jd1q/hsmnCsmRNN.bin.gz?dl=1" } path = _fetch_file(url=download_file[which], data_dir="embeddings", uncompress=False, verbose=1) return load_embedding(path, format="word2vec_bin", normalize=normalize, lower=lower, clean_words=clean_words)
def fetch_HPCA(which, normalize=True, lower=False, clean_words=False): """ Fetches Hellinger PCA based embeddings Parameters ---------- which: str, default: "autoencoder_phrase_hpca" Can choose between "hpca" and "autoencoder_phrase_hpca" (from "The Sum of Its Parts") normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. Returns ------- w: Embedding Instance of Embedding class References ---------- Published at http://lebret.ch/words/ Reference paper: Lebret, Collobert et al., “The Sum of Its Parts”: Joint Learning of Word and Phrase Representations with Autoencoders", 2015 """ download_file = { "autoencoder_phrase_hpca": "https://www.dropbox.com/s/6dyf48crdmjbw1a/AHPCA.bin.gz?dl=1", "hpca": "https://www.dropbox.com/s/5y5l6vyn8yn11dv/HPCA.bin.gz?dl=1" } path = _fetch_file(url=download_file[which], data_dir="embeddings", uncompress=False, verbose=1) return load_embedding(path, format="word2vec_bin", normalize=normalize, lower=lower, clean_words=clean_words)
def fetch_FastText(lang="en", normalize=True, lower=False, clean_words=False): """ Fetches fastText embeddings Parameters ---------- lang: str, default: "en" Can choose between all accessible language on page: https://fasttext.cc/docs/en/pretrained-vectors.html#content normalize: bool, default: True If true will normalize all vector to unit length lower: bool, default: False If true, will convert string to lowercase clean_words: bool, default: False If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters Returns ------- w: Embedding Instance of Embedding class References ---------- Published at https://fasttext.cc/ """ url_vec = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.{}.vec'.format( lang) path = _fetch_file(url=url_vec, data_dir='embeddings', uncompress=False, verbose=1) return load_embedding(path, format='word2vec', normalize=normalize, lower=lower, clean_words=clean_words)
def fetch_SG_GoogleNews(normalize=True, lower=False, clean_words=False): """ Fetches SG (skip-gram with negative sampling) embeddings trained on GoogleNews dataset published on word2vec website Parameters ---------- normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. Returns ------- w: Embedding Instance of Embedding class References ---------- Original source: https://code.google.com/p/word2vec/ """ path = _fetch_file( url= "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", data_dir="embeddings", verbose=1) return load_embedding(path, format="word2vec_bin", normalize=normalize, lower=lower, clean_words=clean_words)
def fetch_GloVe(dim=300, corpus="wiki-6B", normalize=True, lower=False, clean_words=False): """ Fetches GloVe embeddings. Parameters ---------- dim: int, default: 300 Dimensionality of embedding (usually performance increases with dimensionality). Available dimensionalities: * wiki-6B: 50, 100, 200, 300 * common-crawl-42B: 300 * common-crawl-840B: 300 * twitter: 25, 50, 100, 200 corpus: string, default: "wiki-6B" Corpus that GloVe vector were trained on. Available corpuses: "wiki-6B", "common-crawl-42B", "common-crawl-840B", "twitter-27B" normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. Returns ------- w: Embedding Embedding instance References ---------- Project website: http://nlp.stanford.edu/projects/glove/ Notes ----- Loading GloVe format can take a while """ download_file = { "wiki-6B": "http://nlp.stanford.edu/data/glove.6B.zip", "common-crawl-42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip", "common-crawl-840B": "http://nlp.stanford.edu/data/glove.840B.300d.zip", "twitter-27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip" } embedding_file = { "wiki-6B": { 50: "glove.6B/glove.6B.50d.txt", 100: "glove.6B/glove.6B.100d.txt", 200: "glove.6B/glove.6B.200d.txt", 300: "glove.6B/glove.6B.300d.txt" }, "common-crawl-42B": { 300: "glove.42B.300d/glove.42B.300d.txt" }, "common-crawl-840B": { 300: "glove.840B.300d/glove.840B.300d.txt" }, "twitter-27B": { 25: "glove.twitter.27B/glove.twitter.27B.25d.txt", 50: "glove.twitter.27B/glove.twitter.27B.50d.txt", 100: "glove.twitter.27B/glove.twitter.27B.100d.txt", 200: "glove.twitter.27B/glove.twitter.27B.200d.txt", } } vocab_size = { "wiki-6B": 400000, "common-crawl-42B": 1917494, "common-crawl-840B": 2196017, "twitter-27B": 1193514 } assert corpus in download_file, "Unrecognized corpus" assert dim in embedding_file[corpus], "Not available dimensionality" _ = _fetch_file(url=download_file[corpus], data_dir="embeddings", uncompress=True, verbose=1) return load_embedding(path.join(_get_dataset_dir("embeddings"), embedding_file[corpus][dim]), format="glove", normalize=normalize, lower=lower, clean_words=clean_words)