Ejemplo n.º 1
0
def fetch_conceptnet_numberbatch(clean_words=False):
    """
    Fetches ConceptNetNumberbatch embeddings. Embeddings are normalized to unit length,
    and the vocabulary terms are lowercase.

    Parameters
    ----------
    clean_words: bool, default: False
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Published at https://github.com/commonsense/conceptnet-numberbatch
    Reference paper: Robert Speer, Joshua Chin, and Catherine Havasi (2017). "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge." In proceedings of AAAI 2017.
    """
    path = _fetch_file(
        url=
        'https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz',
        data_dir='embeddings',
        uncompress=False,
        verbose=1)
    return load_embedding(path,
                          format='word2vec',
                          normalize=False,
                          clean_words=clean_words)
Ejemplo n.º 2
0
def fetch_HDC(dim=300, normalize=True, lower=False, clean_words=False):
    """
    Fetches PDC embeddings trained on wiki by Fei Sun

    Parameters
    ----------
    dim: int, default:300
      Dimensionality of embedding

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Embedding instance

    References
    ----------
    Embeddings were published on http://ofey.me/projects/wordrep/.
    Reference paper: Fei Sun, Jiafeng Guo, Yanyan Lan, Jun Xu, and Xueqi Cheng.
    "Learning word representations by jointly modeling syntagmatic and paradigmatic relations"
    """

    url = {
        50:
        "https://www.dropbox.com/s/q22ssy8055loknz/wikicorp.201004-hdc-"
        "iter-20-alpha-0.025-window-10-dim-50-neg-10-subsample-0.0001.txt.bz2?dl=1",
        100:
        "https://www.dropbox.com/s/13226et55fi6g50/wikicorp.201004-hdc-"
        "iter-20-alpha-0.025-window-10-dim-100-neg-10-subsample-0.0001.txt.bz2?dl=1",
        300:
        "https://www.dropbox.com/s/jrfwel32yd8w0lu/wikicorp.201004-hdc-"
        "iter-20-alpha-0.025-window-10-dim-300-neg-10-subsample-0.0001.txt.bz2?dl=1"
    }
    assert dim in url, "Unavailable dimensionality"

    path = _fetch_file(url=url[dim],
                       data_dir="embeddings",
                       uncompress=False,
                       move="hdc/hdc{}.txt.bz2".format(dim),
                       verbose=1)

    return load_embedding(path,
                          format="word2vec",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)
Ejemplo n.º 3
0
def fetch_LexVec(which="commoncrawl-W+C",
                 normalize=True,
                 lower=False,
                 clean_words=False):
    """
    Fetches LexVec embeddings

    Parameters
    ----------
    which: str, default: "commoncrawl-W+C"
      Can choose between "commoncrawl-W", "commoncrawl-W+C", "wikipedia+newscrawl-W", "wikipedia+newscrawl-W+C", "commoncrawl-ngramsubwords-W"

    normalize: bool, default: True
      If true will normalize all vector to unit length

    lower: bool, default: False
      If true, will convert string to lowercase

    clean_words: bool, default: False
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Published at https://github.com/alexandres/lexvec
    Reference paper: Salle, Alexandre, Marco Idiart, and Aline Villavicencio. Matrix Factorization using Window Sampling and Negative Sampling for Improved Word Representations. The 54th Annual Meeting of the Association for Computational Linguistics. 2016.
    """
    download_file = {
        "commoncrawl-W":
        "https://www.dropbox.com/s/flh1fjynqvdsj4p/lexvec.commoncrawl.300d.W.pos.vectors.gz?dl=1",
        "commoncrawl-W+C":
        "https://www.dropbox.com/s/zkiajh6fj0hm0m7/lexvec.commoncrawl.300d.W%2BC.pos.vectors.gz?dl=1",
        "wikipedia+newscrawl-W":
        "https://www.dropbox.com/s/kguufyc2xcdi8yk/lexvec.enwiki%2Bnewscrawl.300d.W.pos.vectors.gz?dl=1",
        "wikipedia+newscrawl-W+C":
        "https://www.dropbox.com/s/u320t9bw6tzlwma/lexvec.enwiki%2Bnewscrawl.300d.W%2BC.pos.vectors.gz?dl=1",
        "commoncrawl-ngramsubwords-W":
        "https://www.dropbox.com/s/mrxn933chn5u37z/lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors.gz?dl=1"
    }

    path = _fetch_file(url=download_file[which],
                       data_dir="embeddings",
                       verbose=1)

    return load_embedding(path,
                          format="word2vec",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)
Ejemplo n.º 4
0
def fetch_NMT(which="DE", normalize=True, lower=False, clean_words=False):
    """
    Fetches word embeddings induced by Neural Translation Machine

    Parameters
    ----------
    which: str, default: "DE"
      Can choose between DE and FR, which fetches accordingly EN -> DE or EN -> FR translation
      induced word embeddings

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Published at https://www.cl.cam.ac.uk/~fh295/.
    Reference paper: Hill, Cho et al., "Embedding Word Similarity With Neural Machine Translation", 2014
    """
    dirname = _fetch_file(url="https://www.cl.cam.ac.uk/~fh295/TEmbz.tar.gz",
                          data_dir="embeddings",
                          uncompress=True,
                          verbose=1)

    assert which in ["DE", "FR"], "Unrecognized which parameter"

    fname = {
        "FR": "Trans_embds/D_RNN_500k_144h.pkl",
        "DE": "Trans_embds/D_german_50k_500k_168h.pkl"
    }

    return load_embedding(path.join(dirname, fname[which]),
                          format="dict",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)
Ejemplo n.º 5
0
def fetch_morphoRNNLM(which, normalize=True, lower=False, clean_words=False):
    """
    Fetches recursive morphological neural network embeddings

    Parameters
    ----------
    which: str, default: "CW"
      Can choose between CW and HSMN

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Published at http://stanford.edu/~lmthang/morphoNLM/
    Reference paper: Luong, Socher et al., "Better Word Representations with Recursive Neural Networks for Morphology", 2013
    """
    download_file = {
        "CW": "https://www.dropbox.com/s/7fdj2666iqv4xbu/cwCsmRNN.bin.gz?dl=1",
        "HSMN":
        "https://www.dropbox.com/s/okw1i6kc6e2jd1q/hsmnCsmRNN.bin.gz?dl=1"
    }

    path = _fetch_file(url=download_file[which],
                       data_dir="embeddings",
                       uncompress=False,
                       verbose=1)

    return load_embedding(path,
                          format="word2vec_bin",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)
Ejemplo n.º 6
0
def fetch_HPCA(which, normalize=True, lower=False, clean_words=False):
    """
    Fetches Hellinger PCA based embeddings

    Parameters
    ----------
    which: str, default: "autoencoder_phrase_hpca"
      Can choose between "hpca" and "autoencoder_phrase_hpca" (from "The Sum of Its Parts")

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Published at http://lebret.ch/words/
    Reference paper: Lebret, Collobert et al., “The Sum of Its Parts”: Joint Learning of Word and Phrase Representations with Autoencoders", 2015
    """
    download_file = {
        "autoencoder_phrase_hpca":
        "https://www.dropbox.com/s/6dyf48crdmjbw1a/AHPCA.bin.gz?dl=1",
        "hpca": "https://www.dropbox.com/s/5y5l6vyn8yn11dv/HPCA.bin.gz?dl=1"
    }

    path = _fetch_file(url=download_file[which],
                       data_dir="embeddings",
                       uncompress=False,
                       verbose=1)

    return load_embedding(path,
                          format="word2vec_bin",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)
Ejemplo n.º 7
0
def fetch_FastText(lang="en", normalize=True, lower=False, clean_words=False):
    """
       Fetches fastText embeddings

       Parameters
       ----------
       lang: str, default: "en"
         Can choose between all accessible language on page:
         https://fasttext.cc/docs/en/pretrained-vectors.html#content

       normalize: bool, default: True
         If true will normalize all vector to unit length

       lower: bool, default: False
         If true, will convert string to lowercase

       clean_words: bool, default: False
         If true will only keep alphanumeric characters and "_", "-"
         Warning: shouldn't be applied to embeddings with non-ascii characters

       Returns
       -------
       w: Embedding
         Instance of Embedding class

       References
       ----------
       Published at https://fasttext.cc/
       """

    url_vec = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.{}.vec'.format(
        lang)

    path = _fetch_file(url=url_vec,
                       data_dir='embeddings',
                       uncompress=False,
                       verbose=1)

    return load_embedding(path,
                          format='word2vec',
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)
Ejemplo n.º 8
0
def fetch_SG_GoogleNews(normalize=True, lower=False, clean_words=False):
    """
    Fetches SG (skip-gram with negative sampling)
    embeddings trained on GoogleNews dataset published on word2vec website

    Parameters
    ----------
    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Original source: https://code.google.com/p/word2vec/
    """
    path = _fetch_file(
        url=
        "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz",
        data_dir="embeddings",
        verbose=1)
    return load_embedding(path,
                          format="word2vec_bin",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)
Ejemplo n.º 9
0
def fetch_GloVe(dim=300,
                corpus="wiki-6B",
                normalize=True,
                lower=False,
                clean_words=False):
    """
    Fetches GloVe embeddings.

    Parameters
    ----------
    dim: int, default: 300
      Dimensionality of embedding (usually performance increases with dimensionality).
      Available dimensionalities:
        * wiki-6B: 50, 100, 200, 300
        * common-crawl-42B: 300
        * common-crawl-840B: 300
        * twitter: 25, 50, 100, 200

    corpus: string, default: "wiki-6B"
      Corpus that GloVe vector were trained on.
      Available corpuses: "wiki-6B", "common-crawl-42B", "common-crawl-840B", "twitter-27B"

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Embedding instance

    References
    ----------
    Project website: http://nlp.stanford.edu/projects/glove/

    Notes
    -----
    Loading GloVe format can take a while
    """
    download_file = {
        "wiki-6B": "http://nlp.stanford.edu/data/glove.6B.zip",
        "common-crawl-42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip",
        "common-crawl-840B":
        "http://nlp.stanford.edu/data/glove.840B.300d.zip",
        "twitter-27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip"
    }

    embedding_file = {
        "wiki-6B": {
            50: "glove.6B/glove.6B.50d.txt",
            100: "glove.6B/glove.6B.100d.txt",
            200: "glove.6B/glove.6B.200d.txt",
            300: "glove.6B/glove.6B.300d.txt"
        },
        "common-crawl-42B": {
            300: "glove.42B.300d/glove.42B.300d.txt"
        },
        "common-crawl-840B": {
            300: "glove.840B.300d/glove.840B.300d.txt"
        },
        "twitter-27B": {
            25: "glove.twitter.27B/glove.twitter.27B.25d.txt",
            50: "glove.twitter.27B/glove.twitter.27B.50d.txt",
            100: "glove.twitter.27B/glove.twitter.27B.100d.txt",
            200: "glove.twitter.27B/glove.twitter.27B.200d.txt",
        }
    }

    vocab_size = {
        "wiki-6B": 400000,
        "common-crawl-42B": 1917494,
        "common-crawl-840B": 2196017,
        "twitter-27B": 1193514
    }

    assert corpus in download_file, "Unrecognized corpus"
    assert dim in embedding_file[corpus], "Not available dimensionality"

    _ = _fetch_file(url=download_file[corpus],
                    data_dir="embeddings",
                    uncompress=True,
                    verbose=1)

    return load_embedding(path.join(_get_dataset_dir("embeddings"),
                                    embedding_file[corpus][dim]),
                          format="glove",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)