Ejemplo n.º 1
0
def download_languages_bin(directory: AnyStr, language_codes: List[AnyStr]):
    """Download all the required language fastText word embeddings. It takes a list of language codes as defined on
    https://fasttext.cc/docs/en/crawl-vectors.html and downloads them in order.
    """
    root = getcwd()
    chdir(root + f"/{directory}")
    for language in language_codes:
        download_model(language, if_exists='ignore')
        run(["rm", "-f", f"cc.{language}.300.bin.gz"])
    chdir(root)
Ejemplo n.º 2
0
def load_embeddings(filename, language="", reduced_dim=None):
    """
    Loading pre-trained fasttext word embeddings. 
    """

    # Download the official fasttext embedding distribution:
    if language:
        util.download_model(language, if_exists='ignore')

    # Load model:
    try:
        model = fasttext.load_model(filename)
    except FileNotFoundError:
        print(f"File with name {filename} does not exist.")

    # Reduce embedding dimension if needed:
    if reduced_dim:
        assert reduced_dim < 300, f"The new embedding dimension {reduced_dim} is too big"
        assert reduced_dim > 0, f"The new embedding dimension {reduced_dim} must be strictly positive"
        util.reduce_model(model, reduced_dim)

    return model
Ejemplo n.º 3
0
 def __init__(
     self,
     model_dir: str = "models",
     fasttext_model_name:
     str = "ft_native_300_ru_wiki_lenta_nltk_word_tokenize.bin",
     fasttext_en_model_name: str = "cc.en.300.bin",
 ):
     self.model_dir = model_dir
     self.fasttext_model_name = fasttext_model_name
     self.fasttext_en_model_name = fasttext_en_model_name
     self.model_path = os.path.join(self.model_dir,
                                    self.fasttext_model_name)
     if self.fasttext_model_name == "en":
         util.download_model('en', if_exists='ignore')
         self.model = load_model(self.fasttext_en_model_name)
     else:
         self.model = load_model(self.model_path)
     self.input_matrix = torch.FloatTensor(self.model.get_input_matrix())
     self.matrix_shape = self.input_matrix.shape
     self.embedding_bag = EmbeddingBag(
         self.matrix_shape[0],
         self.matrix_shape[1]).from_pretrained(self.input_matrix,
                                               mode="mean").cuda()
Ejemplo n.º 4
0
def load_embeddings(language):
    from fasttext.util import download_model
    from fasttext import load_model
    download_model(language, if_exists='ignore')
    embeddings = load_model('cc.' + language + '.300.bin')
    return embeddings
import gdown
from fasttext.util import download_model

if __name__ == '__main__':

    print('Download job offers data')
    offers_url = 'https://drive.google.com/uc?export=download&confirm=A6wL&id=1tI4SctLNkZU6vJuBw1Hf1lVqephc35cG'
    gdown.download(offers_url, 'data/all_offers.csv', quiet=False)

    print('Download FastText representations of job offers')
    vectors_url = 'https://drive.google.com/uc?export=download&confirm=-GH4&id=1m_ckxOk4Ga884ai9mopnSj7gmvb1t5tG'
    gdown.download(vectors_url, 'data/offers_fasttext.npy', quiet=False)

    print('Download FastText French model')
    download_model('fr', if_exists='ignore')