Beispiel #1
0
    def __init__(self, emdim):

        base_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'data')

        self.fasttext_dim = 300
        self.glove_dim = emdim - 300

        assert self.glove_dim in [50, 100, 200,
                                  300], "Embedding dimension must be one of the following: 350, 400, 500, 600"

        print("Will download magnitude files from the server if they aren't avaialble locally.. So, grab a cup of coffee while the downloading is under progress..")
        glove = Magnitude(MagnitudeUtils.download_model('glove/medium/glove.6B.{}d'.format(self.glove_dim),
                                                        download_dir=os.path.join(base_dir, 'magnitude')), case_insensitive=True)
        fasttext = Magnitude(MagnitudeUtils.download_model('fasttext/medium/wiki-news-300d-1M-subword',
                                                           download_dir=os.path.join(base_dir, 'magnitude')), case_insensitive=True)
        self.vectors = Magnitude(glove, fasttext)
Beispiel #2
0
def test_embedtext_creation():
    extractor_cfg = {
        "_name": "embedtext",
        "index": "anserini",
        "tokenizer": "anserini",
        "embeddings": "glove6b",
        "zerounk": True,
        "calcidf": True,
        "maxqlen": MAXQLEN,
        "maxdoclen": MAXDOCLEN,
    }
    extractor = EmbedText(extractor_cfg)

    benchmark = DummyBenchmark({"_fold": "s1", "rundocsonly": False})
    collection = DummyCollection({"_name": "dummy"})

    index_cfg = {"_name": "anserini", "indexstops": False, "stemmer": "porter"}
    index = AnseriniIndex(index_cfg)
    index.modules["collection"] = collection

    tok_cfg = {"_name": "anserini", "keepstops": True, "stemmer": "none"}
    tokenizer = AnseriniTokenizer(tok_cfg)

    extractor.modules["index"] = index
    extractor.modules["tokenizer"] = tokenizer

    qids = list(benchmark.qrels.keys())  # ["301"]
    qid = qids[0]
    docids = list(benchmark.qrels[qid].keys())

    extractor.create(qids, docids, benchmark.topics[benchmark.query_type])

    expected_vocabs = [
        "lessdummy", "dummy", "doc", "hello", "greetings", "world", "from",
        "outer", "space", "<pad>"
    ]
    expected_stoi = {s: i for i, s in enumerate(expected_vocabs)}

    assert set(extractor.stoi.keys()) == set(expected_stoi.keys())

    emb_path = "glove/light/glove.6B.300d"
    fullemb = Magnitude(MagnitudeUtils.download_model(emb_path))
    assert extractor.embeddings.shape == (len(expected_vocabs), fullemb.dim)

    for i in range(extractor.embeddings.shape[0]):
        if i == extractor.pad:
            assert extractor.embeddings[i].sum() < 1e-5
            continue
        s = extractor.itos[i]
        assert (extractor.embeddings[i] - fullemb.query(s)).sum() < 1e-5
    return extractor
Beispiel #3
0
 def __init__(self, embedding_name):
     """
         If the _is_initialized class property is not set, build the benchmark and model (expensive)
         Else, do nothing.
     """
     self.embedding_name = embedding_name
     self.embedding = Magnitude(
         MagnitudeUtils.download_model(
             self.SUPPORTED_EMBEDDINGS[embedding_name], download_dir=os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())
         ),
         lazy_loading=-1,
         blocking=True,
     )
     self.stoi = {self.PAD: 0}  # string to integer. Associates an integer value with every token
     self.itos = {0: self.PAD}
 def get_magnitude_embeddings(self, embedding_name):
     return Magnitude(MagnitudeUtils.download_model(self.embedding_lookup[embedding_name], download_dir=self.cache_path))
Beispiel #5
0
 def _get_pretrained_emb(self):
     magnitude_cache = CACHE_BASE_PATH / "magnitude/"
     return Magnitude(MagnitudeUtils.download_model(self.embed_paths[self.cfg["embeddings"]], download_dir=magnitude_cache))