Beispiel #1
0
 def test_batchify(self):
     X = [0, 1, 2, 3, 4, 5]  # noqa: N806
     y = [0, 0, 1, 1, 0, 1]
     batch_gen = MagnitudeUtils.batchify(X, y, 2)
     X_batch, y_batch = next(batch_gen)  # noqa: N806
     self.assertEqual(X_batch, [0, 1])
     self.assertEqual(y_batch, [0, 0])
     X_batch, y_batch = next(batch_gen)  # noqa: N806
     self.assertEqual(X_batch, [2, 3])
     self.assertEqual(y_batch, [1, 1])
     X_batch, y_batch = next(batch_gen)  # noqa: N806
     self.assertEqual(X_batch, [4, 5])
     self.assertEqual(y_batch, [0, 1])
     X_batch, y_batch = next(batch_gen)  # noqa: N806
     self.assertEqual(X_batch, [0, 1])
     self.assertEqual(y_batch, [0, 0])
     X = [0, 1, 2]  # noqa: N806
     y = [0, 0, 1]
     batch_gen = MagnitudeUtils.batchify(X, y, 2)
     X_batch, y_batch = next(batch_gen)  # noqa: N806
     self.assertEqual(X_batch, [0, 1])
     self.assertEqual(y_batch, [0, 0])
     X_batch, y_batch = next(batch_gen)  # noqa: N806
     self.assertEqual(X_batch, [2])
     self.assertEqual(y_batch, [1])
Beispiel #2
0
    def __init__(self, emdim):

        base_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'data')

        self.fasttext_dim = 300
        self.glove_dim = emdim - 300

        assert self.glove_dim in [50, 100, 200,
                                  300], "Embedding dimension must be one of the following: 350, 400, 500, 600"

        print("Will download magnitude files from the server if they aren't avaialble locally.. So, grab a cup of coffee while the downloading is under progress..")
        glove = Magnitude(MagnitudeUtils.download_model('glove/medium/glove.6B.{}d'.format(self.glove_dim),
                                                        download_dir=os.path.join(base_dir, 'magnitude')), case_insensitive=True)
        fasttext = Magnitude(MagnitudeUtils.download_model('fasttext/medium/wiki-news-300d-1M-subword',
                                                           download_dir=os.path.join(base_dir, 'magnitude')), case_insensitive=True)
        self.vectors = Magnitude(glove, fasttext)
Beispiel #3
0
 def test_from_categorical(self):
     y_c = [[0., 1., 0., 0., 0., 0.],
            [0., 0., 0., 0., 0., 1.]]
     self.assertTrue(isclose(
         MagnitudeUtils.from_categorical(y_c),
         [1., 5.]
     ).all())
Beispiel #4
0
    def read_train_test_data(train_ff: str, test_ff: str):
        with open(train_ff, 'rb') as ff:
            trains = [line.decode('utf-8') for line in ff.readlines()]
        with open(test_ff, 'rb') as ff:
            tests = [line.decode('utf-8') for line in ff.readlines()]

        """
            etree_w2v = Pipeline([
                ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
                ("extra trees", ExtraTreesClassifier(n_estimators=200))])
            """
        add_label, label_to_int, int_to_label = MagnitudeUtils.class_encoding()

        def get_sentence(line: str):
            return line.rsplit(' ', 1)[0]

        def get_target(line: str):
            return line.rsplit(' ', 1)[-1].strip().lower()

        X_train = [get_sentence(line) for line in trains]
        y_train = [add_label(get_target(line)) for line in trains]

        X_test = [get_sentence(line) for line in tests]
        y_test = [add_label(get_target(line)) for line in tests]

        return X_train, y_train, X_test, y_test, label_to_int, int_to_label
Beispiel #5
0
 def test_class_encoding(self):
     add_class, class_to_int, int_to_class = MagnitudeUtils.class_encoding()
     self.assertEqual(add_class('cat'), 0)
     self.assertEqual(add_class('dog'), 1)
     self.assertEqual(add_class('dog'), 1)
     self.assertEqual(add_class('dog'), 1)
     self.assertEqual(add_class('cat'), 0)
     self.assertEqual(class_to_int('dog'), 1)
     self.assertEqual(class_to_int('cat'), 0)
     self.assertEqual(int_to_class(1), 'dog')
     self.assertEqual(int_to_class(0), 'cat')
Beispiel #6
0
 def test_to_categorical(self):
     y = [1, 5, 1, 1, 2, 4, 1, 3, 1, 3, 5, 4]
     self.assertTrue(
         isclose(
             MagnitudeUtils.to_categorical(y),
             [[0., 1., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 1.],
              [0., 1., 0., 0., 0., 0.], [0., 1., 0., 0., 0., 0.],
              [0., 0., 1., 0., 0., 0.], [0., 0., 0., 0., 1., 0.],
              [0., 1., 0., 0., 0., 0.], [0., 0., 0., 1., 0., 0.],
              [0., 1., 0., 0., 0., 0.], [0., 0., 0., 1., 0., 0.],
              [0., 0., 0., 0., 0., 1.], [0., 0., 0., 0., 1., 0.]]).all())
Beispiel #7
0
def test_embedtext_creation():
    extractor_cfg = {
        "_name": "embedtext",
        "index": "anserini",
        "tokenizer": "anserini",
        "embeddings": "glove6b",
        "zerounk": True,
        "calcidf": True,
        "maxqlen": MAXQLEN,
        "maxdoclen": MAXDOCLEN,
    }
    extractor = EmbedText(extractor_cfg)

    benchmark = DummyBenchmark({"_fold": "s1", "rundocsonly": False})
    collection = DummyCollection({"_name": "dummy"})

    index_cfg = {"_name": "anserini", "indexstops": False, "stemmer": "porter"}
    index = AnseriniIndex(index_cfg)
    index.modules["collection"] = collection

    tok_cfg = {"_name": "anserini", "keepstops": True, "stemmer": "none"}
    tokenizer = AnseriniTokenizer(tok_cfg)

    extractor.modules["index"] = index
    extractor.modules["tokenizer"] = tokenizer

    qids = list(benchmark.qrels.keys())  # ["301"]
    qid = qids[0]
    docids = list(benchmark.qrels[qid].keys())

    extractor.create(qids, docids, benchmark.topics[benchmark.query_type])

    expected_vocabs = [
        "lessdummy", "dummy", "doc", "hello", "greetings", "world", "from",
        "outer", "space", "<pad>"
    ]
    expected_stoi = {s: i for i, s in enumerate(expected_vocabs)}

    assert set(extractor.stoi.keys()) == set(expected_stoi.keys())

    emb_path = "glove/light/glove.6B.300d"
    fullemb = Magnitude(MagnitudeUtils.download_model(emb_path))
    assert extractor.embeddings.shape == (len(expected_vocabs), fullemb.dim)

    for i in range(extractor.embeddings.shape[0]):
        if i == extractor.pad:
            assert extractor.embeddings[i].sum() < 1e-5
            continue
        s = extractor.itos[i]
        assert (extractor.embeddings[i] - fullemb.query(s)).sum() < 1e-5
    return extractor
Beispiel #8
0
 def __init__(self, embedding_name):
     """
         If the _is_initialized class property is not set, build the benchmark and model (expensive)
         Else, do nothing.
     """
     self.embedding_name = embedding_name
     self.embedding = Magnitude(
         MagnitudeUtils.download_model(
             self.SUPPORTED_EMBEDDINGS[embedding_name], download_dir=os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())
         ),
         lazy_loading=-1,
         blocking=True,
     )
     self.stoi = {self.PAD: 0}  # string to integer. Associates an integer value with every token
     self.itos = {0: self.PAD}
 def get_magnitude_embeddings(self, embedding_name):
     return Magnitude(MagnitudeUtils.download_model(self.embedding_lookup[embedding_name], download_dir=self.cache_path))
Beispiel #10
0
 def _get_pretrained_emb(self):
     magnitude_cache = CACHE_BASE_PATH / "magnitude/"
     return Magnitude(MagnitudeUtils.download_model(self.embed_paths[self.cfg["embeddings"]], download_dir=magnitude_cache))