Ejemplo n.º 1
0
def test_extract_segment_short_text(tmpdir):
    # The text is too short for TextTilingTokenizer. Test if the fallback works
    ttt = TextTilingTokenizer(k=6)
    pipeline_config = {"passagelen": 30, "slicelen": 20, "tfchannel": True}
    extractor = DeepTileExtractor(tmpdir, tmpdir, pipeline_config)
    s = "But we in it shall be rememberèd We few, we happy few, we band of brothers"
    doc_toks = s.split(" ")
    segments = extractor.extract_segment(doc_toks, ttt)
    assert len(segments) == 1
    # N.B - segments are in all lowercase, special chars (comma) have been removed
    assert segments == [
        "But we in it shall be rememberèd We few, we happy few, we band of brothers"
    ]

    s = (
        "But we in it shall be rememberèd We few, we happy few, we band of brothers. For he to-day that sheds his "
        "blood with me Shall be my brother")
    doc_toks = s.split(" ")

    segments = extractor.extract_segment(doc_toks, ttt)
    assert len(segments) == 2
    assert segments == [
        "But we in it shall be rememberèd We few, we happy few, we band of brothers. For he to-day that",
        "sheds his blood with me Shall be my brother",
    ]
Ejemplo n.º 2
0
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index,
                              dummy_collection_config):
    # Kind of a useless test - not asserting much here. Still useful since it makes sure that the code at least runs
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "unigram",
        "passagelen": 3,
        "slicelen": 20,
        "tfchannel": True,
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    extractor = DeepTileExtractor(tmpdir,
                                  tmpdir,
                                  pipeline_config,
                                  index=trec_index,
                                  collection=collection,
                                  benchmark=benchmark)

    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(extractor, "get_magnitude_embeddings",
                        fake_magnitude_embedding)
    extractor.build_from_benchmark(True)
    assert extractor.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }

    assert extractor.itos == {v: k for k, v in extractor.stoi.items()}
Ejemplo n.º 3
0
def test_extract_segment_long_text(tmpdir):
    # nltk.TextTilingTokenizer only works with large blobs of text
    ttt = TextTilingTokenizer(k=6)
    pipeline_config = {"passagelen": 30, "slicelen": 20, "tfchannel": True}
    extractor = DeepTileExtractor(tmpdir, tmpdir, pipeline_config)

    # blob of text with Shakespeare and Shangri La. Should split into two topics
    s = (
        "O that we now had here but one ten thousand of those men in England That do no work to-day. Whats he that "
        "wishes so? My cousin, Westmorland? No, my fair cousin. If we are marked to die, we are enough To do our "
        "country loss; and if to live, The fewer men, the greater share of honour. Gods will! I pray thee, wish"
        " not one man more. Shangri-La is a fictional place described in the 1933 novel Lost Horizon "
        "by British author James Hilton. Hilton describes Shangri-La as a mystical, harmonious valley, gently guided "
        "from a lamasery, enclosed in the western end of the Kunlun Mountains. Shangri-La has become synonymous with "
        "any earthly paradise, particularly a mythical Himalayan utopia – a permanently happy land, isolated from "
        "the world")
    doc_toks = s.split(" ")
    segments = extractor.extract_segment(doc_toks, ttt)
    assert len(segments) == 2

    # The split was determined by nltk.TextTilingTokenizer. Far from perfect
    assert segments == [
        "O that we now had here but one ten thousand of those men in England That do no work to-day. Whats he that wishes so? My cousin, Westmorland? No, my fair cousin. If we are marked to die, we are",
        " enough To do our country loss; and if to live, The fewer men, the greater share of honour. Gods will! I pray thee, wish not one man more. Shangri-La is a fictional place described in the 1933 novel Lost Horizon by British author James Hilton. Hilton describes Shangri-La as a mystical, harmonious valley, gently guided from a lamasery, enclosed in the western end of the Kunlun Mountains. Shangri-La has become synonymous with any earthly paradise, particularly a mythical Himalayan utopia – a permanently happy land, isolated from the world",
    ]
Ejemplo n.º 4
0
def test_clean_segments(tmpdir):
    extractor = DeepTileExtractor(tmpdir, tmpdir, {"tfchannel": True})
    assert extractor.clean_segments(["hello world", "foo bar"], p_len=4) == [
        "hello world",
        "foo bar",
        extractor.pad_tok,
        extractor.pad_tok,
    ]
    assert extractor.clean_segments(["hello world", "foo bar", "alice", "bob"],
                                    p_len=3) == [
                                        "hello world", "foo bar", "alicebob"
                                    ]
Ejemplo n.º 5
0
def test_deeptiles_create(monkeypatch, tmpdir, dummy_index):
    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(DeepTileExtractor, "_get_pretrained_emb",
                        fake_magnitude_embedding)
    benchmark = DummyBenchmark({})
    extractor_config = {
        "name": "deeptiles",
        "tilechannels": 3,
        "maxqlen": 5,
        "passagelen": 3,
        "slicelen": 20,
        "tfchannel": True,
        "embeddings": "glove6b",
        "usecache": False,
    }
    extractor = DeepTileExtractor(extractor_config,
                                  provide={
                                      "index": dummy_index,
                                      "benchmark": benchmark
                                  })

    print("BT:", benchmark.topics["title"])
    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics["title"])
    assert extractor.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
        "lessdummy": 9,
    }

    assert extractor.itos == {v: k for k, v in extractor.stoi.items()}
    assert extractor.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
        "lessdummy": 9,
    }
Ejemplo n.º 6
0
def test_deeptiles_extract_segment_short_text(tmpdir, monkeypatch,
                                              dummy_index):
    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(DeepTileExtractor, "_get_pretrained_emb",
                        fake_magnitude_embedding)
    benchmark = DummyBenchmark()
    # The text is too short for TextTilingTokenizer. Test if the fallback works
    ttt = TextTilingTokenizer(k=6)
    pipeline_config = {
        "name": "deeptiles",
        "passagelen": 30,
        "slicelen": 20,
        "tfchannel": True,
        "tilechannels": 3,
        "index": {
            "collection": {
                "name": "dummy"
            }
        },
    }
    extractor = DeepTileExtractor(pipeline_config,
                                  provide={
                                      "index": dummy_index,
                                      "benchmark": benchmark
                                  })
    s = "But we in it shall be rememberèd We few, we happy few, we band of brothers"
    doc_toks = s.split(" ")
    segments = extractor.extract_segment(doc_toks, ttt)
    assert len(segments) == 1
    # N.B - segments are in all lowercase, special chars (comma) have been removed
    assert segments == [
        "But we in it shall be rememberèd We few, we happy few, we band of brothers"
    ]

    s = (
        "But we in it shall be rememberèd We few, we happy few, we band of brothers. For he to-day that sheds his "
        "blood with me Shall be my brother")
    doc_toks = s.split(" ")

    segments = extractor.extract_segment(doc_toks, ttt)
    assert len(segments) == 2
    assert segments == [
        "But we in it shall be rememberèd We few, we happy few, we band of brothers. For he to-day that",
        "sheds his blood with me Shall be my brother",
    ]
Ejemplo n.º 7
0
def test_deeptiles_extract_segment_long_text(tmpdir, monkeypatch, dummy_index):
    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(DeepTileExtractor, "_get_pretrained_emb",
                        fake_magnitude_embedding)
    benchmark = DummyBenchmark()
    # nltk.TextTilingTokenizer only works with large blobs of text
    ttt = TextTilingTokenizer(k=6)
    extractor_config = {
        "name": "deeptiles",
        "embeddings": "glove6b",
        "tilechannels": 3,
        "passagelen": 30,
        "slicelen": 20,
        "tfchannel": True,
    }
    extractor = DeepTileExtractor(extractor_config,
                                  provide={
                                      "index": dummy_index,
                                      "benchmark": benchmark
                                  })

    # blob of text with Shakespeare and Shangri La. Should split into two topics
    s = (
        "O that we now had here but one ten thousand of those men in England That do no work to-day. Whats he that "
        "wishes so? My cousin, Westmorland? No, my fair cousin. If we are marked to die, we are enough To do our "
        "country loss; and if to live, The fewer men, the greater share of honour. Gods will! I pray thee, wish"
        " not one man more. Shangri-La is a fictional place described in the 1933 novel Lost Horizon "
        "by British author James Hilton. Hilton describes Shangri-La as a mystical, harmonious valley, gently guided "
        "from a lamasery, enclosed in the western end of the Kunlun Mountains. Shangri-La has become synonymous with "
        "any earthly paradise, particularly a mythical Himalayan utopia – a permanently happy land, isolated from "
        "the world")
    doc_toks = s.split(" ")
    segments = extractor.extract_segment(doc_toks, ttt)
    assert len(segments) == 2

    # The split was determined by nltk.TextTilingTokenizer. Far from perfect
    assert segments == [
        "O that we now had here but one ten thousand of those men in England That do no work to-day. Whats he that wishes so? My cousin, Westmorland? No, my fair cousin. If we are marked to die, we are",
        " enough To do our country loss; and if to live, The fewer men, the greater share of honour. Gods will! I pray thee, wish not one man more. Shangri-La is a fictional place described in the 1933 novel Lost Horizon by British author James Hilton. Hilton describes Shangri-La as a mystical, harmonious valley, gently guided from a lamasery, enclosed in the western end of the Kunlun Mountains. Shangri-La has become synonymous with any earthly paradise, particularly a mythical Himalayan utopia – a permanently happy land, isolated from the world",
    ]
Ejemplo n.º 8
0
def test_deeptiles_clean_segments(tmpdir, dummy_index):
    pipeline_config = {
        "name": "deeptiles",
        "passagelen": 30,
        "slicelen": 20,
        "tfchannel": True,
        "tilechannels": 3
    }
    extractor = DeepTileExtractor(pipeline_config,
                                  provide={"index": dummy_index})
    assert extractor.clean_segments(["hello world", "foo bar"], p_len=4) == [
        "hello world",
        "foo bar",
        extractor.pad_tok,
        extractor.pad_tok,
    ]
    assert extractor.clean_segments(["hello world", "foo bar", "alice", "bob"],
                                    p_len=3) == [
                                        "hello world", "foo bar", "alicebob"
                                    ]
Ejemplo n.º 9
0
def test_create_visualization_matrix(monkeypatch, tmpdir):
    pipeline_config = {
        "maxqlen": 5,
        "passagelen": 3,
        "slicelen": 20,
        "tfchannel": True
    }
    extractor = DeepTileExtractor(tmpdir, tmpdir, pipeline_config)
    extractor.stoi = {
        "<pad>": 0,
        "hello": 1,
        "world": 2,
        "foo": 3,
        "bar": 4,
        "alice": 5,
        "bob": 6
    }
    extractor.idf = defaultdict(lambda: 0)

    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(extractor, "get_magnitude_embeddings",
                        fake_magnitude_embedding)
    embeddings_matrix = extractor.create_embedding_matrix("glove6b.50d")
    level = extractor.create_visualization_matrix(
        [
            "hello", extractor.pad_tok, extractor.pad_tok, extractor.pad_tok,
            extractor.pad_tok
        ],
        ["hello world", "foo bar", "alice bob"],
        embeddings_matrix,
    )
    assert level.shape == (1, 5, 3, 3)
Ejemplo n.º 10
0
def test_deeptiles_create_visualization_matrix(monkeypatch, tmpdir,
                                               dummy_index):
    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(DeepTileExtractor, "_get_pretrained_emb",
                        fake_magnitude_embedding)
    benchmark = DummyBenchmark()
    pipeline_config = {
        "name": "deeptiles",
        "tilechannels": 3,
        "maxqlen": 5,
        "passagelen": 3,
        "slicelen": 20,
        "tfchannel": True
    }
    extractor = DeepTileExtractor(pipeline_config,
                                  provide={
                                      "index": dummy_index,
                                      "benchmark": benchmark
                                  })
    extractor.stoi = {
        "<pad>": 0,
        "hello": 1,
        "world": 2,
        "foo": 3,
        "bar": 4,
        "alice": 5,
        "bob": 6
    }
    extractor.idf = defaultdict(lambda: 0)

    extractor._build_embedding_matrix()
    embeddings_matrix = extractor.embeddings
    level = extractor.create_visualization_matrix(
        [
            "hello", extractor.pad_tok, extractor.pad_tok, extractor.pad_tok,
            extractor.pad_tok
        ],
        ["hello world", "foo bar", "alice bob"],
        embeddings_matrix,
    )
    assert level.shape == (1, 5, 3, 3)