def test_upgrade_streamcorpus_v0_3_0(test_data_dir):
    up = upgrade_streamcorpus_v0_3_0(config={})
    count = 0

    for si in streamcorpus.Chunk(get_test_chunk_path(test_data_dir), message=streamcorpus.StreamItem_v0_2_0):
        count += 1
        si3 = up(si)
        assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES["v0_3_0"]
        if count > 10:
            break
Esempio n. 2
0
def test_upgrade_streamcorpus_v0_3_0(test_data_dir):
    up = upgrade_streamcorpus_v0_3_0(config={})
    count = 0

    for si in streamcorpus.Chunk(get_test_chunk_path(test_data_dir),
                                 message=streamcorpus.StreamItem_v0_2_0):
        count += 1
        si3 = up(si)
        assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES['v0_3_0']
        if count > 10:
            break
Esempio n. 3
0
def test_upgrade_streamcorpus_v0_3_0_check_mention_ids(test_data_dir):
    up = upgrade_streamcorpus_v0_3_0(config={})
    all_mention_ids = set()
    for si in streamcorpus.Chunk(os.path.join(
            test_data_dir,
            'test/MAINSTREAM_NEWS-15-9d6218f0aa7c9585cda12a10d642a8b3-41600ffca7703f7914102da5256233ce.sc.xz'
    ),
                                 message=streamcorpus.StreamItem_v0_2_0):
        si3 = up(si)
        assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES['v0_3_0']
        mention_ids = set()
        for sentence in si3.body.sentences['lingpipe']:
            sentence_mention_ids = set()
            for token in sentence.tokens:
                if token.mention_id not in [None, -1]:
                    sentence_mention_ids.add(token.mention_id)

            assert mention_ids.intersection(sentence_mention_ids) == set()
            mention_ids.update(sentence_mention_ids)
            all_mention_ids.update(sentence_mention_ids)
    assert len(all_mention_ids) > 0
def test_upgrade_streamcorpus_v0_3_0_check_mention_ids(test_data_dir):
    up = upgrade_streamcorpus_v0_3_0(config={})
    all_mention_ids = set()
    for si in streamcorpus.Chunk(
        os.path.join(
            test_data_dir,
            "test/MAINSTREAM_NEWS-15-9d6218f0aa7c9585cda12a10d642a8b3-41600ffca7703f7914102da5256233ce.sc.xz",
        ),
        message=streamcorpus.StreamItem_v0_2_0,
    ):
        si3 = up(si)
        assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES["v0_3_0"]
        mention_ids = set()
        for sentence in si3.body.sentences["lingpipe"]:
            sentence_mention_ids = set()
            for token in sentence.tokens:
                if token.mention_id not in [None, -1]:
                    sentence_mention_ids.add(token.mention_id)

            assert mention_ids.intersection(sentence_mention_ids) == set()
            mention_ids.update(sentence_mention_ids)
            all_mention_ids.update(sentence_mention_ids)
    assert len(all_mention_ids) > 0