def test_upgrade_streamcorpus_v0_3_0_check_mention_ids():

    up = _init_stage("upgrade_streamcorpus_v0_3_0", {})

    all_mention_ids = set()

    for si in streamcorpus.Chunk(
        os.path.join(
            os.path.dirname(__file__),
            _TEST_DATA_ROOT,
            "test/MAINSTREAM_NEWS-15-9d6218f0aa7c9585cda12a10d642a8b3-41600ffca7703f7914102da5256233ce.sc.xz",
        ),
        message=streamcorpus.StreamItem_v0_2_0,
    ):

        si3 = up(si)

        assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES["v0_3_0"]

        mention_ids = set()
        for sentence in si3.body.sentences["lingpipe"]:
            sentence_mention_ids = set()
            for token in sentence.tokens:
                if token.mention_id not in [None, -1]:
                    sentence_mention_ids.add(token.mention_id)

            assert mention_ids.intersection(sentence_mention_ids) == set()

            mention_ids.update(sentence_mention_ids)

            all_mention_ids.update(sentence_mention_ids)

    assert len(all_mention_ids) > 0
def make_hyperlink_labeled_test_chunk():
    '''
    returns a path to a temporary chunk that has been hyperlink labeled
    '''
    tpath = os.path.join('/tmp', str(uuid.uuid1()) + '.sc')
    o_chunk = Chunk(tpath, mode='wb')

    dpath = os.path.dirname(__file__)
    ipath = os.path.join( dpath, _TEST_DATA_ROOT, 'test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc' )

    cv = _init_stage('clean_visible', {})
    hl = hyperlink_labels(
        {'require_abs_url': True, 
         'all_domains': True,
         'offset_types': ['BYTES']}
        )
    for si in Chunk(path=ipath):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

    o_chunk.close()
    return tpath
def test_target_parsing():
    path = os.path.dirname(__file__)
    path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
    test_html = open(os.path.join(path, 'target-test.html')).read()

    html = make_clean_html( test_html )

    assert 'logo' in html
    assert 'target' in html

    visible = make_clean_visible( html )
    
    assert 'logo' not in visible
    assert 'target' not in visible

    hyperlink_labels = _init_stage(
        'hyperlink_labels', 
        dict(offset_types=['LINES'],
             require_abs_url=True,
             all_domains=True,
             ))
    si = StreamItem(body=ContentItem(clean_html=html))
    context = {}
    hyperlink_labels( si, context )
    html2 = si.body.clean_html

    visible2 = make_clean_visible( html2 )
    
    #print visible2

    assert 'target' not in visible2
    assert 'logo' not in visible2
def test_langauge():
    path = os.path.dirname(__file__)
    path = os.path.join(path, _TEST_DATA_ROOT, "test/raw-unicode-issues.html")
    si = make_stream_item(None, "test")
    si.body = ContentItem(raw=open(path).read())

    lang = _init_stage("language", {})
    context = {}
    lang(si, context)

    assert si.body.language.name == "Japanese"
    assert si.body.language.code == "ja"
def make_hyperlink_labeled_test_stream_item():
    context = {}
    si = make_test_stream_item()
    assert len(si.body.clean_html) > 200
    hyperlink_labels(
        {'require_abs_url': True, 
         'all_domains': True,
         'offset_types': ['BYTES']}
        )(si, context)

    cv = _init_stage('clean_visible', {})
    cv(si, context)
    assert len(si.body.clean_visible) > 200

    return si
def test_upgrade_streamcorpus_v0_3_0():

    up = _init_stage("upgrade_streamcorpus_v0_3_0", {})

    count = 0
    for si in streamcorpus.Chunk(
        os.path.join(os.path.dirname(__file__), _TEST_DATA_ROOT, "test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc"),
        message=streamcorpus.StreamItem_v0_2_0,
    ):

        count += 1

        si3 = up(si)

        assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES["v0_3_0"]

        if count > 10:
            break