def test_upgrade_streamcorpus_v0_3_0_check_mention_ids(): up = _init_stage("upgrade_streamcorpus_v0_3_0", {}) all_mention_ids = set() for si in streamcorpus.Chunk( os.path.join( os.path.dirname(__file__), _TEST_DATA_ROOT, "test/MAINSTREAM_NEWS-15-9d6218f0aa7c9585cda12a10d642a8b3-41600ffca7703f7914102da5256233ce.sc.xz", ), message=streamcorpus.StreamItem_v0_2_0, ): si3 = up(si) assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES["v0_3_0"] mention_ids = set() for sentence in si3.body.sentences["lingpipe"]: sentence_mention_ids = set() for token in sentence.tokens: if token.mention_id not in [None, -1]: sentence_mention_ids.add(token.mention_id) assert mention_ids.intersection(sentence_mention_ids) == set() mention_ids.update(sentence_mention_ids) all_mention_ids.update(sentence_mention_ids) assert len(all_mention_ids) > 0
def make_hyperlink_labeled_test_chunk(): ''' returns a path to a temporary chunk that has been hyperlink labeled ''' tpath = os.path.join('/tmp', str(uuid.uuid1()) + '.sc') o_chunk = Chunk(tpath, mode='wb') dpath = os.path.dirname(__file__) ipath = os.path.join( dpath, _TEST_DATA_ROOT, 'test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc' ) cv = _init_stage('clean_visible', {}) hl = hyperlink_labels( {'require_abs_url': True, 'all_domains': True, 'offset_types': ['BYTES']} ) for si in Chunk(path=ipath): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def test_target_parsing(): path = os.path.dirname(__file__) path = os.path.join( path, _TEST_DATA_ROOT, 'test' ) test_html = open(os.path.join(path, 'target-test.html')).read() html = make_clean_html( test_html ) assert 'logo' in html assert 'target' in html visible = make_clean_visible( html ) assert 'logo' not in visible assert 'target' not in visible hyperlink_labels = _init_stage( 'hyperlink_labels', dict(offset_types=['LINES'], require_abs_url=True, all_domains=True, )) si = StreamItem(body=ContentItem(clean_html=html)) context = {} hyperlink_labels( si, context ) html2 = si.body.clean_html visible2 = make_clean_visible( html2 ) #print visible2 assert 'target' not in visible2 assert 'logo' not in visible2
def test_langauge(): path = os.path.dirname(__file__) path = os.path.join(path, _TEST_DATA_ROOT, "test/raw-unicode-issues.html") si = make_stream_item(None, "test") si.body = ContentItem(raw=open(path).read()) lang = _init_stage("language", {}) context = {} lang(si, context) assert si.body.language.name == "Japanese" assert si.body.language.code == "ja"
def make_hyperlink_labeled_test_stream_item(): context = {} si = make_test_stream_item() assert len(si.body.clean_html) > 200 hyperlink_labels( {'require_abs_url': True, 'all_domains': True, 'offset_types': ['BYTES']} )(si, context) cv = _init_stage('clean_visible', {}) cv(si, context) assert len(si.body.clean_visible) > 200 return si
def test_upgrade_streamcorpus_v0_3_0(): up = _init_stage("upgrade_streamcorpus_v0_3_0", {}) count = 0 for si in streamcorpus.Chunk( os.path.join(os.path.dirname(__file__), _TEST_DATA_ROOT, "test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc"), message=streamcorpus.StreamItem_v0_2_0, ): count += 1 si3 = up(si) assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES["v0_3_0"] if count > 10: break