def generate_stream_items_from_kba_json(json_file_path): ## iterate over gzip'ed file of JSON lines data = gzip.GzipFile(fileobj=open(json_file_path, 'rb'), mode='rb').read() for line in data.splitlines(): try: doc = json.loads(line) except Exception, exc: print('trapped: %s' % traceback.format_exc(exc)) print('continuing') continue assert doc['source'] == 'social', doc['source'] ## make a StreamItem with valid StreamTime computed from ## zulu_timestamp. This will fix the four-hour offsets in ## some of the KBA 2012 files. stream_item = make_stream_item( doc['stream_time']['zulu_timestamp'], bytes(doc['abs_url'].encode('utf-8')) ) ## capture schost and source stream_item.schost = doc.pop('schost') stream_item.source = doc.pop('source') ## assemble source_metadata stream_item.source_metadata['kba-2012'] = json.dumps(doc.pop('source_metadata')) ## might have a funky original URL stream_item.original_url = doc['original_url'] and \ bytes(doc['original_url'].encode('utf-8')) or b'' ## get the three possible ContentItems body = doc.pop('body', {}).pop('raw', '').decode('string-escape') title = doc.pop('title', {}).pop('raw', '').decode('string-escape') anchor = doc.pop('anchor', {}).pop('raw', '').decode('string-escape') stream_item.body = ContentItem( raw = b''.join(['<p>', anchor, '</p>', '<p>', title, '</p>', body]), media_type = 'text/html', encoding = 'UTF-8', ) if title: stream_item.other_content['title'] = ContentItem( raw = title, media_type = 'text/html', encoding = 'UTF-8', ) if anchor: stream_item.other_content['anchor'] = ContentItem( raw = anchor, media_type = 'text/html', encoding = 'UTF-8', ) yield stream_item
def test_speed(parser_type, test_data_dir): stream_items = [] for i in xrange(10): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.join(test_data_dir, 'test' ) stream_item.body.clean_html = open( os.path.join(path, 'nytimes-index-clean.html')).read() stream_items.append( stream_item ) context = {} start = time.time() hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': False, 'domain_substrings': ['nytimes.com'], 'offset_types': [parser_type], }) for si in stream_items: si = hl(si, context) elapsed = time.time() - start rate = len(stream_items) / elapsed logger.debug('OffsetType: {}'.format(OffsetType)) logger.info('{:.1f} per second for {}'.format(rate, parser_type))
def test_target_parsing(test_data_dir): path = os.path.join(test_data_dir, 'test') test_html = open(os.path.join(path, 'target-test.html')).read() html = make_clean_html( test_html ) assert 'logo' in html assert 'target' in html visible = make_clean_visible( html ) assert 'logo' not in visible assert 'target' not in visible stage = hyperlink_labels(config={ 'offset_types': ['LINES'], 'require_abs_url': True, 'all_domains': True, }) si = StreamItem(body=ContentItem(clean_html=html)) context = {} stage( si, context ) html2 = si.body.clean_html visible2 = make_clean_visible( html2 ) #print visible2 assert 'target' not in visible2 assert 'logo' not in visible2
def test_langauge(test_data_dir): path = os.path.join(test_data_dir, 'test/raw-unicode-issues.html') si = make_stream_item(None, 'test') si.body = ContentItem(raw=open(path).read()) context = {} lang = language(config={}) lang(si, context) assert si.body.language.name == 'Japanese' assert si.body.language.code == 'ja'
def test_stage(test_data_dir): stage = clean_html({}) # NB: not even defaults path = os.path.join(test_data_dir, 'test') with open(os.path.join(path, 'nytimes-index.html'), 'r') as f: raw = f.read().decode('utf8') si = StreamItem(body=ContentItem(raw=raw, media_type='text/html')) si = stage(si, {}) with open(os.path.join(path, 'nytimes-index-clean-stable.html'), 'r') as f: stable = f.read() assert si.body.clean_html == stable
def test_long_doc(parser_type, test_data_dir): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.join(test_data_dir, 'test' ) stream_item.body.clean_html = open( os.path.join(path, 'company-test.html')).read() context = {} hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': True, 'offset_types': [parser_type], }) hl(stream_item, context)
def __call__(self, s1, context): s2 = make_stream_item(s1.stream_time.zulu_timestamp, s1.abs_url) s2.schost = s1.schost s2.source = s1.source s2.source_metadata['kba-2012'] = s1.source_metadata logger.debug('len(original .body.raw) = %d' % len(s1.body.raw)) #logger.critical(repr(s2)) s2.body = ContentItem( raw=s1.body.raw, encoding=s1.body.encoding, ## default, might get overwritten below media_type='text/html', taggings={ 'stanford': Tagging( tagger_id='stanford', raw_tagging=s1.body.ner, generation_time=make_stream_time('2012-06-01T00:00:00.0Z'), tagger_config= 'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100', tagger_version='Stanford CoreNLP ver 1.2.0', ) }) if self.config['keep_old_cleansed_as_clean_visible']: s2.body.clean_visible = s1.body.cleansed if s1.source == 'social': s2.body.media_type = 'text/plain' ## the separation of content items in the social stream ## was artificial and annoying, so smoosh them together s2.body.clean_visible = '\n\n'.join( [s1.title.cleansed, s1.anchor.cleansed, s1.body.cleansed]) changed_body_raw = False if s1.title and s1.title.raw: s2.body.raw = s1.title.raw s2.body.raw += r'\n\n' changed_body_raw = True if s1.anchor and s1.anchor.raw: s2.body.raw += s1.anchor.raw s2.body.raw += r'\n\n' changed_body_raw = True if changed_body_raw: s2.body.raw += s1.body.raw if s1.title: ci = ContentItem( raw=s1.title.raw, encoding=s1.title.encoding, clean_visible=s1.title.cleansed, ) s2.other_content['title'] = ci if s1.anchor: ci = ContentItem(raw=s1.anchor.raw, encoding=s1.anchor.encoding, clean_visible=s1.anchor.cleansed) s2.other_content['anchor'] = ci return s2
def make_test_stream_item(test_data_dir): stream_item = make_stream_item(None, 'http://nytimes.com/') stream_item.body = ContentItem() path = os.path.join(test_data_dir, 'test', 'nytimes-index-clean-stable.html') stream_item.body.clean_html = open(str(path)).read() return stream_item
def add_content_item(stream_item, title_m): title = whitespace_re.sub(' ', title_m.group('title')).strip() if len(title) > 60: title = title[:60] + '...' stream_item.other_content['title'] = ContentItem(clean_visible=title)