def test_title(): stage = title({}) cv = clean_visible({}) si = make_stream_item(0, '') si.body.clean_html = '''Then there was a <tag> ... <title>TITLE HERE </title> ''' si = cv(si, {}) si = stage(si) assert si.other_content['title'].clean_visible == 'TITLE HERE' si = make_stream_item(0, '') si.body.clean_html = '''Then there was a that went <tag> ... <title>TITLE HERE%s </title> ''' % ('*' * 80) si = cv(si, {}) si = stage(si) assert si.other_content['title'].clean_visible == 'TITLE HERE' + '*' * 50 + '...'
def test_matcher(): config = dict( ## command to run fpat_path="cat" ) fm = fpat_matcher(config) si1 = make_stream_item(None, "http://example.com") si1.body = ContentItem(clean_visible="hello! This is a test of matching Bob.") si2 = make_stream_item(None, "http://example.com") si2.body = ContentItem(clean_visible="hello! This is a test of matching Sally.") chunk_path = "/tmp/%s" % uuid.uuid1() ch = Chunk(chunk_path, mode="wb") ch.add(si1) ch.add(si1) ch.add(si2) ch.close() fm(chunk_path) ch = Chunk(chunk_path, mode="rb") SIs = list(ch) ## verify the si has expected things for si in SIs: len(si.body.labels) == 1 for i in range(2): print SIs[i].ratings
def test_filter_domains(tmpdir): domains_path = tmpdir.join('domains_path.txt') domains_path.write('cats.com\nhttp://birds.com/') stage = filter_domains(dict( include_domains = ['dogs.com'], include_domains_path = str(domains_path), )) assert stage.domains == set(['dogs.com', 'cats.com', 'birds.com']) si = make_stream_item(0, 'http://dogs.com/') assert stage(si) is not None si = make_stream_item(0, 'http://cats.com/') assert stage(si) is not None si = make_stream_item(0, 'http://birds.com/') assert stage(si) is not None si = make_stream_item(0, 'http://things.com/') assert stage(si) is None si = make_stream_item(0, 'http://things.com/') si.schost = 'https://birds.com' assert domain_name_cleanse(si.schost) == 'birds.com' assert stage(si) is not None
def test_title(): stage = title({}) cv = clean_visible({}) si = make_stream_item(0, '') si.body.clean_html = '''Then there was a <tag> ... <title>TITLE HERE </title> ''' si = cv(si, {}) si = stage(si) assert si.other_content['title'].clean_visible == 'TITLE HERE' si = make_stream_item(0, '') si.body.clean_html = '''Then there was a that went <tag> ... <title>TITLE HERE%s </title> ''' % ('*' * 80) si = cv(si, {}) si = stage(si) assert si.other_content[ 'title'].clean_visible == 'TITLE HERE' + '*' * 50 + '...'
def test_ft_with_stream_item(): si = streamcorpus.make_stream_item('2005-01-01T05:06:07.0Z', 'abc') tokens = [ streamcorpus.Token(offsets={ streamcorpus.OffsetType.XPATH_CHARS: streamcorpus.Offset( type=streamcorpus.OffsetType.XPATH_CHARS, xpath='/html[1]/body[1]/p[1]/b[1]/text()[1]', first=0, xpath_end='/html[1]/body[1]/p[1]/text()[1]', xpath_end_offset=2), }), streamcorpus.Token(offsets={ streamcorpus.OffsetType.XPATH_CHARS: streamcorpus.Offset( type=streamcorpus.OffsetType.XPATH_CHARS, xpath='/html[1]/body[1]/p[1]/b[2]/text()[1]', first=0, xpath_end='/html[1]/body[1]/p[1]/text()[2]', xpath_end_offset=4), }), ] si.body.sentences = {'test': [streamcorpus.Sentence(tokens=tokens)]} si.body.clean_html = '<html><body><p><b>T</b>om ' \ '<b>B</b>rady</p></body></html>' ft = FeatureTokens() ft['tom brady'].append([('test', 0, 0), ('test', 0, 1)]) assert next(ft.xpath_slices(si, 'tom brady')) == 'Tom Brady'
def test_fix_text(test_data_dir): fpath = path.join(test_data_dir, 'test/microsoft-quotes.txt') si = make_stream_item(None, 'test') si.body = ContentItem(raw=open(fpath).read()) fixer = fix_text(config={'read_from': 'raw', 'write_to': 'clean_visible'}) fixer(si, {}) assert(si.body.clean_visible.strip() == 'Do not "quote me."')
def _make_stream_item(cls, path, metadata, abs_url, entities): ''' ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. creation_time = os.path.getctime(path) ## make stream item stream_item = streamcorpus.make_stream_item( creation_time, abs_url) stream_item.source = metadata.get('source') ## build a ContentItem for the body body = streamcorpus.ContentItem() body.media_type = magic.from_file(path, mime=True) logger.info('opening %r', path) with open(path) as f: body.raw = f.read() ## attach the content_item to the stream_item stream_item.body = body ## annotations anno = streamcorpus.Annotator() anno.annotator_id = metadata['annotator_id'] anno.annotation_time = stream_item.stream_time num_ratings = 0 for entity, is_profile in entities: num_ratings += 1 ## pull out target id and mention tokens target_id = str(entity['target_id']) ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id = target_id) rating.contains_mention = True if is_profile: rating.flags = [streamcorpus.FlagType.PROFILE] ## parse slots in yaml file slots = cls._parse_slots(entity['slots']) ## heuristically split the slots string on white space and ## use each token as a separate mention. rating.mentions = [cleanse(unicode(slot[1], 'utf-8')) for slot in slots] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline logger.info('created StreamItem(num ratings=%d, abs_url=%r', num_ratings, stream_item.abs_url) return stream_item
def process_response(self, resp): logger.info('retrieved %d bytes for %r', len(resp.content), resp.url) last_modified = resp.headers.get('last-modified') if last_modified: try: last_modified = dateutil.parser.parse(last_modified) last_modified = int(last_modified.strftime('%s')) except Exception: last_modified = None if not last_modified: last_modified = int(time.time()) si = streamcorpus.make_stream_item(last_modified, resp.url) # don't try to convert it... e.g. if we got a PDF si.original_url = resp.url si.body.raw = resp.content media_type = resp.headers.get('content-type') try: media_type = (media_type.decode('utf8', 'ignore').encode('utf8', 'ignore')) except Exception: media_type = repr(media_type) si.body.media_type = media_type si.body.encoding = resp.apparent_encoding return si
def generate_stream_items_from_kba_json(json_file_path): ## iterate over gzip'ed file of JSON lines data = gzip.GzipFile(fileobj=open(json_file_path, 'rb'), mode='rb').read() for line in data.splitlines(): try: doc = json.loads(line) except Exception, exc: print('trapped: %s' % traceback.format_exc(exc)) print('continuing') continue assert doc['source'] == 'social', doc['source'] ## make a StreamItem with valid StreamTime computed from ## zulu_timestamp. This will fix the four-hour offsets in ## some of the KBA 2012 files. stream_item = make_stream_item( doc['stream_time']['zulu_timestamp'], bytes(doc['abs_url'].encode('utf-8')) ) ## capture schost and source stream_item.schost = doc.pop('schost') stream_item.source = doc.pop('source') ## assemble source_metadata stream_item.source_metadata['kba-2012'] = json.dumps(doc.pop('source_metadata')) ## might have a funky original URL stream_item.original_url = doc['original_url'] and \ bytes(doc['original_url'].encode('utf-8')) or b'' ## get the three possible ContentItems body = doc.pop('body', {}).pop('raw', '').decode('string-escape') title = doc.pop('title', {}).pop('raw', '').decode('string-escape') anchor = doc.pop('anchor', {}).pop('raw', '').decode('string-escape') stream_item.body = ContentItem( raw = b''.join(['<p>', anchor, '</p>', '<p>', title, '</p>', body]), media_type = 'text/html', encoding = 'UTF-8', ) if title: stream_item.other_content['title'] = ContentItem( raw = title, media_type = 'text/html', encoding = 'UTF-8', ) if anchor: stream_item.other_content['anchor'] = ContentItem( raw = anchor, media_type = 'text/html', encoding = 'UTF-8', ) yield stream_item
def test_multi_token_match(): si = make_stream_item(0, '') tagger_id = 'test_tagger' annotator_id = 'test_anno' target_id = 'test_target' si.body.sentences[tagger_id] = [ Sentence(tokens=[ Token(token='This'), Token(token='-LRB-big-RRB- dog'), Token(token='Jake'), Token(token='has'), Token(token='no'), Token(token=u'\u1F601'.encode('utf8')), Token(token='...'), Token(token='Teeth'), ])] rating = Rating(annotator=Annotator(annotator_id=annotator_id), target=Target(target_id=target_id), mentions=['Big dog! Jake... ', u'\u1F601 Teeth'.encode('utf8')], ) add_annotation(si, rating) aligner_data = dict( tagger_id = tagger_id, annotator_id = annotator_id, ) multi_token_match(si, aligner_data) assert si.body.sentences[tagger_id][0].tokens[1].labels assert si.body.sentences[tagger_id][0].tokens[2].labels assert si.body.sentences[tagger_id][0].tokens[-3].labels assert si.body.sentences[tagger_id][0].tokens[-2].labels assert si.body.sentences[tagger_id][0].tokens[-1].labels
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('out_path') args = parser.parse_args() si = make_stream_item(1, 'http://crazydog.com') si.body.raw = ''' Flying dogs are amazing. The flight of the super dog Sam Vroomvroom is often cited as the first such flying dog. ''' topic_name = 'The flight of the super dog Sam Vroomvroom' sel = Selector( selector_type=SelectorType.TOPIC.value, raw_selector=topic_name, canonical_selector=topic_name.lower( ), # this is the key for making it appear for a profile of this title offsets={ OffsetType.CHARS: Offset( type=OffsetType.CHARS, first=si.body.raw.find('The'), length=len(topic_name), ) }, ) si.body.selectors['other'] = [sel] chunk = Chunk(args.out_path, mode='wb') chunk.add(si) chunk.close()
def test_multi_token_match(): si = make_stream_item(0, '') tagger_id = 'test_tagger' annotator_id = 'test_anno' target_id = 'test_target' si.body.sentences[tagger_id] = [ Sentence(tokens=[ Token(token='This'), Token(token='-LRB-big-RRB- dog'), Token(token='Jake'), Token(token='has'), Token(token='no'), Token(token=u'\u1F601'.encode('utf8')), Token(token='...'), Token(token='Teeth'), ]) ] rating = Rating( annotator=Annotator(annotator_id=annotator_id), target=Target(target_id=target_id), mentions=['Big dog! Jake... ', u'\u1F601 Teeth'.encode('utf8')], ) add_annotation(si, rating) aligner_data = dict( tagger_id=tagger_id, annotator_id=annotator_id, ) multi_token_match(si, aligner_data) assert si.body.sentences[tagger_id][0].tokens[1].labels assert si.body.sentences[tagger_id][0].tokens[2].labels assert si.body.sentences[tagger_id][0].tokens[-3].labels assert si.body.sentences[tagger_id][0].tokens[-2].labels assert si.body.sentences[tagger_id][0].tokens[-1].labels
def test_kvlayer_simple(configurator, tmpdir): si = streamcorpus.make_stream_item('2000-01-01T12:34:00.000123Z', 'test://test.stream.item/') chunkfile = str(tmpdir.join('chunk.sc.xz')) with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk: chunk.add(si) with configurator(): writer = to_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'to_kvlayer')) writer(chunkfile, {}, '') kvlclient = kvlayer.client() kvlclient.setup_namespace({'stream_items': 2}) print repr(list(kvlclient.scan_keys('stream_items'))) for (k,v) in kvlclient.get( 'stream_items', (uuid.UUID(int=946730040), uuid.UUID(hex='985c1e3ed73256cd9a399919fe93cf76'))): assert v is not None reader = from_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'from_kvlayer')) sis = list(reader('')) assert len(sis) == 1 assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks assert sis[0].abs_url == si.abs_url
def test_ft_with_stream_item(): si = streamcorpus.make_stream_item('2005-01-01T05:06:07.0Z', 'abc') tokens = [ streamcorpus.Token( offsets={ streamcorpus.OffsetType.XPATH_CHARS: streamcorpus.Offset( type=streamcorpus.OffsetType.XPATH_CHARS, xpath='/html[1]/body[1]/p[1]/b[1]/text()[1]', first=0, xpath_end='/html[1]/body[1]/p[1]/text()[1]', xpath_end_offset=2), }), streamcorpus.Token( offsets={ streamcorpus.OffsetType.XPATH_CHARS: streamcorpus.Offset( type=streamcorpus.OffsetType.XPATH_CHARS, xpath='/html[1]/body[1]/p[1]/b[2]/text()[1]', first=0, xpath_end='/html[1]/body[1]/p[1]/text()[2]', xpath_end_offset=4), }), ] si.body.sentences = {'test': [streamcorpus.Sentence(tokens=tokens)]} si.body.clean_html = '<html><body><p><b>T</b>om ' \ '<b>B</b>rady</p></body></html>' ft = FeatureTokens() ft['tom brady'].append([('test', 0, 0), ('test', 0, 1)]) assert next(ft.xpath_slices(si, 'tom brady')) == 'Tom Brady'
def process_response(self, resp): logger.info("retrieved %d bytes for %r", len(resp.content), resp.url) last_modified = resp.headers.get("last-modified") if last_modified: try: last_modified = dateutil.parser.parse(last_modified) last_modified = int(last_modified.strftime("%s")) except Exception: last_modified = None if not last_modified: last_modified = int(time.time()) si = streamcorpus.make_stream_item(last_modified, resp.url) # don't try to convert it... e.g. if we got a PDF si.original_url = resp.url si.body.raw = resp.content media_type = resp.headers.get("content-type") try: media_type = media_type.decode("utf8", "ignore").encode("utf8", "ignore") except Exception: media_type = repr(media_type) si.body.media_type = media_type si.body.encoding = resp.apparent_encoding return si
def fetch_all(urls, out_dir): session = requests.Session() roller = streamcorpus.ChunkRoller(out_dir) start = time.time() try: for idx, url in enumerate(urls): print('# starting fetch of %r' % url) sys.stdout.flush() resp = requests.get(url, headers=headers) last_modified = resp.headers.get('last-modified') if last_modified: try: last_modified = int(last_modified) except: dt = dateutil.parser.parse(last_modified) last_modified = int(dt.strftime('%s')) si = streamcorpus.make_stream_item(last_modified or time.time(), url) si.body.raw = resp.content si.body.media_type = resp.headers.get('content-type') si.body.encoding = resp.encoding roller.add(si) print('fetched %d bytes for %s with last_modified=%r' % (len(si.body.raw), url, last_modified)) if idx % 10 == 0: elapsed = time.time() - start rate = (idx + 1) / elapsed remaining = (len(urls) - 1 - idx) / rate / 3600 print('%d of %d done in %.3f seconds --> %.3f per second --> %.3f hours remaining' % ((idx + 1), len(urls), elapsed, rate, remaining)) sys.stdout.flush() except: roller.close() raise roller.close()
def make_test_stream_item(): stream_item = make_stream_item(None, 'http://nytimes.com/') stream_item.body = ContentItem() path = os.path.dirname(__file__) path = os.path.join( path, _TEST_DATA_ROOT, 'test', 'nytimes-index-clean-stable.html') stream_item.body.clean_html = open(path).read() return stream_item
def test_get_name_info(tmpdir): path = str(tmpdir.join('test_path')) c = Chunk(path, mode='wb') c.add(make_stream_item(28491, 'abs_url')) name_info = get_name_info(path, i_str='foo') assert name_info['date_now'] == name_info['date_time_now'][:10] assert name_info['date_now'] + '-' + name_info['time_now'] == name_info['date_time_now']
def test_langauge(test_data_dir): path = os.path.join(test_data_dir, 'test/raw-unicode-issues.html') si = make_stream_item(None, 'test') si.body = ContentItem(raw=open(path).read()) context = {} lang = language(config={}) lang(si, context) assert si.body.language.name == 'Japanese' assert si.body.language.code == 'ja'
def setup_nltk(text, run_extractor=True): si = make_stream_item(0, '') si.body.clean_visible = text nt = nltk_tokenizer({}) nt(si, {}) regex_extractor = structured_features(structured_features_config) if run_extractor: regex_extractor(si) return si
def _make_stream_item(entry): """Given a single spinn3r feed entry, produce a single StreamItem. Returns 'None' if a complete item can't be constructed. """ # get standard metadata, assuming it's present... if not hasattr(entry, 'permalink_entry'): return None pe = entry.permalink_entry # ...and create a streamitem... si = streamcorpus.make_stream_item( pe.date_found[:-1] + '.0Z', pe.canonical_link.href.encode('utf8')) if not si.stream_time: logger.debug('failed to generate stream_time from {0!r}' .format(pe.date_found)) return None if not si.abs_url: logger.debug('failed to generate abs_url from {0!r}' .format(pe.canonical_link.href)) return None # ...filling in the actual data si.body = _make_content_item( pe.content, alternate_data=entry.feed_entry.content.data) if not si.body: return None if not si.body.raw: return None if pe.content_extract.data: si.other_content['extract'] = _make_content_item(pe.content_extract) si.other_content['title'] = streamcorpus.ContentItem( raw=pe.title.encode('utf8'), media_type=pe.content_extract.mime_type, encoding='UTF-8') si.other_content['feed_entry_title'] = streamcorpus.ContentItem( raw=entry.feed_entry.title.encode('utf8'), media_type=entry.feed_entry.content.mime_type, encoding='UTF-8') if entry.feed_entry.content.data: si.other_content['feed_entry'] = _make_content_item( entry.feed_entry.content) si.source_metadata['lang'] = pe.lang[0].code si.source_metadata['author'] = json.dumps( dict( name = pe.author[0].name, email = pe.author[0].email, link = pe.author[0].link[0].href, ) ) si.source = entry.source.publisher_type return si
def _get_stream_item(self, item): stream_item = streamcorpus.make_stream_item(time.time(), item['url']) stream_item.body.raw = self._encode(item.get('body', u'')) stream_item.body.media_type = self._get_media_type(item) stream_item.body.encoding = self.encoding stream_item.original_url = item.get('source_url') meta = self._get_metadata(item) stream_item.body.language = streamcorpus.Language( code=meta.get('language_code', '?'), name=meta.get('language_name', '?')) stream_item.source_metadata = meta return stream_item
def _generate_stream_items(protobuf_data): ''' converts all of the protobuf_data spinn3r protoStream format into StreamItems, which it yields as a generator ''' ## iterate over entry objects and bytes from which they came for num, (entry, delimited_bytes) in enumerate(delimited_messages(protobuf_data)): #print num if entry is None: ## hit end of data continue if not hasattr(entry, 'permalink_entry'): #print 'missing permalink_entry' continue pe = entry.permalink_entry ## verify our understanding of the kludgy link data #assert pe.link[0].href[:len(pe.link[0].resource)] == pe.link[0].resource, \ # (pe.link[0].href, pe.link[0].resource) #assert pe.link[0].href == pe.canonical_link.href #assert pe.canonical_link.href.startswith(pe.canonical_link.resource), \ # (pe.canonical_link.href, pe.canonical_link.resource) ## create a StreamItem for this date_found, canonical_link si = make_stream_item( pe.date_found[:-1] + '.0Z', pe.canonical_link.href.encode('utf8')) if not si.stream_time: print 'failed to generate stream_time from: %r' % pe.date_found continue if not si.abs_url: print 'failed to generate abs_url from: %r' % pe.canonical_link.href continue if not pe.content.data: continue try: raw = zlib.decompress(pe.content.data) assert raw except Exception, exc: #print('failed to get anything from decompressing pe.content.data') try: raw = zlib.decompress(entry.feed_entry.content.data) assert raw except Exception, exc: #print('failed to get anything from decompressing entry.feed_entry.content.data') #print('empty entry? %s' % entry) continue
def test_langauge(): path = os.path.dirname(__file__) path = os.path.join(path, _TEST_DATA_ROOT, "test/raw-unicode-issues.html") si = make_stream_item(None, "test") si.body = ContentItem(raw=open(path).read()) lang = _init_stage("language", {}) context = {} lang(si, context) assert si.body.language.name == "Japanese" assert si.body.language.code == "ja"
def _make_stream_item(entry): """Given a single spinn3r feed entry, produce a single StreamItem. Returns 'None' if a complete item can't be constructed. """ # get standard metadata, assuming it's present... if not hasattr(entry, 'permalink_entry'): return None pe = entry.permalink_entry # ...and create a streamitem... si = streamcorpus.make_stream_item(pe.date_found[:-1] + '.0Z', pe.canonical_link.href.encode('utf8')) if not si.stream_time: logger.debug('failed to generate stream_time from {0!r}'.format( pe.date_found)) return None if not si.abs_url: logger.debug('failed to generate abs_url from {0!r}'.format( pe.canonical_link.href)) return None # ...filling in the actual data si.body = _make_content_item(pe.content, alternate_data=entry.feed_entry.content.data) if not si.body: return None if not si.body.raw: return None if pe.content_extract.data: si.other_content['extract'] = _make_content_item(pe.content_extract) si.other_content['title'] = streamcorpus.ContentItem( raw=pe.title.encode('utf8'), media_type=pe.content_extract.mime_type, encoding='UTF-8') si.other_content['feed_entry_title'] = streamcorpus.ContentItem( raw=entry.feed_entry.title.encode('utf8'), media_type=entry.feed_entry.content.mime_type, encoding='UTF-8') if entry.feed_entry.content.data: si.other_content['feed_entry'] = _make_content_item( entry.feed_entry.content) si.source_metadata['lang'] = pe.lang[0].code si.source_metadata['author'] = json.dumps( dict( name=pe.author[0].name, email=pe.author[0].email, link=pe.author[0].link[0].href, )) si.source = entry.source.publisher_type return si
def test_chunk_roller(tmpdir): cr = ChunkRoller(str(tmpdir), chunk_max=10) for i in range(25): si = make_stream_item(i, str(i)) cr.add(si) cr.close() files = [] for fname in os.listdir(str(tmpdir)): assert 'tmp' not in fname count = int(fname.split('-')[0]) files.append(count) assert sorted(files) == [5, 10, 10]
def test_kvlayer_negative(configurator, tmpdir): si = streamcorpus.make_stream_item('1969-07-20T20:18:00.000000Z', 'test://test.stream.item/') chunkfile = str(tmpdir.join('chunk.sc.xz')) with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk: chunk.add(si) with configurator(): writer = to_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'to_kvlayer')) writer(chunkfile, {}, '') reader = from_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'from_kvlayer')) sis = list(reader('')) assert len(sis) == 1 assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks assert sis[0].abs_url == si.abs_url
def extract_user_names(clean_visible): '''also renamedto usernames2''' if isinstance(clean_visible, unicode): clean_visible = clean_visible.encode('utf8') si = make_stream_item(0, '') si.body.clean_visible = clean_visible xform = nltk_tokenizer({}) xform.process_item(si) classifier = Classifier('naivebayes') sc = classifier.build_feature(si) logger.info('found usernames: %r', sc) open('/tmp/found.txt', 'wb').write('\n'.join(map(lambda x: x.encode('utf8'), sc.keys()))) return sc
def __call__(self, s1, context): s2 = make_stream_item(s1.stream_time.zulu_timestamp, s1.abs_url) s2.schost = s1.schost s2.source = s1.source s2.source_metadata['kba-2012'] = s1.source_metadata logger.debug('len(original .body.raw) = %d' % len(s1.body.raw)) #logger.critical(repr(s2)) s2.body = ContentItem( raw=s1.body.raw, encoding=s1.body.encoding, ## default, might get overwritten below media_type='text/html', taggings={ 'stanford': Tagging( tagger_id='stanford', raw_tagging=s1.body.ner, generation_time=make_stream_time('2012-06-01T00:00:00.0Z'), tagger_config= 'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100', tagger_version='Stanford CoreNLP ver 1.2.0', ) }) if self.config['keep_old_cleansed_as_clean_visible']: s2.body.clean_visible = s1.body.cleansed if s1.source == 'social': s2.body.media_type = 'text/plain' ## the separation of content items in the social stream ## was artificial and annoying, so smoosh them together s2.body.clean_visible = '\n\n'.join( [s1.title.cleansed, s1.anchor.cleansed, s1.body.cleansed]) changed_body_raw = False if s1.title and s1.title.raw: s2.body.raw = s1.title.raw s2.body.raw += r'\n\n' changed_body_raw = True if s1.anchor and s1.anchor.raw: s2.body.raw += s1.anchor.raw s2.body.raw += r'\n\n' changed_body_raw = True if changed_body_raw: s2.body.raw += s1.body.raw if s1.title: ci = ContentItem( raw=s1.title.raw, encoding=s1.title.encoding, clean_visible=s1.title.cleansed, ) s2.other_content['title'] = ci if s1.anchor: ci = ContentItem(raw=s1.anchor.raw, encoding=s1.anchor.encoding, clean_visible=s1.anchor.cleansed) s2.other_content['anchor'] = ci return s2
def generate_john_smith_chunk(path_to_original): ''' This _looks_ like a Chunk only in that it generates StreamItem instances when iterated upon. ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. Here, we assume the JS corpus ## was created at one moment at the end of 1998: creation_time = '1998-12-31T23:59:59.999999Z' correct_time = 915148799 if not path_to_original.startswith('/'): path_to_original = os.path.join(os.getcwd(), path_to_original) ## iterate over the files in the 35 input directories for label_id in range(35): dir_path = os.path.join(path_to_original, str(label_id)) fnames = os.listdir(dir_path) fnames.sort() for fname in fnames: stream_item = streamcorpus.make_stream_item( creation_time, ## make up an abs_url os.path.join( 'john-smith-corpus', str(label_id), fname)) if int(stream_item.stream_time.epoch_ticks) != correct_time: raise PipelineBaseException('wrong stream_time construction: %r-->%r != %r'\ % (creation_time, stream_item.stream_time.epoch_ticks, correct_time)) ## These docs came from the authors of the paper cited above. stream_item.source = 'bagga-and-baldwin' ## build a ContentItem for the body body = streamcorpus.ContentItem() raw_string = open(os.path.join(dir_path, fname)).read() ## We know that this is already clean and has nothing ## tricky in it, because we manually cleansed it. To ## illustrate how we stick all strings into thrift, we ## convert this to unicode (which introduces no changes) ## and then encode it as utf-8, which also introduces no ## changes. Thrift stores strings as 8-bit character ## strings. # http://www.mail-archive.com/[email protected]/msg00210.html body.clean_visible = unicode(raw_string).encode('utf8') ## attach the content_item to the stream_item stream_item.body = body stream_item.body.language = streamcorpus.Language(code='en', name='ENGLISH') ## The authors also annotated the corpus anno = streamcorpus.Annotator() anno.annotator_id = 'bagga-and-baldwin' anno.annotation_time = stream_item.stream_time ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id = str(label_id)) # must be string rating.contains_mention = True rating.mentions = ['john', 'smith'] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline yield stream_item
def make_test_stream_item(test_data_dir): stream_item = make_stream_item(None, "http://nytimes.com/") stream_item.body = ContentItem() path = os.path.join(test_data_dir, "test", "nytimes-index-clean-stable.html") stream_item.body.clean_html = open(str(path)).read() return stream_item
def _make_stream_item(self, dir_path, fname): ## could use dirpath as the label. Instead, we illustrate ## using a TSV file to lookup the ground truth using the fname. assert fname in self.ground_truth, (dir_path, fname) ## "mention" is the name string from the text ## "target_id" is the label mention, target_id = self.ground_truth[fname] ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. Here, we assume the JS corpus ## was created at one moment at the end of 1998: creation_time = "1998-12-31T23:59:59.999999Z" stream_item = streamcorpus.make_stream_item( creation_time, ## make up an abs_url os.path.join("john-smith-corpus", target_id, fname), ) ## These docs came from the authors of the paper cited above. stream_item.source = "bagga-and-baldwin" ## build a ContentItem for the body body = streamcorpus.ContentItem() raw_string = open(os.path.join(dir_path, fname)).read() ## We know that this is already clean and has nothing ## tricky in it, because we manually cleansed it. To ## illustrate how we stick all strings into thrift, we ## convert this to unicode (which introduces no changes) ## and then encode it as utf-8, which also introduces no ## changes. Thrift stores strings as 8-bit character ## strings. # http://www.mail-archive.com/[email protected]/msg00210.html body.clean_visible = unicode(raw_string).encode("utf8") ## attach the content_item to the stream_item stream_item.body = body stream_item.body.language = streamcorpus.Language(code="en", name="ENGLISH") ## The authors also annotated the corpus anno = streamcorpus.Annotator() anno.annotator_id = "bagga-and-baldwin" anno.annotation_time = stream_item.stream_time ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id=target_id) rating.contains_mention = True ## heuristically split the mentions string on white space and ## use each token as a separate mention. For other corpora, ## this might need to be more sophisticated. rating.mentions = map(cleanse, mention.decode("utf8").split()) ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline return stream_item
def generate_john_smith_chunk(path_to_original): ''' This _looks_ like a Chunk only in that it generates StreamItem instances when iterated upon. ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. Here, we assume the JS corpus ## was created at one moment at the end of 1998: creation_time = '1998-12-31T23:59:59.999999Z' correct_time = 915148799 if not os.path.isabs(path_to_original): path_to_original = os.path.join(os.getcwd(), path_to_original) ## iterate over the files in the 35 input directories for label_id in range(35): dir_path = os.path.join(path_to_original, str(label_id)) fnames = os.listdir(dir_path) fnames.sort() for fname in fnames: stream_item = streamcorpus.make_stream_item( creation_time, ## make up an abs_url os.path.join('john-smith-corpus', str(label_id), fname)) if int(stream_item.stream_time.epoch_ticks) != correct_time: raise PipelineBaseException('wrong stream_time construction: %r-->%r != %r'\ % (creation_time, stream_item.stream_time.epoch_ticks, correct_time)) ## These docs came from the authors of the paper cited above. stream_item.source = 'bagga-and-baldwin' ## build a ContentItem for the body body = streamcorpus.ContentItem() raw_string = open(os.path.join(dir_path, fname)).read() ## We know that this is already clean and has nothing ## tricky in it, because we manually cleansed it. To ## illustrate how we stick all strings into thrift, we ## convert this to unicode (which introduces no changes) ## and then encode it as utf-8, which also introduces no ## changes. Thrift stores strings as 8-bit character ## strings. # http://www.mail-archive.com/[email protected]/msg00210.html body.clean_visible = unicode(raw_string).encode('utf8') ## attach the content_item to the stream_item stream_item.body = body stream_item.body.language = streamcorpus.Language(code='en', name='ENGLISH') ## The authors also annotated the corpus anno = streamcorpus.Annotator() anno.annotator_id = 'bagga-and-baldwin' anno.annotation_time = stream_item.stream_time ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target( target_id=str(label_id)) # must be string rating.contains_mention = True rating.mentions = ['john', 'smith'] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline yield stream_item
def _make_stream_item(cls, path, metadata, abs_url, entities): ''' ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. creation_time = os.path.getctime(path) ## make stream item stream_item = streamcorpus.make_stream_item(creation_time, abs_url) stream_item.source = metadata.get('source') ## build a ContentItem for the body body = streamcorpus.ContentItem() body.media_type = magic.from_file(path, mime=True) logger.info('opening %r', path) with open(path) as f: body.raw = f.read() ## attach the content_item to the stream_item stream_item.body = body ## annotations anno = streamcorpus.Annotator() anno.annotator_id = metadata['annotator_id'] anno.annotation_time = stream_item.stream_time num_ratings = 0 for entity, is_profile in entities: num_ratings += 1 ## pull out target id and mention tokens target_id = str(entity['target_id']) ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id=target_id) rating.contains_mention = True if is_profile: rating.flags = [streamcorpus.FlagType.PROFILE] ## parse slots in yaml file slots = cls._parse_slots(entity['slots']) ## heuristically split the slots string on white space and ## use each token as a separate mention. rating.mentions = [ cleanse(unicode(slot[1], 'utf-8')) for slot in slots ] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline logger.info('created StreamItem(num ratings=%d, abs_url=%r', num_ratings, stream_item.abs_url) return stream_item
def __call__(self, s1, context): s2 = make_stream_item(s1.stream_time.zulu_timestamp, s1.abs_url) s2.schost = s1.schost s2.source = s1.source s2.source_metadata['kba-2012'] = s1.source_metadata logger.debug('len(original .body.raw) = %d' % len( s1.body.raw )) #logger.critical(repr(s2)) s2.body = ContentItem( raw = s1.body.raw, encoding = s1.body.encoding, ## default, might get overwritten below media_type = 'text/html', taggings = {'stanford': Tagging( tagger_id = 'stanford', raw_tagging = s1.body.ner, generation_time = make_stream_time('2012-06-01T00:00:00.0Z'), tagger_config = 'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100', tagger_version = 'Stanford CoreNLP ver 1.2.0', )} ) if self.config['keep_old_cleansed_as_clean_visible']: s2.body.clean_visible = s1.body.cleansed if s1.source == 'social': s2.body.media_type = 'text/plain' ## the separation of content items in the social stream ## was artificial and annoying, so smoosh them together s2.body.clean_visible = '\n\n'.join([ s1.title.cleansed, s1.anchor.cleansed, s1.body.cleansed]) changed_body_raw = False if s1.title and s1.title.raw: s2.body.raw = s1.title.raw s2.body.raw += r'\n\n' changed_body_raw = True if s1.anchor and s1.anchor.raw: s2.body.raw += s1.anchor.raw s2.body.raw += r'\n\n' changed_body_raw = True if changed_body_raw: s2.body.raw += s1.body.raw if s1.title: ci = ContentItem( raw = s1.title.raw, encoding = s1.title.encoding, clean_visible = s1.title.cleansed, ) s2.other_content['title'] = ci if s1.anchor: ci = ContentItem( raw = s1.anchor.raw, encoding = s1.anchor.encoding, clean_visible = s1.anchor.cleansed ) s2.other_content['anchor'] = ci return s2
def make_test_stream_item(test_data_dir): stream_item = make_stream_item(None, 'http://nytimes.com/') stream_item.body = ContentItem() path = os.path.join(test_data_dir, 'test', 'nytimes-index-clean-stable.html') stream_item.body.clean_html = open(str(path)).read() return stream_item
ch = Chunk(output_path, mode='wb') for file_path in input_files: ## get the text text = open(file_path).read() ## every StreamItem has a timestamp, which ideally is the creation ## time of the text zulu_timestamp = '2013-04-18T18:18:20.000000Z' ## every StreamItem has an absolute URL, which ideally points to ## the real text on the Web abs_url = 'http://nytimes.com/index.html' si = make_stream_item(zulu_timestamp, abs_url) assert si.version == Versions.v0_3_0, 'new streamcorpus collections should be built using the latest version' ## StreamItem.source must be a string without spaces that ## identifies the origin of the content. Existing source names ## are 'social', 'news', 'linking', 'arxiv', 'FORUMS', and a few ## others. Make up an appropriate source name for this content, ## it should be human readable and make sense as the name of the ## corpus. Typically, when naming chunk files, we use ## "<date-hour>/<source>-<md5>.sc.xz" si.source = 'news' ## all of the StreamItems in a chunk file **must** have the same ## source string ## if the text is raw from the web and might contain control
def __call__(self, si, context=None): if si.version == streamcorpus.Versions.v0_3_0: return si if not hasattr(si, 'version'): raise NotImplementedError('upgrade_streamcorpus_v0_3_0 does not support upgrading from v0_1_0; see "_upgrade_streamcorpus.py"') si3 = streamcorpus.make_stream_item( zulu_timestamp=si.stream_time.zulu_timestamp, abs_url=si.abs_url) if si3.stream_id != si.stream_id: si3.external_ids['kba-2013'] = {si3.stream_id: si.stream_id} ## copy everything for attr in ['original_url', 'ratings', 'schost', 'source', 'source_metadata', 'ratings', ]: setattr(si3, attr, copy.deepcopy(getattr(si, attr))) si3.body = streamcorpus.ContentItem() for name, ci in si.other_content.items(): ci3 = streamcorpus.ContentItem() si3.other_content[name] = ci3 for attr in content_item_attrs: setattr(ci3, attr, copy.deepcopy(getattr(ci, attr))) upgrade_labels(ci, ci3) for attr in content_item_attrs: setattr(si3.body, attr, copy.deepcopy(getattr(si.body, attr))) upgrade_labels(si.body, si3.body) ## fix the body.sentences['lingpipe'] mention_id ranges next_global_mention_id = 0 ## mapping from (sentence_id, mention_id) --> global_mention_id mention_ids = {} si3.body.sentences['lingpipe'] = [] for sentence_id, sentence in enumerate(si.body.sentences.get('lingpipe', [])): new_sent = streamcorpus.Sentence() si3.body.sentences['lingpipe'].append(new_sent) for token_id, token in enumerate(sentence.tokens): new_token = streamcorpus.Token() new_sent.tokens.append(new_token) for attr in ['token_num', 'token', 'offsets', 'sentence_pos', 'lemma', 'pos', 'entity_type', 'mention_id', 'equiv_id', 'parent_id', 'dependency_path']: setattr(new_token, attr, copy.deepcopy(getattr(token, attr))) upgrade_labels(token, new_token) if token.mention_id not in [-1, None]: key = (sentence_id, token.mention_id) if key in mention_ids: new_mention_id = mention_ids[key] else: new_mention_id = next_global_mention_id next_global_mention_id += 1 ## save it for later mention_ids[key] = new_mention_id new_token.mention_id = new_mention_id logger.debug('new_mention_id = %d' % new_mention_id) if token.entity_type in [3, 4]: ## convert FEMALE/MALE_PRONOUN new_token.mention_type = streamcorpus.MentionType.PRO new_token.entity_type = streamcorpus.EntityType.PER if token.entity_type == 3: gender_value = 1 else: gender_value = 0 attr = streamcorpus.Attribute( attribute_type = streamcorpus.AttributeType.PER_AGE, evidence = token.token, value = str(gender_value), sentence_id = sentence_id, mention_id = token.mention_id) if 'lingpipe' not in si3.body.attributes: si3.body.attributes['lingpipe'] = [] si3.body.attributes['lingpipe'].append(attr) else: new_token.mention_type = streamcorpus.MentionType.NAME ## return our newly manufacturered v0_3_0 StreamItem return si3
def __call__(self, si, context=None): if si.version == streamcorpus.Versions.v0_3_0: return si if not hasattr(si, 'version'): raise NotImplementedError( 'upgrade_streamcorpus_v0_3_0 does not support upgrading from v0_1_0; see "_upgrade_streamcorpus.py"' ) si3 = streamcorpus.make_stream_item( zulu_timestamp=si.stream_time.zulu_timestamp, abs_url=si.abs_url) if si3.stream_id != si.stream_id: si3.external_ids['kba-2013'] = {si3.stream_id: si.stream_id} ## copy everything for attr in [ 'original_url', 'ratings', 'schost', 'source', 'source_metadata', 'ratings', ]: setattr(si3, attr, copy.deepcopy(getattr(si, attr))) si3.body = streamcorpus.ContentItem() for name, ci in si.other_content.items(): ci3 = streamcorpus.ContentItem() si3.other_content[name] = ci3 for attr in content_item_attrs: setattr(ci3, attr, copy.deepcopy(getattr(ci, attr))) upgrade_labels(ci, ci3) for attr in content_item_attrs: setattr(si3.body, attr, copy.deepcopy(getattr(si.body, attr))) upgrade_labels(si.body, si3.body) ## fix the body.sentences['lingpipe'] mention_id ranges next_global_mention_id = 0 ## mapping from (sentence_id, mention_id) --> global_mention_id mention_ids = {} si3.body.sentences['lingpipe'] = [] for sentence_id, sentence in enumerate( si.body.sentences.get('lingpipe', [])): new_sent = streamcorpus.Sentence() si3.body.sentences['lingpipe'].append(new_sent) for token_id, token in enumerate(sentence.tokens): new_token = streamcorpus.Token() new_sent.tokens.append(new_token) for attr in [ 'token_num', 'token', 'offsets', 'sentence_pos', 'lemma', 'pos', 'entity_type', 'mention_id', 'equiv_id', 'parent_id', 'dependency_path' ]: setattr(new_token, attr, copy.deepcopy(getattr(token, attr))) upgrade_labels(token, new_token) if token.mention_id not in [-1, None]: key = (sentence_id, token.mention_id) if key in mention_ids: new_mention_id = mention_ids[key] else: new_mention_id = next_global_mention_id next_global_mention_id += 1 ## save it for later mention_ids[key] = new_mention_id new_token.mention_id = new_mention_id logger.debug('new_mention_id = %d' % new_mention_id) if token.entity_type in [3, 4]: ## convert FEMALE/MALE_PRONOUN new_token.mention_type = streamcorpus.MentionType.PRO new_token.entity_type = streamcorpus.EntityType.PER if token.entity_type == 3: gender_value = 1 else: gender_value = 0 attr = streamcorpus.Attribute( attribute_type=streamcorpus.AttributeType.PER_AGE, evidence=token.token, value=str(gender_value), sentence_id=sentence_id, mention_id=token.mention_id) if 'lingpipe' not in si3.body.attributes: si3.body.attributes['lingpipe'] = [] si3.body.attributes['lingpipe'].append(attr) else: new_token.mention_type = streamcorpus.MentionType.NAME ## return our newly manufacturered v0_3_0 StreamItem return si3