def __call__(self, i_str): # Read in the entire contents as text; we will need to # save it away later with open(i_str, 'r') as f: serifxml = f.read() fname = os.path.basename(i_str) stream_time = None date_m = date_in_file_name_re.match(fname) if date_m: year = int(date_m.group('year')) month = int(date_m.group('month')) day = int(date_m.group('day')) try: stream_time = streamcorpus.make_stream_time( zulu_timestamp = '%d-%02d-%02dT00:00:01.000000Z' % (year, month, day)) except Exception, exc: logger.info('trapped failed parsing of file name to make stream_time', exc_info=True) stream_time = None
def __call__(self, i_str): # Read in the entire contents as text; we will need to # save it away later with open(i_str, 'r') as f: serifxml = f.read() fname = os.path.basename(i_str) stream_time = None date_m = date_in_file_name_re.match(fname) if date_m: year = int(date_m.group('year')) month = int(date_m.group('month')) day = int(date_m.group('day')) try: stream_time = streamcorpus.make_stream_time( zulu_timestamp='%d-%02d-%02dT00:00:01.000000Z' % (year, month, day)) except Exception, exc: logger.info( 'trapped failed parsing of file name to make stream_time', exc_info=True) stream_time = None
def __call__(self, s1, context): s2 = make_stream_item(s1.stream_time.zulu_timestamp, s1.abs_url) s2.schost = s1.schost s2.source = s1.source s2.source_metadata['kba-2012'] = s1.source_metadata logger.debug('len(original .body.raw) = %d' % len(s1.body.raw)) #logger.critical(repr(s2)) s2.body = ContentItem( raw=s1.body.raw, encoding=s1.body.encoding, ## default, might get overwritten below media_type='text/html', taggings={ 'stanford': Tagging( tagger_id='stanford', raw_tagging=s1.body.ner, generation_time=make_stream_time('2012-06-01T00:00:00.0Z'), tagger_config= 'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100', tagger_version='Stanford CoreNLP ver 1.2.0', ) }) if self.config['keep_old_cleansed_as_clean_visible']: s2.body.clean_visible = s1.body.cleansed if s1.source == 'social': s2.body.media_type = 'text/plain' ## the separation of content items in the social stream ## was artificial and annoying, so smoosh them together s2.body.clean_visible = '\n\n'.join( [s1.title.cleansed, s1.anchor.cleansed, s1.body.cleansed]) changed_body_raw = False if s1.title and s1.title.raw: s2.body.raw = s1.title.raw s2.body.raw += r'\n\n' changed_body_raw = True if s1.anchor and s1.anchor.raw: s2.body.raw += s1.anchor.raw s2.body.raw += r'\n\n' changed_body_raw = True if changed_body_raw: s2.body.raw += s1.body.raw if s1.title: ci = ContentItem( raw=s1.title.raw, encoding=s1.title.encoding, clean_visible=s1.title.cleansed, ) s2.other_content['title'] = ci if s1.anchor: ci = ContentItem(raw=s1.anchor.raw, encoding=s1.anchor.encoding, clean_visible=s1.anchor.cleansed) s2.other_content['anchor'] = ci return s2
class from_serifxml(Configured): '''Read a Serif XML intermediate file as the input to the pipeline. This is a specialized reader for unusual circumstances; you will still need to run :class:`~streamcorpus_pipeline._serif.serif` with special settings to complete the tagging. This expects to find serifxml flat files in a directory and creates a :class:`~streamcorpus.Tagging` with :attr:`~streamcorpus.Tagging.raw_tagging` holding the serifxml string. This :class:`~streamcorpus.Tagging` is stored in :attr:`~streamcorpus.StreamItem.body.taggings`. This also fills in :attr:`~streamcorpus.ContentItem.raw` field. This has one configuration option, which can usually be left at its default value: .. code-block:: yaml streamcorpus_pipeline: from_serifxml: tagger_id: serif `tagger_id` is the tagger name in the generated :class:`~streamcorpus.StreamItem`. To obtain :attr:`~streamcorpus.StreamItem.body.sentences`, one must run Serif in the special `read_serifxml` mode: .. code-block:: yaml streamcorpus_pipeline: third_dir_path: /third tmp_dir_path: tmp reader: from_serifxml incremental_transforms: - language - guess_media_type - clean_html - clean_visible - title batch_transforms: - serif language: force: name: English code: en guess_media_type: fallback_media_type: text/plain serif: path_in_third: serif/serif-latest serif_exe: bin/x86_64/Serif par: streamcorpus_read_serifxml par_additions: streamcorpus_read_serifxml: - "# example additional line" writer: to_local_chunks to_local_chunks: output_type: otherdir output_path: test_output output_name: "%(input_fname)s" ''' config_name = 'from_serifxml' default_config = { 'tagger_id': 'serif', } def __call__(self, i_str): # Read in the entire contents as text; we will need to # save it away later with open(i_str, 'r') as f: serifxml = f.read() fname = os.path.basename(i_str) stream_time = None date_m = date_in_file_name_re.match(fname) if date_m: year = int(date_m.group('year')) month = int(date_m.group('month')) day = int(date_m.group('day')) try: stream_time = streamcorpus.make_stream_time( zulu_timestamp='%d-%02d-%02dT00:00:01.000000Z' % (year, month, day)) except Exception, exc: logger.info( 'trapped failed parsing of file name to make stream_time', exc_info=True) stream_time = None if not stream_time: ## fall back to using the present moment on this system epoch_ticks = time.time() ### NOT IN THE SERIFXML FILE stream_time = streamcorpus.make_stream_time( epoch_ticks=epoch_ticks) # Parse the XML root = etree.fromstring(serifxml) # Get some key parts doc_id = root.xpath('string(/SerifXML/Document/@docid)') source = root.xpath('string(/SerifXML/Document/@source_type)') raw = root.xpath('string(/SerifXML/Document/OriginalText/Contents)') # Build the streamitem tagging = streamcorpus.Tagging( tagger_id=self.config['tagger_id'], raw_tagging=serifxml, ) body = streamcorpus.ContentItem( raw=raw, taggings={ self.config['tagger_id']: tagging, }, ) si = streamcorpus.StreamItem( version=streamcorpus.Versions.v0_3_0, doc_id=doc_id, abs_url=fname, source=source, body=body, stream_id='%d-%s' % (stream_time.epoch_ticks, doc_id), stream_time=stream_time, ) yield si
def align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path): ''' iterate through the i_chunk and tmp_ner_path to generate a new Chunk with body.ner ''' o_chunk = Chunk() input_iter = i_chunk.__iter__() ner = '' stream_id = None all_ner = xml.dom.minidom.parse(open(tmp_ner_path)) for raw_ner in all_ner.getElementsByTagName('FILENAME'): stream_item = input_iter.next() ## get stream_id out of the XML stream_id = raw_ner.attributes.get('docid').value assert stream_id and stream_id == stream_item.stream_id, \ '%s != %s\nner=%r' % (stream_id, stream_item.stream_id, ner) tagger_id = 'lingpipe' tagging = Tagging() tagging.tagger_id = tagger_id ## get this one file out of its FILENAME tags tagged_doc = list(lingpipe.files(raw_ner.toxml()))[0][1] tagging.raw_tagging = tagged_doc tagging.generation_time = streamcorpus.make_stream_time() stream_item.body.taggings[tagger_id] = tagging sentences = list(lingpipe.sentences(tagged_doc)) ## make JS labels on individual tokens assert stream_item.ratings[0].mentions, stream_item.stream_id john_smith_label = Label() john_smith_label.annotator = stream_item.ratings[0].annotator john_smith_label.target_id = stream_item.ratings[0].target_id # first map all corefchains to their words equiv_ids = collections.defaultdict(lambda: set()) for sent in sentences: for tok in sent.tokens: if tok.entity_type is not None: equiv_ids[tok.equiv_id].add(cleanse(tok.token)) ## find all the chains that are John Smith johnsmiths = set() for equiv_id, names in equiv_ids.items(): ## detect 'smith' in 'smithye' _names = cleanse(' '.join(names)) if 'john' in _names and 'smith' in _names: johnsmiths.add(equiv_id) print len(johnsmiths) ## now apply the label for sent in sentences: for tok in sent.tokens: if tok.equiv_id in johnsmiths: tok.labels = [john_smith_label] stream_item.body.sentences[tagger_id] = sentences o_chunk.add(stream_item) ## put the o_chunk bytes into the specified file open(tmp_done_path, 'wb').write(str(o_chunk)) ## replace this with log.info() print 'created %s' % tmp_done_path
def align_chunk_with_ner(self, ner_xml_path, i_chunk, o_chunk): ''' iterate through ner_xml_path to fuse with i_chunk into o_chunk ''' ## prepare to iterate over the input chunk input_iter = i_chunk.__iter__() all_ner = xml.dom.minidom.parse(open(ner_xml_path)) ## this converts our UTF-8 data into unicode strings, so when ## we want to compute byte offsets or construct tokens, we ## must .encode('utf8') for ner_dom in all_ner.getElementsByTagName('FILENAME'): #for stream_id, raw_ner in files(open(ner_xml_path).read().decode('utf8')): stream_item = input_iter.next() ## get stream_id out of the XML stream_id = ner_dom.attributes.get('stream_id').value if stream_item.stream_id is None: assert not stream_id, 'out of sync: None != %r' % stream_id logger.critical('si.stream_id is None... ignoring') continue assert stream_id and stream_id == stream_item.stream_id, \ '%s != %s' % (stream_id, stream_item.stream_id) if not stream_item.body: ## the XML better have had an empty clean_visible too... #assert not ner_dom....something continue tagging = Tagging() tagging.tagger_id = self.tagger_id # pylint: disable=E1101 ''' ## get this one file out of its FILENAME tags tagged_doc_parts = list(files(ner_dom.toxml())) if not tagged_doc_parts: continue tagged_doc = tagged_doc_parts[0][1] ## hack hope_original = make_clean_visible(tagged_doc, '') open(ner_xml_path + '-clean', 'wb').write(hope_original.encode('utf-8')) print ner_xml_path + '-clean' ''' #tagging.raw_tagging = tagged_doc tagging.generation_time = streamcorpus.make_stream_time() stream_item.body.taggings[self.tagger_id] = tagging # pylint: disable=E1101 ## could consume lots of memory here by instantiating everything sentences, relations, attributes = self.get_sentences(ner_dom) stream_item.body.sentences[self.tagger_id] = sentences # pylint: disable=E1101 stream_item.body.relations[self.tagger_id] = relations # pylint: disable=E1101 stream_item.body.attributes[self.tagger_id] = attributes # pylint: disable=E1101 logger.debug('finished aligning tokens %s' % stream_item.stream_id) ''' for num, sent in enumerate(sentences): for tok in sent.tokens: print '%d\t%d\t%s' % (num, tok.offsets[OffsetType.LINES].first, repr(tok.token)) ''' if 'align_labels_by' in self.config and self.config[ 'align_labels_by']: assert 'aligner_data' in self.config, 'config missing "aligner_data"' aligner = AlignmentStrategies[self.config['align_labels_by']] aligner(stream_item, self.config['aligner_data']) ## forcibly collect dereferenced objects gc.collect() try: o_chunk.add(stream_item) except MemoryError, exc: msg = traceback.format_exc(exc) msg += make_memory_info_msg() logger.critical(msg) raise PipelineOutOfMemory(msg)
count = 0 for i_str in args.purge: if not args.quiet: print ("purging: %r" % i_str.strip()) tq.purge(i_str) count += 1 print ("Done purging %d strs" % count) if args.counts: if args.detailed: counts = tq.counts_detailed else: counts = tq.counts print ("\ncounts for %s at %s" % (namespace, tq.addresses)) print (repr(make_stream_time())) print ("\n".join(["\t%s:\t%s" % (k, v) for k, v in counts.items()])) available_pending_completed = sum([counts[k] for k in ["available", "pending", "completed"]]) print ("%d len(available+pending+completed)" % available_pending_completed) print ("%d len(tasks)" % counts["tasks"]) print ("%d missing" % (counts["tasks"] - available_pending_completed)) if args.list_details: data = tq.details(args.list_details) # print '\n'.join(data['results']) print data if args.list_completed: for completed in tq.completed: print "#%s\n%s" % (completed["i_str"], "\n".join(completed["results"]))
def align_chunk_with_ner(self, ner_xml_path, i_chunk, o_chunk): ''' iterate through ner_xml_path to fuse with i_chunk into o_chunk ''' ## prepare to iterate over the input chunk input_iter = i_chunk.__iter__() all_ner = xml.dom.minidom.parse(open(ner_xml_path)) ## this converts our UTF-8 data into unicode strings, so when ## we want to compute byte offsets or construct tokens, we ## must .encode('utf8') for ner_dom in all_ner.getElementsByTagName('FILENAME'): #for stream_id, raw_ner in files(open(ner_xml_path).read().decode('utf8')): stream_item = input_iter.next() ## get stream_id out of the XML stream_id = ner_dom.attributes.get('stream_id').value if stream_item.stream_id is None: assert not stream_id, 'out of sync: None != %r' % stream_id logger.critical('si.stream_id is None... ignoring') continue assert stream_id and stream_id == stream_item.stream_id, \ '%s != %s' % (stream_id, stream_item.stream_id) if not stream_item.body: ## the XML better have had an empty clean_visible too... #assert not ner_dom....something continue tagging = Tagging() tagging.tagger_id = self.tagger_id # pylint: disable=E1101 ''' ## get this one file out of its FILENAME tags tagged_doc_parts = list(files(ner_dom.toxml())) if not tagged_doc_parts: continue tagged_doc = tagged_doc_parts[0][1] ## hack hope_original = make_clean_visible(tagged_doc, '') open(ner_xml_path + '-clean', 'wb').write(hope_original.encode('utf-8')) print ner_xml_path + '-clean' ''' #tagging.raw_tagging = tagged_doc tagging.generation_time = streamcorpus.make_stream_time() stream_item.body.taggings[self.tagger_id] = tagging # pylint: disable=E1101 ## could consume lots of memory here by instantiating everything sentences, relations, attributes = self.get_sentences(ner_dom) stream_item.body.sentences[self.tagger_id] = sentences # pylint: disable=E1101 stream_item.body.relations[self.tagger_id] = relations # pylint: disable=E1101 stream_item.body.attributes[self.tagger_id] = attributes # pylint: disable=E1101 logger.debug('finished aligning tokens %s' % stream_item.stream_id) ''' for num, sent in enumerate(sentences): for tok in sent.tokens: print '%d\t%d\t%s' % (num, tok.offsets[OffsetType.LINES].first, repr(tok.token)) ''' if 'align_labels_by' in self.config and self.config['align_labels_by']: assert 'aligner_data' in self.config, 'config missing "aligner_data"' aligner = AlignmentStrategies[ self.config['align_labels_by'] ] aligner( stream_item, self.config['aligner_data'] ) ## forcibly collect dereferenced objects gc.collect() try: o_chunk.add(stream_item) except MemoryError, exc: msg = traceback.format_exc(exc) msg += make_memory_info_msg() logger.critical(msg) raise PipelineOutOfMemory(msg)
def __call__(self, s1, context): s2 = make_stream_item(s1.stream_time.zulu_timestamp, s1.abs_url) s2.schost = s1.schost s2.source = s1.source s2.source_metadata['kba-2012'] = s1.source_metadata logger.debug('len(original .body.raw) = %d' % len( s1.body.raw )) #logger.critical(repr(s2)) s2.body = ContentItem( raw = s1.body.raw, encoding = s1.body.encoding, ## default, might get overwritten below media_type = 'text/html', taggings = {'stanford': Tagging( tagger_id = 'stanford', raw_tagging = s1.body.ner, generation_time = make_stream_time('2012-06-01T00:00:00.0Z'), tagger_config = 'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100', tagger_version = 'Stanford CoreNLP ver 1.2.0', )} ) if self.config['keep_old_cleansed_as_clean_visible']: s2.body.clean_visible = s1.body.cleansed if s1.source == 'social': s2.body.media_type = 'text/plain' ## the separation of content items in the social stream ## was artificial and annoying, so smoosh them together s2.body.clean_visible = '\n\n'.join([ s1.title.cleansed, s1.anchor.cleansed, s1.body.cleansed]) changed_body_raw = False if s1.title and s1.title.raw: s2.body.raw = s1.title.raw s2.body.raw += r'\n\n' changed_body_raw = True if s1.anchor and s1.anchor.raw: s2.body.raw += s1.anchor.raw s2.body.raw += r'\n\n' changed_body_raw = True if changed_body_raw: s2.body.raw += s1.body.raw if s1.title: ci = ContentItem( raw = s1.title.raw, encoding = s1.title.encoding, clean_visible = s1.title.cleansed, ) s2.other_content['title'] = ci if s1.anchor: ci = ContentItem( raw = s1.anchor.raw, encoding = s1.anchor.encoding, clean_visible = s1.anchor.cleansed ) s2.other_content['anchor'] = ci return s2