def process_path(self, chunk_path): ## make temporary file paths based on chunk_path clean_visible_path = chunk_path + '-clean_visible.xml' ner_xml_path = chunk_path + '-ner.xml' ## process the chunk's clean_visible data into xml i_chunk = Chunk(path=chunk_path, mode='rb') make_clean_visible_file(i_chunk, clean_visible_path) ## make sure holding nothing that consumes memory i_chunk = None ## generate an output file from the tagger self.make_ner_file(clean_visible_path, ner_xml_path) ## make a new output chunk at a temporary path tmp_chunk_path = chunk_path + '_' o_chunk = Chunk(path=tmp_chunk_path, mode='wb') ## re-open i_chunk i_chunk = Chunk(path=chunk_path, mode='rb') ## fuse the output file with i_chunk to make o_chunk self.align_chunk_with_ner(ner_xml_path, i_chunk, o_chunk) ## clean up temp files if self.config['cleanup_tmp_files']: os.remove(clean_visible_path) os.remove(ner_xml_path) ## atomic rename new chunk file into place os.rename(tmp_chunk_path, chunk_path)
def make_hyperlink_labeled_test_chunk(): ''' returns a path to a temporary chunk that has been hyperlink labeled ''' tpath = os.path.join('/tmp', str(uuid.uuid1()) + '.sc') o_chunk = Chunk(tpath, mode='wb') dpath = os.path.dirname(__file__) ipath = os.path.join( dpath, _TEST_DATA_ROOT, 'test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc' ) cv = _init_stage('clean_visible', {}) hl = hyperlink_labels( {'require_abs_url': True, 'all_domains': True, 'offset_types': ['BYTES']} ) for si in Chunk(path=ipath): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def test_get_name_info(tmpdir): path = str(tmpdir.join('test_path')) c = Chunk(path, mode='wb') c.add(make_stream_item(28491, 'abs_url')) name_info = get_name_info(path, i_str='foo') assert name_info['date_now'] == name_info['date_time_now'][:10] assert name_info['date_now'] + '-' + name_info['time_now'] == name_info['date_time_now']
def test_matcher(): config = dict( ## command to run fpat_path="cat" ) fm = fpat_matcher(config) si1 = make_stream_item(None, "http://example.com") si1.body = ContentItem(clean_visible="hello! This is a test of matching Bob.") si2 = make_stream_item(None, "http://example.com") si2.body = ContentItem(clean_visible="hello! This is a test of matching Sally.") chunk_path = "/tmp/%s" % uuid.uuid1() ch = Chunk(chunk_path, mode="wb") ch.add(si1) ch.add(si1) ch.add(si2) ch.close() fm(chunk_path) ch = Chunk(chunk_path, mode="rb") SIs = list(ch) ## verify the si has expected things for si in SIs: len(si.body.labels) == 1 for i in range(2): print SIs[i].ratings
def run_pipeline(tagger_id, input_dir, output_dir, tmp_dir='/tmp', pipeline_root='./'): ''' ''' ## make tmp_dir more tmp_dir = os.path.join(tmp_dir, '%s-%s' % (uuid.uuid1(), os.getpid())) assert not os.path.exists(tmp_dir), tmp_dir os.makedirs(tmp_dir) for fname in os.listdir(input_dir): if not fname.endswith('.sc'): ## ignore any non streamcorpus.Chunk files continue fpath = os.path.join(input_dir, fname) ## just need one chunk for this tiny corpus i_chunk = Chunk(file_obj=open(fpath)) ## prepare to make intermediate files in tmp_dir tmp_cleansed_path = os.path.join(tmp_dir, fname + '.cleansed.xml') tmp_ner_path = os.path.join(tmp_dir, fname + '.ner.xml') tmp_done_path = os.path.join(output_dir, fname + '.done.partial') final_done_path = os.path.join(output_dir, fname + '.done.sc') make_cleansed_file(i_chunk, tmp_cleansed_path) make_ner_file(tagger_id, tmp_cleansed_path, tmp_ner_path, pipeline_root) align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path) ## atomic rename when done os.rename(tmp_done_path, final_done_path) ## replace with log.info() print 'done with %s' % final_done_path
def test_tokenizer(test_data_dir): path = os.path.join(test_data_dir, 'test', 'wlc-chunk-with-labels.sc') num = 0 for si in Chunk(path): num += 1 ## there is only one StreamItem in this chunk sentences = si.body.sentences.pop('nltk_tokenizer') t = nltk_tokenizer(config={'annotator_id': 'author'}) t.process_item(si) assert num > 0 ## if something changes, then need to save new test data #open(path, 'wb').write(serialize(si)) #return if 1: #assert si.body.sentences['nltk_tokenizer'] == sentences num = 0 for i in range(len(si.body.sentences['nltk_tokenizer'])): for j in range( len(si.body.sentences['nltk_tokenizer'][i].tokens)): tok_t = si.body.sentences['nltk_tokenizer'][i].tokens[j] for attr in dir(tok_t): if attr.startswith('__'): continue #type(attr) == type(test_tokenizer): continue ## printing for diagnostics when things change #print 'checking ', attr assert getattr(tok_t, attr) == getattr(sentences[i].tokens[j], attr) num += 1 assert num > 0
def cids_and_fcs(): count = 0 seen = set() for si in Chunk(t_path): clean_html = getattr(si.body, 'clean_html', '') if clean_html is None or len(clean_html.strip()) == 0: logger.warn('dropping SI lacking clean_html: %r', si.abs_url) continue if 'other_features' in si.other_content: other_features = json.loads( si.other_content['other_features'].raw) else: other_features = None fc = html_to_fc( clean_html=si.body.clean_html.decode('utf-8'), clean_visible=si.body.clean_visible.decode('utf-8'), encoding='utf-8', url=si.abs_url, timestamp=si.stream_time.epoch_ticks, other_features=other_features, ) add_sip_to_fc(fc, self.tfidf) content_id = mk_content_id(str(fc.get(u'meta_url'))) if content_id in seen: logger.warn('dropping duplicate content_id=%r', content_id) else: seen.add(content_id) yield content_id, fc count += 1 logger.info('saved %d FCs from %d SIs', count, len(seen))
def make_hyperlink_labeled_test_chunk(tmpdir): ''' returns a path to a temporary chunk that has been hyperlink labeled ''' tpath = tmpdir.join(str(uuid.uuid1()) + '.sc') o_chunk = Chunk(tpath, mode='wb') ipath = get_test_chunk_path() hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': True, 'offset_types': [BYTES], }) cv = make_clean_visible(config={}) for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def _aligner_core(t_path1, aligner, aligner_data): t_chunk1 = Chunk(t_path1, mode='rb') t_path2 = t_path1 + '-tmp-aligning' t_chunk2 = Chunk(t_path2, mode='wb') for si in t_chunk1: aligner(si, aligner_data) t_chunk2.add(si) t_chunk1.close() t_chunk2.close() if aligner_data.get('cleanup_tmp_files', True): logger.info('atomic rename: %r --> %r', t_path2, t_path1) os.rename(t_path2, t_path1) logger.debug('done renaming') else: # for development, leave intermediate tmp file shutil.copy(t_path2, t_path1) logger.info('copied %r -> %r', t_path2, t_path1)
def cca_items(args): '''This generator takes an s3_paths_fname file, fetches the data, constructs a CCA record, and yields it. ''' for path in lzma.open(args.s3_paths_fname): if args.date_hour is not None: if not path.startswith(args.date_hour): continue s3_path = args.s3_path_prefix + path.strip() url = args.s3_http_host + s3_path logger.info( url ) retries = 0 max_retries = 10 while retries < max_retries: retries += 1 sys.stderr.flush() try: resp = requests.get(url) errors, data = decrypt_and_uncompress(resp.content, gpg_private='trec-kba-rsa') logger.info( '\n'.join(errors) ) for si in Chunk(file_obj=StringIO(data)): item = { 'key': si.stream_id, 'url': si.abs_url, 'timestamp': si.stream_time.epoch_ticks, 'request': None, ## not part of this data set 'response': { 'headers': [ ['Content-Type', 'text/html'], ], 'body': si.body.clean_html, ## alternatively, could use si.body.raw and ## si.body.media_type for the Content-Type ## header, but that would cause the Serif NER ## to be useless to teams... }, 'imported': None, } yield item #print cbor.dumps(rec) ## do something with the data logger.info( '%d bytes of html, or %d bytes of tag-stripped clean_visible, and %d sentences with %d tokens' % ( len(si.body.clean_html), len(si.body.clean_visible), len(si.body.sentences['serif']), len(list(chain(*map(attrgetter('tokens'), si.body.sentences['serif'])))), )) break # break out of retry loop except Exception, exc: logger.critical( traceback.format_exc(exc) ) logger.critical( 'retrying %d of %d times to fetch and access: %s' % (retries, max_retries, url) ) time.sleep(1)
def get_john_smith_tagged_by_lingpipe_without_labels_data(): fh = StringIO() o_chunk = Chunk(file_obj=fh, mode='wb') path = get_john_smith_tagged_by_lingpipe_path() for si in Chunk(path): for sentence in si.body.sentences['lingpipe']: for token in sentence.tokens: for labels in token.labels.values(): for label in labels: label.offsets.update(token.offsets) for offset in label.offsets.values(): offset.value = token.token add_annotation(si.body, label) token.labels = dict() o_chunk.add(si) o_chunk.flush() return fh.getvalue()
def __call__(self, chunk_path): ''' batch-type transform stage: reads a chunk from chunk_path, and replaces it with a new chunk at the same path ''' ## make a new output chunk at a temporary path tmp_chunk_path = chunk_path + '_' t_chunk = Chunk(path=tmp_chunk_path, mode='wb') for num, si in enumerate(Chunk(path=chunk_path)): if num < self.config['max_items']: t_chunk.add(si) else: break ## flush to disk t_chunk.close() ## atomic rename new chunk file into place os.rename(tmp_chunk_path, chunk_path)
def main(): parser = argparse.ArgumentParser( 'process streamcorpus.Chunk files to generate CBOR files' ' to load into memex_dossier.akagraph.' ) parser.add_argument('input_paths', nargs='+', help='paths to streamcorpus.Chunk files') parser.add_argument('--output-path', help='cbor file (or cbor.gz) to create') parser.add_argument('--xform', action='store_true', default=False, help='run structured_features transform before page_extractors') parser.add_argument('--total', type=int, help='anticipated number of StreamItems') parser.add_argument('--limit', type=int, help='stop processing after this many StreamItems') args = parser.parse_args() xform = structured_features(structured_features.default_config) fopen = open if args.output_path.endswith('.gz'): fopen = gzip.open fh = fopen(args.output_path, 'wb') count = 0 start = time.time() for path in args.input_paths: for si in Chunk(path): count += 1 if count % 100 == 0: elapsed = time.time() - start rate = count / elapsed msg = '%d done in %.1f secs --> %.1f per sec' % (count, elapsed, rate) if args.total: remaining = (args.total - count) / rate msg += ' --> %.1f sec remaining' % remaining print(msg) sys.stdout.flush() if args.limit and count > args.limit: break #url_parts = urlparse(si.abs_url) if args.xform: si = xform(si) slots = profile_page(si) if slots: slots = cbor.loads(slots) better_slots = {} for key, values in slots['slots'].iteritems(): assert isinstance(values, list), values better_slots[key.lower()] = [unicodedata.normalize('NFKC', v).lower() for v in values] better_slots['url'] = si.abs_url cbor.dump(better_slots, fh) fh.close() print('done')
def start(self): self.toFactoriePipeName = os.tmpnam() self.fromFactoriePipeName = os.tmpnam() os.mkfifo(self.toFactoriePipeName) os.mkfifo(self.fromFactoriePipeName) logger.debug('made fifos %r %r', self.toFactoriePipeName, self.fromFactoriePipeName) self.call_factorie(self.toFactoriePipeName, self.fromFactoriePipeName) self.pipeToFactorie = Chunk(path=self.toFactoriePipeName, mode='ab') self.pipeFromFactorie = Chunk(path=self.fromFactoriePipeName, mode='rb') self.taggedChunkIter = iter(self.pipeFromFactorie)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('out_path') args = parser.parse_args() si = make_stream_item(1, 'http://crazydog.com') si.body.raw = ''' Flying dogs are amazing. The flight of the super dog Sam Vroomvroom is often cited as the first such flying dog. ''' topic_name = 'The flight of the super dog Sam Vroomvroom' sel = Selector( selector_type=SelectorType.TOPIC.value, raw_selector=topic_name, canonical_selector=topic_name.lower( ), # this is the key for making it appear for a profile of this title offsets={ OffsetType.CHARS: Offset( type=OffsetType.CHARS, first=si.body.raw.find('The'), length=len(topic_name), ) }, ) si.body.selectors['other'] = [sel] chunk = Chunk(args.out_path, mode='wb') chunk.add(si) chunk.close()
def make_hyperlink_labeled_test_chunk(tmpdir): """ returns a path to a temporary chunk that has been hyperlink labeled """ tpath = tmpdir.join(str(uuid.uuid1()) + ".sc") o_chunk = Chunk(tpath, mode="wb") ipath = get_test_chunk_path() hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [BYTES]}) cv = make_clean_visible(config={}) for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file): """minimal end-to-end test, with a fixed pipeline""" with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def _aligner_core(t_path1, aligner, aligner_data): t_chunk1 = Chunk(t_path1, mode='rb') t_path2 = t_path1 + '-tmp-aligning' t_chunk2 = Chunk(t_path2, mode='wb') for si in t_chunk1: aligner( si, aligner_data ) t_chunk2.add(si) t_chunk1.close() t_chunk2.close() if aligner_data.get('cleanup_tmp_files', True): logger.info('atomic rename: %r --> %r', t_path2, t_path1) os.rename(t_path2, t_path1) logger.debug('done renaming') else: # for development, leave intermediate tmp file shutil.copy(t_path2, t_path1) logger.info('copied %r -> %r', t_path2, t_path1)
class factorie(FactorieBase): ''' incremental transform ''' def __init__(self, config): super(FactorieIncremetal, self).__init__(config) self.toFactoriePipeName = None self.fromFactoriePipeName = None self.pipeToFactorie = None self.pipeFromFactorie = None self.taggedChunkIter = None def start(self): self.toFactoriePipeName = os.tmpnam() self.fromFactoriePipeName = os.tmpnam() os.mkfifo(self.toFactoriePipeName) os.mkfifo(self.fromFactoriePipeName) logger.debug('made fifos %r %r', self.toFactoriePipeName, self.fromFactoriePipeName) self.call_factorie(self.toFactoriePipeName, self.fromFactoriePipeName) self.pipeToFactorie = Chunk(path=self.toFactoriePipeName, mode='ab') self.pipeFromFactorie = Chunk(path=self.fromFactoriePipeName, mode='rb') self.taggedChunkIter = iter(self.pipeFromFactorie) def close(self): self.pipeToFactorie.close() self.taggedChunkIter = None self.pipeFromFactorie = None os.unlink(self.toFactoriePipeName) os.unlink(self.fromFactoriePipeName) if self.process: self.process.terminate() self.process = None def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() return False def __call__(self, stream_item, context): if not self.process: self.start() logger.debug('pushing stream item to factorie') self.pipeToFactorie.add(stream_item) self.pipeToFactorie.flush() nc = self.taggedChunkIter.next() logger.debug('got item from factorie') return nc
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config, output_file): """set a publisher_type filter that matches everything in the feed""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'publisher_type': 'WEBLOG' } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def verify(self, o_path, md5): url = 'http://s3.amazonaws.com/%(bucket)s/%(o_path)s' % dict( bucket=self.config['bucket'], o_path=o_path) logger.info('fetching %r' % url) req = requests.get(url) errors, data = decrypt_and_uncompress( req.content, # pylint: disable=E1103 self.config.get('gpg_decryption_key_path'), tmp_dir=self.config['tmp_dir_path'], ) logger.info('got back SIs: %d' % len(list(Chunk(data=data)))) rec_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101 if md5 == rec_md5: return else: logger.critical('\n'.join(errors)) raise Exception('original md5 = %r != %r = received md5' % (md5, rec_md5))
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config, output_file): """configuration explicitly ignores bad prefetched data""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': False } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = filename from_spinn3r_feed._prefetched[key] = 'bogus data, dude!' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) del from_spinn3r_feed._prefetched[key] with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def get_john_smith_tagged_by_lingpipe_without_labels_data(test_data_dir): fh = StringIO() o_chunk = Chunk(file_obj=fh, mode='wb') path = get_john_smith_tagged_by_lingpipe_path(test_data_dir) for si in Chunk(path): for sentence in si.body.sentences['lingpipe']: for token in sentence.tokens: for labels in token.labels.values(): for label in labels: label.offsets.update(token.offsets) for offset in label.offsets.values(): offset.value = token.token add_annotation(si.body, label) token.labels = dict() o_chunk.add(si) o_chunk.flush() return fh.getvalue()
def test_spinn3r_pipeline_prefetched(filename, urls, pipeline_config, output_file): """minimal end-to-end test, preloading data in the loader""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': True } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = 'test_file.bin' with open(filename, 'rb') as f: from_spinn3r_feed._prefetched[key] = f.read() work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) del from_spinn3r_feed._prefetched[key] with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def attempt_fetch(work_unit, fpath): '''attempt a fetch and iteration over a work_unit.key path in s3 ''' url = 'http://s3.amazonaws.com/aws-publicdatasets/' + work_unit.key.strip() ## cheapest way to iterate over the corpus is a few stages of ## streamed child processes. Note that stderr needs to go ## separately to a file so that reading the stdin doesn't get ## blocked: cmd = '(wget -O - %s | gpg --no-permission-warning --trust-model always --output - --decrypt - | xz --decompress) 2> %s-err' % ( url, fpath) print cmd child = Popen(cmd, stdout=PIPE, shell=True) print 'child launched' sys.stdout.flush() si_count = 0 serif_count = 0 exc = '' stream_ids = list() clean_visible_bytes = 0 clean_visible_count = 0 try: for si in Chunk(file_obj=child.stdout): print si.stream_id, si.abs_url if si.body.language: lang = si.body.language.code else: lang = '' stream_ids.append((lang, si.stream_id)) if si.body.clean_visible: clean_visible_count += 1 clean_visible_bytes += len(si.body.clean_visible) si_count += 1 if 'serif' in si.body.sentences: serif_count += 1 except Exception, exc: exc = re.sub('\s+', ' ', str(exc)).strip()
def test_tagger_transform(tagger, chain_selector, stages, tmpdir, test_data_dir): transform = stages.init_stage( tagger, { tagger: { 'tagger_id': 'lingpipe', 'annotator_id': 'bagga-and-baldwin', 'chain_selector': chain_selector } }) data = get_john_smith_tagged_by_lingpipe_without_labels_data(test_data_dir) with tmpdir.join('{}.{}.sc'.format(tagger, chain_selector)).open('wb') as tf: tf.write(data) tf.flush() transform.process_path(tf.name) found_one = False for si in Chunk(tf.name): for sentence in si.body.sentences['lingpipe']: for token in sentence.tokens: if token.labels: found_one = True assert found_one
def get_test_chunk(test_data_dir): return Chunk(path=get_test_chunk_path(test_data_dir), message=StreamItem_v0_2_0)
def get_name_info(chunk_path, assert_one_date_hour=False, i_str=None): ''' takes a chunk blob and obtains the date_hour, md5, num ''' assert i_str is not None, 'must provide i_str as keyword arg' name_info = dict() if i_str: name_info['i_str'] = i_str else: name_info['i_str'] = '' i_fname = i_str.split('/')[-1] i_fname = i_fname.split('.')[0] ## strip off .sc[.xz[.gpg]] name_info['input_fname'] = i_fname name_info['input_md5'] = i_fname.split('-')[-1] # TODO: return a dict-like object that does the expensive # calculation lazily, the name format might not even need that # value. ch = Chunk(path=chunk_path, mode='rb') date_hours = set() target_names = set() doc_ids = set() epoch_ticks = None count = 0 for si in ch: if epoch_ticks is None: epoch_ticks = si.stream_time.epoch_ticks date_hours.add( si.stream_time.zulu_timestamp[:13] ) doc_ids.add( si.doc_id ) for annotator_id, ratings in si.ratings.items(): for rating in ratings: target_name = rating.target.target_id.split('/')[-1] target_names.add( target_name ) count += 1 ## create the md5 property, so we can use it in the filename name_info['md5'] = ch.md5_hexdigest name_info['num'] = count name_info['epoch_ticks'] = epoch_ticks name_info['target_names'] = '-'.join( target_names ) name_info['doc_ids_8'] = '-'.join( [di[:8] for di in doc_ids] ) if assert_one_date_hour: assert len(date_hours) == 1, \ 'got a chunk with other than one data_hour! ' + \ repr(date_hours) if len(date_hours) > 0: date_hour = list(date_hours)[0] date_hour = date_hour.replace('T', '-') else: assert count == 0, (date_hours, count) date_hour = None name_info['date_hour'] = date_hour # TODO: in future lazy evaluation world, rand8 should return a # different value every time it is accessed so that a format could # be 'foo-{rand8}{rand8}' name_info['rand8'] = '%08x' % (random.randint(0, 0x7fffffff),) name_info['date_now'] = datetime.datetime.utcnow().strftime('%Y-%m-%d') name_info['time_now'] = datetime.datetime.utcnow().strftime('%H-%M-%S') name_info['date_time_now'] = datetime.datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S') return name_info
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('input') args = parser.parse_args() logger = logging.getLogger('streamcorpus_pipeline') ch = logging.StreamHandler() logger.addHandler(ch) streamcorpus_logger = logging.getLogger('streamcorpus') streamcorpus_logger.addHandler(ch) t = nltk_tokenizer(dict(annotator_id='author')) for si in Chunk(args.input): t.process_item(si) if si.body.clean_visible: assert len(si.body.sentences['nltk_tokenizer']) > 0 logger.critical( 'num_sentences=%d, has wikipedia %r and %d labels', len(si.body.sentences['nltk_tokenizer']), 'wikipedia.org' in si.body.clean_html, len(getattr(si.body, 'labels', {}).get('author', {}))) #if len(getattr(si.body, 'labels', {}).get('author', {})) > 3: # c = Chunk('foo.sc', mode='wb') # c.add(si) # c.close() # sys.exit()
def profiles_from_runfile( runfile_path, offset_c_prepended=False, offset_inclusive=True, decode_utf=False, streamitems_dir=None, max_lines=None, ): ''' Returns a dictionary mappping from entity-name to ComparableProfile, where the ComparableProfiles are constructed from a runfile. ''' runfile = gzip.open(runfile_path, 'r') filter_run = runfile.readline() assert filter_run.startswith('#') filter_run = json.loads(filter_run[1:]) if filter_run['task_id'] != 'kba-ssf-2014': # do nothing return runfile_profiles = dict() runfile_csv = csv.reader(runfile, delimiter='\t') count = 1 for row in runfile_csv: #skip comments if row[0].startswith('#'): continue if max_lines is not None and count > max_lines: break count += 1 #parse the row stream_item = row[2] profile_name = row[3] slot_name = row[8] slot_value = row[9] offset_str = row[10] #initialize profile if profile_name not in runfile_profiles: runfile_profiles[profile_name] = ComparableProfile( profile_name, truncate_counts=True) #do all the offsets have a 'c' prepended? if offset_c_prepended: #remove the 'c's offsets = [offset[1:] for offset in offset_str.split(',')] else: #if there is no 'c' prepended, we also know that there is one offset. offsets = [offset_str] log('{}: fetching stream item: {}'.format(runfile_path, stream_item)) ## The chunk files are located two levels deep in a directory ## hierarchy, where each level is 2 character prefix of the ## stream-id. We are going to extract the first 4 characters ## of the stream-id in chunks of 2, so we can find the ## corresponding StreamItem on the filesystem. For example, ## the StreamItem with id ## 1234567890-abcdef0123456789abcdef0123456789 would be stored ## in a single-item chunk file called ## ./ab/cd/1234567890-abcdef0123456789abcdef0123456789.sc and ## the further file extensions of .xz or .xz.gpg are optional, ## because the streamcorpus python package handles that for ## us. match = re.match('.*-(..)(..).*', stream_item) if match is None: raise Exception( "Cannot read StreamItem for {}".format(stream_item)) stream_item_path = '{}/{}/{}.sc.xz.gpg'.format(match.group(1), match.group(2), stream_item) stream_item_file_path = os.path.join(streamitems_dir, stream_item_path) if not os.path.isfile(stream_item_file_path): log('Could not find stream item {}'.format(stream_item)) continue c = Chunk(stream_item_file_path) si = [si for si in c][0] #collect the single si in this chunk #are the offsets indexes in the decoded string or the undecoded string? if decode_utf: clean_visible = si.body.clean_visible.decode('utf-8') else: clean_visible = si.body.clean_visible #parse each offset in offsets if len(offsets) > 1: begin = int(offsets[0].split('-')[0]) end = int(offsets[-1].split('-')[1]) else: offset = offsets[0] begin, end = [int(loc) for loc in offset.split('-')] #account for inclusive offsets. if offset_inclusive: end += 1 #build the slot value from clean_visible. slot_value_processed = clean_visible[int(begin):int(end)].lower( ).replace('_', ' ').strip() if not decode_utf: #we now must decode, because clean_visible wasn't decoded from the start. try: slot_value_processed = slot_value_processed.decode('utf-8') except UnicodeDecodeError: log('Warning: Could not decode slot_value: {}. Will skip slot-fill.' .format(slot_value_processed)) continue log('## %s %s: %s' % (profile_name, slot_name, slot_value_processed.encode('utf-8'))) #we want the bag-of-words associated with this slot_value for value in slot_value_processed.split(): runfile_profiles[profile_name].add_value_for_slot(slot_name, value) return runfile_profiles
''' ## "Chunk" is a convenience wrapper in the python tools built around ## the streamcorpus thirft interfaces. It is essentially just a ## wrapper around open(<file_handle>) and can take a path to a flat ## file on disk, or a file_obj that has already been opened in memory, ## such as a pipe from stdin or a network socket. from streamcorpus import Chunk ## This classes are available in any language that can compile the ## streamcorpus thrift interfaces from streamcorpus import Tagging, Versions, Relation, Attribute, Sentence, Token ## read StreamItems from over stdin. We will assume that these ## StreamItems have already been constructed and have ## StreamItem.body.clean_visible i_chunk = Chunk(file_obj=sys.stdin, mode='rb') ## write StreamItems via stdout. We will add more data to them o_chunk = Chunk(file_obj=sys.stdout, mode='wb') ## iterate over input chunks, generate data, and write to output for si in i_chunk: assert si.version == Versions.v0_3_0, 'new streamcorpus collections should be built using the latest version' ## clean_visible is byte identical to clean_html, except all the ## tags are converted to whitespace, so offsets in match #input_html = si.body.clean_html = text.encode('utf8') clean_visible = si.body.clean_visible.decode('utf8') ## run the text through a tagger
) si.source_metadata['lang'] = pe.lang[0].code si.source_metadata['author'] = json.dumps( dict( name = pe.author[0].name, email = pe.author[0].email, link = pe.author[0].link[0].href, ) ) si.source = entry.source.publisher_type yield si if __name__ == '__main__': #import sys #from _handle_unconvertible_spinn3r import handle_unconvertible_spinn3r as hus #map(hus, _generate_stream_items(sys.stdin.read())) o_chunk = Chunk('/tmp/foo.sc', mode='wb') for si in _generate_stream_items(sys.stdin.read()): print '---post smoosh raw: %s --' % si.stream_id print si.body.raw print si.stream_id if si.stream_id == '1345928297-da71cfa833ce8218684b6dab152dd69b': o_chunk.add( si ) o_chunk.close()
from streamcorpus import make_stream_item, make_stream_time, get_date_hour ## get useful file wrapper class from streamcorpus import Chunk ## get a couple of the classes compiled from the thrift interface definitions from streamcorpus import Tagging, Versions ## somehow get a list of input text files # fake example with just one input, downloaded via wget (see __doc__ # string above) input_files = ['index.html'] ## open a chunk file to write StreamItems output_path = 'first-output-chunk.sc' ch = Chunk(output_path, mode='wb') for file_path in input_files: ## get the text text = open(file_path).read() ## every StreamItem has a timestamp, which ideally is the creation ## time of the text zulu_timestamp = '2013-04-18T18:18:20.000000Z' ## every StreamItem has an absolute URL, which ideally points to ## the real text on the Web abs_url = 'http://nytimes.com/index.html' si = make_stream_item(zulu_timestamp, abs_url)
os.rename(tmp_done_path, final_done_path) ## replace with log.info() print 'done with %s' % final_done_path if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('tagger_id', metavar='stanford|lingpipe', help='name of NLP pipeline to run') parser.add_argument('input_dir', help='directory of Chunk files') parser.add_argument('output_dir', help='directory to put new streamcorpus.Chunk files') parser.add_argument('--convert-kba', action='store_true', default=False, help='Expect input_dir to have old-style KBA chunks') parser.add_argument('--pipeline-root', metavar='PIPELINE_ROOT', dest='pipeline_root', help='file path to root dir for the particular NLP pipeline') parser.add_argument('--align-only', metavar='OUTPUT_PATH', dest='align_only', default=None, help='produce a chunk file at OUTPUT_PATH by alignning a single input chunk file with the intermediate') ## could add options to delete input after verifying the output on disk? args = parser.parse_args() if args.align_only is not None: i_chunk = Chunk(file_obj=open(args.input_dir)) tmp_ner_path = args.output_dir tmp_done_path = args.align_only align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path) else: run_pipeline(args.tagger_id, args.input_dir, args.output_dir, pipeline_root=args.pipeline_root)
for path in lzma.open(s3_paths_fname): s3_path = s3_path_prefix + path.strip() url = s3_http_host + s3_path logger.info(url) retries = 0 max_retries = 10 while retries < max_retries: retries += 1 sys.stderr.flush() try: resp = requests.get(url) errors, data = decrypt_and_uncompress(resp.content, gpg_private='trec-kba-rsa') logger.info('\n'.join(errors)) for si in Chunk(file_obj=StringIO(data)): rec = { 'url': si.abs_url, 'timestamp': si.stream_time.epoch_ticks, 'request': None, ## not part of this data set 'response': { 'headers': [ ['Content-Type', 'text/html'], ], 'body': si.body.clean_html, ## alternatively, could use si.body.raw and ## si.body.media_type for the Content-Type ## header, but that would cause the Serif NER ## to be useless to teams... },
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('action', metavar='postproc|align', help='postproc') parser.add_argument('input_file', help='XML file from LingPipe') parser.add_argument('output_file', help='XML file to generate with OWPL data') parser.add_argument( '--source_chunk', help='source chunk file that was input to the pipeline data') args = parser.parse_args() if args.action == 'postproc': text = open(args.input_file).read() print 'read %d bytes from %s' % (len(text), args.input_file) raise NotImplementedError( 'need to instantiate a LingPipeParser object here') for stream_id, tagged_doc in files(text): for sent in sentences(tagged_doc): # pylint: disable=E0602 for tok in sent.tokens: if tok.entity_type is not None: print tok, EntityType._VALUES_TO_NAMES[tok.entity_type] elif args.action == 'align': i_chunk = Chunk(path=args.source_chunk, mode='rb') o_chunk = Chunk(path=args.output_file, mode='wb') align_chunk_with_ner(args.input_file, i_chunk, o_chunk) # pylint: disable=E0602
def align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path): ''' iterate through the i_chunk and tmp_ner_path to generate a new Chunk with body.ner ''' o_chunk = Chunk() input_iter = i_chunk.__iter__() ner = '' stream_id = None all_ner = xml.dom.minidom.parse(open(tmp_ner_path)) for raw_ner in all_ner.getElementsByTagName('FILENAME'): stream_item = input_iter.next() ## get stream_id out of the XML stream_id = raw_ner.attributes.get('docid').value assert stream_id and stream_id == stream_item.stream_id, \ '%s != %s\nner=%r' % (stream_id, stream_item.stream_id, ner) tagger_id = 'lingpipe' tagging = Tagging() tagging.tagger_id = tagger_id ## get this one file out of its FILENAME tags tagged_doc = list(lingpipe.files(raw_ner.toxml()))[0][1] tagging.raw_tagging = tagged_doc tagging.generation_time = streamcorpus.make_stream_time() stream_item.body.taggings[tagger_id] = tagging sentences = list(lingpipe.sentences(tagged_doc)) ## make JS labels on individual tokens assert stream_item.ratings[0].mentions, stream_item.stream_id john_smith_label = Label() john_smith_label.annotator = stream_item.ratings[0].annotator john_smith_label.target_id = stream_item.ratings[0].target_id # first map all corefchains to their words equiv_ids = collections.defaultdict(lambda: set()) for sent in sentences: for tok in sent.tokens: if tok.entity_type is not None: equiv_ids[tok.equiv_id].add(cleanse(tok.token)) ## find all the chains that are John Smith johnsmiths = set() for equiv_id, names in equiv_ids.items(): ## detect 'smith' in 'smithye' _names = cleanse(' '.join(names)) if 'john' in _names and 'smith' in _names: johnsmiths.add(equiv_id) print len(johnsmiths) ## now apply the label for sent in sentences: for tok in sent.tokens: if tok.equiv_id in johnsmiths: tok.labels = [john_smith_label] stream_item.body.sentences[tagger_id] = sentences o_chunk.add(stream_item) ## put the o_chunk bytes into the specified file open(tmp_done_path, 'wb').write(str(o_chunk)) ## replace this with log.info() print 'created %s' % tmp_done_path