def cca_items(args): '''This generator takes an s3_paths_fname file, fetches the data, constructs a CCA record, and yields it. ''' for path in lzma.open(args.s3_paths_fname): if args.date_hour is not None: if not path.startswith(args.date_hour): continue s3_path = args.s3_path_prefix + path.strip() url = args.s3_http_host + s3_path logger.info( url ) retries = 0 max_retries = 10 while retries < max_retries: retries += 1 sys.stderr.flush() try: resp = requests.get(url) errors, data = decrypt_and_uncompress(resp.content, gpg_private='trec-kba-rsa') logger.info( '\n'.join(errors) ) for si in Chunk(file_obj=StringIO(data)): item = { 'key': si.stream_id, 'url': si.abs_url, 'timestamp': si.stream_time.epoch_ticks, 'request': None, ## not part of this data set 'response': { 'headers': [ ['Content-Type', 'text/html'], ], 'body': si.body.clean_html, ## alternatively, could use si.body.raw and ## si.body.media_type for the Content-Type ## header, but that would cause the Serif NER ## to be useless to teams... }, 'imported': None, } yield item #print cbor.dumps(rec) ## do something with the data logger.info( '%d bytes of html, or %d bytes of tag-stripped clean_visible, and %d sentences with %d tokens' % ( len(si.body.clean_html), len(si.body.clean_visible), len(si.body.sentences['serif']), len(list(chain(*map(attrgetter('tokens'), si.body.sentences['serif'])))), )) break # break out of retry loop except Exception, exc: logger.critical( traceback.format_exc(exc) ) logger.critical( 'retrying %d of %d times to fetch and access: %s' % (retries, max_retries, url) ) time.sleep(1)
def redownload_verify(self, o_path, md5): key = Key(get_bucket(self.config), o_path) contents = key.get_contents_as_string() errors, data = decrypt_and_uncompress( contents, # pylint: disable=E1103 self.config.get('gpg_decryption_key_path'), tmp_dir=self.config['tmp_dir_path'], ) logger.info( 'got back SIs: %d' % len( list( Chunk(data=data) ) )) return verify_md5(md5, data, other_errors=errors)
def get_chunk(self, key): tries = 0 while 1: fh = StringIO() key.get_contents_to_file(fh) data = fh.getvalue() _errors, data = decrypt_and_uncompress( data, self.config.get('gpg_decryption_key_path'), ## how should this get into the config...? tmp_dir=self.config['tmp_dir_path'], ) logger.info( '\n'.join(_errors) ) if self.config['input_format'] == 'streamitem' and \ self.config['streamcorpus_version'] == 'v0_1_0': i_content_md5 = key.key.split('.')[-3] else: ## go past {sc,protostream}.xz.gpg parts = key.key.split('.') if parts[-1] == '.gpg': parts.pop() try: i_content_md5 = parts[-4][-32:] except IndexError: # The regex hammer. m = re.search('([a-z0-9]{32})\.sc', key.key) i_content_md5 = m.group(1) ## verify the data matches expected md5 f_content_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101 if i_content_md5 != f_content_md5: msg = 'FAIL(%d): %s --> %s != %s' % (tries, key.key, i_content_md5, f_content_md5) logger.critical(msg) tries += 1 if tries > self.config['tries']: ## indicate complete failure to pipeline so it ## gets recorded in task_queue raise FailedExtraction(msg) else: continue if self.config['input_format'] == 'spinn3r': ## convert the data from spinn3r's protostream format return _generate_stream_items( data ) elif self.config['input_format'] == 'streamitem': message = _message_versions[ self.config['streamcorpus_version'] ] return streamcorpus.Chunk(data=data, message=message) else: raise ConfigurationError('from_s3_chunks input_format = %r' % self.config['input_format'])
def get_chunk(self, key): tries = 0 while 1: fh = StringIO() key.get_contents_to_file(fh) data = fh.getvalue() _errors, data = decrypt_and_uncompress( data, self.config.get('gpg_decryption_key_path'), ## how should this get into the config...? tmp_dir=self.config['tmp_dir_path'], ) logger.info('\n'.join(_errors)) if self.config['input_format'] == 'streamitem' and \ self.config['streamcorpus_version'] == 'v0_1_0': i_content_md5 = key.key.split('.')[-3] else: ## go past {sc,protostream}.xz.gpg parts = key.key.split('.') if parts[-1] == '.gpg': parts.pop() i_content_md5 = parts[-3][-32:] ## verify the data matches expected md5 f_content_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101 if i_content_md5 != f_content_md5: msg = 'FAIL(%d): %s --> %s != %s' % ( tries, key.key, i_content_md5, f_content_md5) logger.critical(msg) tries += 1 if tries > self.config['tries']: ## indicate complete failure to pipeline so it ## gets recorded in task_queue raise FailedExtraction(msg) else: continue if self.config['input_format'] == 'spinn3r': ## convert the data from spinn3r's protostream format return _generate_stream_items(data) elif self.config['input_format'] == 'streamitem': message = _message_versions[ self.config['streamcorpus_version']] return streamcorpus.Chunk(data=data, message=message) else: sys.exit('Invalid config: input_format = %r' % self.config['input_format'])
def verify(self, o_path, md5): if self.config.get('is_private', False): return self.redownload_verify(o_path, md5) url = 'http://s3.amazonaws.com/%(bucket)s/%(o_path)s' % dict( bucket = self.config['bucket'], o_path = o_path) logger.info('fetching %r' % url) req = requests.get(url) errors, data = decrypt_and_uncompress( req.content, # pylint: disable=E1103 self.config.get('gpg_decryption_key_path'), tmp_dir=self.config['tmp_dir_path'], ) logger.info( 'got back SIs: %d' % len( list( Chunk(data=data) ) )) return verify_md5(md5, data, other_errors=errors)
def get_chunk(self, key): tries = 0 while 1: fh = StringIO() key.get_contents_to_file(fh) data = fh.getvalue() _errors, data = decrypt_and_uncompress( data, self.config['gpg_decryption_key_path'], tmp_dir=self.config['tmp_dir_path'], ) logger.info( '\n'.join(_errors) ) if self.config['input_format'] == 'streamitem' and \ self.config['streamcorpus_version'] == 'v0_1_0': i_content_md5 = key.key.split('.')[-3] else: ## go past {sc,protostream}.xz.gpg i_content_md5 = key.key.split('.')[-4][-32:] ## verify the data matches expected md5 f_content_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101 if i_content_md5 != f_content_md5: msg = 'FAIL(%d): %s --> %s != %s' % (tries, key.key, i_content_md5, f_content_md5) logger.critical(msg) tries += 1 if tries > self.config['tries']: ## indicate complete failure to pipeline so it ## gets recorded in task_queue raise FailedExtraction(msg) else: continue if self.config['input_format'] == 'spinn3r': ## convert the data from spinn3r's protostream format return _extract_spinn3r._generate_stream_items( data ) elif self.config['input_format'] == 'streamitem': message = _message_versions[ self.config['streamcorpus_version'] ] return streamcorpus.Chunk(data=data, message=message) else: sys.exit('Invalid config: input_format = %r' % self.config['input_format'])
def test_kvlayer_index_with_source(configurator, test_data_dir): overlay = { 'streamcorpus_pipeline': { 'to_kvlayer': { 'indexes': [ 'with_source' ], }, }, } with chunks(configurator, test_data_dir, overlay) as (path, client): # We should not have written the doc_id_epoch_ticks index at all for k,v in client.scan('stream_items_doc_id_epoch_ticks'): assert False, 'epoch_ticks present! k={!r}'.format(k) # Every item in the ...with_source index should match a real item for k,v in client.scan('stream_items_with_source'): assert v == 'WEBLOG' # by inspection for kk,sixz in client.get('stream_items', k): errs,sibytes = streamcorpus.decrypt_and_uncompress(sixz) assert errs == [] for si in streamcorpus.Chunk(data=sibytes): assert si.source == v
def verify(self, o_path, md5): url = 'http://s3.amazonaws.com/%(bucket)s/%(o_path)s' % dict( bucket=self.config['bucket'], o_path=o_path) logger.info('fetching %r' % url) req = requests.get(url) errors, data = decrypt_and_uncompress( req.content, # pylint: disable=E1103 self.config.get('gpg_decryption_key_path'), tmp_dir=self.config['tmp_dir_path'], ) logger.info('got back SIs: %d' % len(list(Chunk(data=data)))) rec_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101 if md5 == rec_md5: return else: logger.critical('\n'.join(errors)) raise Exception('original md5 = %r != %r = received md5' % (md5, rec_md5))
def verify(self, o_path, md5): url = 'http://s3.amazonaws.com/%(bucket)s/%(o_path)s' % dict( bucket = self.config['bucket'], o_path = o_path) logger.info('fetching %r' % url) req = requests.get(url) errors, data = decrypt_and_uncompress( req.content, # pylint: disable=E1103 self.config.get('gpg_decryption_key_path'), tmp_dir=self.config['tmp_dir_path'], ) logger.info( 'got back SIs: %d' % len( list( Chunk(data=data) ) )) rec_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101 if md5 == rec_md5: return else: logger.critical('\n'.join(errors)) raise Exception('original md5 = %r != %r = received md5' % (md5, rec_md5))
def test_kvlayer_index_with_source(configurator, test_data_dir): overlay = { 'streamcorpus_pipeline': { 'to_kvlayer': { 'indexes': ['with_source'], }, }, } with chunks(configurator, test_data_dir, overlay) as (path, client): # We should not have written the doc_id_epoch_ticks index at all for k, v in client.scan('stream_items_doc_id_epoch_ticks'): assert False, 'epoch_ticks present! k={!r}'.format(k) # Every item in the ...with_source index should match a real item for k, v in client.scan('stream_items_with_source'): assert v == 'WEBLOG' # by inspection for kk, sixz in client.get('stream_items', k): errs, sibytes = streamcorpus.decrypt_and_uncompress(sixz) assert errs == [] for si in streamcorpus.Chunk(data=sibytes): assert si.source == v
def __call__(self, i_str): if i_str: epoch_ticks_1, doc_id_1, epoch_ticks_2, doc_id_2 = i_str.split(',') epoch_ticks_1 = uuid.UUID(int=int(epoch_ticks_1)) epoch_ticks_2 = uuid.UUID(int=int(epoch_ticks_2)) if doc_id_1: assert doc_id_2, (doc_id_1, doc_id_2) doc_id_1 = uuid.UUID(hex=doc_id_1) doc_id_2 = uuid.UUID(hex=doc_id_2) key1 = (epoch_ticks_1, doc_id_1) key2 = (epoch_ticks_2, doc_id_2) else: key1 = (epoch_ticks_1, ) key2 = (epoch_ticks_2, ) key_ranges = [(key1, key2)] else: key_ranges = [] for key, data in self.client.scan( 'stream_items', *key_ranges ): errors, data = streamcorpus.decrypt_and_uncompress(data) yield streamcorpus.deserialize(data)
def __call__(self, i_str): if i_str: epoch_ticks_1, doc_id_1, epoch_ticks_2, doc_id_2 = i_str.split(',') epoch_ticks_1 = uuid.UUID(int=int(epoch_ticks_1)) epoch_ticks_2 = uuid.UUID(int=int(epoch_ticks_2)) if doc_id_1: assert doc_id_2, (doc_id_1, doc_id_2) doc_id_1 = uuid.UUID(hex=doc_id_1) doc_id_2 = uuid.UUID(hex=doc_id_2) key1 = (epoch_ticks_1, doc_id_1) key2 = (epoch_ticks_2, doc_id_2) else: key1 = (epoch_ticks_1, ) key2 = (epoch_ticks_2, ) key_ranges = [(key1, key2)] else: key_ranges = [] for key, data in self.client.scan('stream_items', *key_ranges): errors, data = streamcorpus.decrypt_and_uncompress(data) yield streamcorpus.deserialize(data)
def get_kvlayer_stream_item(client, stream_id): '''Retrieve a :class:`streamcorpus.StreamItem` from :mod:`kvlayer`. This function requires that `client` already be set up properly:: client = kvlayer.client() client.setup_namespace({'stream_items': 2}) si = get_kvlayer_stream_item(client, stream_id) `stream_id` is in the form of :data:`streamcorpus.StreamItem.stream_id` and contains the ``epoch_ticks``, a hyphen, and the ``doc_id``. :param client: kvlayer client object :type client: :class:`kvlayer.AbstractStorage` :param str stream_id: stream Id to retrieve :return: corresponding :class:`streamcorpus.StreamItem` :raise exceptions.KeyError: if `stream_id` is malformed or does not correspond to anything in the database ''' # Reminder: stream_id is 1234567890-123456789abcdef...0 # where the first part is the (decimal) epoch_ticks and the second # part is the (hex) doc_id parts = stream_id.split('-') if len(parts) != 2: raise KeyError('invalid stream_id ' + stream_id) timestr = parts[0] dochex = parts[1] if not timestr.isdigit(): raise KeyError('invalid stream_id ' + stream_id) if dochex.lstrip(string.hexdigits) != '': raise KeyError('invalid stream_id ' + stream_id) key = (uuid.UUID(int=int(timestr)), uuid.UUID(hex=dochex)) for k,v in client.get('stream_items', key): if v is not None: errors, bytestr = streamcorpus.decrypt_and_uncompress(v) return streamcorpus.deserialize(bytestr) raise KeyError(stream_id)
def get_kvlayer_stream_item(client, stream_id): '''Retrieve a :class:`streamcorpus.StreamItem` from :mod:`kvlayer`. This function requires that `client` already be set up properly:: client = kvlayer.client() client.setup_namespace({'stream_items': 2}) si = get_kvlayer_stream_item(client, stream_id) `stream_id` is in the form of :data:`streamcorpus.StreamItem.stream_id` and contains the ``epoch_ticks``, a hyphen, and the ``doc_id``. :param client: kvlayer client object :type client: :class:`kvlayer.AbstractStorage` :param str stream_id: stream Id to retrieve :return: corresponding :class:`streamcorpus.StreamItem` :raise exceptions.KeyError: if `stream_id` is malformed or does not correspond to anything in the database ''' # Reminder: stream_id is 1234567890-123456789abcdef...0 # where the first part is the (decimal) epoch_ticks and the second # part is the (hex) doc_id parts = stream_id.split('-') if len(parts) != 2: raise KeyError('invalid stream_id ' + stream_id) timestr = parts[0] dochex = parts[1] if not timestr.isdigit(): raise KeyError('invalid stream_id ' + stream_id) if dochex.lstrip(string.hexdigits) != '': raise KeyError('invalid stream_id ' + stream_id) key = (uuid.UUID(int=int(timestr)), uuid.UUID(hex=dochex)) for k, v in client.get('stream_items', key): if v is not None: errors, bytestr = streamcorpus.decrypt_and_uncompress(v) return streamcorpus.deserialize(bytestr) raise KeyError(stream_id)
s3_paths_fname = 'local-politics-streamcorpus-v0_3_0-s3-paths.txt.xz' if not os.path.exists(s3_paths_fname): sys.exit('please download %strec/dd/%s' % (s3_http_host, s3_paths_fname)) for path in lzma.open(s3_paths_fname): s3_path = s3_path_prefix + path.strip() url = s3_http_host + s3_path logger.info(url) retries = 0 max_retries = 10 while retries < max_retries: retries += 1 sys.stderr.flush() try: resp = requests.get(url) errors, data = decrypt_and_uncompress(resp.content, gpg_private='trec-kba-rsa') logger.info('\n'.join(errors)) for si in Chunk(file_obj=StringIO(data)): rec = { 'url': si.abs_url, 'timestamp': si.stream_time.epoch_ticks, 'request': None, ## not part of this data set 'response': { 'headers': [ ['Content-Type', 'text/html'], ], 'body': si.body.clean_html, ## alternatively, could use si.body.raw and ## si.body.media_type for the Content-Type ## header, but that would cause the Serif NER