def cca_items(args):
    '''This generator takes an s3_paths_fname file, fetches the data,
    constructs a CCA record, and yields it.

    '''
    for path in lzma.open(args.s3_paths_fname):
        if args.date_hour is not None:
            if not path.startswith(args.date_hour):
                continue                
        s3_path = args.s3_path_prefix + path.strip()
        url = args.s3_http_host + s3_path
        logger.info( url )
        retries = 0
        max_retries = 10
        while retries < max_retries:
            retries += 1
            sys.stderr.flush()
            try:
                resp = requests.get(url)
                errors, data = decrypt_and_uncompress(resp.content, gpg_private='trec-kba-rsa')
                logger.info( '\n'.join(errors) )
                for si in Chunk(file_obj=StringIO(data)):

                    item = {
                        'key': si.stream_id,
                        'url': si.abs_url,
                        'timestamp': si.stream_time.epoch_ticks,
                        'request': None,  ## not part of this data set
                        'response': {
                            'headers': [
                                ['Content-Type', 'text/html'],
                            ],
                            'body': si.body.clean_html,
                            ## alternatively, could use si.body.raw and
                            ## si.body.media_type for the Content-Type
                            ## header, but that would cause the Serif NER
                            ## to be useless to teams...
                        },
                        'imported': None,
                    }
                    yield item

                    #print cbor.dumps(rec)

                    ## do something with the data
                    logger.info(
                        '%d bytes of html, or %d bytes of tag-stripped clean_visible, and %d sentences with %d tokens' % (
                        len(si.body.clean_html), len(si.body.clean_visible), 
                        len(si.body.sentences['serif']),
                        len(list(chain(*map(attrgetter('tokens'), si.body.sentences['serif'])))),
                        ))
                break # break out of retry loop
            except Exception, exc:
                logger.critical( traceback.format_exc(exc) )
                logger.critical( 'retrying %d of %d times to fetch and access: %s' % (retries, max_retries, url) )
                time.sleep(1)
def cca_items(args):
    '''This generator takes an s3_paths_fname file, fetches the data,
    constructs a CCA record, and yields it.

    '''
    for path in lzma.open(args.s3_paths_fname):
        if args.date_hour is not None:
            if not path.startswith(args.date_hour):
                continue                
        s3_path = args.s3_path_prefix + path.strip()
        url = args.s3_http_host + s3_path
        logger.info( url )
        retries = 0
        max_retries = 10
        while retries < max_retries:
            retries += 1
            sys.stderr.flush()
            try:
                resp = requests.get(url)
                errors, data = decrypt_and_uncompress(resp.content, gpg_private='trec-kba-rsa')
                logger.info( '\n'.join(errors) )
                for si in Chunk(file_obj=StringIO(data)):

                    item = {
                        'key': si.stream_id,
                        'url': si.abs_url,
                        'timestamp': si.stream_time.epoch_ticks,
                        'request': None,  ## not part of this data set
                        'response': {
                            'headers': [
                                ['Content-Type', 'text/html'],
                            ],
                            'body': si.body.clean_html,
                            ## alternatively, could use si.body.raw and
                            ## si.body.media_type for the Content-Type
                            ## header, but that would cause the Serif NER
                            ## to be useless to teams...
                        },
                        'imported': None,
                    }
                    yield item

                    #print cbor.dumps(rec)

                    ## do something with the data
                    logger.info(
                        '%d bytes of html, or %d bytes of tag-stripped clean_visible, and %d sentences with %d tokens' % (
                        len(si.body.clean_html), len(si.body.clean_visible), 
                        len(si.body.sentences['serif']),
                        len(list(chain(*map(attrgetter('tokens'), si.body.sentences['serif'])))),
                        ))
                break # break out of retry loop
            except Exception, exc:
                logger.critical( traceback.format_exc(exc) )
                logger.critical( 'retrying %d of %d times to fetch and access: %s' % (retries, max_retries, url) )
                time.sleep(1)
    def redownload_verify(self, o_path, md5):
        key = Key(get_bucket(self.config), o_path)
        contents = key.get_contents_as_string()
        errors, data = decrypt_and_uncompress(
            contents, # pylint: disable=E1103
            self.config.get('gpg_decryption_key_path'),
            tmp_dir=self.config['tmp_dir_path'],
            )

        logger.info( 'got back SIs: %d' % len( list( Chunk(data=data) ) ))
        return verify_md5(md5, data, other_errors=errors)
    def get_chunk(self, key):
        tries = 0
        while 1:
            fh = StringIO()
            key.get_contents_to_file(fh)
            data = fh.getvalue()
            _errors, data = decrypt_and_uncompress(
                data, 
                self.config.get('gpg_decryption_key_path'),
                ## how should this get into the config...?
                tmp_dir=self.config['tmp_dir_path'],
                )
            logger.info( '\n'.join(_errors) )
            if self.config['input_format'] == 'streamitem' and \
                    self.config['streamcorpus_version'] == 'v0_1_0':
                i_content_md5 = key.key.split('.')[-3]
            else:
                    ## go past {sc,protostream}.xz.gpg
                    parts = key.key.split('.')
                    if  parts[-1] == '.gpg':
                        parts.pop()
                    try:
                        i_content_md5 = parts[-4][-32:]
                    except IndexError:
                        # The regex hammer.
                        m = re.search('([a-z0-9]{32})\.sc', key.key)
                        i_content_md5 = m.group(1)

            ## verify the data matches expected md5
            f_content_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101
            if i_content_md5 != f_content_md5:
                msg = 'FAIL(%d): %s --> %s != %s' % (tries, key.key, i_content_md5, f_content_md5)
                logger.critical(msg)
                tries += 1
                if tries > self.config['tries']:
                    ## indicate complete failure to pipeline so it
                    ## gets recorded in task_queue
                    raise FailedExtraction(msg)
                else:
                    continue

            if self.config['input_format'] == 'spinn3r':
                ## convert the data from spinn3r's protostream format
                return _generate_stream_items( data )

            elif self.config['input_format'] == 'streamitem':
                message = _message_versions[ self.config['streamcorpus_version'] ]

                return streamcorpus.Chunk(data=data, message=message)

            else:
                raise ConfigurationError('from_s3_chunks input_format = %r' %
                                         self.config['input_format'])
    def get_chunk(self, key):
        tries = 0
        while 1:
            fh = StringIO()
            key.get_contents_to_file(fh)
            data = fh.getvalue()
            _errors, data = decrypt_and_uncompress(
                data,
                self.config.get('gpg_decryption_key_path'),
                ## how should this get into the config...?
                tmp_dir=self.config['tmp_dir_path'],
            )
            logger.info('\n'.join(_errors))
            if self.config['input_format'] == 'streamitem' and \
                    self.config['streamcorpus_version'] == 'v0_1_0':
                i_content_md5 = key.key.split('.')[-3]
            else:
                ## go past {sc,protostream}.xz.gpg
                parts = key.key.split('.')
                if parts[-1] == '.gpg':
                    parts.pop()
                i_content_md5 = parts[-3][-32:]

            ## verify the data matches expected md5
            f_content_md5 = hashlib.md5(data).hexdigest()  # pylint: disable=E1101
            if i_content_md5 != f_content_md5:
                msg = 'FAIL(%d): %s --> %s != %s' % (
                    tries, key.key, i_content_md5, f_content_md5)
                logger.critical(msg)
                tries += 1
                if tries > self.config['tries']:
                    ## indicate complete failure to pipeline so it
                    ## gets recorded in task_queue
                    raise FailedExtraction(msg)
                else:
                    continue

            if self.config['input_format'] == 'spinn3r':
                ## convert the data from spinn3r's protostream format
                return _generate_stream_items(data)

            elif self.config['input_format'] == 'streamitem':
                message = _message_versions[
                    self.config['streamcorpus_version']]

                return streamcorpus.Chunk(data=data, message=message)

            else:
                sys.exit('Invalid config: input_format = %r' %
                         self.config['input_format'])
    def verify(self, o_path, md5):
        if self.config.get('is_private', False):
            return self.redownload_verify(o_path, md5)

        url = 'http://s3.amazonaws.com/%(bucket)s/%(o_path)s' % dict(
            bucket = self.config['bucket'],
            o_path = o_path)
        logger.info('fetching %r' % url)
        req = requests.get(url)
        errors, data = decrypt_and_uncompress(
            req.content, # pylint: disable=E1103
            self.config.get('gpg_decryption_key_path'),
            tmp_dir=self.config['tmp_dir_path'],
            )

        logger.info( 'got back SIs: %d' % len( list( Chunk(data=data) ) ))
        return verify_md5(md5, data, other_errors=errors)
    def get_chunk(self, key):
        tries = 0
        while 1:
            fh = StringIO()
            key.get_contents_to_file(fh)
            data = fh.getvalue()
            _errors, data = decrypt_and_uncompress(
                data, 
                self.config['gpg_decryption_key_path'],
                tmp_dir=self.config['tmp_dir_path'],
                )
            logger.info( '\n'.join(_errors) )
            if self.config['input_format'] == 'streamitem' and \
                    self.config['streamcorpus_version'] == 'v0_1_0':
                i_content_md5 = key.key.split('.')[-3]
            else:
                ## go past {sc,protostream}.xz.gpg
                i_content_md5 = key.key.split('.')[-4][-32:]

            ## verify the data matches expected md5
            f_content_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101
            if i_content_md5 != f_content_md5:
                msg = 'FAIL(%d): %s --> %s != %s' % (tries, key.key, i_content_md5, f_content_md5)
                logger.critical(msg)
                tries += 1
                if tries > self.config['tries']:
                    ## indicate complete failure to pipeline so it
                    ## gets recorded in task_queue
                    raise FailedExtraction(msg)
                else:
                    continue

            if self.config['input_format'] == 'spinn3r':
                ## convert the data from spinn3r's protostream format
                return _extract_spinn3r._generate_stream_items( data )

            elif self.config['input_format'] == 'streamitem':
                message = _message_versions[ self.config['streamcorpus_version'] ]

                return streamcorpus.Chunk(data=data, message=message)

            else:
                sys.exit('Invalid config: input_format = %r' % self.config['input_format'])
def test_kvlayer_index_with_source(configurator, test_data_dir):
    overlay = {
        'streamcorpus_pipeline': {
            'to_kvlayer': {
                'indexes': [ 'with_source' ],
            },
        },
    }
    with chunks(configurator, test_data_dir, overlay) as (path, client):
        # We should not have written the doc_id_epoch_ticks index at all
        for k,v in client.scan('stream_items_doc_id_epoch_ticks'):
            assert False, 'epoch_ticks present! k={!r}'.format(k)
        # Every item in the ...with_source index should match a real item
        for k,v in client.scan('stream_items_with_source'):
            assert v == 'WEBLOG' # by inspection
            for kk,sixz in client.get('stream_items', k):
                errs,sibytes = streamcorpus.decrypt_and_uncompress(sixz)
                assert errs == []
                for si in streamcorpus.Chunk(data=sibytes):
                    assert si.source == v
    def verify(self, o_path, md5):
        url = 'http://s3.amazonaws.com/%(bucket)s/%(o_path)s' % dict(
            bucket=self.config['bucket'], o_path=o_path)
        logger.info('fetching %r' % url)
        req = requests.get(url)
        errors, data = decrypt_and_uncompress(
            req.content,  # pylint: disable=E1103
            self.config.get('gpg_decryption_key_path'),
            tmp_dir=self.config['tmp_dir_path'],
        )

        logger.info('got back SIs: %d' % len(list(Chunk(data=data))))

        rec_md5 = hashlib.md5(data).hexdigest()  # pylint: disable=E1101
        if md5 == rec_md5:
            return
        else:
            logger.critical('\n'.join(errors))
            raise Exception('original md5 = %r != %r = received md5' %
                            (md5, rec_md5))
    def verify(self, o_path, md5):
        url = 'http://s3.amazonaws.com/%(bucket)s/%(o_path)s' % dict(
            bucket = self.config['bucket'],
            o_path = o_path)
        logger.info('fetching %r' % url)
        req = requests.get(url)
        errors, data = decrypt_and_uncompress(
            req.content, # pylint: disable=E1103
            self.config.get('gpg_decryption_key_path'),
            tmp_dir=self.config['tmp_dir_path'],
            )

        logger.info( 'got back SIs: %d' % len( list( Chunk(data=data) ) ))

        rec_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101
        if md5 == rec_md5:
            return
        else:
            logger.critical('\n'.join(errors))
            raise Exception('original md5 = %r != %r = received md5' % (md5, rec_md5))
Exemple #11
0
def test_kvlayer_index_with_source(configurator, test_data_dir):
    overlay = {
        'streamcorpus_pipeline': {
            'to_kvlayer': {
                'indexes': ['with_source'],
            },
        },
    }
    with chunks(configurator, test_data_dir, overlay) as (path, client):
        # We should not have written the doc_id_epoch_ticks index at all
        for k, v in client.scan('stream_items_doc_id_epoch_ticks'):
            assert False, 'epoch_ticks present! k={!r}'.format(k)
        # Every item in the ...with_source index should match a real item
        for k, v in client.scan('stream_items_with_source'):
            assert v == 'WEBLOG'  # by inspection
            for kk, sixz in client.get('stream_items', k):
                errs, sibytes = streamcorpus.decrypt_and_uncompress(sixz)
                assert errs == []
                for si in streamcorpus.Chunk(data=sibytes):
                    assert si.source == v
    def __call__(self, i_str):
        if i_str:
            epoch_ticks_1, doc_id_1, epoch_ticks_2, doc_id_2 = i_str.split(',')
            epoch_ticks_1 = uuid.UUID(int=int(epoch_ticks_1))
            epoch_ticks_2 = uuid.UUID(int=int(epoch_ticks_2))
            if doc_id_1:
                assert doc_id_2, (doc_id_1, doc_id_2)
                doc_id_1 = uuid.UUID(hex=doc_id_1)
                doc_id_2 = uuid.UUID(hex=doc_id_2)
                key1 = (epoch_ticks_1, doc_id_1)
                key2 = (epoch_ticks_2, doc_id_2)
            else:
                key1 = (epoch_ticks_1, )
                key2 = (epoch_ticks_2, )
            key_ranges = [(key1, key2)]
        else:
            key_ranges = []

        for key, data in self.client.scan( 'stream_items', *key_ranges ):
            errors, data = streamcorpus.decrypt_and_uncompress(data)
            yield streamcorpus.deserialize(data)
Exemple #13
0
    def __call__(self, i_str):
        if i_str:
            epoch_ticks_1, doc_id_1, epoch_ticks_2, doc_id_2 = i_str.split(',')
            epoch_ticks_1 = uuid.UUID(int=int(epoch_ticks_1))
            epoch_ticks_2 = uuid.UUID(int=int(epoch_ticks_2))
            if doc_id_1:
                assert doc_id_2, (doc_id_1, doc_id_2)
                doc_id_1 = uuid.UUID(hex=doc_id_1)
                doc_id_2 = uuid.UUID(hex=doc_id_2)
                key1 = (epoch_ticks_1, doc_id_1)
                key2 = (epoch_ticks_2, doc_id_2)
            else:
                key1 = (epoch_ticks_1, )
                key2 = (epoch_ticks_2, )
            key_ranges = [(key1, key2)]
        else:
            key_ranges = []

        for key, data in self.client.scan('stream_items', *key_ranges):
            errors, data = streamcorpus.decrypt_and_uncompress(data)
            yield streamcorpus.deserialize(data)
def get_kvlayer_stream_item(client, stream_id):
    '''Retrieve a :class:`streamcorpus.StreamItem` from :mod:`kvlayer`.

    This function requires that `client` already be set up properly::

        client = kvlayer.client()
        client.setup_namespace({'stream_items': 2})
        si = get_kvlayer_stream_item(client, stream_id)

    `stream_id` is in the form of
    :data:`streamcorpus.StreamItem.stream_id` and contains the
    ``epoch_ticks``, a hyphen, and the ``doc_id``.

    :param client: kvlayer client object
    :type client: :class:`kvlayer.AbstractStorage`
    :param str stream_id: stream Id to retrieve
    :return: corresponding :class:`streamcorpus.StreamItem`
    :raise exceptions.KeyError: if `stream_id` is malformed or does
      not correspond to anything in the database

    '''
    # Reminder: stream_id is 1234567890-123456789abcdef...0
    # where the first part is the (decimal) epoch_ticks and the second
    # part is the (hex) doc_id
    parts = stream_id.split('-')
    if len(parts) != 2:
        raise KeyError('invalid stream_id ' + stream_id)
    timestr = parts[0]
    dochex = parts[1]
    if not timestr.isdigit():
        raise KeyError('invalid stream_id ' + stream_id)
    if dochex.lstrip(string.hexdigits) != '':
        raise KeyError('invalid stream_id ' + stream_id)

    key = (uuid.UUID(int=int(timestr)), uuid.UUID(hex=dochex))
    for k,v in client.get('stream_items', key):
        if v is not None:
            errors, bytestr = streamcorpus.decrypt_and_uncompress(v)
            return streamcorpus.deserialize(bytestr)
    raise KeyError(stream_id)
Exemple #15
0
def get_kvlayer_stream_item(client, stream_id):
    '''Retrieve a :class:`streamcorpus.StreamItem` from :mod:`kvlayer`.

    This function requires that `client` already be set up properly::

        client = kvlayer.client()
        client.setup_namespace({'stream_items': 2})
        si = get_kvlayer_stream_item(client, stream_id)

    `stream_id` is in the form of
    :data:`streamcorpus.StreamItem.stream_id` and contains the
    ``epoch_ticks``, a hyphen, and the ``doc_id``.

    :param client: kvlayer client object
    :type client: :class:`kvlayer.AbstractStorage`
    :param str stream_id: stream Id to retrieve
    :return: corresponding :class:`streamcorpus.StreamItem`
    :raise exceptions.KeyError: if `stream_id` is malformed or does
      not correspond to anything in the database

    '''
    # Reminder: stream_id is 1234567890-123456789abcdef...0
    # where the first part is the (decimal) epoch_ticks and the second
    # part is the (hex) doc_id
    parts = stream_id.split('-')
    if len(parts) != 2:
        raise KeyError('invalid stream_id ' + stream_id)
    timestr = parts[0]
    dochex = parts[1]
    if not timestr.isdigit():
        raise KeyError('invalid stream_id ' + stream_id)
    if dochex.lstrip(string.hexdigits) != '':
        raise KeyError('invalid stream_id ' + stream_id)

    key = (uuid.UUID(int=int(timestr)), uuid.UUID(hex=dochex))
    for k, v in client.get('stream_items', key):
        if v is not None:
            errors, bytestr = streamcorpus.decrypt_and_uncompress(v)
            return streamcorpus.deserialize(bytestr)
    raise KeyError(stream_id)
Exemple #16
0
s3_paths_fname = 'local-politics-streamcorpus-v0_3_0-s3-paths.txt.xz'
if not os.path.exists(s3_paths_fname):
    sys.exit('please download %strec/dd/%s' % (s3_http_host, s3_paths_fname))

for path in lzma.open(s3_paths_fname):
    s3_path = s3_path_prefix + path.strip()
    url = s3_http_host + s3_path
    logger.info(url)
    retries = 0
    max_retries = 10
    while retries < max_retries:
        retries += 1
        sys.stderr.flush()
        try:
            resp = requests.get(url)
            errors, data = decrypt_and_uncompress(resp.content,
                                                  gpg_private='trec-kba-rsa')
            logger.info('\n'.join(errors))
            for si in Chunk(file_obj=StringIO(data)):

                rec = {
                    'url': si.abs_url,
                    'timestamp': si.stream_time.epoch_ticks,
                    'request': None,  ## not part of this data set
                    'response': {
                        'headers': [
                            ['Content-Type', 'text/html'],
                        ],
                        'body': si.body.clean_html,
                        ## alternatively, could use si.body.raw and
                        ## si.body.media_type for the Content-Type
                        ## header, but that would cause the Serif NER