def process_path(self, chunk_path):
        ## make temporary file paths based on chunk_path
        clean_visible_path = chunk_path + '-clean_visible.xml'
        ner_xml_path = chunk_path + '-ner.xml'

        ## process the chunk's clean_visible data into xml
        i_chunk = Chunk(path=chunk_path, mode='rb')
        make_clean_visible_file(i_chunk, clean_visible_path)

        ## make sure holding nothing that consumes memory
        i_chunk = None

        ## generate an output file from the tagger
        self.make_ner_file(clean_visible_path, ner_xml_path)

        ## make a new output chunk at a temporary path
        tmp_chunk_path = chunk_path + '_'
        o_chunk = Chunk(path=tmp_chunk_path, mode='wb')

        ## re-open i_chunk
        i_chunk = Chunk(path=chunk_path, mode='rb')

        ## fuse the output file with i_chunk to make o_chunk
        self.align_chunk_with_ner(ner_xml_path, i_chunk, o_chunk)

        ## clean up temp files
        if self.config['cleanup_tmp_files']:
            os.remove(clean_visible_path)
            os.remove(ner_xml_path)

        ## atomic rename new chunk file into place
        os.rename(tmp_chunk_path, chunk_path)
def make_hyperlink_labeled_test_chunk():
    '''
    returns a path to a temporary chunk that has been hyperlink labeled
    '''
    tpath = os.path.join('/tmp', str(uuid.uuid1()) + '.sc')
    o_chunk = Chunk(tpath, mode='wb')

    dpath = os.path.dirname(__file__)
    ipath = os.path.join( dpath, _TEST_DATA_ROOT, 'test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc' )

    cv = _init_stage('clean_visible', {})
    hl = hyperlink_labels(
        {'require_abs_url': True, 
         'all_domains': True,
         'offset_types': ['BYTES']}
        )
    for si in Chunk(path=ipath):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

    o_chunk.close()
    return tpath
def test_get_name_info(tmpdir):

    path = str(tmpdir.join('test_path'))
    c = Chunk(path, mode='wb')
    c.add(make_stream_item(28491, 'abs_url'))

    name_info = get_name_info(path, i_str='foo')
    assert name_info['date_now'] == name_info['date_time_now'][:10]
    assert name_info['date_now'] + '-' + name_info['time_now'] == name_info['date_time_now']
def test_matcher():

    config = dict(
        ## command to run
        fpat_path="cat"
    )

    fm = fpat_matcher(config)

    si1 = make_stream_item(None, "http://example.com")
    si1.body = ContentItem(clean_visible="hello! This is a test of matching Bob.")

    si2 = make_stream_item(None, "http://example.com")
    si2.body = ContentItem(clean_visible="hello! This is a test of matching Sally.")

    chunk_path = "/tmp/%s" % uuid.uuid1()

    ch = Chunk(chunk_path, mode="wb")
    ch.add(si1)
    ch.add(si1)
    ch.add(si2)
    ch.close()

    fm(chunk_path)

    ch = Chunk(chunk_path, mode="rb")

    SIs = list(ch)

    ## verify the si has expected things
    for si in SIs:
        len(si.body.labels) == 1

    for i in range(2):
        print SIs[i].ratings
def run_pipeline(tagger_id, input_dir, output_dir, tmp_dir='/tmp', pipeline_root='./'):
    '''
    '''
    ## make tmp_dir more
    tmp_dir = os.path.join(tmp_dir, '%s-%s' % (uuid.uuid1(), os.getpid()))
    assert not os.path.exists(tmp_dir), tmp_dir
    os.makedirs(tmp_dir)
    
    for fname in os.listdir(input_dir):
        if not fname.endswith('.sc'):
            ## ignore any non streamcorpus.Chunk files
            continue

        fpath = os.path.join(input_dir, fname)

        ## just need one chunk for this tiny corpus
        i_chunk = Chunk(file_obj=open(fpath))

        ## prepare to make intermediate files in tmp_dir
        tmp_cleansed_path = os.path.join(tmp_dir, fname + '.cleansed.xml')
        tmp_ner_path      = os.path.join(tmp_dir, fname + '.ner.xml')
        tmp_done_path     = os.path.join(output_dir, fname + '.done.partial')
        final_done_path   = os.path.join(output_dir, fname + '.done.sc')

        make_cleansed_file(i_chunk, tmp_cleansed_path)

        make_ner_file(tagger_id, tmp_cleansed_path, tmp_ner_path, pipeline_root)

        align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path)

        ## atomic rename when done
        os.rename(tmp_done_path, final_done_path)

        ## replace with log.info()
        print 'done with %s' % final_done_path
def test_tokenizer(test_data_dir):
    path = os.path.join(test_data_dir, 'test', 'wlc-chunk-with-labels.sc')
    num = 0
    for si in Chunk(path):
        num += 1
        ## there is only one StreamItem in this chunk
        sentences = si.body.sentences.pop('nltk_tokenizer')
        t = nltk_tokenizer(config={'annotator_id': 'author'})
        t.process_item(si)

        assert num > 0

        ## if something changes, then need to save new test data
        #open(path, 'wb').write(serialize(si))
        #return
        if 1:

            #assert si.body.sentences['nltk_tokenizer'] == sentences
            num = 0
            for i in range(len(si.body.sentences['nltk_tokenizer'])):
                for j in range(
                        len(si.body.sentences['nltk_tokenizer'][i].tokens)):
                    tok_t = si.body.sentences['nltk_tokenizer'][i].tokens[j]
                    for attr in dir(tok_t):
                        if attr.startswith('__'):
                            continue  #type(attr) == type(test_tokenizer): continue
                        ## printing for diagnostics when things change
                        #print 'checking ', attr
                    assert getattr(tok_t,
                                   attr) == getattr(sentences[i].tokens[j],
                                                    attr)
                    num += 1

            assert num > 0
Beispiel #7
0
 def cids_and_fcs():
     count = 0
     seen = set()
     for si in Chunk(t_path):
         clean_html = getattr(si.body, 'clean_html', '')
         if clean_html is None or len(clean_html.strip()) == 0:
             logger.warn('dropping SI lacking clean_html: %r',
                         si.abs_url)
             continue
         if 'other_features' in si.other_content:
             other_features = json.loads(
                 si.other_content['other_features'].raw)
         else:
             other_features = None
         fc = html_to_fc(
             clean_html=si.body.clean_html.decode('utf-8'),
             clean_visible=si.body.clean_visible.decode('utf-8'),
             encoding='utf-8',
             url=si.abs_url,
             timestamp=si.stream_time.epoch_ticks,
             other_features=other_features,
         )
         add_sip_to_fc(fc, self.tfidf)
         content_id = mk_content_id(str(fc.get(u'meta_url')))
         if content_id in seen:
             logger.warn('dropping duplicate content_id=%r', content_id)
         else:
             seen.add(content_id)
             yield content_id, fc
             count += 1
     logger.info('saved %d FCs from %d SIs', count, len(seen))
Beispiel #8
0
def make_hyperlink_labeled_test_chunk(tmpdir):
    '''
    returns a path to a temporary chunk that has been hyperlink labeled
    '''
    tpath = tmpdir.join(str(uuid.uuid1()) + '.sc')
    o_chunk = Chunk(tpath, mode='wb')

    ipath = get_test_chunk_path()

    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': [BYTES],
    })
    cv = make_clean_visible(config={})
    for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

        o_chunk.close()
        return tpath
def _aligner_core(t_path1, aligner, aligner_data):
    t_chunk1 = Chunk(t_path1, mode='rb')
    t_path2 = t_path1 + '-tmp-aligning'
    t_chunk2 = Chunk(t_path2, mode='wb')
    for si in t_chunk1:
        aligner(si, aligner_data)
        t_chunk2.add(si)
    t_chunk1.close()
    t_chunk2.close()

    if aligner_data.get('cleanup_tmp_files', True):
        logger.info('atomic rename: %r --> %r', t_path2, t_path1)
        os.rename(t_path2, t_path1)
        logger.debug('done renaming')
    else:
        # for development, leave intermediate tmp file
        shutil.copy(t_path2, t_path1)
        logger.info('copied %r -> %r', t_path2, t_path1)
def cca_items(args):
    '''This generator takes an s3_paths_fname file, fetches the data,
    constructs a CCA record, and yields it.

    '''
    for path in lzma.open(args.s3_paths_fname):
        if args.date_hour is not None:
            if not path.startswith(args.date_hour):
                continue                
        s3_path = args.s3_path_prefix + path.strip()
        url = args.s3_http_host + s3_path
        logger.info( url )
        retries = 0
        max_retries = 10
        while retries < max_retries:
            retries += 1
            sys.stderr.flush()
            try:
                resp = requests.get(url)
                errors, data = decrypt_and_uncompress(resp.content, gpg_private='trec-kba-rsa')
                logger.info( '\n'.join(errors) )
                for si in Chunk(file_obj=StringIO(data)):

                    item = {
                        'key': si.stream_id,
                        'url': si.abs_url,
                        'timestamp': si.stream_time.epoch_ticks,
                        'request': None,  ## not part of this data set
                        'response': {
                            'headers': [
                                ['Content-Type', 'text/html'],
                            ],
                            'body': si.body.clean_html,
                            ## alternatively, could use si.body.raw and
                            ## si.body.media_type for the Content-Type
                            ## header, but that would cause the Serif NER
                            ## to be useless to teams...
                        },
                        'imported': None,
                    }
                    yield item

                    #print cbor.dumps(rec)

                    ## do something with the data
                    logger.info(
                        '%d bytes of html, or %d bytes of tag-stripped clean_visible, and %d sentences with %d tokens' % (
                        len(si.body.clean_html), len(si.body.clean_visible), 
                        len(si.body.sentences['serif']),
                        len(list(chain(*map(attrgetter('tokens'), si.body.sentences['serif'])))),
                        ))
                break # break out of retry loop
            except Exception, exc:
                logger.critical( traceback.format_exc(exc) )
                logger.critical( 'retrying %d of %d times to fetch and access: %s' % (retries, max_retries, url) )
                time.sleep(1)
def get_john_smith_tagged_by_lingpipe_without_labels_data():
    fh = StringIO()
    o_chunk = Chunk(file_obj=fh, mode='wb')

    path = get_john_smith_tagged_by_lingpipe_path()
    for si in Chunk(path):
        for sentence in si.body.sentences['lingpipe']:
            for token in sentence.tokens:
                for labels in token.labels.values():
                    for label in labels:
                        label.offsets.update(token.offsets)
                        for offset in label.offsets.values():
                            offset.value = token.token
                        add_annotation(si.body, label)
                token.labels = dict()
        o_chunk.add(si)

    o_chunk.flush()
    return fh.getvalue()
    def __call__(self, chunk_path):
        '''
        batch-type transform stage: reads a chunk from chunk_path, and
        replaces it with a new chunk at the same path
        '''
        ## make a new output chunk at a temporary path
        tmp_chunk_path = chunk_path + '_'
        t_chunk = Chunk(path=tmp_chunk_path, mode='wb')

        for num, si in enumerate(Chunk(path=chunk_path)):
            if num < self.config['max_items']:
                t_chunk.add(si)
            else:
                break

        ## flush to disk
        t_chunk.close()

        ## atomic rename new chunk file into place
        os.rename(tmp_chunk_path, chunk_path)
Beispiel #13
0
def main():
    parser = argparse.ArgumentParser(
        'process streamcorpus.Chunk files to generate CBOR files'
        ' to load into memex_dossier.akagraph.'                                     
    )
    parser.add_argument('input_paths', nargs='+', 
                        help='paths to streamcorpus.Chunk files')
    parser.add_argument('--output-path', help='cbor file (or cbor.gz) to create')
    parser.add_argument('--xform', action='store_true', default=False,
                        help='run structured_features transform before page_extractors')
    parser.add_argument('--total', type=int, help='anticipated number of StreamItems')
    parser.add_argument('--limit', type=int, 
                        help='stop processing after this many StreamItems')
    args = parser.parse_args()

    xform = structured_features(structured_features.default_config)

    fopen = open
    if args.output_path.endswith('.gz'):
        fopen = gzip.open
    fh = fopen(args.output_path, 'wb')

    count = 0
    start = time.time()
    for path in args.input_paths:
        for si in Chunk(path):
            count += 1
            if count % 100 == 0:
                elapsed = time.time() - start
                rate = count / elapsed
                msg = '%d done in %.1f secs --> %.1f per sec' % (count, elapsed, rate)
                if args.total:
                    remaining = (args.total - count) / rate
                    msg += ' --> %.1f sec remaining' % remaining
                print(msg)
                sys.stdout.flush()
            if args.limit and count > args.limit:
                break
            #url_parts = urlparse(si.abs_url)
            if args.xform:
                si = xform(si)
            slots = profile_page(si)
            if slots:
                slots = cbor.loads(slots)
                better_slots = {}
                for key, values in slots['slots'].iteritems():
                    assert isinstance(values, list), values
                    better_slots[key.lower()] = [unicodedata.normalize('NFKC', v).lower()
                                                 for v in values]
                better_slots['url'] = si.abs_url
                cbor.dump(better_slots, fh)
    fh.close()
    print('done')
    def start(self):
        self.toFactoriePipeName = os.tmpnam()
        self.fromFactoriePipeName = os.tmpnam()
        os.mkfifo(self.toFactoriePipeName)
        os.mkfifo(self.fromFactoriePipeName)
        logger.debug('made fifos %r %r', self.toFactoriePipeName, self.fromFactoriePipeName)

        self.call_factorie(self.toFactoriePipeName, self.fromFactoriePipeName)

        self.pipeToFactorie = Chunk(path=self.toFactoriePipeName, mode='ab')
        self.pipeFromFactorie = Chunk(path=self.fromFactoriePipeName, mode='rb')
        self.taggedChunkIter = iter(self.pipeFromFactorie)
Beispiel #15
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('out_path')
    args = parser.parse_args()

    si = make_stream_item(1, 'http://crazydog.com')
    si.body.raw = '''
Flying dogs are amazing.
The flight of the super dog Sam Vroomvroom is often cited as the first such flying dog.
'''

    topic_name = 'The flight of the super dog Sam Vroomvroom'
    sel = Selector(
        selector_type=SelectorType.TOPIC.value,
        raw_selector=topic_name,
        canonical_selector=topic_name.lower(
        ),  # this is the key for making it appear for a profile of this title
        offsets={
            OffsetType.CHARS:
            Offset(
                type=OffsetType.CHARS,
                first=si.body.raw.find('The'),
                length=len(topic_name),
            )
        },
    )
    si.body.selectors['other'] = [sel]

    chunk = Chunk(args.out_path, mode='wb')
    chunk.add(si)
    chunk.close()
def make_hyperlink_labeled_test_chunk(tmpdir):
    """
    returns a path to a temporary chunk that has been hyperlink labeled
    """
    tpath = tmpdir.join(str(uuid.uuid1()) + ".sc")
    o_chunk = Chunk(tpath, mode="wb")

    ipath = get_test_chunk_path()

    hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [BYTES]})
    cv = make_clean_visible(config={})
    for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

        o_chunk.close()
        return tpath
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file):
    """minimal end-to-end test, with a fixed pipeline"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def _aligner_core(t_path1, aligner, aligner_data):
    t_chunk1 = Chunk(t_path1, mode='rb')
    t_path2 = t_path1 + '-tmp-aligning'
    t_chunk2 = Chunk(t_path2, mode='wb')
    for si in t_chunk1:
        aligner( si, aligner_data )
        t_chunk2.add(si)
    t_chunk1.close()
    t_chunk2.close()

    if aligner_data.get('cleanup_tmp_files', True):
        logger.info('atomic rename: %r --> %r', t_path2, t_path1)
        os.rename(t_path2, t_path1)
        logger.debug('done renaming')
    else:
        # for development, leave intermediate tmp file
        shutil.copy(t_path2, t_path1)
        logger.info('copied %r -> %r', t_path2, t_path1)
class factorie(FactorieBase):
    '''
    incremental transform
    '''
    def __init__(self, config):
        super(FactorieIncremetal, self).__init__(config)

        self.toFactoriePipeName = None
        self.fromFactoriePipeName = None
        self.pipeToFactorie = None
        self.pipeFromFactorie = None
        self.taggedChunkIter = None

    def start(self):
        self.toFactoriePipeName = os.tmpnam()
        self.fromFactoriePipeName = os.tmpnam()
        os.mkfifo(self.toFactoriePipeName)
        os.mkfifo(self.fromFactoriePipeName)
        logger.debug('made fifos %r %r', self.toFactoriePipeName, self.fromFactoriePipeName)

        self.call_factorie(self.toFactoriePipeName, self.fromFactoriePipeName)

        self.pipeToFactorie = Chunk(path=self.toFactoriePipeName, mode='ab')
        self.pipeFromFactorie = Chunk(path=self.fromFactoriePipeName, mode='rb')
        self.taggedChunkIter = iter(self.pipeFromFactorie)

    def close(self):
        self.pipeToFactorie.close()
        self.taggedChunkIter = None
        self.pipeFromFactorie = None
        os.unlink(self.toFactoriePipeName)
        os.unlink(self.fromFactoriePipeName)
        if self.process:
            self.process.terminate()
            self.process = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
        return False

    def __call__(self, stream_item, context):
        if not self.process:
            self.start()
        logger.debug('pushing stream item to factorie')
        self.pipeToFactorie.add(stream_item)
        self.pipeToFactorie.flush()
        nc = self.taggedChunkIter.next()
        logger.debug('got item from factorie')
        return nc
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config,
                                         output_file):
    """set a publisher_type filter that matches everything in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'WEBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
    def verify(self, o_path, md5):
        url = 'http://s3.amazonaws.com/%(bucket)s/%(o_path)s' % dict(
            bucket=self.config['bucket'], o_path=o_path)
        logger.info('fetching %r' % url)
        req = requests.get(url)
        errors, data = decrypt_and_uncompress(
            req.content,  # pylint: disable=E1103
            self.config.get('gpg_decryption_key_path'),
            tmp_dir=self.config['tmp_dir_path'],
        )

        logger.info('got back SIs: %d' % len(list(Chunk(data=data))))

        rec_md5 = hashlib.md5(data).hexdigest()  # pylint: disable=E1101
        if md5 == rec_md5:
            return
        else:
            logger.critical('\n'.join(errors))
            raise Exception('original md5 = %r != %r = received md5' %
                            (md5, rec_md5))
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config,
                                            output_file):
    """configuration explicitly ignores bad prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': False
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def get_john_smith_tagged_by_lingpipe_without_labels_data(test_data_dir):
    fh = StringIO()
    o_chunk = Chunk(file_obj=fh, mode='wb')

    path = get_john_smith_tagged_by_lingpipe_path(test_data_dir)
    for si in Chunk(path):
        for sentence in si.body.sentences['lingpipe']:
            for token in sentence.tokens:
                for labels in token.labels.values():
                    for label in labels:
                        label.offsets.update(token.offsets)
                        for offset in label.offsets.values():
                            offset.value = token.token
                        add_annotation(si.body, label)
                token.labels = dict()
        o_chunk.add(si)

    o_chunk.flush()
    return fh.getvalue()
def test_spinn3r_pipeline_prefetched(filename, urls, pipeline_config,
                                     output_file):
    """minimal end-to-end test, preloading data in the loader"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        with open(filename, 'rb') as f:
            from_spinn3r_feed._prefetched[key] = f.read()
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
    def __call__(self, chunk_path):
        '''
        batch-type transform stage: reads a chunk from chunk_path, and
        replaces it with a new chunk at the same path
        '''
        ## make a new output chunk at a temporary path
        tmp_chunk_path = chunk_path + '_'
        t_chunk = Chunk(path=tmp_chunk_path, mode='wb')

        for num, si in enumerate(Chunk(path=chunk_path)):
            if num < self.config['max_items']:
                t_chunk.add(si)
            else:
                break

        ## flush to disk
        t_chunk.close()

        ## atomic rename new chunk file into place
        os.rename(tmp_chunk_path, chunk_path)
Beispiel #26
0
def attempt_fetch(work_unit, fpath):
    '''attempt a fetch and iteration over a work_unit.key path in s3
     '''
    url = 'http://s3.amazonaws.com/aws-publicdatasets/' + work_unit.key.strip()

    ## cheapest way to iterate over the corpus is a few stages of
    ## streamed child processes.  Note that stderr needs to go
    ## separately to a file so that reading the stdin doesn't get
    ## blocked:
    cmd = '(wget -O - %s | gpg --no-permission-warning --trust-model always --output - --decrypt - | xz --decompress) 2> %s-err' % (
        url, fpath)
    print cmd
    child = Popen(cmd, stdout=PIPE, shell=True)
    print 'child launched'
    sys.stdout.flush()

    si_count = 0
    serif_count = 0
    exc = ''
    stream_ids = list()
    clean_visible_bytes = 0
    clean_visible_count = 0
    try:
        for si in Chunk(file_obj=child.stdout):
            print si.stream_id, si.abs_url
            if si.body.language:
                lang = si.body.language.code
            else:
                lang = ''
            stream_ids.append((lang, si.stream_id))
            if si.body.clean_visible:
                clean_visible_count += 1
                clean_visible_bytes += len(si.body.clean_visible)
            si_count += 1
            if 'serif' in si.body.sentences:
                serif_count += 1
    except Exception, exc:
        exc = re.sub('\s+', ' ', str(exc)).strip()
Beispiel #27
0
def test_tagger_transform(tagger, chain_selector, stages, tmpdir,
                          test_data_dir):
    transform = stages.init_stage(
        tagger, {
            tagger: {
                'tagger_id': 'lingpipe',
                'annotator_id': 'bagga-and-baldwin',
                'chain_selector': chain_selector
            }
        })
    data = get_john_smith_tagged_by_lingpipe_without_labels_data(test_data_dir)
    with tmpdir.join('{}.{}.sc'.format(tagger,
                                       chain_selector)).open('wb') as tf:
        tf.write(data)
        tf.flush()
        transform.process_path(tf.name)
        found_one = False
        for si in Chunk(tf.name):
            for sentence in si.body.sentences['lingpipe']:
                for token in sentence.tokens:
                    if token.labels:
                        found_one = True
        assert found_one
def get_test_chunk(test_data_dir):
    return Chunk(path=get_test_chunk_path(test_data_dir),
                 message=StreamItem_v0_2_0)
def get_name_info(chunk_path, assert_one_date_hour=False, i_str=None):
    '''
    takes a chunk blob and obtains the date_hour, md5, num
    '''
    assert i_str is not None, 'must provide i_str as keyword arg'

    name_info = dict()
    if i_str:
        name_info['i_str'] = i_str
    else:
        name_info['i_str'] = ''

    i_fname = i_str.split('/')[-1]
    i_fname = i_fname.split('.')[0]  ## strip off .sc[.xz[.gpg]]
    name_info['input_fname'] = i_fname 

    name_info['input_md5'] = i_fname.split('-')[-1]

    # TODO: return a dict-like object that does the expensive
    # calculation lazily, the name format might not even need that
    # value.
    ch = Chunk(path=chunk_path, mode='rb')
    date_hours = set()
    target_names = set()
    doc_ids = set()
    epoch_ticks = None
    count = 0
    for si in ch:
        if epoch_ticks is None:
            epoch_ticks = si.stream_time.epoch_ticks
        date_hours.add( si.stream_time.zulu_timestamp[:13] )
        doc_ids.add( si.doc_id )
        for annotator_id, ratings in si.ratings.items():
            for rating in ratings:
                target_name = rating.target.target_id.split('/')[-1]
                target_names.add( target_name )
        count += 1

    ## create the md5 property, so we can use it in the filename
    name_info['md5'] = ch.md5_hexdigest
    name_info['num'] = count
    name_info['epoch_ticks'] = epoch_ticks

    name_info['target_names'] = '-'.join( target_names )
    name_info['doc_ids_8'] = '-'.join( [di[:8] for di in doc_ids] )

    if assert_one_date_hour:
        assert len(date_hours) == 1, \
            'got a chunk with other than one data_hour! ' + \
            repr(date_hours)

    if len(date_hours) > 0:
        date_hour = list(date_hours)[0]
        date_hour = date_hour.replace('T', '-')
    else:
        assert count == 0, (date_hours, count)
        date_hour = None
    name_info['date_hour'] = date_hour

    # TODO: in future lazy evaluation world, rand8 should return a
    # different value every time it is accessed so that a format could
    # be 'foo-{rand8}{rand8}'
    name_info['rand8'] = '%08x' % (random.randint(0, 0x7fffffff),)

    name_info['date_now'] = datetime.datetime.utcnow().strftime('%Y-%m-%d')
    name_info['time_now'] = datetime.datetime.utcnow().strftime('%H-%M-%S')
    name_info['date_time_now'] = datetime.datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S')

    return name_info

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('input')
    args = parser.parse_args()

    logger = logging.getLogger('streamcorpus_pipeline')
    ch = logging.StreamHandler()
    logger.addHandler(ch)
    streamcorpus_logger = logging.getLogger('streamcorpus')
    streamcorpus_logger.addHandler(ch)

    t = nltk_tokenizer(dict(annotator_id='author'))
    for si in Chunk(args.input):
        t.process_item(si)
        if si.body.clean_visible:
            assert len(si.body.sentences['nltk_tokenizer']) > 0
            logger.critical(
                'num_sentences=%d, has wikipedia %r and %d labels',
                len(si.body.sentences['nltk_tokenizer']), 'wikipedia.org'
                in si.body.clean_html,
                len(getattr(si.body, 'labels', {}).get('author', {})))

            #if len(getattr(si.body, 'labels', {}).get('author', {})) > 3:
            #    c = Chunk('foo.sc', mode='wb')
            #    c.add(si)
            #    c.close()
            #    sys.exit()
Beispiel #31
0
def profiles_from_runfile(
    runfile_path,
    offset_c_prepended=False,
    offset_inclusive=True,
    decode_utf=False,
    streamitems_dir=None,
    max_lines=None,
):
    '''
    Returns a dictionary mappping from entity-name to ComparableProfile, where the
    ComparableProfiles are constructed from a runfile.
    '''
    runfile = gzip.open(runfile_path, 'r')
    filter_run = runfile.readline()
    assert filter_run.startswith('#')
    filter_run = json.loads(filter_run[1:])
    if filter_run['task_id'] != 'kba-ssf-2014':
        # do nothing
        return

    runfile_profiles = dict()
    runfile_csv = csv.reader(runfile, delimiter='\t')

    count = 1
    for row in runfile_csv:

        #skip comments
        if row[0].startswith('#'):
            continue

        if max_lines is not None and count > max_lines:
            break

        count += 1
        #parse the row
        stream_item = row[2]
        profile_name = row[3]
        slot_name = row[8]
        slot_value = row[9]
        offset_str = row[10]

        #initialize profile
        if profile_name not in runfile_profiles:
            runfile_profiles[profile_name] = ComparableProfile(
                profile_name, truncate_counts=True)

        #do all the offsets have a 'c' prepended?
        if offset_c_prepended:
            #remove the 'c's
            offsets = [offset[1:] for offset in offset_str.split(',')]
        else:
            #if there is no 'c' prepended, we also know that there is one offset.
            offsets = [offset_str]

        log('{}: fetching stream item: {}'.format(runfile_path, stream_item))

        ## The chunk files are located two levels deep in a directory
        ## hierarchy, where each level is 2 character prefix of the
        ## stream-id.  We are going to extract the first 4 characters
        ## of the stream-id in chunks of 2, so we can find the
        ## corresponding StreamItem on the filesystem.  For example,
        ## the StreamItem with id
        ## 1234567890-abcdef0123456789abcdef0123456789 would be stored
        ## in a single-item chunk file called
        ## ./ab/cd/1234567890-abcdef0123456789abcdef0123456789.sc and
        ## the further file extensions of .xz or .xz.gpg are optional,
        ## because the streamcorpus python package handles that for
        ## us.
        match = re.match('.*-(..)(..).*', stream_item)

        if match is None:
            raise Exception(
                "Cannot read StreamItem for {}".format(stream_item))

        stream_item_path = '{}/{}/{}.sc.xz.gpg'.format(match.group(1),
                                                       match.group(2),
                                                       stream_item)

        stream_item_file_path = os.path.join(streamitems_dir, stream_item_path)

        if not os.path.isfile(stream_item_file_path):
            log('Could not find stream item {}'.format(stream_item))
            continue

        c = Chunk(stream_item_file_path)

        si = [si for si in c][0]  #collect the single si in this chunk

        #are the offsets indexes in the decoded string or the undecoded string?
        if decode_utf:
            clean_visible = si.body.clean_visible.decode('utf-8')
        else:
            clean_visible = si.body.clean_visible

        #parse each offset in offsets
        if len(offsets) > 1:
            begin = int(offsets[0].split('-')[0])
            end = int(offsets[-1].split('-')[1])
        else:
            offset = offsets[0]
            begin, end = [int(loc) for loc in offset.split('-')]

        #account for inclusive offsets.
        if offset_inclusive:
            end += 1

        #build the slot value from clean_visible.
        slot_value_processed = clean_visible[int(begin):int(end)].lower(
        ).replace('_', ' ').strip()

        if not decode_utf:
            #we now must decode, because clean_visible wasn't decoded from the start.
            try:
                slot_value_processed = slot_value_processed.decode('utf-8')
            except UnicodeDecodeError:
                log('Warning: Could not decode slot_value: {}. Will skip slot-fill.'
                    .format(slot_value_processed))
                continue

        log('## %s %s: %s' %
            (profile_name, slot_name, slot_value_processed.encode('utf-8')))

        #we want the bag-of-words associated with this slot_value
        for value in slot_value_processed.split():
            runfile_profiles[profile_name].add_value_for_slot(slot_name, value)

    return runfile_profiles
'''
## "Chunk" is a convenience wrapper in the python tools built around
## the streamcorpus thirft interfaces.  It is essentially just a
## wrapper around open(<file_handle>) and can take a path to a flat
## file on disk, or a file_obj that has already been opened in memory,
## such as a pipe from stdin or a network socket.
from streamcorpus import Chunk

## This classes are available in any language that can compile the
## streamcorpus thrift interfaces
from streamcorpus import Tagging, Versions, Relation, Attribute, Sentence, Token

## read StreamItems from over stdin.  We will assume that these
## StreamItems have already been constructed and have
## StreamItem.body.clean_visible
i_chunk = Chunk(file_obj=sys.stdin, mode='rb')

## write StreamItems via stdout.  We will add more data to them
o_chunk = Chunk(file_obj=sys.stdout, mode='wb')

## iterate over input chunks, generate data, and write to output
for si in i_chunk:

    assert si.version == Versions.v0_3_0, 'new streamcorpus collections should be built using the latest version'

    ## clean_visible is byte identical to clean_html, except all the
    ## tags are converted to whitespace, so offsets in match
    #input_html = si.body.clean_html = text.encode('utf8')
    clean_visible = si.body.clean_visible.decode('utf8')

    ## run the text through a tagger
                )

        si.source_metadata['lang'] = pe.lang[0].code
        si.source_metadata['author'] = json.dumps( 
            dict(
                name = pe.author[0].name,
                email = pe.author[0].email,
                link = pe.author[0].link[0].href,
                )
            )
        si.source = entry.source.publisher_type

        yield si


if __name__ == '__main__':
    #import sys
    #from _handle_unconvertible_spinn3r import handle_unconvertible_spinn3r as hus
    #map(hus, _generate_stream_items(sys.stdin.read()))

    o_chunk = Chunk('/tmp/foo.sc', mode='wb')
    for si in _generate_stream_items(sys.stdin.read()):
        print '---post smoosh raw: %s --' % si.stream_id
        print si.body.raw
        print si.stream_id

        if si.stream_id == '1345928297-da71cfa833ce8218684b6dab152dd69b':
            o_chunk.add( si )

    o_chunk.close()
from streamcorpus import make_stream_item, make_stream_time, get_date_hour

## get useful file wrapper class
from streamcorpus import Chunk

## get a couple of the classes compiled from the thrift interface definitions
from streamcorpus import Tagging, Versions

## somehow get a list of input text files
# fake example with just one input, downloaded via wget (see __doc__
# string above)
input_files = ['index.html']

## open a chunk file to write StreamItems
output_path = 'first-output-chunk.sc'
ch = Chunk(output_path, mode='wb')

for file_path in input_files:

    ## get the text
    text = open(file_path).read()

    ## every StreamItem has a timestamp, which ideally is the creation
    ## time of the text
    zulu_timestamp = '2013-04-18T18:18:20.000000Z'

    ## every StreamItem has an absolute URL, which ideally points to
    ## the real text on the Web
    abs_url = 'http://nytimes.com/index.html'

    si = make_stream_item(zulu_timestamp, abs_url)
from streamcorpus import make_stream_item, make_stream_time, get_date_hour

## get useful file wrapper class
from streamcorpus import Chunk

## get a couple of the classes compiled from the thrift interface definitions
from streamcorpus import Tagging, Versions

## somehow get a list of input text files
# fake example with just one input, downloaded via wget (see __doc__
# string above)
input_files = ['index.html']

## open a chunk file to write StreamItems
output_path = 'first-output-chunk.sc'
ch = Chunk(output_path, mode='wb')

for file_path in input_files:

    ## get the text
    text = open(file_path).read()

    ## every StreamItem has a timestamp, which ideally is the creation
    ## time of the text
    zulu_timestamp = '2013-04-18T18:18:20.000000Z'

    ## every StreamItem has an absolute URL, which ideally points to
    ## the real text on the Web
    abs_url = 'http://nytimes.com/index.html'

    si = make_stream_item(zulu_timestamp, abs_url)
        os.rename(tmp_done_path, final_done_path)

        ## replace with log.info()
        print 'done with %s' % final_done_path

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('tagger_id', 
                        metavar='stanford|lingpipe', 
                        help='name of NLP pipeline to run')
    parser.add_argument('input_dir', help='directory of Chunk files')
    parser.add_argument('output_dir', help='directory to put new streamcorpus.Chunk files')
    parser.add_argument('--convert-kba', action='store_true', default=False,
                        help='Expect input_dir to have old-style KBA chunks')
    parser.add_argument('--pipeline-root', metavar='PIPELINE_ROOT', dest='pipeline_root',
                        help='file path to root dir for the particular NLP pipeline')
    parser.add_argument('--align-only', metavar='OUTPUT_PATH', dest='align_only', default=None,
                        help='produce a chunk file at OUTPUT_PATH by alignning a single input chunk file with the intermediate')
    ## could add options to delete input after verifying the output on disk?
    args = parser.parse_args()

    if args.align_only is not None:
        i_chunk = Chunk(file_obj=open(args.input_dir))
        tmp_ner_path = args.output_dir
        tmp_done_path = args.align_only
        align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path)

    else:
        run_pipeline(args.tagger_id, args.input_dir, args.output_dir, pipeline_root=args.pipeline_root)
Beispiel #37
0
for path in lzma.open(s3_paths_fname):
    s3_path = s3_path_prefix + path.strip()
    url = s3_http_host + s3_path
    logger.info(url)
    retries = 0
    max_retries = 10
    while retries < max_retries:
        retries += 1
        sys.stderr.flush()
        try:
            resp = requests.get(url)
            errors, data = decrypt_and_uncompress(resp.content,
                                                  gpg_private='trec-kba-rsa')
            logger.info('\n'.join(errors))
            for si in Chunk(file_obj=StringIO(data)):

                rec = {
                    'url': si.abs_url,
                    'timestamp': si.stream_time.epoch_ticks,
                    'request': None,  ## not part of this data set
                    'response': {
                        'headers': [
                            ['Content-Type', 'text/html'],
                        ],
                        'body': si.body.clean_html,
                        ## alternatively, could use si.body.raw and
                        ## si.body.media_type for the Content-Type
                        ## header, but that would cause the Serif NER
                        ## to be useless to teams...
                    },
Beispiel #38
0
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('action', metavar='postproc|align', help='postproc')
    parser.add_argument('input_file', help='XML file from LingPipe')
    parser.add_argument('output_file',
                        help='XML file to generate with OWPL data')
    parser.add_argument(
        '--source_chunk',
        help='source chunk file that was input to the pipeline data')
    args = parser.parse_args()

    if args.action == 'postproc':
        text = open(args.input_file).read()
        print 'read %d bytes from %s' % (len(text), args.input_file)

        raise NotImplementedError(
            'need to instantiate a LingPipeParser object here')

        for stream_id, tagged_doc in files(text):
            for sent in sentences(tagged_doc):  # pylint: disable=E0602
                for tok in sent.tokens:
                    if tok.entity_type is not None:
                        print tok, EntityType._VALUES_TO_NAMES[tok.entity_type]

    elif args.action == 'align':
        i_chunk = Chunk(path=args.source_chunk, mode='rb')
        o_chunk = Chunk(path=args.output_file, mode='wb')
        align_chunk_with_ner(args.input_file, i_chunk, o_chunk)  # pylint: disable=E0602
def align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path):
    '''
    iterate through the i_chunk and tmp_ner_path to generate a new
    Chunk with body.ner
    '''
    o_chunk = Chunk()
    input_iter = i_chunk.__iter__()
    ner = ''
    stream_id = None

    all_ner = xml.dom.minidom.parse(open(tmp_ner_path))

    for raw_ner in all_ner.getElementsByTagName('FILENAME'):
        
        stream_item = input_iter.next()
        ## get stream_id out of the XML
        stream_id = raw_ner.attributes.get('docid').value
        assert stream_id and stream_id == stream_item.stream_id, \
            '%s != %s\nner=%r' % (stream_id, stream_item.stream_id, ner)

        tagger_id = 'lingpipe'
        tagging = Tagging()
        tagging.tagger_id = tagger_id
        ## get this one file out of its FILENAME tags
        tagged_doc = list(lingpipe.files(raw_ner.toxml()))[0][1]
        tagging.raw_tagging = tagged_doc
        tagging.generation_time = streamcorpus.make_stream_time()
        stream_item.body.taggings[tagger_id] = tagging

        sentences = list(lingpipe.sentences(tagged_doc))

        ## make JS labels on individual tokens
        assert stream_item.ratings[0].mentions, stream_item.stream_id
        john_smith_label = Label()
        john_smith_label.annotator = stream_item.ratings[0].annotator
        john_smith_label.target_id = stream_item.ratings[0].target_id

        # first map all corefchains to their words
        equiv_ids = collections.defaultdict(lambda: set())
        for sent in sentences:
            for tok in sent.tokens:
                if tok.entity_type is not None:
                    equiv_ids[tok.equiv_id].add(cleanse(tok.token))

        ## find all the chains that are John Smith
        johnsmiths = set()
        for equiv_id, names in equiv_ids.items():
            ## detect 'smith' in 'smithye'
            _names = cleanse(' '.join(names))
            if 'john' in _names and 'smith' in _names:
                johnsmiths.add(equiv_id)

        print len(johnsmiths)
        ## now apply the label
        for sent in sentences:
            for tok in sent.tokens:
                if tok.equiv_id in johnsmiths:
                    tok.labels = [john_smith_label]                

        stream_item.body.sentences[tagger_id] = sentences
        
        o_chunk.add(stream_item)

    ## put the o_chunk bytes into the specified file
    open(tmp_done_path, 'wb').write(str(o_chunk))
    ## replace this with log.info()
    print 'created %s' % tmp_done_path