Ejemplo n.º 1
0
def feature_pipeline(chunk_in, FC_chunk_out):
    '''Run a basic pipeline to generate feature collections from
    streamitems. If file exists just loads the existing file. Returns
    a list of either the generated FCs or the FCs in the existing
    file.
    
    `chunk_in` path to SC chunk file

    `FC_chunk_out` path where the FC chunk file be written

    '''    
    if isfile(FC_chunk_out):
        print FC_chunk_out, 'already exists...',
        fcs = [fc for fc in FC_Chunk(FC_chunk_out, mode='rb')]
        print 'loaded.'
    else:
        chunk_out = FC_Chunk(FC_chunk_out, mode='wb')
        fcs = []
        for cfile in glob.glob(join(chunk_in,'*.sc.xz')):
            print 'processing', cfile
            for i, si in enumerate(SC_Chunk(cfile)):
                if i % 10==0: print i, 'fc processed'
                fc = html_to_fc(
                    html=si.body.raw,
                    encoding=si.body.encoding,
                    url=si.abs_url)
                chunk_out.add(fc)
                fcs.append(fc)

        print 'done creating', FC_chunk_out
    return fcs
Ejemplo n.º 2
0
 def get_output_accumulator(self, output_path=None):
     if output_path is not None:
         self._chunk = FeatureCollectionChunk(path=output_path, mode='wb')
     def add(cids_and_fcs):
         if self.tfidf is not None:
             for _, fc in cids_and_fcs:
                 add_sip_to_fc(fc, self.tfidf)
         if output_path is not None:
             for _, fc in cids_and_fcs:
                 self._chunk.add(fc)
         else:
             self.store.put(cids_and_fcs)
     return add
Ejemplo n.º 3
0
 def do_load(self, args):
     get_content_id = partial(self.get_content_id, args.id_feature_prefix,
                              args.id_feature)
     for chunkfile in args.chunk_files:
         if not chunkfile.endswith('.fc'):
             fc_chunker = FeatureCollectionChunk(path=chunkfile)
             for i, fcs in enumerate(chunks(args.batch_size, fc_chunker)):
                 fcs = list(fcs)
                 content_ids = map(get_content_id, fcs)
                 self.store.put(zip(content_ids, fcs))
                 print('batch %d (%d FCs)' % (i, len(fcs)))
         else:
             # This currently seg faults.
             fh = open(chunkfile, 'rb')
             fc_chunker = cbor_iter(fh)
             for i, fcs in enumerate(chunks(args.batch_size, fc_chunker)):
                 fcs = list(fcs)
                 content_ids = map(get_content_id, fcs)
                 self.store.put(zip(content_ids, fcs))
                 print('batch %d (%d FCs)' % (i, len(fcs)))
Ejemplo n.º 4
0
def test_fc_chunk():
    fc1 = FeatureCollection({'NAME': {'foo': 2, 'baz': 1}})
    fc2 = FeatureCollection({'NAME': {'foo': 4, 'baz': 2}})

    fh = StringIO()
    chunk = FeatureCollectionChunk(file_obj=fh, mode='wb')
    chunk.add(fc1)
    chunk.add(fc2)
    chunk.flush()

    blob = fh.getvalue()
    assert blob
    fh = StringIO(blob)
    chunk = FeatureCollectionChunk(file_obj=fh, mode='rb')
    rfc1, rfc2 = list(chunk)
    assert fc1 == rfc1
    assert fc2 == rfc2
Ejemplo n.º 5
0
class App(yakonfig.cmd.ArgParseCmd):
    def __init__(self, *args, **kwargs):
        yakonfig.cmd.ArgParseCmd.__init__(self, *args, **kwargs)
        self._store = None
        self._chunk = None
        self.tfidf = None

    @property
    def store(self):
        if self._store is None:
            feature_indexes = None
            try:
                conf = yakonfig.get_global_config('dossier.store')
                feature_indexes = conf['feature_indexes']
            except KeyError:
                pass
            self._store = Store(kvlayer.client(),
                                feature_indexes=feature_indexes)
        return self._store

    def done(self):
        if self._chunk is not None:
            self._chunk.flush()

    def get_output_accumulator(self, output_path=None):
        if output_path is not None:
            self._chunk = FeatureCollectionChunk(path=output_path, mode='wb')
        def add(cids_and_fcs):
            if self.tfidf is not None:
                for _, fc in cids_and_fcs:
                    add_sip_to_fc(fc, self.tfidf)
            if output_path is not None:
                for _, fc in cids_and_fcs:
                    self._chunk.add(fc)
            else:
                self.store.put(cids_and_fcs)
        return add

    def get_mapper(self, args):
        cpus = getattr(args, 'processes', 1)
        if cpus == 1:
            return imap
        else:
            pool = multiprocessing.Pool(processes=cpus)
            return pool.imap

    def args_etl_ads(self, p):
        p.add_argument('--host', default='localhost')
        p.add_argument('--port', default=9090, type=int)
        p.add_argument('--table-prefix', default='')
        p.add_argument('--limit', default=None, type=int)
        p.add_argument('--batch-size', default=1000, type=int)
        p.add_argument('--start', default=None, type=str)
        p.add_argument('--stop', default=None, type=str)
        p.add_argument('-p', '--processes',
                       default=multiprocessing.cpu_count(), type=int)
        p.add_argument('-o', '--output', default=None)
        p.add_argument('--tfidf', default=None, type=str,
                       help='Path to TF-IDF background model. Can be '
                            'generated with the `dossier.etl tfidf` script.')

    def do_etl_ads(self, args):
        if args.tfidf is not None:
            self.tfidf = models.TfidfModel.load(args.tfidf)
        etl = Ads(args.host, args.port, table_prefix=args.table_prefix)
        gen = etl.cids_and_fcs(self.get_mapper(args), args.start, args.stop,
                               limit=args.limit)
        self.etl(args, etl, gen)

    def args_etl_scrapy(self, p):
        p.add_argument('-p', '--processes',
                       default=multiprocessing.cpu_count(), type=int)
        p.add_argument('--batch-size', default=1000, type=int)
        p.add_argument('--limit', default=None, type=int)
        p.add_argument('-o', '--output', default=None)
        p.add_argument('--url-prefix', default=None,
                       help='Override the URL prefix to use when fixing '
                            'relative URLs. When omitted, detect '
                            'automatically.')
        p.add_argument('--tfidf', default=None, type=str,
                       help='Path to TF-IDF background model. Can be '
                            'generated with the `dossier.etl tfidf` script.')
        p.add_argument('input',
                       help='Scrapy data. Only supports CSV format currently.')

    def do_etl_scrapy(self, args):
        if args.tfidf is not None:
            self.tfidf = models.TfidfModel.load(args.tfidf)

        url_prefix = args.url_prefix
        if url_prefix is None:
            url_prefix = Scrapy.detect_url_prefix(open(args.input))
            if url_prefix is not None:
                print('Auto-detected URL prefix:', url_prefix)
        etl = Scrapy(open(args.input), url_prefix=url_prefix)
        gen = etl.cids_and_fcs(self.get_mapper(args), limit=args.limit)
        self.etl(args, etl, gen)

    def etl(self, args, etl, gen):
        add = self.get_output_accumulator(args.output)
        try:
            batch_progress(gen, add, limit=args.limit,
                           batch_size=args.batch_size)
        finally:
            self.done()