def feature_pipeline(chunk_in, FC_chunk_out): '''Run a basic pipeline to generate feature collections from streamitems. If file exists just loads the existing file. Returns a list of either the generated FCs or the FCs in the existing file. `chunk_in` path to SC chunk file `FC_chunk_out` path where the FC chunk file be written ''' if isfile(FC_chunk_out): print FC_chunk_out, 'already exists...', fcs = [fc for fc in FC_Chunk(FC_chunk_out, mode='rb')] print 'loaded.' else: chunk_out = FC_Chunk(FC_chunk_out, mode='wb') fcs = [] for cfile in glob.glob(join(chunk_in,'*.sc.xz')): print 'processing', cfile for i, si in enumerate(SC_Chunk(cfile)): if i % 10==0: print i, 'fc processed' fc = html_to_fc( html=si.body.raw, encoding=si.body.encoding, url=si.abs_url) chunk_out.add(fc) fcs.append(fc) print 'done creating', FC_chunk_out return fcs
def get_output_accumulator(self, output_path=None): if output_path is not None: self._chunk = FeatureCollectionChunk(path=output_path, mode='wb') def add(cids_and_fcs): if self.tfidf is not None: for _, fc in cids_and_fcs: add_sip_to_fc(fc, self.tfidf) if output_path is not None: for _, fc in cids_and_fcs: self._chunk.add(fc) else: self.store.put(cids_and_fcs) return add
def do_load(self, args): get_content_id = partial(self.get_content_id, args.id_feature_prefix, args.id_feature) for chunkfile in args.chunk_files: if not chunkfile.endswith('.fc'): fc_chunker = FeatureCollectionChunk(path=chunkfile) for i, fcs in enumerate(chunks(args.batch_size, fc_chunker)): fcs = list(fcs) content_ids = map(get_content_id, fcs) self.store.put(zip(content_ids, fcs)) print('batch %d (%d FCs)' % (i, len(fcs))) else: # This currently seg faults. fh = open(chunkfile, 'rb') fc_chunker = cbor_iter(fh) for i, fcs in enumerate(chunks(args.batch_size, fc_chunker)): fcs = list(fcs) content_ids = map(get_content_id, fcs) self.store.put(zip(content_ids, fcs)) print('batch %d (%d FCs)' % (i, len(fcs)))
def test_fc_chunk(): fc1 = FeatureCollection({'NAME': {'foo': 2, 'baz': 1}}) fc2 = FeatureCollection({'NAME': {'foo': 4, 'baz': 2}}) fh = StringIO() chunk = FeatureCollectionChunk(file_obj=fh, mode='wb') chunk.add(fc1) chunk.add(fc2) chunk.flush() blob = fh.getvalue() assert blob fh = StringIO(blob) chunk = FeatureCollectionChunk(file_obj=fh, mode='rb') rfc1, rfc2 = list(chunk) assert fc1 == rfc1 assert fc2 == rfc2
class App(yakonfig.cmd.ArgParseCmd): def __init__(self, *args, **kwargs): yakonfig.cmd.ArgParseCmd.__init__(self, *args, **kwargs) self._store = None self._chunk = None self.tfidf = None @property def store(self): if self._store is None: feature_indexes = None try: conf = yakonfig.get_global_config('dossier.store') feature_indexes = conf['feature_indexes'] except KeyError: pass self._store = Store(kvlayer.client(), feature_indexes=feature_indexes) return self._store def done(self): if self._chunk is not None: self._chunk.flush() def get_output_accumulator(self, output_path=None): if output_path is not None: self._chunk = FeatureCollectionChunk(path=output_path, mode='wb') def add(cids_and_fcs): if self.tfidf is not None: for _, fc in cids_and_fcs: add_sip_to_fc(fc, self.tfidf) if output_path is not None: for _, fc in cids_and_fcs: self._chunk.add(fc) else: self.store.put(cids_and_fcs) return add def get_mapper(self, args): cpus = getattr(args, 'processes', 1) if cpus == 1: return imap else: pool = multiprocessing.Pool(processes=cpus) return pool.imap def args_etl_ads(self, p): p.add_argument('--host', default='localhost') p.add_argument('--port', default=9090, type=int) p.add_argument('--table-prefix', default='') p.add_argument('--limit', default=None, type=int) p.add_argument('--batch-size', default=1000, type=int) p.add_argument('--start', default=None, type=str) p.add_argument('--stop', default=None, type=str) p.add_argument('-p', '--processes', default=multiprocessing.cpu_count(), type=int) p.add_argument('-o', '--output', default=None) p.add_argument('--tfidf', default=None, type=str, help='Path to TF-IDF background model. Can be ' 'generated with the `dossier.etl tfidf` script.') def do_etl_ads(self, args): if args.tfidf is not None: self.tfidf = models.TfidfModel.load(args.tfidf) etl = Ads(args.host, args.port, table_prefix=args.table_prefix) gen = etl.cids_and_fcs(self.get_mapper(args), args.start, args.stop, limit=args.limit) self.etl(args, etl, gen) def args_etl_scrapy(self, p): p.add_argument('-p', '--processes', default=multiprocessing.cpu_count(), type=int) p.add_argument('--batch-size', default=1000, type=int) p.add_argument('--limit', default=None, type=int) p.add_argument('-o', '--output', default=None) p.add_argument('--url-prefix', default=None, help='Override the URL prefix to use when fixing ' 'relative URLs. When omitted, detect ' 'automatically.') p.add_argument('--tfidf', default=None, type=str, help='Path to TF-IDF background model. Can be ' 'generated with the `dossier.etl tfidf` script.') p.add_argument('input', help='Scrapy data. Only supports CSV format currently.') def do_etl_scrapy(self, args): if args.tfidf is not None: self.tfidf = models.TfidfModel.load(args.tfidf) url_prefix = args.url_prefix if url_prefix is None: url_prefix = Scrapy.detect_url_prefix(open(args.input)) if url_prefix is not None: print('Auto-detected URL prefix:', url_prefix) etl = Scrapy(open(args.input), url_prefix=url_prefix) gen = etl.cids_and_fcs(self.get_mapper(args), limit=args.limit) self.etl(args, etl, gen) def etl(self, args, etl, gen): add = self.get_output_accumulator(args.output) try: batch_progress(gen, add, limit=args.limit, batch_size=args.batch_size) finally: self.done()