def make_entities(db_path, outfile): db = dataset.connect("sqlite:///%s" % db_path) store = Dataset("temp", database_uri="sqlite://") writer = store.bulk() write_edges(writer, db) write_addresses(writer, db) write_nodes(writer, db["entity"], "Company") write_nodes(writer, db["intermediary"]) write_nodes(writer, db["officer"]) for entity in store.iterate(): write_object(outfile, entity)
def handle(self, task): name = task.context.get("ftmstore", task.job.dataset.name) entity_ids = task.payload.get("entity_ids") dataset = Dataset(name, OP_TRANSLATE) try: writer = dataset.bulk() for entity in dataset.partials(entity_id=entity_ids): self.translate(writer, entity) writer.flush() self.dispatch_next(task, entity_ids) finally: dataset.close()
class EntityEmitter(object): def __init__(self, context): self.fragment = 0 self.log = context.log self.name = context.crawler.name self.dataset = Dataset(self.name, origin=ORIGIN) self.bulk = self.dataset.bulk() def make(self, schema): entity = model.make_entity(schema, key_prefix=self.name) return entity def emit(self, entity, rule='pass'): if entity.id is None: raise RuntimeError("Entity has no ID: %r", entity) self.bulk.put(entity, fragment=str(self.fragment)) self.fragment += 1 def finalize(self): self.bulk.flush()
except Exception: log.exception("Failed to parse: %r", member) def parse_archive(writer, archive_path): log.info("Archive: %s", archive_path) tar = tarfile.open(archive_path, "r") while True: member = tar.next() if member is None: break fh = tar.extractfile(member) if fh is None: continue parse_file(writer, fh, member) fh.close() writer.flush() if __name__ == "__main__": prefix = "data/" dataset = Dataset("pa_companies", origin="parse") writer = dataset.bulk() for file_name in sorted(os.listdir(prefix)): file_path = os.path.join(prefix, file_name) parse_archive(writer, file_path) with open("panama.json", "w") as fh: for entity in dataset.iterate(): write_object(fh, entity)