def load_ba_fixtures(config): # This is messy. Would be cool to do it more cleanly, but how? if not len(BA_FIXTURES['entities']): with open(os.path.join(FIXTURES, 'ba.mapping.yaml'), 'rb') as fh: mapping = yaml.load(fh) mapper = Mapper(mapping, config.resolver, scope=config.base_uri) with open(os.path.join(FIXTURES, 'ba.csv'), 'rb') as csvfh: reader = unicodecsv.DictReader(csvfh) for row in reader: _, data = mapper.apply(row) BA_FIXTURES['entities'].append(data) source = Source.ensure({ 'slug': BA_SOURCE, 'title': 'BiH Parliament', 'url': 'http://foo.ba/' }) permission = Permission() permission.role_id = Role.SYSTEM_USER permission.read = True permission.write = False permission.resource_id = source.id permission.resource_type = Permission.SOURCE session.add(permission) session.commit() for entity in BA_FIXTURES['entities']: config.entities.save(entity['$schema'], entity, source_id=source.id) get_loom_indexer().index(source=BA_SOURCE)
def load_ba_fixtures(config): # This is messy. Would be cool to do it more cleanly, but how? if not len(BA_FIXTURES['entities']): with open(os.path.join(FIXTURES, 'ba.mapping.yaml'), 'rb') as fh: mapping = yaml.load(fh) mapper = Mapper(mapping, config.resolver, scope=config.base_uri) with open(os.path.join(FIXTURES, 'ba.csv'), 'rb') as csvfh: reader = unicodecsv.DictReader(csvfh) for row in reader: _, data = mapper.apply(row) BA_FIXTURES['entities'].append(data) source = Source.ensure({ 'slug': BA_SOURCE, 'title': 'BiH Parliament', 'url': 'http://foo.ba/' }) permission = Permission() permission.role_id = Role.SYSTEM_USER permission.read = True permission.write = False permission.resource_id = source.id permission.resource_type = Permission.SOURCE session.add(permission) session.commit() for entity in BA_FIXTURES['entities']: config.entities.save(entity['$schema'], entity, source_id=source.id) get_loom_indexer().index(source=BA_SOURCE)
def map_row(csvfile, mapfile, columns=None): """ Generator function that transforms a CSV row into a mapped dictionary object, one row at a time """ mapping = file_to_json(mapfile) resolver = RefResolver.from_schema(mapping) mapper = Mapper(mapping, resolver) drop_blank = lambda p, k, v: v is not None and v != "" and not_empty(v) total_rows = row_count(csvfile) if isinstance(csvfile, str): csvfp = open(csvfile, 'r', encoding='utf-8-sig') for row in tqdm(csv.DictReader(csvfp), total=total_rows): row = {key: value for key, value in row.items() if key in columns} \ if columns else row _, data = mapper.apply(row) data = remap(data, visit=drop_blank) yield data
def records(self, mapping_name): mapper = SchemaMapper(self.spec.get_mapping(mapping_name), self.config.resolver, scope=self.config.base_uri) schema = mapper.visitor.schema.get('id') begin = time.time() stmts = 0 for i, row in enumerate(self.generator.generate(mapping_name)): _, data = mapper.apply(row) for stmt in self.config.entities.triplify(schema, data): stmts += 1 yield stmt if i > 0 and i % 10000 == 0: elapsed = time.time() - begin per_record = float(elapsed) / float(i) speed = per_record * 1000 log.info("Generating %r: %s records (%s, %.2fms/r)", mapping_name, i, stmts, speed)