Beispiel #1
0
def load_ba_fixtures(config):
    # This is messy. Would be cool to do it more cleanly, but how?
    if not len(BA_FIXTURES['entities']):
        with open(os.path.join(FIXTURES, 'ba.mapping.yaml'), 'rb') as fh:
            mapping = yaml.load(fh)
        mapper = Mapper(mapping, config.resolver, scope=config.base_uri)
        with open(os.path.join(FIXTURES, 'ba.csv'), 'rb') as csvfh:
            reader = unicodecsv.DictReader(csvfh)
            for row in reader:
                _, data = mapper.apply(row)
                BA_FIXTURES['entities'].append(data)

    source = Source.ensure({
        'slug': BA_SOURCE,
        'title': 'BiH Parliament',
        'url': 'http://foo.ba/'
    })
    permission = Permission()
    permission.role_id = Role.SYSTEM_USER
    permission.read = True
    permission.write = False
    permission.resource_id = source.id
    permission.resource_type = Permission.SOURCE
    session.add(permission)
    session.commit()
    for entity in BA_FIXTURES['entities']:
        config.entities.save(entity['$schema'], entity, source_id=source.id)
    get_loom_indexer().index(source=BA_SOURCE)
Beispiel #2
0
def load_ba_fixtures(config):
    # This is messy. Would be cool to do it more cleanly, but how?
    if not len(BA_FIXTURES['entities']):
        with open(os.path.join(FIXTURES, 'ba.mapping.yaml'), 'rb') as fh:
            mapping = yaml.load(fh)
        mapper = Mapper(mapping, config.resolver, scope=config.base_uri)
        with open(os.path.join(FIXTURES, 'ba.csv'), 'rb') as csvfh:
            reader = unicodecsv.DictReader(csvfh)
            for row in reader:
                _, data = mapper.apply(row)
                BA_FIXTURES['entities'].append(data)

    source = Source.ensure({
        'slug': BA_SOURCE,
        'title': 'BiH Parliament',
        'url': 'http://foo.ba/'
    })
    permission = Permission()
    permission.role_id = Role.SYSTEM_USER
    permission.read = True
    permission.write = False
    permission.resource_id = source.id
    permission.resource_type = Permission.SOURCE
    session.add(permission)
    session.commit()
    for entity in BA_FIXTURES['entities']:
        config.entities.save(entity['$schema'], entity, source_id=source.id)
    get_loom_indexer().index(source=BA_SOURCE)
Beispiel #3
0
 def test_sa_term26_flatten(self):
     mapping, uri = fixture_uri('everypol/mapping.json')
     resolver.store[uri] = mapping
     csvobj = fixture_file('everypol/term-26.csv')
     mapped = list(csv_mapper(csvobj, mapping, resolver))
     objs = [o for (o, err) in mapped]
     for row in Mapper.flatten_iter(objs, mapping, resolver):
         assert 'group_id' in row, row
         assert 'email' in row, row
Beispiel #4
0
def csv_mapper(fileobj, mapping, resolver=None, scope=None):
    """ Given a CSV file object (fh), parse the file as a unicode CSV document,
    iterate over all rows of the data and map them to a JSON schema using the
    mapping instructions in ``mapping``. """
    from jsonmapping import Mapper
    reader = unicodecsv.DictReader(fileobj)
    for row in Mapper.apply_iter(reader, mapping, resolver=resolver,
                                 scope=scope):
        yield row
Beispiel #5
0
def map_row(csvfile, mapfile, columns=None):
    """ Generator function that transforms a CSV row into a mapped
        dictionary object, one row at a time
    """
    mapping = file_to_json(mapfile)
    resolver = RefResolver.from_schema(mapping)
    mapper = Mapper(mapping, resolver)
    drop_blank = lambda p, k, v: v is not None and v != "" and not_empty(v)
    total_rows = row_count(csvfile)

    if isinstance(csvfile, str):
        csvfp = open(csvfile, 'r', encoding='utf-8-sig')
    for row in tqdm(csv.DictReader(csvfp), total=total_rows):
        row = {key: value for key, value in row.items() if key in columns} \
            if columns else row
        _, data = mapper.apply(row)
        data = remap(data, visit=drop_blank)
        yield data
Beispiel #6
0
    def records(self, mapping_name):
        mapper = SchemaMapper(self.spec.get_mapping(mapping_name),
                              self.config.resolver, scope=self.config.base_uri)
        schema = mapper.visitor.schema.get('id')
        begin = time.time()
        stmts = 0
        for i, row in enumerate(self.generator.generate(mapping_name)):
            _, data = mapper.apply(row)
            for stmt in self.config.entities.triplify(schema, data):
                stmts += 1
                yield stmt

            if i > 0 and i % 10000 == 0:
                elapsed = time.time() - begin
                per_record = float(elapsed) / float(i)
                speed = per_record * 1000
                log.info("Generating %r: %s records (%s, %.2fms/r)",
                         mapping_name, i, stmts, speed)
Beispiel #7
0
def csv_mapper(fileobj, mapping, resolver=None, scope=None):
    """ Given a CSV file object (fh), parse the file as a unicode CSV document,
    iterate over all rows of the data and map them to a JSON schema using the
    mapping instructions in ``mapping``. """
    reader = unicodecsv.DictReader(fileobj)
    for (row, err) in Mapper.apply_iter(reader,
                                        mapping,
                                        resolver=resolver,
                                        scope=scope):
        yield (row, err)
Beispiel #8
0
def load_mapped_csv(graph, csv_uri, mapping, context_id=None):
    """ Load data from a CSV file, applying a JSON mapping and then adding
    it to the graph. """
    meta = {'source_url': csv_uri}
    reader = unicodecsv.DictReader(read_uri(csv_uri))
    ctx = graph.context(identifier=context_id, meta=meta)
    for data, err in Mapper.apply_iter(reader, mapping, graph.resolver,
                                       scope=graph.base_uri):
        if err is not None:
            log.warning("Error loading %r: %r", csv_uri, err)
        else:
            ctx.add(data['$schema'], data)
    ctx.save()
Beispiel #9
0
def load_mapped_csv(graph, csv_uri, mapping, context_id=None):
    """ Load data from a CSV file, applying a JSON mapping and then adding
    it to the graph. """
    meta = {'source_url': csv_uri}
    reader = unicodecsv.DictReader(read_uri(csv_uri))
    ctx = graph.context(identifier=context_id, meta=meta)
    for data, err in Mapper.apply_iter(reader,
                                       mapping,
                                       graph.resolver,
                                       scope=graph.base_uri):
        if err is not None:
            log.warning("Error loading %r: %r", csv_uri, err)
        else:
            ctx.add(data['$schema'], data)
    ctx.save()