Beispiel #1
0
def load_ba_fixtures(config):
    # This is messy. Would be cool to do it more cleanly, but how?
    if not len(BA_FIXTURES['entities']):
        with open(os.path.join(FIXTURES, 'ba.mapping.yaml'), 'rb') as fh:
            mapping = yaml.load(fh)
        mapper = Mapper(mapping, config.resolver, scope=config.base_uri)
        with open(os.path.join(FIXTURES, 'ba.csv'), 'rb') as csvfh:
            reader = unicodecsv.DictReader(csvfh)
            for row in reader:
                _, data = mapper.apply(row)
                BA_FIXTURES['entities'].append(data)

    source = Source.ensure({
        'slug': BA_SOURCE,
        'title': 'BiH Parliament',
        'url': 'http://foo.ba/'
    })
    permission = Permission()
    permission.role_id = Role.SYSTEM_USER
    permission.read = True
    permission.write = False
    permission.resource_id = source.id
    permission.resource_type = Permission.SOURCE
    session.add(permission)
    session.commit()
    for entity in BA_FIXTURES['entities']:
        config.entities.save(entity['$schema'], entity, source_id=source.id)
    get_loom_indexer().index(source=BA_SOURCE)
Beispiel #2
0
def load_ba_fixtures(config):
    # This is messy. Would be cool to do it more cleanly, but how?
    if not len(BA_FIXTURES['entities']):
        with open(os.path.join(FIXTURES, 'ba.mapping.yaml'), 'rb') as fh:
            mapping = yaml.load(fh)
        mapper = Mapper(mapping, config.resolver, scope=config.base_uri)
        with open(os.path.join(FIXTURES, 'ba.csv'), 'rb') as csvfh:
            reader = unicodecsv.DictReader(csvfh)
            for row in reader:
                _, data = mapper.apply(row)
                BA_FIXTURES['entities'].append(data)

    source = Source.ensure({
        'slug': BA_SOURCE,
        'title': 'BiH Parliament',
        'url': 'http://foo.ba/'
    })
    permission = Permission()
    permission.role_id = Role.SYSTEM_USER
    permission.read = True
    permission.write = False
    permission.resource_id = source.id
    permission.resource_type = Permission.SOURCE
    session.add(permission)
    session.commit()
    for entity in BA_FIXTURES['entities']:
        config.entities.save(entity['$schema'], entity, source_id=source.id)
    get_loom_indexer().index(source=BA_SOURCE)
Beispiel #3
0
def map_row(csvfile, mapfile, columns=None):
    """ Generator function that transforms a CSV row into a mapped
        dictionary object, one row at a time
    """
    mapping = file_to_json(mapfile)
    resolver = RefResolver.from_schema(mapping)
    mapper = Mapper(mapping, resolver)
    drop_blank = lambda p, k, v: v is not None and v != "" and not_empty(v)
    total_rows = row_count(csvfile)

    if isinstance(csvfile, str):
        csvfp = open(csvfile, 'r', encoding='utf-8-sig')
    for row in tqdm(csv.DictReader(csvfp), total=total_rows):
        row = {key: value for key, value in row.items() if key in columns} \
            if columns else row
        _, data = mapper.apply(row)
        data = remap(data, visit=drop_blank)
        yield data
Beispiel #4
0
    def records(self, mapping_name):
        mapper = SchemaMapper(self.spec.get_mapping(mapping_name),
                              self.config.resolver, scope=self.config.base_uri)
        schema = mapper.visitor.schema.get('id')
        begin = time.time()
        stmts = 0
        for i, row in enumerate(self.generator.generate(mapping_name)):
            _, data = mapper.apply(row)
            for stmt in self.config.entities.triplify(schema, data):
                stmts += 1
                yield stmt

            if i > 0 and i % 10000 == 0:
                elapsed = time.time() - begin
                per_record = float(elapsed) / float(i)
                speed = per_record * 1000
                log.info("Generating %r: %s records (%s, %.2fms/r)",
                         mapping_name, i, stmts, speed)