예제 #1
0
    def generate_table(self, meta, sheet, row_set):
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        schema = TabularSchema({
            'sheet_name': row_set.name,
            'content_hash': meta.content_hash,
            'sheet': sheet
        })
        columns = [schema.add_column(h) for h in headers]
        log.info("Creating internal table: %s columns, table: %r", len(columns),
                 schema.table_name)
        tabular = Tabular(schema)
        tabular.drop()
        tabular.create()

        def generate_rows():
            for i, row in enumerate(row_set):
                record = {}
                for cell, column in zip(row, columns):
                    record[column.name] = string_value(cell.value)
                if len(record):
                    for column in columns:
                        record[column.name] = record.get(column.name, None)
                    yield record
            log.info("Loaded %s rows.", i)

        tabular.load_iter(generate_rows())
        return schema
예제 #2
0
    def ingest(self, meta, local_path):
        with open(local_path, 'rb') as fh:
            db = DBF(fh)
            schema = TabularSchema({
                'content_hash': meta.content_hash,
                'sheet': 0
            })
            columns = [schema.add_column(h) for h in db.fields.keys()]
            columns = {c.label: c.name for c in columns}
            tabular = Tabular(schema)
            tabular.drop()
            tabular.create()

            def generate_rows():
                if db.numrec == 0:
                    return
                text = []
                for i in xrange(0, db.numrec):
                    for v in db.select(i).values():
                        if isinstance(v, str):
                            text.append(v)
                encoding = guess_encoding(' '.join(text))

                for i in xrange(0, db.numrec):
                    row = db.select(i)
                    record = {}
                    for k, value in row.items():
                        name = columns.get(k)
                        record[name] = string_value(value, encoding=encoding)
                    if len(record):
                        for name in columns.values():
                            record[name] = record.get(name, None)
                        yield record

                log.info("Loaded %s rows.", i)

            tabular.load_iter(generate_rows())
            meta.tables = [schema]
            document = self.create_document(meta)
            self.emit(document)