Beispiel #1
0
    def __init__(self,
                 resource=None,
                 fields=None,
                 sheet=0,
                 encoding=None,
                 skip_rows=0,
                 has_header=True):
        """Creates a XLSX spreadsheet data source stream.

        Attributes:

        * resource: file name, URL or file-like object
        * sheet: sheet index number (as int) or sheet name
        * has_header: flag determining whether first line contains header or
          not. ``True`` by default.
        """
        if isinstance(resource, openpyxl.Workbook):
            self.workbook = resource
        else:
            self.workbook = _load_workbook(resource)

        if isinstance(sheet, int):
            self.sheet = self.workbook.worksheets[sheet]
        elif isinstance(sheet, str):
            self.sheet = self.workbook[sheet]
        else:
            raise ArgumentError('sheet has to be a string or an integer')

        if has_header:
            self.first_row = skip_rows + 1
        else:
            self.first_row = skip_rows

        if fields:
            self.fields = fields
        else:
            rows = enumerate(self.sheet.rows)
            first_row = next(dropwhile(lambda x: x[0] < self.first_row,
                                       rows))[1]
            if has_header:
                header_row = next(self.sheet.rows)
                names = [str(c.value) for c in header_row]
            else:
                names = ['col%d' % i for i in range(len(first_row))]

            self.fields = FieldList()
            for name, cell in zip(names, first_row):
                if cell.is_date:
                    storage_type = 'date'
                else:
                    storage_type = CELL_TYPES.get(cell.data_type, 'unknown')
                field = Field(name, storage_type=storage_type)
                self.fields.append(field)
Beispiel #2
0
    def __init__(self, resource=None, fields=None, sheet=0,
                 encoding=None, skip_rows=0, has_header=True,
                 stop_empty_line=False):
        """Creates a XLSX spreadsheet data source stream.

        Attributes:

        * fields: `bubbles.metadata.FieldList` to use instead of auto-detection
        * resource: file name, URL or file-like object
        * sheet: sheet index number (as int) or sheet name
        * has_header: flag determining whether first line contains header or
          not. ``True`` by default.
        * stop_empty_line: flag to stop iteration over rows at the first
          encounter with an empty line. As XLSX files can contain millions
          or rows, this might cause very long iterations, especially if
          all the lines are empty past a certain point
        """
        if isinstance(resource, openpyxl.Workbook):
            self.workbook = resource
        else:
            self.workbook = _load_workbook(resource)

        if isinstance(sheet, int):
            self.sheet = self.workbook.worksheets[sheet]
        elif isinstance(sheet, str):
            self.sheet = self.workbook[sheet]
        else:
            raise ArgumentError('sheet has to be a string or an integer')

        if has_header:
            self.first_row = skip_rows + 1
        else:
            self.first_row = skip_rows

        self.stop_empty_line = stop_empty_line

        if fields:
            self.fields = fields
        else:
            rows = enumerate(self.sheet.rows)
            first_row = next(dropwhile(lambda x: x[0] < self.first_row,
                                       rows))[1]
            if has_header:
                header_rows = enumerate(self.sheet.rows)
                header_row = next(dropwhile(lambda x: x[0] < (self.first_row - 1),
                                            header_rows))[1]
                # fetch names, replace line breaks by spaces in case of
                # manual line wrapping
                names = [' '.join(str(c.value).split()) for c in header_row]
            else:
                names = ['col%d' % i for i in range(len(first_row))]

            self.fields = FieldList()
            for name, cell in zip(names, first_row):
                if cell.is_date:
                    storage_type = 'date'
                else:
                    storage_type = CELL_TYPES.get(cell.data_type, 'unknown')
                field = Field(name, storage_type=storage_type)
                self.fields.append(field)
Beispiel #3
0
    def test_skip(self):
        obj = XLSXObject(data_path("data.xlsx"), FieldList("number", "name"),
                        skip_rows=2)
        self.assertEqual(["number", "name"], obj.fields.names())

        rows = list(obj.rows())
        self.assertEqual(2, len(obj))
        self.assertEqual(2, len(rows))

        self.assertSequenceEqual([3.0, "Cecil"], rows[0])
Beispiel #4
0
    def test_no_header(self):
        with self.assertRaises(ArgumentError):
            obj = CSVSource(data_path("fruits-sk.csv"), read_header=False)

        fields = FieldList("id", "fruit", "type")
        obj = CSVSource(data_path("fruits-sk.csv"),
                        read_header=False,
                        fields=fields)
        self.assertEqual(["id", "fruit", "type"], obj.fields.names())

        rows = list(obj.rows())
        self.assertEqual(17, len(rows))

        self.assertEqual(["id", "fruit", "type"], rows[0])
        obj.release()
Beispiel #5
0
class XLSXObject(DataObject):
    __identifier__ = "xlsx"

    def __init__(self,
                 resource=None,
                 fields=None,
                 sheet=0,
                 encoding=None,
                 skip_rows=0,
                 has_header=True):
        """Creates a XLSX spreadsheet data source stream.

        Attributes:

        * resource: file name, URL or file-like object
        * sheet: sheet index number (as int) or sheet name
        * has_header: flag determining whether first line contains header or
          not. ``True`` by default.
        """
        if isinstance(resource, openpyxl.Workbook):
            self.workbook = resource
        else:
            self.workbook = _load_workbook(resource)

        if isinstance(sheet, int):
            self.sheet = self.workbook.worksheets[sheet]
        elif isinstance(sheet, str):
            self.sheet = self.workbook[sheet]
        else:
            raise ArgumentError('sheet has to be a string or an integer')

        if has_header:
            self.first_row = skip_rows + 1
        else:
            self.first_row = skip_rows

        if fields:
            self.fields = fields
        else:
            rows = enumerate(self.sheet.rows)
            first_row = next(dropwhile(lambda x: x[0] < self.first_row,
                                       rows))[1]
            if has_header:
                header_row = next(self.sheet.rows)
                names = [str(c.value) for c in header_row]
            else:
                names = ['col%d' % i for i in range(len(first_row))]

            self.fields = FieldList()
            for name, cell in zip(names, first_row):
                if cell.is_date:
                    storage_type = 'date'
                else:
                    storage_type = CELL_TYPES.get(cell.data_type, 'unknown')
                field = Field(name, storage_type=storage_type)
                self.fields.append(field)

    def _nrows(self):
        return sum([1 for _ in self.sheet.rows])

    def representations(self):
        return ["rows", "records"]

    def __len__(self):
        return self._nrows() - self.first_row

    def rows(self):
        if not self.fields:
            raise RuntimeError("Fields are not initialized")

        field_count = len(self.fields)
        rows = enumerate(self.sheet.rows)
        rows = dropwhile(lambda x: x[0] < self.first_row, rows)
        for _, row in rows:
            yield tuple([c.value for c in row[:field_count]])

    def records(self):
        fields = self.fields.names()
        for row in self.rows():
            yield dict(zip(fields, row))

    def is_consumable(self):
        return False