コード例 #1
0
ファイル: headers.py プロジェクト: asuffield/messytables
 def apply_headers(row_set, row):
     _row = []
     pairs = izip_longest(row, headers)
     for i, (cell, header) in enumerate(pairs):
         if cell is None:
             cell = Cell(None)
         cell.column = header
         if not cell.column:
             cell.column = "column_%d" % i
             cell.column_autogenerated = True
         _row.append(cell)
     return _row
コード例 #2
0
 def apply_headers(row_set, row):
     _row = []
     pairs = izip_longest(row, headers)
     for i, (cell, header) in enumerate(pairs):
         if cell is None:
             cell = Cell(None)
         cell.column = header
         if not cell.column:
             cell.column = "column_%d" % i
             cell.column_autogenerated = True
         _row.append(cell)
     return _row
コード例 #3
0
ファイル: ods.py プロジェクト: ziggi0703/messytables
    def raw(self, sample=False):
        """ Iterate over all rows in this sheet. """
        rows = ODS_ROW_MATCH.findall(self.sheet)

        for row in rows:
            row_data = []

            block = "{0}{1}{2}".format(self.namespace_tags[0], row,
                                       self.namespace_tags[1])
            partial = cStringIO.StringIO(block)

            for action, elem in etree.iterparse(partial, ('end', )):
                if elem.tag == '{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table-cell':
                    cell_type = elem.attrib.get(
                        'urn:oasis:names:tc:opendocument:xmlns:office:1.0:value-type'
                    )
                    children = elem.getchildren()
                    if children:
                        c = Cell(children[0].text,
                                 type=ODS_TYPES.get(cell_type, StringType()))
                        row_data.append(c)

            if not row_data:
                # ignore blank lines
                continue

            del partial
            yield row_data
        del rows
コード例 #4
0
ファイル: html.py プロジェクト: scraperdragon/messytables
    def raw(self, sample=False):
        blank_cells = defaultdict(list)  # ie row 2, cols 3,4,6: {2: [3,4,6]}
        for r, row in enumerate(self.sheet.xpath('.//tr')):
            # TODO: handle header nicer - preserve the fact it's a header!
            html_cells = row.xpath('.//*[name()="td" or name()="th"]')
            # TODO: only select those that are not children of subtables?
            """ at the end of this chunk, you will have an accurate blank_cells."""
            output_column = 0
            for html_cell in html_cells:
                assert type(r) == int
                while output_column in blank_cells[r]:
                    output_column += 1  # pass over col, doesn't exist in src table
                rowspan = int(html_cell.attrib.get('rowspan', "1"))
                colspan = int(html_cell.attrib.get('colspan', "1"))
                x_range = range(output_column, output_column + colspan)
                y_range = range(r, r + rowspan)
                for x in x_range:
                    for y in y_range:
                        if (output_column,
                                r) != (x, y):  # don't skip current cell
                            blank_cells[y].append(x)
                output_column += 1

            cells = [
                Cell(cell.text_content(),
                     properties={'html': lxml.html.tostring(cell)})
                for cell in html_cells
            ]
            yield insert_blank_cells(cells, blank_cells[r])
            if sample and r == self.window:
                return
            del blank_cells[r]
コード例 #5
0
    def raw(self, sample=False):
        def rows():
            for line in self._sample:
                if PY2:
                    yield line.encode('utf-8')
                else:
                    yield line
            if not sample:
                for line in self.lines:
                    if PY2:
                        yield line.encode('utf-8')
                    else:
                        yield line

        # Fix the maximum field size to something a little larger
        csv.field_size_limit(256000)

        try:
            for row in csv.reader(rows(),
                                  dialect=self._dialect, **self._overrides):
                yield [Cell(to_unicode_or_bust(c)) for c in row]
        except csv.Error as err:
            if u'newline inside string' in unicode_string(err) and sample:
                pass
            elif u'line contains NULL byte' in unicode_string(err):
                pass
            else:
                raise messytables.ReadError('Error reading CSV: %r', err)
コード例 #6
0
ファイル: ods.py プロジェクト: bwica/dpusher
def _read_cell(element):
    cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE))
    value_token = ODS_VALUE_TOKEN.get(cell_type, 'value')
    if cell_type == 'string':
        cell = _read_text_cell(element)
    elif cell_type == 'currency':
        value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
        currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency'))
        cell = Cell(value + ' ' + currency, type=CurrencyType())
    elif cell_type is not None:
        value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
        cell = Cell(value, type=ODS_TYPES.get(cell_type, StringType()))
    else:
        cell = Cell(EMPTY_CELL_VALUE, type=StringType())

    return cell
コード例 #7
0
 def raw(self, sample=False):
     """
     Yield one row of cells at a time
     """
     if hasattr(self.table, "cell_data"):
         # New style of cell data.
         for row in self.table.cell_data:
             yield [PDFCell(pdf_cell) for pdf_cell in row]
     else:
         for row in self.table:
             yield [Cell(pdf_cell) for pdf_cell in row]
コード例 #8
0
ファイル: html.py プロジェクト: scraperdragon/messytables
def insert_blank_cells(row, blanks):
    """
    Given a list of values, insert blank cells at the indexes given by blanks
    The letters in these examples should really be cells.
    >>> insert_blank_cells(["a","e","f"],[1,2,3])
    ['a', <Cell(String:>, <Cell(String:>, <Cell(String:>, 'e', 'f']
    """
    # DISCUSS: option to repeat top-left of col/rowspan.
    # or to identify that areas are a single cell, originally.
    for i in blanks:
        row.insert(i, Cell(""))
    return row
コード例 #9
0
ファイル: ods.py プロジェクト: bwica/dpusher
def _read_text_cell(element):
    children = element.getchildren()
    text_content = []
    for child in children:
        if child.text:
            text_content.append(child.text)
        else:
            text_content.append(EMPTY_CELL_VALUE)
    if len(text_content) > 0:
        cell_value = '\n'.join(text_content)
    else:
        cell_value = EMPTY_CELL_VALUE
    return Cell(cell_value, type=StringType())
コード例 #10
0
ファイル: excelx.py プロジェクト: domoritz/messytables
 def raw(self, sample=False):
     """ Iterate over all rows in this sheet. Types are automatically
     converted according to the excel data types specified, including 
     conversion of excel dates, which are notoriously buggy. """
     num_rows = self.sheet.get_highest_row()
     for i in xrange(min(self.window, num_rows) if sample else num_rows):
         row = []
         for cell in self.sheet.rows[i]:
             value = cell.value
             if cell.is_date():
                 type = DateType(None)
             elif cell.data_type == 'n':
                 type = IntegerType()
             else:
                 type = StringType()
             row.append(Cell(value, type=type))
         yield row
コード例 #11
0
    def sample(self):
        def rows():
            for line in self._sample_lines:
                yield line

        try:
            for row in csv.reader(rows(),
                                  delimiter=self.delimiter,
                                  dialect=self._dialect):
                yield [Cell(to_unicode_or_bust(c)) for c in row]
        except csv.Error, err:
            if 'newline inside string' in unicode(err):
                pass
            elif 'line contains NULL byte' in unicode(err):
                pass
            else:
                raise
コード例 #12
0
ファイル: commas.py プロジェクト: igenieconsulting/dataproxy
 def raw(self, sample=False):
     def rows():
         for line in self._sample:
             yield line
         if not sample:
             for line in self.lines:
                 yield line
     try:
         for row in csv.reader(rows(),
                 dialect=self._dialect, **self._overrides):
             yield [Cell(to_unicode_or_bust(c)) for c in row]
     except csv.Error, err:
         if 'newline inside string' in unicode(err) and sample:
             pass
         elif 'line contains NULL byte' in unicode(err):
             pass
         else:
             raise
コード例 #13
0
 def raw(self, sample=False):
     """ Iterate over all rows in this sheet. Types are automatically
     converted according to the excel data types specified, including
     conversion of excel dates, which are notoriously buggy. """
     num_rows = self.sheet.nrows
     for i in xrange(min(self.window, num_rows) if sample else num_rows):
         row = []
         for j, cell in enumerate(self.sheet.row(i)):
             value = cell.value
             type = XLS_TYPES.get(cell.ctype, StringType())
             if type == DateType(None):
                 if value == 0:
                     raise ValueError('Invalid date at "%s":%d,%d' %
                                      (self.sheet.name, j + 1, i + 1))
                 year, month, day, hour, minute, second = \
                     xlrd.xldate_as_tuple(value, self.sheet.book.datemode)
                 value = datetime(year, month, day, hour, minute, second)
             row.append(Cell(value, type=type))
         yield row
コード例 #14
0
    def raw(self, sample=False):
        def rows():
            if sample:
                generator = self._sample_lines
            else:
                generator = chain(self._sample_lines, self.lines)
            for line in generator:
                yield line

        try:
            for row in csv.reader(rows(),
                                  delimiter=self.delimiter,
                                  dialect=self._dialect):
                yield [Cell(to_unicode_or_bust(c)) for c in row]
        except csv.Error, err:
            if 'newline inside string' in unicode(err) and sample:
                pass
            elif 'line contains NULL byte' in unicode(err):
                pass
            else:
                raise
コード例 #15
0
    def raw(self, sample=False):
        def rows():
            for line in self._sample:
                yield line
            if not sample:
                for line in self.lines:
                    yield line

        # Fix the maximum field size to something a little larger
        csv.field_size_limit(256000)

        try:
            for row in csv.reader(rows(),
                                  dialect=self._dialect,
                                  **self._overrides):
                yield [Cell(to_unicode_or_bust(c)) for c in row]
        except csv.Error, err:
            if 'newline inside string' in unicode(err) and sample:
                pass
            elif 'line contains NULL byte' in unicode(err):
                pass
            else:
                raise
コード例 #16
0
ファイル: pdf.py プロジェクト: kanitw/messytables
 def raw(self, sample=False):
     """
     Yield one row of cells at a time
     """
     for row in self.table:
         yield [Cell(pdf_cell) for pdf_cell in row]