def apply_headers(row_set, row): _row = [] pairs = izip_longest(row, headers) for i, (cell, header) in enumerate(pairs): if cell is None: cell = Cell(None) cell.column = header if not cell.column: cell.column = "column_%d" % i cell.column_autogenerated = True _row.append(cell) return _row
def raw(self, sample=False): """ Iterate over all rows in this sheet. """ rows = ODS_ROW_MATCH.findall(self.sheet) for row in rows: row_data = [] block = "{0}{1}{2}".format(self.namespace_tags[0], row, self.namespace_tags[1]) partial = cStringIO.StringIO(block) for action, elem in etree.iterparse(partial, ('end', )): if elem.tag == '{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table-cell': cell_type = elem.attrib.get( 'urn:oasis:names:tc:opendocument:xmlns:office:1.0:value-type' ) children = elem.getchildren() if children: c = Cell(children[0].text, type=ODS_TYPES.get(cell_type, StringType())) row_data.append(c) if not row_data: # ignore blank lines continue del partial yield row_data del rows
def raw(self, sample=False): blank_cells = defaultdict(list) # ie row 2, cols 3,4,6: {2: [3,4,6]} for r, row in enumerate(self.sheet.xpath('.//tr')): # TODO: handle header nicer - preserve the fact it's a header! html_cells = row.xpath('.//*[name()="td" or name()="th"]') # TODO: only select those that are not children of subtables? """ at the end of this chunk, you will have an accurate blank_cells.""" output_column = 0 for html_cell in html_cells: assert type(r) == int while output_column in blank_cells[r]: output_column += 1 # pass over col, doesn't exist in src table rowspan = int(html_cell.attrib.get('rowspan', "1")) colspan = int(html_cell.attrib.get('colspan', "1")) x_range = range(output_column, output_column + colspan) y_range = range(r, r + rowspan) for x in x_range: for y in y_range: if (output_column, r) != (x, y): # don't skip current cell blank_cells[y].append(x) output_column += 1 cells = [ Cell(cell.text_content(), properties={'html': lxml.html.tostring(cell)}) for cell in html_cells ] yield insert_blank_cells(cells, blank_cells[r]) if sample and r == self.window: return del blank_cells[r]
def raw(self, sample=False): def rows(): for line in self._sample: if PY2: yield line.encode('utf-8') else: yield line if not sample: for line in self.lines: if PY2: yield line.encode('utf-8') else: yield line # Fix the maximum field size to something a little larger csv.field_size_limit(256000) try: for row in csv.reader(rows(), dialect=self._dialect, **self._overrides): yield [Cell(to_unicode_or_bust(c)) for c in row] except csv.Error as err: if u'newline inside string' in unicode_string(err) and sample: pass elif u'line contains NULL byte' in unicode_string(err): pass else: raise messytables.ReadError('Error reading CSV: %r', err)
def _read_cell(element): cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE)) value_token = ODS_VALUE_TOKEN.get(cell_type, 'value') if cell_type == 'string': cell = _read_text_cell(element) elif cell_type == 'currency': value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency')) cell = Cell(value + ' ' + currency, type=CurrencyType()) elif cell_type is not None: value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) cell = Cell(value, type=ODS_TYPES.get(cell_type, StringType())) else: cell = Cell(EMPTY_CELL_VALUE, type=StringType()) return cell
def raw(self, sample=False): """ Yield one row of cells at a time """ if hasattr(self.table, "cell_data"): # New style of cell data. for row in self.table.cell_data: yield [PDFCell(pdf_cell) for pdf_cell in row] else: for row in self.table: yield [Cell(pdf_cell) for pdf_cell in row]
def insert_blank_cells(row, blanks): """ Given a list of values, insert blank cells at the indexes given by blanks The letters in these examples should really be cells. >>> insert_blank_cells(["a","e","f"],[1,2,3]) ['a', <Cell(String:>, <Cell(String:>, <Cell(String:>, 'e', 'f'] """ # DISCUSS: option to repeat top-left of col/rowspan. # or to identify that areas are a single cell, originally. for i in blanks: row.insert(i, Cell("")) return row
def _read_text_cell(element): children = element.getchildren() text_content = [] for child in children: if child.text: text_content.append(child.text) else: text_content.append(EMPTY_CELL_VALUE) if len(text_content) > 0: cell_value = '\n'.join(text_content) else: cell_value = EMPTY_CELL_VALUE return Cell(cell_value, type=StringType())
def raw(self, sample=False): """ Iterate over all rows in this sheet. Types are automatically converted according to the excel data types specified, including conversion of excel dates, which are notoriously buggy. """ num_rows = self.sheet.get_highest_row() for i in xrange(min(self.window, num_rows) if sample else num_rows): row = [] for cell in self.sheet.rows[i]: value = cell.value if cell.is_date(): type = DateType(None) elif cell.data_type == 'n': type = IntegerType() else: type = StringType() row.append(Cell(value, type=type)) yield row
def sample(self): def rows(): for line in self._sample_lines: yield line try: for row in csv.reader(rows(), delimiter=self.delimiter, dialect=self._dialect): yield [Cell(to_unicode_or_bust(c)) for c in row] except csv.Error, err: if 'newline inside string' in unicode(err): pass elif 'line contains NULL byte' in unicode(err): pass else: raise
def raw(self, sample=False): def rows(): for line in self._sample: yield line if not sample: for line in self.lines: yield line try: for row in csv.reader(rows(), dialect=self._dialect, **self._overrides): yield [Cell(to_unicode_or_bust(c)) for c in row] except csv.Error, err: if 'newline inside string' in unicode(err) and sample: pass elif 'line contains NULL byte' in unicode(err): pass else: raise
def raw(self, sample=False): """ Iterate over all rows in this sheet. Types are automatically converted according to the excel data types specified, including conversion of excel dates, which are notoriously buggy. """ num_rows = self.sheet.nrows for i in xrange(min(self.window, num_rows) if sample else num_rows): row = [] for j, cell in enumerate(self.sheet.row(i)): value = cell.value type = XLS_TYPES.get(cell.ctype, StringType()) if type == DateType(None): if value == 0: raise ValueError('Invalid date at "%s":%d,%d' % (self.sheet.name, j + 1, i + 1)) year, month, day, hour, minute, second = \ xlrd.xldate_as_tuple(value, self.sheet.book.datemode) value = datetime(year, month, day, hour, minute, second) row.append(Cell(value, type=type)) yield row
def raw(self, sample=False): def rows(): if sample: generator = self._sample_lines else: generator = chain(self._sample_lines, self.lines) for line in generator: yield line try: for row in csv.reader(rows(), delimiter=self.delimiter, dialect=self._dialect): yield [Cell(to_unicode_or_bust(c)) for c in row] except csv.Error, err: if 'newline inside string' in unicode(err) and sample: pass elif 'line contains NULL byte' in unicode(err): pass else: raise
def raw(self, sample=False): def rows(): for line in self._sample: yield line if not sample: for line in self.lines: yield line # Fix the maximum field size to something a little larger csv.field_size_limit(256000) try: for row in csv.reader(rows(), dialect=self._dialect, **self._overrides): yield [Cell(to_unicode_or_bust(c)) for c in row] except csv.Error, err: if 'newline inside string' in unicode(err) and sample: pass elif 'line contains NULL byte' in unicode(err): pass else: raise
def raw(self, sample=False): """ Yield one row of cells at a time """ for row in self.table: yield [Cell(pdf_cell) for pdf_cell in row]