Exemple #1
0
def _read_cell(element):
    cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE))
    value_token = ODS_VALUE_TOKEN.get(cell_type, 'value')
    if cell_type == 'string':
        cell = _read_text_cell(element)
    elif cell_type == 'currency':
        value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
        currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency'))
        cell = Cell(value + ' ' + currency, type=CurrencyType())
    elif cell_type is not None:
        value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
        cell = Cell(value, type=ODS_TYPES.get(cell_type, StringType()))
    else:
        cell = Cell(EMPTY_CELL_VALUE, type=StringType())

    return cell
Exemple #2
0
    def raw(self, sample=False):
        """ Iterate over all rows in this sheet. """
        rows = ODS_ROW_MATCH.findall(self.sheet)

        for row in rows:
            row_data = []

            block = "{0}{1}{2}".format(self.namespace_tags[0], row,
                                       self.namespace_tags[1])
            partial = cStringIO.StringIO(block)

            for action, elem in etree.iterparse(partial, ('end', )):
                if elem.tag == '{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table-cell':
                    cell_type = elem.attrib.get(
                        'urn:oasis:names:tc:opendocument:xmlns:office:1.0:value-type'
                    )
                    children = elem.getchildren()
                    if children:
                        c = Cell(children[0].text,
                                 type=ODS_TYPES.get(cell_type, StringType()))
                        row_data.append(c)

            if not row_data:
                # ignore blank lines
                continue

            del partial
            yield row_data
        del rows
Exemple #3
0
 def __init__(self, value, column=None, type=None):
     if type is None:
         from messytables.types import StringType
         type = StringType()
     self.value = value
     self.column = column
     self.column_autogenerated = False
     self.type = type
Exemple #4
0
 def __init__(self, value=None, column=None, type=None, source=None):
     assert value is None
     assert isinstance(source, lxml.etree._Element)
     self._lxml = source
     if type is None:
         from messytables.types import StringType
         type = StringType()
     self.type = type
     self.column = column
     self.column_autogenerated = False
Exemple #5
0
def _read_text_cell(element):
    children = element.getchildren()
    text_content = []
    for child in children:
        if child.text:
            text_content.append(child.text)
        else:
            text_content.append(EMPTY_CELL_VALUE)
    if len(text_content) > 0:
        cell_value = '\n'.join(text_content)
    else:
        cell_value = EMPTY_CELL_VALUE
    return Cell(cell_value, type=StringType())
Exemple #6
0
 def from_xlrdcell(xlrd_cell, sheet, col, row):
     value = xlrd_cell.value
     cell_type = XLS_TYPES.get(xlrd_cell.ctype, StringType())
     if cell_type == DateType(None):
         if value == 0:
             raise InvalidDateError
         year, month, day, hour, minute, second = \
             xlrd.xldate_as_tuple(value, sheet.book.datemode)
         value = datetime(year, month, day, hour, minute, second)
     messy_cell = XLSCell(value, type=cell_type)
     messy_cell.sheet = sheet
     messy_cell.xlrd_cell = xlrd_cell
     messy_cell.xlrd_pos = (row, col
                            )  # necessary for properties, note not (x,y)
     return messy_cell
Exemple #7
0
 def raw(self, sample=False):
     """ Iterate over all rows in this sheet. Types are automatically
     converted according to the excel data types specified, including 
     conversion of excel dates, which are notoriously buggy. """
     num_rows = self.sheet.get_highest_row()
     for i in xrange(min(self.window, num_rows) if sample else num_rows):
         row = []
         for cell in self.sheet.rows[i]:
             value = cell.value
             if cell.is_date():
                 type = DateType(None)
             elif cell.data_type == 'n':
                 type = IntegerType()
             else:
                 type = StringType()
             row.append(Cell(value, type=type))
         yield row
Exemple #8
0
    def __init__(self, pdftables_cell):

        self._cell = pdftables_cell

        if pdftables_cell.topleft:
            w, h = pdftables_cell.size
            self._properties = dict(
                colspan=w,
                rowspan=h,
            )
            self.value = pdftables_cell.content

        else:
            self._properties = {}
            self.value = ""

        self.column = None
        self.column_autogenerated = False
        self.type = StringType()
Exemple #9
0
 def raw(self, sample=False):
     """ Iterate over all rows in this sheet. Types are automatically
     converted according to the excel data types specified, including
     conversion of excel dates, which are notoriously buggy. """
     num_rows = self.sheet.nrows
     for i in xrange(min(self.window, num_rows) if sample else num_rows):
         row = []
         for j, cell in enumerate(self.sheet.row(i)):
             value = cell.value
             type = XLS_TYPES.get(cell.ctype, StringType())
             if type == DateType(None):
                 if value == 0:
                     raise ValueError('Invalid date at "%s":%d,%d' %
                                      (self.sheet.name, j + 1, i + 1))
                 year, month, day, hour, minute, second = \
                     xlrd.xldate_as_tuple(value, self.sheet.book.datemode)
                 value = datetime(year, month, day, hour, minute, second)
             row.append(Cell(value, type=type))
         yield row
Exemple #10
0
from datetime import datetime
import xlrd

from messytables.core import RowSet, TableSet, Cell
from messytables.types import StringType, IntegerType, \
        DateType, FloatType

XLS_TYPES = {
    1: StringType(),
    # NB: Excel does not distinguish floats from integers so we use floats
    # We could try actual type detection between floats and ints later
    # or use the excel format string info - see
    # https://groups.google.com/forum/?fromgroups=#!topic/
    #  python-excel/cAQ1ndsCVxk
    2: FloatType(),
    3: DateType(None),
    # this is actually boolean but we do not have a boolean type yet
    4: IntegerType()
}


class XLSTableSet(TableSet):
    """An excel workbook wrapper object.
    """
    def __init__(self,
                 fileobj=None,
                 filename=None,
                 window=None,
                 encoding=None):
        '''Initilize the tableset.
Exemple #11
0
def _map_openpyxl_type_to_messytable_type(type_str):
  type = OPENPYXL_TYPE_MAPPING.get(type_str, None)
  if type:
    return type
  
  return StringType()
Exemple #12
0
    The return value is a tuple of the offset of the header row
    and the names of the columns.
    """
    rows = list(rows)
    modal = column_count_modal(rows)
    for i, row in enumerate(rows):
        length = len([c for c in row if c.value])
        if length >= modal - tolerance:
            # TODO: use type guessing to check that this row has
            # strings and does not conform to the type schema of
            # the table.
            return i, [u'{}'.format(c.value) for c in row if c.value]
    return 0, []

OPENPYXL_TYPE_MAPPING = {
  TYPE_STRING: StringType(),
  TYPE_BOOL: BoolType(),
  TYPE_NUMERIC: IntegerType(),
  'd': DateUtilType()
}

def _get_type_weight(type_str):
  if type_str == TYPE_STRING:
    return StringType.guessing_weight

  if type_str == TYPE_NUMERIC:
    return IntegerType.guessing_weight # Use "IntegerType" weight for even decimal + float

  if type_str == TYPE_BOOL:
    return BoolType.guessing_weight