Ejemplo n.º 1
0
    def check_headers(self, cells, sample):
        errors = []

        for cell in copy(cells):

            # Skip if cell has field
            if 'field' in cell:
                continue

            # Infer field
            if self.__infer_fields:
                column_sample = []
                for row in sample:
                    value = None
                    if len(row) >= cell['column-number']:
                        value = row[cell['column-number'] - 1]
                    column_sample.append([value])
                schema = Schema()
                schema.infer(column_sample, headers=[cell.get('header')])
                cell['field'] = schema.fields[0]

            # Add error/remove column
            else:
                error = Error('extra-header', cell)
                errors.append(error)
                cells.remove(cell)

        return errors
Ejemplo n.º 2
0
def test_infer():
    schema = Schema()
    schema.infer([
      ['id', 'age', 'name'],
      ['1','39','Paul'],
      ['2','23','Jimmy'],
      ['3','36','Jane'],
      ['4','N/A','Judy'],
    ])
    assert schema.descriptor == {
        'fields': [
            {'format': 'default', 'name': 'id', 'type': 'integer'},
            {'format': 'default', 'name': 'age', 'type': 'integer'},
            {'format': 'default', 'name': 'name', 'type': 'string'}],
        'missingValues': ['']}
Ejemplo n.º 3
0
def test_infer():
    data = [
      ['id', 'age', 'name'],
      ['1','39','Paul'],
      ['2','23','Jimmy'],
      ['3','36','Jane'],
      ['4','N/A','Judy'],
    ]
    schema = Schema()
    schema.infer(data)
    assert schema.descriptor == {
        'fields': [
            {'format': 'default', 'name': 'id', 'type': 'integer'},
            {'format': 'default', 'name': 'age', 'type': 'integer'},
            {'format': 'default', 'name': 'name', 'type': 'string'}],
        'missingValues': ['']}
    data = [
      ['id', 'age', 'name'],
      ['1','39','Paul'],
      ['2','23','Jimmy'],
      ['3','36','Jane'],
      ['4','N/A','Judy'],
    ]
    schema = Schema()
    schema.infer(data, confidence=0.8)
    assert schema.descriptor == {
        'fields': [
            {'format': 'default', 'name': 'id', 'type': 'integer'},
            {'format': 'default', 'name': 'age', 'type': 'string'},
            {'format': 'default', 'name': 'name', 'type': 'string'}],
        'missingValues': ['']}
    
    class AllStrings():
        def cast(self, value):
            return [('string', 'default', 0)]
    data = [
      ['id', 'age', 'name'],
      ['1','39','Paul'],
      ['2','23','Jimmy'],
      ['3','36','Jane'],
      ['4','100','Judy'],
    ]

    schema = Schema()
    schema.infer(data, confidence=0.8, guesser_cls=AllStrings)
    assert schema.descriptor['fields'] == [
            {'format': 'default', 'name': 'id', 'type': 'string'},
            {'format': 'default', 'name': 'age', 'type': 'string'},
            {'format': 'default', 'name': 'name', 'type': 'string'}]
    assert schema.descriptor == {
        'fields': [
            {'format': 'default', 'name': 'id', 'type': 'string'},
            {'format': 'default', 'name': 'age', 'type': 'string'},
            {'format': 'default', 'name': 'name', 'type': 'string'}],
        'missingValues': ['']}
Ejemplo n.º 4
0
    def __inspect_table(self, table):

        # Start timer
        start = datetime.datetime.now()

        # Prepare vars
        errors = []
        warnings = []
        headers = []
        row_number = 0
        fatal_error = False
        source = table['source']
        stream = table['stream']
        schema = table['schema']
        extra = table['extra']

        # Prepare checks
        checks = registry.compile_checks(table.get('checks', self.__checks),
                                         self.__skip_checks,
                                         order_fields=self.__order_fields,
                                         infer_fields=self.__infer_fields)

        # Prepare table
        try:
            stream.open()
            sample = stream.sample
            headers = stream.headers
            if headers is None:
                headers = [None] * len(sample[0]) if sample else []
            if _filter_checks(checks, type='schema'):
                if schema is None and self.__infer_schema:
                    schema = Schema()
                    schema.infer(sample, headers=headers)
            if schema is None:
                checks = _filter_checks(checks, type='schema', inverse=True)
        except Exception as exception:
            fatal_error = True
            error = _compose_error_from_exception(exception)
            errors.append(error)

        # Prepare schema
        if not fatal_error:
            if schema:
                if schema.primary_key:
                    for field in schema.descriptor.get('fields', []):
                        if field.get('name') in schema.primary_key:
                            field['primaryKey'] = True
                    schema.commit()
                for error in schema.errors:
                    fatal_error = True
                    error = _compose_error_from_schema_error(error)
                    errors.append(error)

        # Prepare cells
        if not fatal_error:
            cells = []
            fields = [None] * len(headers)
            if schema is not None:
                fields = schema.fields
            iterator = zip_longest(headers, fields, fillvalue=_FILLVALUE)
            for number, (header, field) in enumerate(iterator, start=1):
                cell = {'number': number}
                if header is not _FILLVALUE:
                    cell['header'] = header
                    cell['value'] = header
                if field is not _FILLVALUE:
                    cell['field'] = field
                cells.append(cell)

        # Head checks
        if not fatal_error:
            if None not in headers:
                head_checks = _filter_checks(checks, context='head')
                for check in head_checks:
                    if not cells:
                        break
                    check_func = getattr(check['func'], 'check_headers',
                                         check['func'])
                    check_func(errors, cells, sample)
                for error in errors:
                    error['row'] = None

        # Body checks
        if not fatal_error:
            cellmap = {cell['number']: cell for cell in cells}
            body_checks = _filter_checks(checks, context='body')
            with stream:
                extended_rows = stream.iter(extended=True)
                while True:
                    try:
                        row_number, _, row = next(extended_rows)
                    except StopIteration:
                        break
                    except Exception as exception:
                        fatal_error = True
                        error = _compose_error_from_exception(exception)
                        errors.append(error)
                        break
                    cells = []
                    iterator = zip_longest(headers, row, fillvalue=_FILLVALUE)
                    for number, (header, value) in enumerate(iterator,
                                                             start=1):
                        cellref = cellmap.get(number, {})
                        cell = {'number': number}
                        if header is not _FILLVALUE:
                            cell['header'] = cellref.get('header', header)
                        if 'field' in cellref:
                            cell['field'] = cellref['field']
                        if value is not _FILLVALUE:
                            cell['value'] = value
                        cells.append(cell)
                    for check in body_checks:
                        if not cells:
                            break
                        check_func = getattr(check['func'], 'check_row',
                                             check['func'])
                        check_func(errors, cells, row_number)
                    for error in reversed(errors):
                        if 'row' in error:
                            break
                        error['row'] = row
                    if row_number >= self.__row_limit:
                        warnings.append(
                            'Table "%s" inspection has reached %s row(s) limit'
                            % (source, self.__row_limit))
                        break
                    if len(errors) >= self.__error_limit:
                        warnings.append(
                            'Table "%s" inspection has reached %s error(s) limit'
                            % (source, self.__error_limit))
                        break

        # Table checks
        if not fatal_error:
            for check in checks:
                check_func = getattr(check['func'], 'check_table', None)
                if check_func:
                    check_func(errors)

        # Stop timer
        stop = datetime.datetime.now()

        # Compose report
        headers = headers if None not in headers else None
        errors = errors[:self.__error_limit]
        errors = _sort_errors(errors)
        report = copy(extra)
        report.update({
            'time': round((stop - start).total_seconds(), 3),
            'valid': not bool(errors),
            'error-count': len(errors),
            'row-count': row_number,
            'source': source,
            'headers': headers,
            'scheme': stream.scheme,
            'format': stream.format,
            'encoding': stream.encoding,
            'schema': 'table-schema' if schema else None,
            'errors': errors,
        })

        return warnings, report
Ejemplo n.º 5
0
 def getSchema(self):
     s = Schema()
     return s.infer(self.data)
Ejemplo n.º 6
0
class TableExtractor(AbstractExtractor):
    """
    Extracts tables from HTML as structured content and plain text.
    """
    def __init__(self):
        self.__current_table_row = []
        self.__current_text = ''
        self.__is_table = False
        self.__is_table_head = False
        self.__is_table_body = False
        self.__table_content = None
        self.__table_stack = []
        self.__table_index = 1
        self.__is_anchor = False
        self.__anchor_text = ''
        self.__anchor_url = None
        self.schema = Schema()

    def extract(self,
                el,
                ev,
                structured_content: List[Dict[str, Any]],
                text_list: List[str],
                nlp=None):
        if el.tag == 'table':
            if ev == 'start':
                if self.__is_table:
                    ref = 'table:{}'.format(self.__table_index)
                    self.__current_text += f'{{{ref}}} '
                    self.__table_content.setdefault('references',
                                                    []).append(ref)
                    self.__table_stack.append(
                        (self.__current_table_row, self.__current_text,
                         self.__is_table_head, self.__is_table_body,
                         self.__table_content))
                self.__current_table_row = []
                self.__current_text = ''
                self.__is_table = True
                self.__is_table_head = False
                self.__is_table_body = False
                self.__table_content = {
                    'type': 'table',
                    'index': self.__table_index,
                    'head': [],
                    'body': []
                }
                self.__table_index += 1

            elif ev == 'end':
                table = self.__table_content
                if table['body']:
                    if table['head']:
                        headers = table['head']
                        fields = self.schema.infer(table['body'],
                                                   headers=headers)['fields']
                    else:
                        head = table['body'][0]
                        headers = [
                            'name%d' % (i + 1) for i in range(len(head))
                        ]
                        fields = self.schema.infer(table['body'],
                                                   headers=headers)['fields']
                        if len(table['body']) > 1:
                            dtypes = [field['type'] for field in fields]
                            if any([
                                    typ != guess_type(val)
                                    for typ, val in zip(dtypes, head)
                            ]):
                                table['head'] = [head]
                                table['body'] = table['body'][1:]
                                for field, name in zip(fields, head):
                                    field['name'] = name

                    table['fields'] = fields

                structured_content.append(table)
                if len(self.__table_stack):
                    (self.__current_table_row, self.__current_text,
                     self.__is_table_head, self.__is_table_body,
                     self.__table_content) = self.__table_stack.pop()
                else:
                    self.__is_table_body = False
                    self.__is_table_head = False
                    self.__is_table = False
                    self.__current_text = ''
                    self.__current_table_row = []
                    self.__table_content = None
                    self.__table_index = 1

        elif self.__is_table:
            # noinspection SpellCheckingInspection
            if el.tag == 'thead' and ev == 'start':
                self.__is_table_head = True
                self.__is_table_body = False

            elif el.tag == 'tbody' and ev == 'start':
                self.__is_table_head = False
                self.__is_table_body = True

            elif el.tag == 'tr' and ev == 'end':
                if self.__is_current_table_row_not_empty():
                    values = [v for _, v in self.__current_table_row]
                    text_list.append(strip_link_markers(r'\t'.join(values)))
                    if not self.__is_table_head and (
                            self.__is_table_body
                            or not self.__is_header_row()):
                        self.__table_content['body'].append(values)
                        self.__is_table_head = False
                        self.__is_table_body = True

                    else:
                        self.__table_content['head'].append(values)

                self.__current_text = ''
                self.__current_table_row = []

            elif el.tag == 'th':
                if ev == 'end':
                    self.__current_table_row.append(
                        ('th', clean_text(self.__current_text)))

                self.__current_text = ''

            elif el.tag == 'td':
                if ev == 'end':
                    self.__current_table_row.append(
                        ('td', clean_text(self.__current_text)))

                self.__current_text = ''

            elif el.tag == 'a':
                if ev == 'start':
                    anchor_url = el.get('href')
                    if anchor_url:
                        self.__is_anchor = True
                        self.__current_text += LINK_OPEN_MARKER
                        self.__anchor_url = el.get('href')

                elif ev == 'end' and self.__is_anchor:
                    self.__is_anchor = False
                    if self.__anchor_text.strip():
                        self.__current_text += LINK_CLOSE_MARKER
                        if self.__anchor_url and self.__anchor_text:
                            structured_content.append({
                                'type':
                                'link',
                                'url':
                                self.__anchor_url,
                                'text':
                                self.__anchor_text
                            })
                    else:
                        n = self.__current_text.rfind(LINK_OPEN_MARKER)
                        self.__current_text = self.__current_text[:n] + ' '

                    self.__anchor_url = None
                    self.__anchor_text = ''

            if ev == 'start' and el.text:
                self.__current_text += el.text
                if self.__is_anchor:
                    self.__anchor_text += el.text

            elif ev == 'end' and el.tail:
                self.__current_text += el.tail
                if self.__is_anchor:
                    self.__anchor_text += el.tail

    def __is_current_table_row_not_empty(self) -> bool:
        return any(v for _, v in self.__current_table_row)

    def __is_header_row(self) -> bool:
        return all(k == 'th' for k, _ in self.__current_table_row)
Ejemplo n.º 7
0
def test_schema_infer_with_non_headers_issues_goodtables_258():
    schema = Schema()
    schema.infer([[1], [2], [3]], headers=[None])
    assert schema.field_names == ['field1']