Ejemplo n.º 1
0
def condense(engine, resource_id, table_id, force):
    table_suffix = '%s_table%s' % (resource_id, table_id)

    if not engine.has_table('raw_%s' % table_suffix):
        return

    condensed_table = sl.get_table(engine, 'condensed')

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None:
        return

    connection = engine.connect()
    trans = connection.begin()

    start = time.time()

    try:
        raw_table = sl.get_table(connection, 'raw_%s' % table_suffix)
        sl.drop_table(connection, 'spending_%s' % table_suffix)
        spending_table = sl.get_table(connection, 'spending_%s' % table_suffix)
        columns_table = sl.get_table(connection, 'column_sets')

        normalise_map = normalised_columns_map(raw_table)
        normalised_headers = ','.join(sorted(normalise_map.values()))
        mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers)

        if mapping_row is None or not mapping_row.get('valid'):
            # This table is unmapped, cannot be condensed
            return

        column_mapping = json.loads(mapping_row['column_map'])

        # Build the final mapping from input column to output column
        mapping = {}
        for k,n in normalise_map.iteritems():
            if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0:
                mapping[k] = column_mapping[n]
        
        for row in sl.all(connection, raw_table):
            spending_row = {}
            for key, value in row.items():
                if key not in mapping:
                    continue
                if not value or not len(value.strip()):
                    continue
                if mapping[key] in spending_row:
                    continue
                spending_row[mapping[key]] = value.strip()
            #print spending_row
            sl.add_row(connection, spending_table, spending_row)
        sl.upsert(connection, condensed_table, {'resource_id': resource_id,
                                                'table_id': table_id,
                                                'condense_time': time.time() - start,
                                                }, ['resource_id', 'table_id'])
        trans.commit()
    finally:
        connection.close()
Ejemplo n.º 2
0
def extract_resource_core(engine, row, stats):
    connection = engine.connect()
    fh = open(source_path(row), 'rb')
    source_data = fh.read()

    if not len(source_data):
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "Empty file")
        stats.add_source('Empty file', row)
        return False, 0
    if html_re.search(source_data[0:1024]) is not None:
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "HTML file detected, not a transaction report")
        stats.add_source('HTML file', row)
        return False, 0
    if source_data.startswith('%PDF'):
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "PDF file detected, not a transaction report")
        stats.add_source('PDF file', row)
        return False, 0

    trans = connection.begin()
    start = time.time()
    try:
        if source_data.startswith(COMPDOC_SIGNATURE):
            fh.seek(0)
            table_set = XLSTableSet(fh)
        elif source_data.startswith('PK'):
            table_set = XLSXTableSet(filename=source_path(row))
        else:
            #fh.seek(0)
            from StringIO import StringIO
            sio = StringIO(source_data)

            encoding = None
            detected = chardet.detect(source_data[:200])
            log.debug('Encoding detected as: %s', detected.get('encoding'))
            if detected.get('encoding') == 'ISO-8859-2' and '\xa3' in source_data:
                # Detected as Latin2 but probably isn't - that is for Eastern
                # European languages.  Probably because the presence of a GBP
                # pound sign has foxed chardet. It is pretty certain that it is
                # a single-byte ASCII-variant, and my money is on Windows-1252
                encoding = 'windows-1252'
                log.debug('Probably not ISO-8859-2 because it has GBP symbol, so assuming it is Windows-1252')

            table_set = CSVTableSet(sio, encoding=encoding)

        sheets = 0
        for sheet_id, row_set in enumerate(table_set.tables):
            offset, headers = headers_guess(row_set.sample)
            headers = map(convert_, headers)
            log.debug("Headers: %r", headers)
            if len(headers) <= 1:
                continue
            sheets += 1

            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))

            values = defaultdict(lambda: defaultdict(int))

            raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id)
            sl.drop_table(connection, raw_table_name)
            raw_table = sl.get_table(connection, raw_table_name)

            # with one header row, offset=0 and we want row_number=1 so that
            # the first data row is row_number=2, matching the row number as
            # seen in Excel
            row_number = offset + 1
            for row_ in row_set:
                cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \
                    len(c.column.strip())])
                row_number += 1
                if is_row_blank(cells):
                    continue
                for cell, value in cells.items():
                    values[cell][value] += 1
                cells['row_number'] = row_number
                sl.add_row(connection, raw_table, cells)

        trans.commit()
        log.debug(stats.add_source('Extracted ok', row))
        return sheets>0, sheets
    except Exception, ex:
        log.exception(ex)
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              unicode(ex))
        stats.add_source('Exception: %s' % ex.__class__.__name__, row)
        return False, 0
Ejemplo n.º 3
0
def extract_table(engine, table, row, resource_id, force):
    # For now, interpret lack of data as not-failure at this stage, on
    # the basis that it was already reported as failure at the
    # retrieve stage and will just clutter up this list.
    if not os.path.exists(source_path(row)):
        return
    # assert os.path.exists(source_path(row)), "No source file exists."

    connection = engine.connect()
    extracted_table = sl.get_table(connection, "extracted")

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, extracted_table, resource_id=resource_id) is not None:
        return

    fh = open(source_path(row), "rb")
    source_data = fh.read()

    assert len(source_data) > 0, "Empty file"
    assert html_re.search(source_data[0:1024]) is None, "Looks like HTML"
    assert not source_data.startswith("%PDF"), "Looks like PDF"

    trans = connection.begin()
    start = time.time()
    try:
        if source_data.startswith(COMPDOC_SIGNATURE):
            fh.seek(0)
            table_set = XLSTableSet.from_fileobj(fh)
        elif source_data.startswith("PK"):
            table_set = XLSXTableSet(source_path(row))
        else:
            cd = chardet.detect(source_data)
            fh.close()
            fh = codecs.open(source_path(row), "r", cd["encoding"])

            table_set = CSVTableSet.from_fileobj(fh)

        for table_id, row_set in enumerate(table_set.tables):
            # types = type_guess(row_set.sample)
            # row_set.register_processor(types_processor(types))
            offset, headers = headers_guess(row_set.sample)
            headers = map(convert_, headers)
            assert (
                len(headers) > 1 or len(table_set.tables) > 1
            ), "Only one column was detected; assuming this is not valid data."
            # print headers

            # We might have multiple table sets where one is blank or ranty text or something. Skip those.
            if len(headers) <= 1:
                continue

            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))

            values = defaultdict(lambda: defaultdict(int))

            raw_table_name = "raw_%s_table%s" % (resource_id, table_id)
            sl.drop_table(connection, raw_table_name)
            raw_table = sl.get_table(connection, raw_table_name)

            for row_ in row_set:
                cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if len(c.column.strip())])
                for cell, value in cells.items():
                    values[cell][value] += 1
                sl.add_row(connection, raw_table, cells)

        sl.upsert(
            connection,
            extracted_table,
            {"resource_id": resource_id, "max_table_id": table_id, "extraction_time": time.time() - start},
            ["resource_id"],
        )

        trans.commit()
    # except Exception:
    #    traceback.print_exc()
    #    #log.exception(ex)
    #    assert False, traceback.format_exc()
    finally:
        connection.close()
        fh.close()
Ejemplo n.º 4
0
def extract_resource_core(engine, row, stats):
    connection = engine.connect()
    fh = open(source_path(row), 'rb')
    source_data = fh.read()

    if not len(source_data):
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "Empty file")
        stats.add_source('Empty file', row)
        return False, 0
    if html_re.search(source_data[0:1024]) is not None:
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "HTML file detected, not a transaction report")
        stats.add_source('HTML file', row)
        return False, 0
    if source_data.startswith('%PDF'):
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "PDF file detected, not a transaction report")
        stats.add_source('PDF file', row)
        return False, 0

    trans = connection.begin()
    start = time.time()
    try:
        if source_data.startswith(COMPDOC_SIGNATURE):
            fh.seek(0)
            table_set = XLSTableSet(fh)
        elif source_data.startswith('PK'):
            table_set = XLSXTableSet(filename=source_path(row))
        else:
            #fh.seek(0)
            from StringIO import StringIO
            sio = StringIO(source_data)

            encoding = None
            detected = chardet.detect(source_data[:200])
            log.debug('Encoding detected as: %s', detected.get('encoding'))
            if detected.get(
                    'encoding') == 'ISO-8859-2' and '\xa3' in source_data:
                # Detected as Latin2 but probably isn't - that is for Eastern
                # European languages.  Probably because the presence of a GBP
                # pound sign has foxed chardet. It is pretty certain that it is
                # a single-byte ASCII-variant, and my money is on Windows-1252
                encoding = 'windows-1252'
                log.debug(
                    'Probably not ISO-8859-2 because it has GBP symbol, so assuming it is Windows-1252'
                )

            table_set = CSVTableSet(sio, encoding=encoding)

        sheets = 0
        for sheet_id, row_set in enumerate(table_set.tables):
            offset, headers = headers_guess(row_set.sample)
            headers = map(convert_, headers)
            log.debug("Headers: %r", headers)
            if len(headers) <= 1:
                continue
            sheets += 1

            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))

            values = defaultdict(lambda: defaultdict(int))

            raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id)
            sl.drop_table(connection, raw_table_name)
            raw_table = sl.get_table(connection, raw_table_name)

            # with one header row, offset=0 and we want row_number=1 so that
            # the first data row is row_number=2, matching the row number as
            # seen in Excel
            row_number = offset + 1
            for row_ in row_set:
                cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \
                    len(c.column.strip())])
                row_number += 1
                if is_row_blank(cells):
                    continue
                for cell, value in cells.items():
                    values[cell][value] += 1
                cells['row_number'] = row_number
                sl.add_row(connection, raw_table, cells)

        trans.commit()
        log.debug(stats.add_source('Extracted ok', row))
        return sheets > 0, sheets
    except Exception, ex:
        log.exception(ex)
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              unicode(ex))
        stats.add_source('Exception: %s' % ex.__class__.__name__, row)
        return False, 0
Ejemplo n.º 5
0
def extract_resource_core(engine, row):
    connection = engine.connect()
    fh = open(source_path(row), 'rb')
    source_data = fh.read()

    if not len(source_data):
        issue(engine, row['resource_id'], row['retrieve_hash'],
              "Empty file")
        return False, 0
    if html_re.search(source_data[0:1024]) is not None:
        issue(engine, row['resource_id'], row['retrieve_hash'],
              "HTML file detected, not a transaction report")
        return False, 0
    if source_data.startswith('%PDF'):
        issue(engine, row['resource_id'], row['retrieve_hash'],
              "PDF file detected, not a transaction report")
        return False, 0

    trans = connection.begin()
    start = time.time()
    try:
        if source_data.startswith(COMPDOC_SIGNATURE):
            fh.seek(0)
            table_set = XLSTableSet(fh)
        elif source_data.startswith('PK'):
            table_set = XLSXTableSet(source_path(row))
        else:
            #fh.seek(0)
            from StringIO import StringIO
            sio = StringIO(source_data)
            #cd = chardet.detect(source_data)
            #fh.close()
            #fh = codecs.open(source_path(row), 'r', cd['encoding'] or 'utf-8')
            table_set = CSVTableSet(sio)

        sheets = 0
        for sheet_id, row_set in enumerate(table_set.tables):
            offset, headers = headers_guess(row_set.sample)
            headers = map(convert_, headers)
            log.debug("Headers: %r", headers)
            if len(headers) <= 1:
                continue
            sheets += 1

            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))

            values = defaultdict(lambda: defaultdict(int))

            raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id)
            sl.drop_table(connection, raw_table_name)
            raw_table = sl.get_table(connection, raw_table_name)

            for row_ in row_set:
                cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \
                    len(c.column.strip())])
                for cell, value in cells.items():
                    values[cell][value] += 1
                sl.add_row(connection, raw_table, cells)

        trans.commit()
        return sheets>0, sheets
    except Exception, ex:
        log.exception(ex)
        issue(engine, row['resource_id'], row['retrieve_hash'],
              unicode(ex))
        return False, 0