Example #1
0
def _process_upload(context, data):
    """
    When provided with a filename this function will process each row
    within the file and then return a tuple. The tuple will contain
        - a list of error messages (if any)
        - a list of dicts where each dict contains ...
                {
                 'package': 'a_package_id',
                 'action':  'Added' or 'Updated'
                }
    """
    log = inventory_upload.get_logger()

    errors = []
    results = []

    filename = data['file']
    publisher_name = data['publisher']

    import urlparse
    client = CkanClient(
        base_location=urlparse.urljoin(context['site_url'], 'api'),
        api_key=context['apikey'])

    tableset = None
    try:
        _, ext = os.path.splitext(filename)
        tableset = messytables.any_tableset(
            open(filename, 'r'), extension=ext[1:])
    except Exception, e:
        if str(e) == "Unrecognized MIME type: text/plain":
            tableset = messytables.any_tableset(f, mimetype="text/csv")
        else:
            errors.append("Unable to load file: {0}".format(e))
def _process_upload(context, data):
    """
    When provided with a filename this function will process each row
    within the file and then return a tuple. The tuple will contain
        - a list of error messages (if any)
        - a list of dicts where each dict contains ...
                {
                 'package': 'a_package_id',
                 'action':  'Added' or 'Updated'
                }
    """
    log = inventory_upload.get_logger()

    errors = []
    results = []

    filename = data['file']
    publisher_name = data['publisher']

    import urlparse
    client = CkanClient(base_location=urlparse.urljoin(context['site_url'],
                                                       'api'),
                        api_key=context['apikey'])

    tableset = None
    try:
        _, ext = os.path.splitext(filename)
        tableset = messytables.any_tableset(open(filename, 'r'),
                                            extension=ext[1:])
    except Exception, e:
        if str(e) == "Unrecognized MIME type: text/plain":
            tableset = messytables.any_tableset(f, mimetype="text/csv")
        else:
            errors.append("Unable to load file: {0}".format(e))
Example #3
0
    def create_new_model(self, modelname, app_label):
        """ Use messytables to guess field types and build a new model """

        nocols = False
        cols = self.csvfile[0]
        for col in cols:
            if not col:
                nocols = True
        if nocols:
            cols = ["col_%s" % num for num in range(1, len(cols))]
            print("No column names for %s columns" % len(cols))
        else:
            # strip quotes at ends and replace internal spaces with underscores
            cols = [col.strip("\r") for col in cols]
            cols = [col.strip('"') for col in cols]
            cols = [col.strip("'") for col in cols]
            cols = [cleancol.sub("_", col).lower() for col in cols]
        try:
            from messytables import any_tableset, type_guess
        except:
            self.errors.append(
                "If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org"
            )
            self.modelname = ""
            return
        try:
            table_set = any_tableset(self.filehandle)
            row_set = table_set.tables[0]
            types = type_guess(row_set.sample)
            types = [str(typeobj) for typeobj in types]
            # If the header has more cols than the data has cols - ignore the end ones
            if len(cols) > len(types):
                cols = cols[:len(types)]
        except Exception as err:
            self.errors.append("messytables could not run due to error")
            self.errors.append(str(err))
            self.modelname = ""
            return

        fieldset = []
        maximums = self.get_maxlengths(cols)
        for i, col in enumerate(cols):
            length = maximums[i]
            if types[i] == "String" and length > 255:
                types[i] = "Text"
            integer = length
            decimal = int(length / 2)
            if decimal > 10:
                decimal = 10
            blank = True
            default = True
            column = (col, types[i], length, length, integer, decimal, blank,
                      default)
            fieldset.append(column)
        # Import here so that messytables is not a dependency for just using csvimport cmd
        from csvimport.make_model import MakeModel

        maker = MakeModel()
        return maker.model_from_table("%s_%s" % (app_label, modelname),
                                      fieldset)
Example #4
0
def main(argv=None):
    args = parse_args(argv)

    if args.file is None:
        # slurp the whole input since there seems to be a bug in messytables
        # which should be able to handle streams but doesn't
        args.file = cStringIO.StringIO(sys.stdin.read())

    relation_key = args_to_relation_key(args)

    table_set = any_tableset(args.file)
    if len(table_set.tables) != 1:
        raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables))

    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(strip_processor())
    row_set.register_processor(headers_processor(headers))
    # Temporarily, mark the offset of the header
    row_set.register_processor(offset_processor(offset + 1))

    # guess types and register them
    types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType])
    row_set.register_processor(types_processor(types))

    # Messytables seems to not handle the case where there are no headers.
    # Work around this as follows:
    # 1) offset must be 0
    # 2) if the types of the data match the headers, assume there are
    #    actually no headers
    if offset == 0:
        try:
            [t.cast(v) for (t, v) in zip(types, headers)]
        except:
            pass
        else:
            # We don't need the headers_processor or the offset_processor
            row_set._processors = []
            row_set.register_processor(strip_processor())
            row_set.register_processor(types_processor(types))
            headers = None

    # Construct the Myria schema
    schema = messy_to_schema(types, headers)
    logging.info("Myria schema: {}".format(json.dumps(schema)))

    # Prepare data for writing to Myria
    data, kwargs = write_data(row_set, schema)

    if not args.dry:
        # Connect to Myria and send the data
        connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl)
        ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs)

        sys.stdout.write(pretty_json(ret))
    else:
        sys.stdout.write(data)
Example #5
0
    def __init__(self, filename):
        """
        When provided with a filename (to a CSV, XLS, or XLSX) the constructor
        will attempt to load the file and ensure that messytables knows how to
        process it.
        """
        self.tableset = None

        try:
            _, ext = os.path.splitext(filename)
            self.tableset = messytables.any_tableset(open(filename, "r"), extension=ext[1:])
        except Exception, e:
            if str(e) == "Unrecognized MIME type: text/plain":
                # Attempt to force the load as a CSV file to work around messytables
                # not recognising text/plain
                self.tableset = messytables.any_tableset(f, mimetype="text/csv")
            else:
                log.exception(e)
                raise Exception(u"Failed to load the file at {0}".format(filename))
    def transform(self):
        handle = self.open_data(self.url)

        if not handle:
            raise ResourceError("Remote resource missing",
                                "Unable to load the remote resource")

        try:
            if self.is_csv():
                table_set = any_tableset(fileobj=handle,
                                         extension=self.type)
            else:
                table_set = any_tableset(fileobj=handle,
                                         extension=self.type,
                                         mimetype=self.mimetype)
        except Exception, e:
            # e.g. ValueError('Unrecognized MIME type: application/vnd.oasis.opendocument.spreadsheet')
            log.warn('Messytables parse error %s %s: %s', self.resource_identifier, self.url, e)
            log.warn('Some data: ext: %s, mime: %s', self.type, self.mimetype)
            raise ResourceError("Resource loading error",
                                "Unable to load the resource")
Example #7
0
    def transform(self):
        handle = self.open_data(self.url)

        if not handle:
            raise ResourceError("Informacije", "Udaljeni resurs nedostupan")

        try:
            table_set = any_tableset(fileobj=handle,
                                     extension=self.type,
                                     mimetype=self.mimetype)
        except Exception, e:
            raise ResourceError("Informacija", "Resurs nedostupan")
Example #8
0
    def ingest(self, meta, local_path):
        with open(local_path, 'rb') as fh:
            table_set = any_tableset(fh,
                                     extension=meta.extension,
                                     mimetype=meta.mime_type,
                                     window=20000)
            tables = []
            for sheet, row_set in enumerate(table_set.tables):
                tables.append(self.generate_table(meta, sheet, row_set))

            meta.tables = tables
            document = self.create_document(meta)
            self.emit(document)
Example #9
0
    def __init__(self, filename):
        """
        When provided with a filename (to a CSV, XLS, or XLSX) the constructor
        will attempt to load the file and ensure that messytables knows how to
        process it.
        """
        self.tableset = None

        try:
            _, ext = os.path.splitext(filename)
            self.tableset = messytables.any_tableset(open(filename, 'r'),
                                                     extension=ext[1:])
        except Exception, e:
            if str(e) == "Unrecognized MIME type: text/plain":
                # Attempt to force the load as a CSV file to work around messytables
                # not recognising text/plain
                self.tableset = messytables.any_tableset(filename,
                                                         mimetype="text/csv")
            else:
                log.exception(e)
                raise Exception(
                    u"Failed to load the file at {0}".format(filename))
Example #10
0
def proc(f, database_name, table_name):

    table_set = messytables.any_tableset(f)
    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=[
        messytables.types.StringType,
        messytables.types.DateType,
    ], strict=True)
    hive_data_file = tempfile.NamedTemporaryFile(mode='w')

    fields_ddl = ','.join([
        '  {0} {1}\n'.format(
            canonicalize_column_name(colName),
            hive_column_type(colType)
        )
        for colName, colType in zip(headers, types)
    ])
    hive_sql = '''
DROP TABLE IF EXISTS {0};

CREATE TABLE {0} (
{1}
)
STORED AS TEXTFILE
TBLPROPERTIES ("comment"="add_messytable on {3}");

LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0};
'''.format(table_name, fields_ddl, hive_data_file.name,
        datetime.datetime.now().isoformat())

    hive_cmd_file = tempfile.NamedTemporaryFile(mode='w')
    print(hive_sql, file=hive_cmd_file)
    hive_cmd_file.flush()

    row_set.register_processor(messytables.types_processor(types))

    for row in row_set:
        print('\001'.join(map(str, [ c.value for c in row])),
                file=hive_data_file)
    hive_data_file.flush()

    subprocess.call([
        'hive',
        '--database', database_name,
        '-f', hive_cmd_file.name,
    ])
Example #11
0
    def create_new_model(self, modelname, app_label):
        """ Use messytables to guess field types and build a new model """

        nocols = False
        cols = self.csvfile[0]
        for col in cols:
            if not col:
                nocols = True
        if nocols:
            cols = ['col_%s' % num for num in range(1, len(cols))]
            print('No column names for %s columns' % len(cols))
        else:
            cols = [cleancol.sub('_', col).lower() for col in cols]
        try:
            from messytables import any_tableset, type_guess
        except:
            self.errors.append(
                'If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org'
            )
            self.modelname = ''
            return
        try:
            table_set = any_tableset(self.filehandle)
            row_set = table_set.tables[0]
            types = type_guess(row_set.sample)
            types = [str(typeobj) for typeobj in types]
        except:
            self.errors.append('messytables could not guess your column types')
            self.modelname = ''
            return

        fieldset = []
        maximums = self.get_maxlengths(cols)
        for i, col in enumerate(cols):
            length = maximums[i]
            if types[i] == 'String' and length > 255:
                types[i] = 'Text'
            integer = length
            decimal = int(length / 2)
            if decimal > 10:
                decimal = 10
            blank = True
            default = True
            column = (col, types[i], length, length, integer, decimal, blank,
                      default)
            fieldset.append(column)
        # Import here so that messytables is not a dependency for just using csvimport cmd
        from csvimport.make_model import MakeModel
        maker = MakeModel()
        return maker.model_from_table('%s_%s' % (app_label, modelname),
                                      fieldset)
Example #12
0
    def transform(self):
        handle = self.open_data(self.url)

        if not handle:
            raise ResourceError("Remote resource missing",
                                "Unable to load the remote resource")

        try:
            table_set = any_tableset(fileobj=handle,
                                     extension=self.type,
                                     mimetype=self.mimetype)
        except Exception, e:
            raise ResourceError("Resource loading error",
                                "Unable to load the resource")
Example #13
0
    def transform(self):
        handle = self.open_data(self.url)

        if not handle:
            raise ResourceError("Informacije",
                "Udaljeni resurs nedostupan")

        try:
            table_set = any_tableset(fileobj=handle,
                                     extension=self.type,
                                     mimetype=self.mimetype)
        except Exception, e:
            raise ResourceError("Informacija",
                "Resurs nedostupan")
Example #14
0
def validate_file(file_tmp, file_name, tmp_filepath):

    log.info("upload: checking file * %s * ", file_name)
    MAX_HEADER_LENGTH = 64
    # not allowed characters ( - ' " ’ ‘) regex
    inappropriate_chars = re.compile(r"[\-|\'|\"|\u2018|\u2019]");
    datastore_ext = config.get('ckan.mimetype_guess', "csv xls xlsx tsv")
    tmp_file_name, tmp_file_ext = os.path.splitext(file_name)

    #check if datastore file (csv xls xlsx tsv)
    if tmp_file_ext[1:].lower() in datastore_ext:
        table_set = any_tableset(file_tmp)
        #check if only one data sheet in the file
        if len(table_set.tables)>1:
            rollback_tmp(file_tmp, tmp_filepath)
            log.error("upload: the file * %s * was not uploaded - There is more then one data sheet in the file", file_name)
            raise logic.ValidationError(
                {'upload': ['There is more then one data sheet in the file']}
            )
        else:
            row_set = table_set.tables[0]
            # guess header names and the offset of the header:
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            for header in headers:
                # too long header
                if len(header) > MAX_HEADER_LENGTH:
                    rollback_tmp(file_tmp, tmp_filepath)
                    log.error("upload: the file * %s * was not uploaded - too long header - * %s *",
                              file_name, header)
                    raise logic.ValidationError(
                        {'upload': ['too long header (64 max)']}
                    )
                # not allowed characters in header ( - ' " ’ ‘)
                if inappropriate_chars.search(header):
                    rollback_tmp(file_tmp, tmp_filepath)
                    log.error("upload: the file * %s * was not uploaded - there are inappropriate characters in headers * %s *",
                              file_name, header)
                    raise logic.ValidationError(
                        {'upload': ['there are inappropriate characters in headers (apostrophe/apostrophes/dash)']}
                    )
            # Check for duplicate fields
            unique_fields = set(headers)
            if not len(unique_fields) == len(headers):
                rollback_tmp(file_tmp, tmp_filepath)
                log.error("upload: the file * %s * was not uploaded - Duplicate column names are not supported", file_name)
                raise logic.ValidationError({'upload': ['Duplicate column names are not supported']})
        log.info("passed validation succesfully - the file * %s * was uploaded to CKAN (filestore)", file_name)
    else:
        pass
Example #15
0
    def create_new_model(self, modelname, app_label):
        """ Use messytables to guess field types and build a new model """

        nocols = False
        cols = self.csvfile[0]
        for col in cols:
            if not col:
                nocols = True
        if nocols:
            cols = ['col_%s' % num for num in range(1, len(cols))]
            print ('No column names for %s columns' % len(cols))
        else:
            cols = [cleancol.sub('_', col).lower() for col in cols]
        try:
            from messytables import any_tableset, type_guess
        except:
            self.errors.append(
                'If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org')
            self.modelname = ''
            return

        try:
            table_set = any_tableset(self.filehandle)
            row_set = table_set.tables[0]
            types = type_guess(row_set.sample)
            types = [str(typeobj) for typeobj in types]
        except Exception as err:
            self.errors.append('messytables could not run due to error')
            self.errors.append(str(err))
            self.modelname = ''
            return

        fieldset = []
        maximums = self.get_maxlengths(cols)
        for i, col in enumerate(cols):
            length = maximums[i]
            if types[i] == 'String' and length > 255:
                types[i] = 'Text'
            integer = length
            decimal = int(length / 2)
            if decimal > 10:
                decimal = 10
            blank = True
            default = True
            column = (col, types[i], length, length, integer, decimal, blank, default)
            fieldset.append(column)
        # Import here so that messytables is not a dependency for just using csvimport cmd
        from csvimport.make_model import MakeModel
        maker = MakeModel()
        return maker.model_from_table('%s_%s' % (app_label, modelname), fieldset)
    def transform(self):
        handle = self.open_data(self.url)

        if not handle:
            raise ResourceError("Remote resource missing",
                "Unable to load the remote resource")

        try:
            table_set = any_tableset(fileobj=handle,
                                     extension=self.type,
                                     mimetype=self.mimetype)
        except Exception, e:
            raise ResourceError("Resource loading error",
                "Unable to load the resource")
Example #17
0
    def ingest(self, meta, local_path):
        with open(local_path, 'rb') as fh:
            table_set = any_tableset(fh,
                                     extension=meta.extension,
                                     mimetype=meta.mime_type,
                                     window=20000)
            tables = []
            document = self.create_document(meta)
            for sheet, row_set in enumerate(table_set.tables):
                tables.append(
                    self.generate_table(document, meta, sheet, row_set))

            meta.tables = tables
            document.meta = meta
            self.emit(document)
Example #18
0
    def read_file(self, filename):
        """
        Guess the filetype and read the file into row sets
        """
        #print("Reading file", filename)

        try:
            fh = open(filename, 'rb')
            table_set = any_tableset(fh)  # guess the type...
        except:
            #traceback.print_exc()
            # Cannot find the schema.
            table_set = None

        return table_set
Example #19
0
    def read_file(self, filename): 
        """
        Guess the filetype and read the file into row sets
        """
        #print("Reading file", filename)

        try:
            fh = open(filename, 'rb')
            table_set = any_tableset(fh) # guess the type...
        except:
            #traceback.print_exc()
            # Cannot find the schema.
            table_set = None
            
        return table_set
Example #20
0
def resource_row_set(package, resource):
    """ Generate an iterator over all the rows in this resource's
    source data. """
    # Try to gather information about the source file type.
    if not resource.meta.get('extension'):
        resource.meta['extension'] = guess_extension(resource.meta.get('name'))

    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    table_set = any_tableset(resource.fh(),
                             extension=resource.meta.get('extension'),
                             mimetype=resource.meta.get('mime_type'))
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return
    return tables[0]
    def transform(self):
        handle = self.open_data(self.url)

        if not handle:
            raise ResourceError("Remote resource missing",
                                "Unable to load the remote resource")

        try:
            table_set = any_tableset(fileobj=handle,
                                     extension=self.type,
                                     mimetype=self.mimetype)
        except Exception, e:
            # e.g. ValueError('Unrecognized MIME type: application/vnd.oasis.opendocument.spreadsheet')
            log.warn('Messytables parse error %s %s: %s',
                     self.resource_identifier, self.url, e)
            raise ResourceError("Resource loading error",
                                "Unable to load the resource")
Example #22
0
def parse_table(source):
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    # We're also passing in an extended window size to give more
    # reliable type detection.
    # Because Python's CSV dialect sniffer isn't the best, this also
    # constrains the field quoting character to a double quote.
    table_set = mt.any_tableset(source.fh(),
                                extension=source.meta.get('extension'),
                                mimetype=source.meta.get('mime_type'),
                                quotechar='"',
                                window=20000)
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return
    row_set = tables[0]
    headers = [c.value for c in next(row_set.sample)]
    row_set.register_processor(mt.headers_processor(headers))
    row_set.register_processor(mt.offset_processor(1))
    types = mt.type_guess(row_set.sample, strict=True)
    row_set.register_processor(mt.types_processor(types, strict=True))

    fields, i = {}, 0
    row_iter = iter(row_set)

    while True:
        i += 1
        try:
            row = row_iter.next()
            if not len(fields):
                fields = generate_field_spec(row)

            data = convert_row(row, fields, i)
            check_empty = set(data.values())
            if None in check_empty and len(check_empty) == 1:
                continue

            yield None, fields, data
        except StopIteration:
            return
        except Exception, e:
            # log.exception(e)
            yield e, fields, None
Example #23
0
def parse_data(input):
    fh = open(input, 'rb')

    try:
        table_set = messytables.any_tableset(fh)
    except messytables.ReadError as e:
        print(e)

    get_row_set = lambda table_set: table_set.tables.pop()
    row_set = get_row_set(table_set)
    offset, headers = messytables.headers_guess(row_set.sample)
    # Some headers might have been converted from strings to floats and such.
    headers = [str(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
                data_row[column_name] = cell.value
            yield data_row

    result = row_iterator()

    headers_dicts = [
        dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
        for field in zip(headers, types)
    ]

    print('Determined headers and types: {headers}'.format(
        headers=headers_dicts))

    return headers_dicts, result
Example #24
0
def parse_table(source):
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    # We're also passing in an extended window size to give more
    # reliable type detection.
    # Because Python's CSV dialect sniffer isn't the best, this also
    # constrains the field quoting character to a double quote.
    table_set = mt.any_tableset(source.fh(),
                                extension=source.meta.get('extension'),
                                mimetype=source.meta.get('mime_type'),
                                quotechar='"', window=20000)
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return
    row_set = tables[0]
    headers = [c.value for c in next(row_set.sample)]
    row_set.register_processor(mt.headers_processor(headers))
    row_set.register_processor(mt.offset_processor(1))
    types = mt.type_guess(row_set.sample, strict=True)
    row_set.register_processor(mt.types_processor(types, strict=True))

    fields, i = {}, 0
    row_iter = iter(row_set)

    while True:
        i += 1
        try:
            row = row_iter.next()
            if not len(fields):
                fields = generate_field_spec(row)

            data = convert_row(row, fields, i)
            check_empty = set(data.values())
            if None in check_empty and len(check_empty) == 1:
                continue

            yield None, fields, data
        except StopIteration:
            return
        except Exception, e:
            # log.exception(e)
            yield e, fields, None
Example #25
0
def resource_row_set(package, resource):
    """ Generate an iterator over all the rows in this resource's
    source data. """
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    table_set = any_tableset(resource.fh(),
                             extension=resource.meta.get('extension'),
                             mimetype=resource.meta.get('mime_type'))
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return

    row_set = tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))
    return row_set
Example #26
0
 def test_simple_zip(self):
     fh = horror_fobj('simple.zip')
     table_set = any_tableset(fh, extension='zip')
     assert isinstance(table_set, ZIPTableSet)
    def push_to_datastore(self, context, resource):

        # Get the resource's content hash, which is used to check whether the
        # resource file has changed since last time.
        hash_dict = resource.get('hash')
        if hash_dict:
            original_content_hash = json.loads(hash_dict)['content']
            check_hash = not self.options.force
        else:
            # This resource has no hash yet, it must be a new resource.
            original_content_hash = ''
            check_hash = False

        try:
            result = fetch_resource.download(context,
                                             resource,
                                             self.max_content_length,
                                             DATA_FORMATS,
                                             check_modified=check_hash)
        except fetch_resource.ResourceNotModified as e:
            logger.info(
                u'Skipping unmodified resource: {0}'.format(resource['url'])
            )
            return {'success': True,
                    'resource': resource['id'],
                    'error': None}
        except Exception as e:
            logger.exception(e)
            return {'success': False,
                    'resource': resource['id'],
                    'error': 'Could not download resource'}

        if check_hash and (result['hash'] == original_content_hash):
            logger.info(
                u'Skipping unmodified resource: {0}'.format(resource['url'])
            )
            os.remove(result['saved_file'])
            return {'success': True,
                    'resource': resource['id'],
                    'error': None}

        content_type = result['headers'].get('content-type', '')\
                                        .split(';', 1)[0]  # remove parameters

        f = open(result['saved_file'], 'rb')
        try:
            table_sets = any_tableset(
                f,
                mimetype=content_type,
                extension=resource['format'].lower()
            )
            # only first sheet in xls for time being
            row_set = table_sets.tables[0]
            offset, headers = headers_guess(row_set.sample)
        except Exception as e:
            logger.exception(e)
            os.remove(result['saved_file'])
            return {'success': False,
                    'resource': resource['id'],
                    'error': 'Error parsing the resource'}

        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        row_set.register_processor(datetime_procesor())

        logger.info('Header offset: {0}.'.format(offset))

        guessed_types = type_guess(
            row_set.sample,
            [
                messytables.types.StringType,
                messytables.types.IntegerType,
                messytables.types.FloatType,
                messytables.types.DecimalType,
                messytables.types.DateUtilType
            ],
            strict=True
        )
        logger.info('Guessed types: {0}'.format(guessed_types))
        row_set.register_processor(types_processor(guessed_types, strict=True))
        row_set.register_processor(stringify_processor())

        guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in
                              guessed_types]

        def send_request(data):
            data_dict = {
                'resource_id': resource['id'],
                'fields': [dict(id=name, type=typename) for name, typename
                           in zip(headers, guessed_type_names)],
                'records': data,
                'force': True,
            }
            response = toolkit.get_action('datastore_create')(
                context,
                data_dict
            )
            return response

        # Delete any existing data before proceeding. Otherwise
        # 'datastore_create' will append to the existing datastore. And if the
        # fields have significantly changed, it may also fail.
        logger.info('Trying to delete existing datastore for resource {0} '
                    '(may not exist).'.format(resource['id']))
        try:
            toolkit.get_action('datastore_delete')(
                context,
                {'resource_id': resource['id'], 'force': True}
            )
        except toolkit.ObjectNotFound:
            logger.info('Datastore not found for resource {0}.'.format(
                resource['id']))
        except Exception as e:
            logger.exception(e)

        logger.info('Creating: {0}.'.format(resource['id']))

        # generates chunks of data that can be loaded into ckan
        # n is the maximum size of a chunk
        def chunky(iterable, n):
            it = iter(iterable)
            while True:
                chunk = list(
                    itertools.imap(
                        dict, itertools.islice(it, n)))
                if not chunk:
                    return
                yield chunk

        count = 0
        try:
            for data in chunky(row_set.dicts(), 100):
                count += len(data)
                send_request(data)
        except Exception as e:
            logger.exception(e)
            os.remove(result['saved_file'])
            return {'success': False,
                    'resource': resource['id'],
                    'error': 'Error pushing data to datastore'}

        logger.info("There should be {n} entries in {res_id}.".format(
            n=count,
            res_id=resource['id']
        ))

        resource.update({
            'webstore_url': 'active',
            'webstore_last_updated': datetime.now().isoformat()
        })

        toolkit.get_action('resource_update')(context, resource)
        os.remove(result['saved_file'])
        return {'success': True,
                'resource': resource['id'],
                'error': None}
Example #28
0
def parse_csv(filename, cfg_in):
    """
    Guess csv structure

    :param filename:
    :param cfg_in:
    :param known_structure: list of strings formats in order of columns, from start
    but may be not all (next is auto treeted)
    :return: lst_types, offset, headers 


    * quotechar - specifies a one-character string to use as the
        quoting character.  It defaults to '"'.
    * delimiter - specifies a one-character string to use as the
        field separator.  It defaults to ','.
    * skipinitialspace - specifies how to interpret whitespace which
        immediately follows a delimiter.  It defaults to False, which
        means that whitespace immediately following a delimiter is part
        of the following field.
    * lineterminator -  specifies the character sequence which should
        terminate rows.
    * quoting - controls when quotes should be generated by the writer.
        It can take on any of the following module constants:

        csv.QUOTE_MINIMAL means only when required, for example, when a
            field contains either the quotechar or the delimiter
        csv.QUOTE_ALL means that quotes are always placed around fields.
        csv.QUOTE_NONNUMERIC means that quotes are always placed around
            fields which do not parse as integers or floating point
            numbers.
        csv.QUOTE_NONE means that quotes are never placed around fields.
    * escapechar - specifies a one-character string used to escape
        the delimiter when quoting is set to QUOTE_NONE.
    * doublequote - controls the handling of quotes inside fields.  When
        True, two consecutive quotes are interpreted as one during read,
        and when writing, each quote character embedded in the data is
        written as two quotes
    Example:
    parse_csv(filename, ['%H:%M:%S'])
    """
    set_field_if_no(cfg_in, 'types', [])
    set_field_if_no(cfg_in, 'delimiter')
    with open(filename, 'rb') as fh:
        ext = os_path.splitext(filename)[1]
        # Load a file object:
        try:
            # If you are sure that file is csv use CSVTableSet(fh)
            from magic import MagicException  # because any_tableset uses libmagic
            table_set = any_tableset(fh,
                                     mimetype=None,
                                     extension=ext,
                                     delimiter=cfg_in['delimiter'])
        except (ImportError, MagicException) as e:
            print('There are error ', standard_error_info(e),
                  '\n=> Loading file as csv without trying other formats')
            table_set = CSVTableSet(fh, delimiter=cfg_in['delimiter'])

        # A table set is a collection of tables:
        row_set = table_set.tables[0]
        # A row set is an iterator over the table, but it can only
        # be run once. To peek, a sample is provided:

        # guess header names and the offset of the header:
        offset, headers = headers_guess(row_set.sample)  # tolerance=1
        row_set.register_processor(headers_processor(headers))
        # add one to begin with content, not the header:
        row_set.register_processor(offset_processor(offset + 1))
        # guess column types:
        lst_types = type_guess(row_set.sample, strict=True)
        row_sample = next(row_set.sample)

        # check not detected types
        def formats2types(formats_str):
            for f in formats_str:
                if f:
                    if is_date_format(f):
                        yield (types.DateType(f))
                    else:
                        yield (TimeType())
                else:
                    yield (None)

        known_types = formats2types(cfg_in['types'])

        for n, (t, s, kt) in enumerate(zip(lst_types, row_sample,
                                           known_types)):
            if t.result_type == types.StringType.result_type:
                # not auto detected? -> check known_types
                if kt.test(s.value):
                    lst_types[n] = kt  # t= kt
                else:  # known_types fits element
                    print(
                        "col'"
                        's#{:d} value "{}" type not match provided type of {}'.
                        format(n, s.value, type(kt)))
                    # kt = types.DateType('mm/dd/yyyy')
                    # kt.test('0'+s.value)
                    # detect?
            else:
                pass
        # not works for time type:
        # print(jts.headers_and_typed_as_jts(headers,
        #       list(map(jts.celltype_as_string, lst_types))).as_json())
        return lst_types, offset, headers
Example #29
0
 def test_libreoffice_xlsx(self):
     fh = horror_fobj('libreoffice.xlsx')
     table_set = any_tableset(fh)
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(0, len(data))
Example #30
0
def _datastorer_upload(context, resource, logger):
    result = download(context, resource, data_formats=DATA_FORMATS)

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters

    f = open(result['saved_file'], 'rb')
    table_sets = any_tableset(f,
                              mimetype=content_type,
                              extension=resource['format'].lower())

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(row_set.sample, [
        messytables.types.StringType, messytables.types.IntegerType,
        messytables.types.FloatType, messytables.types.DecimalType,
        messytables.types.DateUtilType
    ],
                               strict=True)
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (
        ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {
            'resource_id':
            resource['id'],
            'fields': [
                dict(id=name, type=typename)
                for name, typename in zip(headers, guessed_type_names)
            ],
            'force':
            True,
            'records':
            data
        }
        response = requests.post(
            datastore_create_request_url,
            data=json.dumps(request),
            headers={
                'Content-Type': 'application/json',
                'Authorization': context['apikey']
            },
        )
        check_response_and_retry(response, datastore_create_request_url,
                                 logger)

    # Delete any existing data before proceeding. Otherwise 'datastore_create' will
    # append to the existing datastore. And if the fields have significantly changed,
    # it may also fail.
    try:
        logger.info(
            'Deleting existing datastore (it may not exist): {0}.'.format(
                resource['id']))
        response = requests.post('%s/api/action/datastore_delete' % (ckan_url),
                                 data=json.dumps({
                                     'resource_id': resource['id'],
                                     'force': True
                                 }),
                                 headers={
                                     'Content-Type': 'application/json',
                                     'Authorization': context['apikey']
                                 })
        if not response.status_code or response.status_code not in (200, 404):
            # skips 200 (OK) or 404 (datastore does not exist, no need to delete it)
            logger.error('Deleting existing datastore failed: {0}'.format(
                get_response_error(response)))
            raise DatastorerException("Deleting existing datastore failed.")
    except requests.exceptions.RequestException as e:
        logger.error('Deleting existing datastore failed: {0}'.format(str(e)))
        raise DatastorerException("Deleting existing datastore failed.")

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(itertools.imap(dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(
        n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    resource.update({
        'webstore_url':
        'active',
        'webstore_last_updated':
        datetime.datetime.now().isoformat()
    })

    response = requests.post(ckan_request_url,
                             data=json.dumps(resource),
                             headers={
                                 'Content-Type': 'application/json',
                                 'Authorization': context['apikey']
                             })

    if response.status_code not in (201, 200):
        raise DatastorerException(
            'Ckan bad response code (%s). Response was %s' %
            (response.status_code, response.content))
Example #31
0
def check_filename(d):
    if not d['tableset']:
        raise SkipTest("Optional library not installed. Skipping")
    fh = horror_fobj(d['filename'])
    table_set = any_tableset(fh, extension=d['filename'], auto_detect=False)
    assert isinstance(table_set, d['tableset']), type(table_set)
Example #32
0
def check_no_filename(d):
    fh = horror_fobj(d['filename'])
    table_set = any_tableset(fh)
    assert isinstance(table_set, d['tableset']), type(table_set)
Example #33
0
 def test_scraperwiki_xlsx(self):
     fh = horror_fobj('sw_gen.xlsx')
     table_set = any_tableset(fh)
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(16, len(data))
Example #34
0
def push_to_datastore(task_id, input, dry_run=False):
    '''Download and parse a resource push its data into CKAN's DataStore.

    An asynchronous job that gets a resource from CKAN, downloads the
    resource's data file and, if the data file has changed since last time,
    parses the data and posts it into CKAN's DataStore.

    :param dry_run: Fetch and parse the data file but don't actually post the
        data to the DataStore, instead return the data headers and rows that
        would have been posted.
    :type dry_run: boolean

    '''
    handler = util.StoringHandler(task_id, input)
    logger = logging.getLogger(task_id)
    logger.addHandler(handler)
    logger.setLevel(logging.DEBUG)

    validate_input(input)

    data = input['metadata']

    ckan_url = data['ckan_url']
    resource_id = data['resource_id']
    api_key = input.get('api_key')

    try:
        resource = get_resource(resource_id, ckan_url, api_key)
    except util.JobError as e:
        # try again in 5 seconds just incase CKAN is slow at adding resource
        time.sleep(5)
        resource = get_resource(resource_id, ckan_url, api_key)

    # check if the resource url_type is a datastore
    if resource.get('url_type') == 'datastore':
        logger.info('Dump files are managed with the Datastore API')
        return

    # check scheme
    url = resource.get('url')
    scheme = urlsplit(url).scheme
    if scheme not in ('http', 'https', 'ftp'):
        raise util.JobError(
            'Only http, https, and ftp resources may be fetched.'
        )

    # fetch the resource data
    logger.info('Fetching from: {0}'.format(url))
    headers = {}
    if resource.get('url_type') == 'upload':
        # If this is an uploaded file to CKAN, authenticate the request,
        # otherwise we won't get file from private resources
        headers['Authorization'] = api_key
    try:
        response = requests.get(
            url,
            headers=headers,
            timeout=DOWNLOAD_TIMEOUT,
            verify=SSL_VERIFY,
            stream=True,  # just gets the headers for now
        )
        response.raise_for_status()

        cl = response.headers.get('content-length')
        try:
            if cl and int(cl) > MAX_CONTENT_LENGTH:
                raise util.JobError(
                    'Resource too large to download: {cl} > max ({max_cl}).'
                    .format(cl=cl, max_cl=MAX_CONTENT_LENGTH))
        except ValueError:
            pass

        tmp = tempfile.TemporaryFile()
        length = 0
        m = hashlib.md5()
        for chunk in response.iter_content(CHUNK_SIZE):
            length += len(chunk)
            if length > MAX_CONTENT_LENGTH:
                raise util.JobError(
                    'Resource too large to process: {cl} > max ({max_cl}).'
                    .format(cl=length, max_cl=MAX_CONTENT_LENGTH))
            tmp.write(chunk)
            m.update(chunk)

        ct = response.headers.get('content-type', '').split(';', 1)[0]

    except requests.HTTPError as e:
        raise HTTPError(
            "DataPusher received a bad HTTP response when trying to download "
            "the data file", status_code=e.response.status_code,
            request_url=url, response=e.response.content)
    except requests.RequestException as e:
        raise HTTPError(
            message=str(e), status_code=None,
            request_url=url, response=None)

    file_hash = m.hexdigest()
    tmp.seek(0)

    if (resource.get('hash') == file_hash
            and not data.get('ignore_hash')):
        logger.info("The file hash hasn't changed: {hash}.".format(
            hash=file_hash))
        return

    resource['hash'] = file_hash

    try:
        table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct)
    except messytables.ReadError as e:
        # try again with format
        tmp.seek(0)
        try:
            format = resource.get('format')
            table_set = messytables.any_tableset(tmp, mimetype=format, extension=format)
        except:
            raise util.JobError(e)

    get_row_set = web.app.config.get('GET_ROW_SET',
                                     lambda table_set: table_set.tables.pop())
    row_set = get_row_set(table_set)
    offset, headers = messytables.headers_guess(row_set.sample)

    existing = datastore_resource_exists(resource_id, api_key, ckan_url)
    existing_info = None
    if existing:
        existing_info = dict((f['id'], f['info'])
            for f in existing.get('fields', []) if 'info' in f)

    # Some headers might have been converted from strings to floats and such.
    headers = [str(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    # override with types user requested
    if existing_info:
        types = [{
            'text': messytables.StringType(),
            'numeric': messytables.DecimalType(),
            'timestamp': messytables.DateUtilType(),
            }.get(existing_info.get(h, {}).get('type_override'), t)
            for t, h in zip(types, headers)]

    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
                if isinstance(cell.value, str):
                    try:
                        data_row[column_name] = cell.value.encode('latin-1').decode('utf-8')
                    except (UnicodeDecodeError, UnicodeEncodeError):
                        data_row[column_name] = cell.value
                else:
                    data_row[column_name] = cell.value
            yield data_row
    result = row_iterator()

    '''
    Delete existing datstore resource before proceeding. Otherwise
    'datastore_create' will append to the existing datastore. And if
    the fields have significantly changed, it may also fail.
    '''
    if existing:
        logger.info('Deleting "{res_id}" from datastore.'.format(
            res_id=resource_id))
        delete_datastore_resource(resource_id, api_key, ckan_url)

    headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
                     for field in zip(headers, types)]

    # Maintain data dictionaries from matching column names
    if existing_info:
        for h in headers_dicts:
            if h['id'] in existing_info:
                h['info'] = existing_info[h['id']]
                # create columns with types user requested
                type_override = existing_info[h['id']].get('type_override')
                if type_override in list(_TYPE_MAPPING.values()):
                    h['type'] = type_override

    logger.info('Determined headers and types: {headers}'.format(
        headers=headers_dicts))

    if dry_run:
        return headers_dicts, result

    count = 0
    for i, chunk in enumerate(chunky(result, 250)):
        records, is_it_the_last_chunk = chunk
        count += len(records)
        logger.info('Saving chunk {number} {is_last}'.format(
            number=i, is_last='(last)' if is_it_the_last_chunk else ''))
        send_resource_to_datastore(resource, headers_dicts, records,
                                   is_it_the_last_chunk, api_key, ckan_url)

    logger.info('Successfully pushed {n} entries to "{res_id}".'.format(
        n=count, res_id=resource_id))

    if data.get('set_url_type', False):
        update_resource(resource, api_key, ckan_url)
Example #35
0
 def test_simple_csv(self):
     fh = horror_fobj('simple.csv')
     table_set = any_tableset(fh, extension='csv')
     assert isinstance(table_set, CSVTableSet)
Example #36
0
 def test_libreoffice_xlsx(self):
     fh = horror_fobj('libreoffice.xlsx')
     table_set = any_tableset(fh)
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(0, len(data))
Example #37
0
 def test_scraperwiki_xlsx(self):
     fh = horror_fobj('sw_gen.xlsx')
     table_set = any_tableset(fh)
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(16, len(data))
Example #38
0
def check_no_filename(d):
    if not d['tableset']:
        raise SkipTest("Optional library not installed. Skipping")
    fh = horror_fobj(d['filename'])
    table_set = any_tableset(fh)
    assert isinstance(table_set, d['tableset']), type(table_set)
Example #39
0
    f = cStringIO.StringIO(file_content)
    ##

    #f = cStringIO.StringIO(response.read())
    file_hash = hashlib.md5(f.read()).hexdigest()
    f.seek(0)

    if (resource.get('hash') == file_hash and not data.get('ignore_hash')):
        logger.info(
            "The file hash hasn't changed: {hash}.".format(hash=file_hash))
        return

    resource['hash'] = file_hash

    try:
        table_set = messytables.any_tableset(f, mimetype=ct, extension=ct)
    except messytables.ReadError as e:
        ## try again with format
        f.seek(0)
        try:
            format = resource.get('format')
            table_set = messytables.any_tableset(f,
                                                 mimetype=format,
                                                 extension=format)
        except:
            raise util.JobError(e)

    row_set = table_set.tables.pop()
    offset, headers = messytables.headers_guess(row_set.sample)

    existing = datastore_resource_exists(resource_id, api_key, ckan_url)
Example #40
0
def validate_file(file_tmp, file_name, tmp_filepath):

    log.info("upload: checking file * %s * ", file_name)
    MAX_HEADER_LENGTH = 64
    # not allowed characters ( - ' " ’ ‘) regex
    inappropriate_chars = re.compile(r"[\-|\'|\"|\u2018|\u2019]")
    datastore_ext = config.get('ckan.mimetype_guess', "csv xls xlsx tsv")
    tmp_file_name, tmp_file_ext = os.path.splitext(file_name)

    tmp_file_ext_str = tmp_file_ext[1:].lower()
    #check if datastore file (csv xls xlsx tsv)
    if tmp_file_ext_str in datastore_ext:
        try:
            table_set = any_tableset(file_tmp)
        except:
            log.info("file is not valid * %s * ", file_name)
            raise logic.ValidationError({'upload': ['The file is not valid']})
        #check if only one data sheet in the file
        if len(table_set.tables) > 1:
            rollback_tmp(file_tmp, tmp_filepath)
            log.error(
                "upload: the file * %s * was not uploaded - There is more then one data sheet in the file",
                file_name)
            raise logic.ValidationError({
                'upload': [_('There is more then one data sheet in the file')]
            })
        #check if table_set is not empty
        elif len(table_set.tables) > 0:
            row_set = table_set.tables[0]
            # guess header names and the offset of the header:
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            for header in headers:
                # too long header
                if len(header) > MAX_HEADER_LENGTH:
                    rollback_tmp(file_tmp, tmp_filepath)
                    log.error(
                        "upload: the file * %s * was not uploaded - too long header - * %s *",
                        file_name, header)
                    raise logic.ValidationError(
                        {'upload': [_('too long header (64 max)')]})
                # not allowed characters in header ( - ' " ’ ‘)
                #if inappropriate_chars.search(header):
                #    rollback_tmp(file_tmp, tmp_filepath)
                #    log.error("upload: the file * %s * was not uploaded - there are inappropriate characters in headers * %s *",
                #              file_name, header)
                #    raise logic.ValidationError(
                #        {'upload': [_('there are inappropriate characters in headers (apostrophe/apostrophes/dash)')]}
                #    )
            # Check for duplicate fields
            unique_fields = set(headers)
            if not len(unique_fields) == len(headers):
                rollback_tmp(file_tmp, tmp_filepath)
                log.error(
                    "upload: the file * %s * was not uploaded - Duplicate column names are not supported",
                    file_name)
                raise logic.ValidationError({
                    'upload': [_('Duplicate column names are not supported')]
                })

            log.info(
                "passed validation succesfully - the file * %s * was uploaded to CKAN (filestore)",
                file_name)
        else:
            log.info(
                "no table_set was created by messytables - skip headers validations in the file * %s * ",
                file_name)

    else:
        pass
Example #41
0
 def test_unknown(self):
     fh = horror_fobj('simple.unknown')
     self.assertRaises(ValueError, lambda: any_tableset(fh, extension='unknown'))
Example #42
0
    file_hash = m.hexdigest()
    tmp.seek(0)

    if (resource.get('hash') == file_hash and not data.get('ignore_hash')):
        logger.info(
            "The file hash hasn't changed: {hash}.".format(hash=file_hash))
        return

    resource['hash'] = file_hash

    # Decoded data if needed
    decoded_tmp = force_decode(tmp)

    try:
        table_set = messytables.any_tableset(decoded_tmp,
                                             mimetype=ct,
                                             extension=ct)
    except messytables.ReadError as e:
        ## try again with format
        decoded_tmp.seek(0)
        try:
            format = resource.get('format')
            table_set = messytables.any_tableset(decoded_tmp,
                                                 mimetype=format,
                                                 extension=format)
        except:
            raise util.JobError(e)

    row_set = table_set.tables.pop()
    offset, headers = messytables.headers_guess(row_set.sample)
Example #43
0
def check_filename(d):
    fh = horror_fobj(d['filename'])
    table_set = any_tableset(fh, extension=d['filename'], auto_detect=False)
    assert isinstance(table_set, d['tableset']), type(table_set)
Example #44
0
 def test_simple_xlsx(self):
     fh = horror_fobj('simple.xlsx')
     table_set = any_tableset(fh, extension='xlsx')
     assert isinstance(table_set, XLSXTableSet)
Example #45
0
def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
    '''Loads an Excel file (or other tabular data recognized by messytables)
    into Datastore and creates indexes.

    Largely copied from datapusher - see below. Is slower than load_csv.
    '''

    # use messytables to determine the header row
    logger.info('Determining column names and types')
    ct = mimetype
    format = os.path.splitext(table_filepath)[1]  # filename extension
    with open(table_filepath, 'rb') as tmp:

        #
        # Copied from datapusher/jobs.py:push_to_datastore
        #

        try:
            table_set = messytables.any_tableset(tmp,
                                                 mimetype=ct,
                                                 extension=ct)
        except messytables.ReadError as e:
            # try again with format
            tmp.seek(0)
            try:
                table_set = messytables.any_tableset(tmp,
                                                     mimetype=format,
                                                     extension=format)
            except Exception as e:
                raise LoaderError(e)

        if not table_set.tables:
            raise LoaderError('Could not parse file as tabular data')
        row_set = table_set.tables.pop()
        offset, headers = messytables.headers_guess(row_set.sample)

        existing = datastore_resource_exists(resource_id)
        existing_info = None
        if existing:
            existing_info = dict((f['id'], f['info'])
                                 for f in existing.get('fields', [])
                                 if 'info' in f)

        # Some headers might have been converted from strings to floats and such.
        headers = encode_headers(headers)

        row_set.register_processor(messytables.headers_processor(headers))
        row_set.register_processor(messytables.offset_processor(offset + 1))
        TYPES, TYPE_MAPPING = get_types()
        types = messytables.type_guess(row_set.sample,
                                       types=TYPES,
                                       strict=True)

        # override with types user requested
        if existing_info:
            types = [{
                'text': messytables.StringType(),
                'numeric': messytables.DecimalType(),
                'timestamp': messytables.DateUtilType(),
            }.get(existing_info.get(h, {}).get('type_override'), t)
                     for t, h in zip(types, headers)]

        row_set.register_processor(messytables.types_processor(types))

        headers = [
            header.strip()[:MAX_COLUMN_LENGTH] for header in headers
            if header.strip()
        ]
        headers_set = set(headers)

        def row_iterator():
            for row in row_set:
                data_row = {}
                for index, cell in enumerate(row):
                    column_name = cell.column.strip()
                    if column_name not in headers_set:
                        continue
                    data_row[column_name] = cell.value
                yield data_row

        result = row_iterator()
        '''
        Delete existing datstore resource before proceeding. Otherwise
        'datastore_create' will append to the existing datastore. And if
        the fields have significantly changed, it may also fail.
        '''
        if existing:
            logger.info('Deleting "{res_id}" from datastore.'.format(
                res_id=resource_id))
            delete_datastore_resource(resource_id)

        headers_dicts = [
            dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
            for field in zip(headers, types)
        ]

        # Maintain data dictionaries from matching column names
        if existing_info:
            for h in headers_dicts:
                if h['id'] in existing_info:
                    h['info'] = existing_info[h['id']]
                    # create columns with types user requested
                    type_override = existing_info[h['id']].get('type_override')
                    if type_override in _TYPE_MAPPING.values():
                        h['type'] = type_override

        logger.info('Determined headers and types: {headers}'.format(
            headers=headers_dicts))

        ### Commented - this is only for tests
        # if dry_run:
        #     return headers_dicts, result

        logger.info('Copying to database...')
        count = 0
        for i, records in enumerate(chunky(result, 250)):
            count += len(records)
            logger.info('Saving chunk {number}'.format(number=i))
            send_resource_to_datastore(resource_id, headers_dicts, records)
        logger.info('...copying done')

        if count:
            logger.info(
                'Successfully pushed {n} entries to "{res_id}".'.format(
                    n=count, res_id=resource_id))
        else:
            # no datastore table is created
            raise LoaderError('No entries found - nothing to load')
Example #46
0
 def test_xlsm(self):
     fh = horror_fobj('bian-anal-mca-2005-dols-eng-1011-0312-tab3.xlsm')
     table_set = any_tableset(fh, extension='xls')
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(62, len(data))
Example #47
0
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
    '''Loads a CSV into DataStore. Does not create the indexes.'''

    # use messytables to determine the header row
    extension = os.path.splitext(csv_filepath)[1]

    tempdir = tempfile.mkdtemp(suffix=resource_id)
    if extension.lower() == '.zip':
        with zipfile.ZipFile(csv_filepath, "r") as zip_ref:
            csvfiles = [
                file for file in zip_ref.filelist
                if file.filename.lower().endswith('.csv')
            ]
            if len(csvfiles) == 0:
                logger.info("no csvfiles found in %s" % csv_filepath)
            if len(csvfiles) > 0:
                if len(csvfiles) > 1:
                    logger.info(
                        "multiple csv files found in %s, only one will be ingested: %s"
                        % (csv_filepath, csvfiles[0].filename))
                else:
                    logger.info("unzipping %s and ingesting %s" %
                                (csv_filepath, csvfiles[0].filename))

                zip_ref.extract(csvfiles[0], tempdir)
                new_loc = os.path.join(tempdir, csvfiles[0].filename)
                csv_filepath = new_loc
                extension = os.path.splitext(csv_filepath)[1]
                logger.info("unzipped %s" % csvfiles[0].filename)
            # close.
            zip_ref.close()
    with open(csv_filepath, 'rb') as f:
        try:
            table_set = messytables.any_tableset(f,
                                                 mimetype=mimetype,
                                                 extension=extension)
        except messytables.ReadError as e:
            # # try again with format
            # f.seek(0)
            # try:
            #     format = resource.get('format')
            #     table_set = messytables.any_tableset(f, mimetype=format,
            #                                          extension=format)
            # except Exception:
            raise LoaderError('Messytables error: {}'.format(e))

        if not table_set.tables:
            raise LoaderError('Could not detect tabular data in this file')
        row_set = table_set.tables.pop()
        header_offset, headers = messytables.headers_guess(row_set.sample)

    # Some headers might have been converted from strings to floats and such.
    headers = encode_headers(headers)

    # Guess the delimiter used in the file
    with open(csv_filepath, 'r') as f:
        header_line = f.readline()
    try:
        sniffer = csv.Sniffer()
        delimiter = sniffer.sniff(header_line).delimiter
    except csv.Error:
        logger.warning(
            'Could not determine delimiter from file, use default ","')
        delimiter = ','

    # Setup the converters that run when you iterate over the row_set.
    # With pgloader only the headers will be iterated over.
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(header_offset + 1))
    # types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    headers = [
        header.strip()[:MAX_COLUMN_LENGTH] for header in headers
        if header.strip()
    ]
    # headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
    #                  for field in zip(headers, types)]

    # TODO worry about csv header name problems
    # e.g. duplicate names

    # encoding (and line ending?)- use chardet
    # It is easier to reencode it as UTF8 than convert the name of the encoding
    # to one that pgloader will understand.
    logger.info('Ensuring character coding is UTF8')
    f_write = tempfile.NamedTemporaryFile(suffix=extension, delete=False)
    try:
        with open(csv_filepath, 'rb') as f_read:
            csv_decoder = messytables.commas.UTF8Recoder(f_read, encoding=None)
            for line in csv_decoder:
                f_write.write(line)
            f_write.close()  # ensures the last line is written
            csv_filepath = f_write.name
        logger.info('Ensuring character coding is UTF8 complete')
        # check tables exists

        # datastore db connection
        engine = get_write_engine()

        # get column info from existing table
        existing = datastore_resource_exists(resource_id)
        existing_info = {}
        if existing:
            existing_info = dict((f['id'], f['info'])
                                 for f in existing.get('fields', [])
                                 if 'info' in f)
            '''
            Delete existing datastore table before proceeding. Otherwise
            the COPY will append to the existing table. And if
            the fields have significantly changed, it may also fail.
            '''
            logger.info('Deleting "{res_id}" from DataStore.'.format(
                res_id=resource_id))
            delete_datastore_resource(resource_id)

        # Columns types are either set (overridden) in the Data Dictionary page
        # or default to text type (which is robust)
        fields = [
            {'id': header_name,
             'type': existing_info.get(header_name, {})\
             .get('type_override') or 'text',
             }
            for header_name in headers]

        # Maintain data dictionaries from matching column names
        if existing_info:
            for f in fields:
                if f['id'] in existing_info:
                    f['info'] = existing_info[f['id']]

        logger.info('Fields: {}'.format(fields))

        # Create table
        from ckan import model
        context = {'model': model, 'ignore_auth': True}
        data_dict = dict(
            resource_id=resource_id,
            fields=fields,
        )
        data_dict['records'] = None  # just create an empty table
        data_dict['force'] = True  # TODO check this - I don't fully
        # understand read-only/datastore resources
        try:
            p.toolkit.get_action('datastore_create')(context, data_dict)
        except p.toolkit.ValidationError as e:
            if 'fields' in e.error_dict:
                # e.g. {'message': None, 'error_dict': {'fields': [u'"***" is not a valid field name']}, '_error_summary': None}
                error_message = e.error_dict['fields'][0]
                raise LoaderError(
                    'Error with field definition: {}'.format(error_message))
            else:
                raise LoaderError(
                    'Validation error when creating the database table: {}'.
                    format(str(e)))
        except Exception as e:
            raise LoaderError(
                'Could not create the database table: {}'.format(e))
        connection = context['connection'] = engine.connect()
        if not fulltext_trigger_exists(connection, resource_id):
            logger.info('Trigger created')
            _create_fulltext_trigger(connection, resource_id)

        # datstore_active is switched on by datastore_create - TODO temporarily
        # disable it until the load is complete

        # logger.info('Disabling row index trigger')
        _disable_fulltext_trigger(connection, resource_id)
        # logger.info('Dropping indexes')
        _drop_indexes(context, data_dict, False)

        logger.info('Copying to database...')

        # Options for loading into postgres:
        # 1. \copy - can't use as that is a psql meta-command and not accessible
        #    via psycopg2
        # 2. COPY - requires the db user to have superuser privileges. This is
        #    dangerous. It is also not available on AWS, for example.
        # 3. pgloader method? - as described in its docs:
        #    Note that while the COPY command is restricted to read either from its standard input or from a local file on the server's file system, the command line tool psql implements a \copy command that knows how to stream a file local to the client over the network and into the PostgreSQL server, using the same protocol as pgloader uses.
        # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
        #    the superuser issue. <-- picked

        # with psycopg2.connect(DSN) as conn:
        #     with conn.cursor() as curs:
        #         curs.execute(SQL)
        raw_connection = engine.raw_connection()
        try:
            cur = raw_connection.cursor()
            try:
                with open(csv_filepath, 'rb') as f:
                    # can't use :param for table name because params are only
                    # for filter values that are single quoted.
                    try:
                        cur.copy_expert(
                            "COPY \"{resource_id}\" ({column_names}) "
                            "FROM STDIN "
                            "WITH (DELIMITER '{delimiter}', FORMAT csv, HEADER 1, "
                            "      ENCODING '{encoding}');".format(
                                resource_id=resource_id,
                                column_names=', '.join(
                                    ['"{}"'.format(h) for h in headers]),
                                delimiter=delimiter,
                                encoding='UTF8',
                            ), f)
                    except psycopg2.DataError as e:
                        # e is a str but with foreign chars e.g.
                        # 'extra data: "paul,pa\xc3\xbcl"\n'
                        # but logging and exceptions need a normal (7 bit) str
                        error_str = str(e).decode('ascii', 'replace').encode(
                            'ascii', 'replace')
                        logger.warning(error_str)
                        raise LoaderError(
                            'Error during the load into PostgreSQL:'
                            ' {}'.format(error_str))

            finally:
                cur.close()
        finally:
            raw_connection.commit()
    finally:
        os.remove(csv_filepath)  # i.e. the tempfile
        shutil.rmtree(tempdir)
    logger.info('...copying done')

    logger.info('Creating search index...')
    _populate_fulltext(connection, resource_id, fields=fields)
    logger.info('...search index created')

    return fields
Example #48
0
def check_no_filename(d):
    if not d['tableset']:
        raise SkipTest("Optional library not installed. Skipping")
    fh = horror_fobj(d['filename'])
    table_set = any_tableset(fh)
    assert isinstance(table_set, d['tableset']), type(table_set)
Example #49
0
def _datastorer_upload(context, resource, logger):
    result = download(context, resource, data_formats=DATA_FORMATS)
    logger.info('Downloaded resource %r' %(resource))

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters
    
    extension = resource['format'].lower()
    
    fp = open(result['saved_file'], 'rb')
    if zipfile.is_zipfile(result['saved_file']):
        fp, zf = open_zipped_tableset(fp, extension=extension)
        logger.info('Opened entry %s from ZIP archive %s', zf, result['saved_file'])
    else:
        logger.info('Opened file %s' %(result['saved_file']))

    table_sets = any_tableset(fp, extension=extension)
    
    if 'sample_size' in context:
        table_sets.window = max(1000, int(context['sample_size']))
        logger.info('Using a sample window of %d', table_sets.window)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
            messytables.types.FloatType,
            messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
        strict=True
    )
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {'resource_id': resource['id'],
                   'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)],
                   'force': True,
                   'records': data}
        response = requests.post(datastore_create_request_url,
                         data=json.dumps(request),
                         headers={'Content-Type': 'application/json',
                                  'Authorization': context['apikey']},
                         )
        check_response_and_retry(response, datastore_create_request_url, logger)

    # Delete any existing data before proceeding. Otherwise 'datastore_create' will
    # append to the existing datastore. And if the fields have significantly changed,
    # it may also fail.
    try:
        logger.info('Deleting existing datastore (it may not exist): {0}.'.format(resource['id']))
        response = requests.post('%s/api/action/datastore_delete' % (ckan_url),
                                 data=json.dumps({'resource_id': resource['id'], 'force': True}),
                        headers={'Content-Type': 'application/json',
                                'Authorization': context['apikey']}
                        )
        if not response.status_code or response.status_code not in (200, 404):
            # skips 200 (OK) or 404 (datastore does not exist, no need to delete it)
            logger.error('Deleting existing datastore failed: {0}'.format(get_response_error(response)))
            raise DatastorerException("Deleting existing datastore failed.")
    except requests.exceptions.RequestException as e:
        logger.error('Deleting existing datastore failed: {0}'.format(str(e)))
        raise DatastorerException("Deleting existing datastore failed.")

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(
                itertools.imap(
                    dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    resource.update({
        'webstore_url': 'active',
        'webstore_last_updated': datetime.datetime.now().isoformat()
    })

    response = requests.post(
        ckan_request_url,
        data=json.dumps(resource),
        headers={'Content-Type': 'application/json',
                 'Authorization': context['apikey']})

    if response.status_code not in (201, 200):
        raise DatastorerException('Ckan bad response code (%s). Response was %s' %
                             (response.status_code, response.content))
Example #50
0
    ct = response.info().getheader('content-type').split(';', 1)[0]

    f = cStringIO.StringIO(response.read())
    file_hash = hashlib.md5(f.read()).hexdigest()
    f.seek(0)

    if (resource.get('hash') == file_hash
            and not data.get('ignore_hash')):
        logger.info("The file hash hasn't changed: {hash}.".format(
            hash=file_hash))
        return

    resource['hash'] = file_hash

    try:
        table_set = messytables.any_tableset(f, mimetype=ct, extension=ct)
    except messytables.ReadError as e:
        ## try again with format
        f.seek(0)
        try:
            format = resource.get('format')
            table_set = messytables.any_tableset(f, mimetype=format, extension=format)
        except:
            raise util.JobError(e)

    row_set = table_set.tables.pop()
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)
    row_set.register_processor(messytables.types_processor(types))
Example #51
0
def check_filename(d):
    if not d['tableset']:
        raise SkipTest("Optional library not installed. Skipping")
    fh = horror_fobj(d['filename'])
    table_set = any_tableset(fh, extension=d['filename'], auto_detect=False)
    assert isinstance(table_set, d['tableset']), type(table_set)