Exemple #1
0
def send_resource_to_datastore(resource_id, headers, records):
    """
    Stores records in CKAN datastore
    """
    request = {'resource_id': resource_id,
               'fields': headers,
               'force': True,
               'records': records}

    from ckan import model
    context = {'model': model, 'ignore_auth': True}
    try:
        p.toolkit.get_action('datastore_create')(context, request)
    except p.toolkit.ValidationError as e:
        raise LoaderError('Validation error writing rows to db: {}'
                          .format(str(e)))
Exemple #2
0
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
    '''Loads a CSV into DataStore. Does not create the indexes.'''

    # use messytables to determine the header row
    extension = os.path.splitext(csv_filepath)[1]

    tempdir = tempfile.mkdtemp(suffix=resource_id)
    if extension.lower() == '.zip':
        with zipfile.ZipFile(csv_filepath, "r") as zip_ref:
            csvfiles = [
                file for file in zip_ref.filelist
                if file.filename.lower().endswith('.csv')
            ]
            if len(csvfiles) == 0:
                logger.info("no csvfiles found in %s" % csv_filepath)
            if len(csvfiles) > 0:
                if len(csvfiles) > 1:
                    logger.info(
                        "multiple csv files found in %s, only one will be ingested: %s"
                        % (csv_filepath, csvfiles[0].filename))
                else:
                    logger.info("unzipping %s and ingesting %s" %
                                (csv_filepath, csvfiles[0].filename))

                zip_ref.extract(csvfiles[0], tempdir)
                new_loc = os.path.join(tempdir, csvfiles[0].filename)
                csv_filepath = new_loc
                extension = os.path.splitext(csv_filepath)[1]
                logger.info("unzipped %s" % csvfiles[0].filename)
            # close.
            zip_ref.close()
    with open(csv_filepath, 'rb') as f:
        try:
            table_set = messytables.any_tableset(f,
                                                 mimetype=mimetype,
                                                 extension=extension)
        except messytables.ReadError as e:
            # # try again with format
            # f.seek(0)
            # try:
            #     format = resource.get('format')
            #     table_set = messytables.any_tableset(f, mimetype=format,
            #                                          extension=format)
            # except Exception:
            raise LoaderError('Messytables error: {}'.format(e))

        if not table_set.tables:
            raise LoaderError('Could not detect tabular data in this file')
        row_set = table_set.tables.pop()
        header_offset, headers = messytables.headers_guess(row_set.sample)

    # Some headers might have been converted from strings to floats and such.
    headers = encode_headers(headers)

    # Guess the delimiter used in the file
    with open(csv_filepath, 'r') as f:
        header_line = f.readline()
    try:
        sniffer = csv.Sniffer()
        delimiter = sniffer.sniff(header_line).delimiter
    except csv.Error:
        logger.warning(
            'Could not determine delimiter from file, use default ","')
        delimiter = ','

    # Setup the converters that run when you iterate over the row_set.
    # With pgloader only the headers will be iterated over.
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(header_offset + 1))
    # types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    headers = [
        header.strip()[:MAX_COLUMN_LENGTH] for header in headers
        if header.strip()
    ]
    # headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
    #                  for field in zip(headers, types)]

    # TODO worry about csv header name problems
    # e.g. duplicate names

    # encoding (and line ending?)- use chardet
    # It is easier to reencode it as UTF8 than convert the name of the encoding
    # to one that pgloader will understand.
    logger.info('Ensuring character coding is UTF8')
    f_write = tempfile.NamedTemporaryFile(suffix=extension, delete=False)
    try:
        with open(csv_filepath, 'rb') as f_read:
            csv_decoder = messytables.commas.UTF8Recoder(f_read, encoding=None)
            for line in csv_decoder:
                f_write.write(line)
            f_write.close()  # ensures the last line is written
            csv_filepath = f_write.name
        logger.info('Ensuring character coding is UTF8 complete')
        # check tables exists

        # datastore db connection
        engine = get_write_engine()

        # get column info from existing table
        existing = datastore_resource_exists(resource_id)
        existing_info = {}
        if existing:
            existing_info = dict((f['id'], f['info'])
                                 for f in existing.get('fields', [])
                                 if 'info' in f)
            '''
            Delete existing datastore table before proceeding. Otherwise
            the COPY will append to the existing table. And if
            the fields have significantly changed, it may also fail.
            '''
            logger.info('Deleting "{res_id}" from DataStore.'.format(
                res_id=resource_id))
            delete_datastore_resource(resource_id)

        # Columns types are either set (overridden) in the Data Dictionary page
        # or default to text type (which is robust)
        fields = [
            {'id': header_name,
             'type': existing_info.get(header_name, {})\
             .get('type_override') or 'text',
             }
            for header_name in headers]

        # Maintain data dictionaries from matching column names
        if existing_info:
            for f in fields:
                if f['id'] in existing_info:
                    f['info'] = existing_info[f['id']]

        logger.info('Fields: {}'.format(fields))

        # Create table
        from ckan import model
        context = {'model': model, 'ignore_auth': True}
        data_dict = dict(
            resource_id=resource_id,
            fields=fields,
        )
        data_dict['records'] = None  # just create an empty table
        data_dict['force'] = True  # TODO check this - I don't fully
        # understand read-only/datastore resources
        try:
            p.toolkit.get_action('datastore_create')(context, data_dict)
        except p.toolkit.ValidationError as e:
            if 'fields' in e.error_dict:
                # e.g. {'message': None, 'error_dict': {'fields': [u'"***" is not a valid field name']}, '_error_summary': None}
                error_message = e.error_dict['fields'][0]
                raise LoaderError(
                    'Error with field definition: {}'.format(error_message))
            else:
                raise LoaderError(
                    'Validation error when creating the database table: {}'.
                    format(str(e)))
        except Exception as e:
            raise LoaderError(
                'Could not create the database table: {}'.format(e))
        connection = context['connection'] = engine.connect()
        if not fulltext_trigger_exists(connection, resource_id):
            logger.info('Trigger created')
            _create_fulltext_trigger(connection, resource_id)

        # datstore_active is switched on by datastore_create - TODO temporarily
        # disable it until the load is complete

        # logger.info('Disabling row index trigger')
        _disable_fulltext_trigger(connection, resource_id)
        # logger.info('Dropping indexes')
        _drop_indexes(context, data_dict, False)

        logger.info('Copying to database...')

        # Options for loading into postgres:
        # 1. \copy - can't use as that is a psql meta-command and not accessible
        #    via psycopg2
        # 2. COPY - requires the db user to have superuser privileges. This is
        #    dangerous. It is also not available on AWS, for example.
        # 3. pgloader method? - as described in its docs:
        #    Note that while the COPY command is restricted to read either from its standard input or from a local file on the server's file system, the command line tool psql implements a \copy command that knows how to stream a file local to the client over the network and into the PostgreSQL server, using the same protocol as pgloader uses.
        # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
        #    the superuser issue. <-- picked

        # with psycopg2.connect(DSN) as conn:
        #     with conn.cursor() as curs:
        #         curs.execute(SQL)
        raw_connection = engine.raw_connection()
        try:
            cur = raw_connection.cursor()
            try:
                with open(csv_filepath, 'rb') as f:
                    # can't use :param for table name because params are only
                    # for filter values that are single quoted.
                    try:
                        cur.copy_expert(
                            "COPY \"{resource_id}\" ({column_names}) "
                            "FROM STDIN "
                            "WITH (DELIMITER '{delimiter}', FORMAT csv, HEADER 1, "
                            "      ENCODING '{encoding}');".format(
                                resource_id=resource_id,
                                column_names=', '.join(
                                    ['"{}"'.format(h) for h in headers]),
                                delimiter=delimiter,
                                encoding='UTF8',
                            ), f)
                    except psycopg2.DataError as e:
                        # e is a str but with foreign chars e.g.
                        # 'extra data: "paul,pa\xc3\xbcl"\n'
                        # but logging and exceptions need a normal (7 bit) str
                        error_str = str(e).decode('ascii', 'replace').encode(
                            'ascii', 'replace')
                        logger.warning(error_str)
                        raise LoaderError(
                            'Error during the load into PostgreSQL:'
                            ' {}'.format(error_str))

            finally:
                cur.close()
        finally:
            raw_connection.commit()
    finally:
        os.remove(csv_filepath)  # i.e. the tempfile
        shutil.rmtree(tempdir)
    logger.info('...copying done')

    logger.info('Creating search index...')
    _populate_fulltext(connection, resource_id, fields=fields)
    logger.info('...search index created')

    return fields
Exemple #3
0
def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
    '''Loads an Excel file (or other tabular data recognized by messytables)
    into Datastore and creates indexes.

    Largely copied from datapusher - see below. Is slower than load_csv.
    '''

    # use messytables to determine the header row
    logger.info('Determining column names and types')
    ct = mimetype
    format = os.path.splitext(table_filepath)[1]  # filename extension
    with open(table_filepath, 'rb') as tmp:

        #
        # Copied from datapusher/jobs.py:push_to_datastore
        #

        try:
            table_set = messytables.any_tableset(tmp,
                                                 mimetype=ct,
                                                 extension=ct)
        except messytables.ReadError as e:
            # try again with format
            tmp.seek(0)
            try:
                table_set = messytables.any_tableset(tmp,
                                                     mimetype=format,
                                                     extension=format)
            except Exception as e:
                raise LoaderError(e)

        if not table_set.tables:
            raise LoaderError('Could not parse file as tabular data')
        row_set = table_set.tables.pop()
        offset, headers = messytables.headers_guess(row_set.sample)

        existing = datastore_resource_exists(resource_id)
        existing_info = None
        if existing:
            existing_info = dict((f['id'], f['info'])
                                 for f in existing.get('fields', [])
                                 if 'info' in f)

        # Some headers might have been converted from strings to floats and such.
        headers = encode_headers(headers)

        row_set.register_processor(messytables.headers_processor(headers))
        row_set.register_processor(messytables.offset_processor(offset + 1))
        TYPES, TYPE_MAPPING = get_types()
        types = messytables.type_guess(row_set.sample,
                                       types=TYPES,
                                       strict=True)

        # override with types user requested
        if existing_info:
            types = [{
                'text': messytables.StringType(),
                'numeric': messytables.DecimalType(),
                'timestamp': messytables.DateUtilType(),
            }.get(existing_info.get(h, {}).get('type_override'), t)
                     for t, h in zip(types, headers)]

        row_set.register_processor(messytables.types_processor(types))

        headers = [
            header.strip()[:MAX_COLUMN_LENGTH] for header in headers
            if header.strip()
        ]
        headers_set = set(headers)

        def row_iterator():
            for row in row_set:
                data_row = {}
                for index, cell in enumerate(row):
                    column_name = cell.column.strip()
                    if column_name not in headers_set:
                        continue
                    data_row[column_name] = cell.value
                yield data_row

        result = row_iterator()
        '''
        Delete existing datstore resource before proceeding. Otherwise
        'datastore_create' will append to the existing datastore. And if
        the fields have significantly changed, it may also fail.
        '''
        if existing:
            logger.info('Deleting "{res_id}" from datastore.'.format(
                res_id=resource_id))
            delete_datastore_resource(resource_id)

        headers_dicts = [
            dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
            for field in zip(headers, types)
        ]

        # Maintain data dictionaries from matching column names
        if existing_info:
            for h in headers_dicts:
                if h['id'] in existing_info:
                    h['info'] = existing_info[h['id']]
                    # create columns with types user requested
                    type_override = existing_info[h['id']].get('type_override')
                    if type_override in _TYPE_MAPPING.values():
                        h['type'] = type_override

        logger.info('Determined headers and types: {headers}'.format(
            headers=headers_dicts))

        ### Commented - this is only for tests
        # if dry_run:
        #     return headers_dicts, result

        logger.info('Copying to database...')
        count = 0
        for i, records in enumerate(chunky(result, 250)):
            count += len(records)
            logger.info('Saving chunk {number}'.format(number=i))
            send_resource_to_datastore(resource_id, headers_dicts, records)
        logger.info('...copying done')

        if count:
            logger.info(
                'Successfully pushed {n} entries to "{res_id}".'.format(
                    n=count, res_id=resource_id))
        else:
            # no datastore table is created
            raise LoaderError('No entries found - nothing to load')
Exemple #4
0
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):

    # use messytables to determine the header row
    extension = os.path.splitext(csv_filepath)[1]
    with open(csv_filepath, 'rb') as f:
        try:
            table_set = messytables.any_tableset(f,
                                                 mimetype=mimetype,
                                                 extension=extension)
        except messytables.ReadError as e:
            # # try again with format
            # f.seek(0)
            # try:
            #     format = resource.get('format')
            #     table_set = messytables.any_tableset(f, mimetype=format,
            #                                          extension=format)
            # except Exception:
            raise LoaderError('Messytables error: {}'.format(e))

        if not table_set.tables:
            raise LoaderError('Could not detect tabular data in this file')
        row_set = table_set.tables.pop()
        header_offset, headers = messytables.headers_guess(row_set.sample)

    # Some headers might have been converted from strings to floats and such.
    headers = [unicode(header) for header in headers]

    # Setup the converters that run when you iterate over the row_set.
    # With pgloader only the headers will be iterated over.
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(header_offset + 1))
    # types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    headers = [header.strip() for header in headers if header.strip()]
    # headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
    #                  for field in zip(headers, types)]

    # TODO worry about csv header name problems
    # e.g. duplicate names

    # encoding (and line ending?)- use chardet
    # It is easier to reencode it as UTF8 than convert the name of the encoding
    # to one that pgloader will understand.
    logger.info('Ensuring character coding is UTF8')
    f_write = tempfile.NamedTemporaryFile(suffix=extension, delete=False)
    try:
        with open(csv_filepath, 'rb') as f_read:
            csv_decoder = messytables.commas.UTF8Recoder(f_read, encoding=None)
            for line in csv_decoder:
                f_write.write(line)
            f_write.close()  # ensures the last line is written
            csv_filepath = f_write.name

        # check tables exists

        # datastore db connection
        engine = get_write_engine()

        # get column info from existing table
        existing = datastore_resource_exists(resource_id)
        existing_info = {}
        if existing:
            existing_info = dict((f['id'], f['info'])
                                 for f in existing.get('fields', [])
                                 if 'info' in f)
            '''
            Delete existing datastore table before proceeding. Otherwise
            the COPY will append to the existing table. And if
            the fields have significantly changed, it may also fail.
            '''
            logger.info('Deleting "{res_id}" from DataStore.'.format(
                res_id=resource_id))
            delete_datastore_resource(resource_id)

        # Columns types are either set (overridden) in the Data Dictionary page
        # or default to text type (which is robust)
        fields = [
            {'id': header_name,
             'type': existing_info.get(header_name, {})\
             .get('type_override') or 'text',
             }
            for header_name in headers]

        # Maintain data dictionaries from matching column names
        if existing_info:
            for f in fields:
                if f['id'] in existing_info:
                    f['info'] = existing_info[f['id']]

        logger.info('Fields: {}'.format(fields))

        # Create table
        from ckan import model
        context = {'model': model, 'ignore_auth': True}
        try:
            p.toolkit.get_action('datastore_create')(
                context,
                dict(
                    resource_id=resource_id,
                    fields=fields,
                    records=None,  # just create an empty table
                    force=True,  # TODO check this - I don't fully understand
                    # read-only/datastore resources
                ))
        except p.toolkit.ValidationError as e:
            if 'fields' in e.error_dict:
                # e.g. {'message': None, 'error_dict': {'fields': [u'"***" is not a valid field name']}, '_error_summary': None}
                error_message = e.error_dict['fields'][0]
                raise LoaderError(
                    'Error with field definition: {}'.format(error_message))
            else:
                raise LoaderError(
                    'Validation error when creating the database table: {}'.
                    format(str(e)))
        except Exception as e:
            raise LoaderError(
                'Could not create the database table: {}'.format(e))
        connection = engine.connect()
        if not fulltext_trigger_exists(connection, resource_id):
            _create_fulltext_trigger(connection, resource_id)

        logger.info('Copying to database...')

        # Options for loading into postgres:
        # 1. \copy - can't use as that is a psql meta-command and not accessible
        #    via psycopg2
        # 2. COPY - requires the db user to have superuser privileges. This is
        #    dangerous. It is also not available on AWS, for example.
        # 3. pgloader method? - as described in its docs:
        #    Note that while the COPY command is restricted to read either from its standard input or from a local file on the server's file system, the command line tool psql implements a \copy command that knows how to stream a file local to the client over the network and into the PostgreSQL server, using the same protocol as pgloader uses.
        # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
        #    the superuser issue. <-- picked

        # with psycopg2.connect(DSN) as conn:
        #     with conn.cursor() as curs:
        #         curs.execute(SQL)
        connection = engine.raw_connection()
        try:
            cur = connection.cursor()
            try:
                with open(csv_filepath, 'rb') as f:
                    # can't use :param for table name because params are only
                    # for filter values that are single quoted.
                    try:
                        cur.copy_expert(
                            "COPY \"{resource_id}\" ({column_names}) "
                            "FROM STDIN "
                            "WITH (DELIMITER ',', FORMAT csv, HEADER 1, "
                            "      ENCODING '{encoding}');".format(
                                resource_id=resource_id,
                                column_names=', '.join(
                                    ['"{}"'.format(h) for h in headers]),
                                encoding='UTF8',
                            ), f)
                    except psycopg2.DataError as e:
                        logger.error(e)
                        raise LoaderError(
                            'Error during the load into PostgreSQL:'
                            ' {}'.format(e))

            finally:
                cur.close()
        finally:
            connection.commit()
    finally:
        os.remove(csv_filepath)  # i.e. the tempfile

    logger.info('...copying done')