def send_resource_to_datastore(resource_id, headers, records): """ Stores records in CKAN datastore """ request = {'resource_id': resource_id, 'fields': headers, 'force': True, 'records': records} from ckan import model context = {'model': model, 'ignore_auth': True} try: p.toolkit.get_action('datastore_create')(context, request) except p.toolkit.ValidationError as e: raise LoaderError('Validation error writing rows to db: {}' .format(str(e)))
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads a CSV into DataStore. Does not create the indexes.''' # use messytables to determine the header row extension = os.path.splitext(csv_filepath)[1] tempdir = tempfile.mkdtemp(suffix=resource_id) if extension.lower() == '.zip': with zipfile.ZipFile(csv_filepath, "r") as zip_ref: csvfiles = [ file for file in zip_ref.filelist if file.filename.lower().endswith('.csv') ] if len(csvfiles) == 0: logger.info("no csvfiles found in %s" % csv_filepath) if len(csvfiles) > 0: if len(csvfiles) > 1: logger.info( "multiple csv files found in %s, only one will be ingested: %s" % (csv_filepath, csvfiles[0].filename)) else: logger.info("unzipping %s and ingesting %s" % (csv_filepath, csvfiles[0].filename)) zip_ref.extract(csvfiles[0], tempdir) new_loc = os.path.join(tempdir, csvfiles[0].filename) csv_filepath = new_loc extension = os.path.splitext(csv_filepath)[1] logger.info("unzipped %s" % csvfiles[0].filename) # close. zip_ref.close() with open(csv_filepath, 'rb') as f: try: table_set = messytables.any_tableset(f, mimetype=mimetype, extension=extension) except messytables.ReadError as e: # # try again with format # f.seek(0) # try: # format = resource.get('format') # table_set = messytables.any_tableset(f, mimetype=format, # extension=format) # except Exception: raise LoaderError('Messytables error: {}'.format(e)) if not table_set.tables: raise LoaderError('Could not detect tabular data in this file') row_set = table_set.tables.pop() header_offset, headers = messytables.headers_guess(row_set.sample) # Some headers might have been converted from strings to floats and such. headers = encode_headers(headers) # Guess the delimiter used in the file with open(csv_filepath, 'r') as f: header_line = f.readline() try: sniffer = csv.Sniffer() delimiter = sniffer.sniff(header_line).delimiter except csv.Error: logger.warning( 'Could not determine delimiter from file, use default ","') delimiter = ',' # Setup the converters that run when you iterate over the row_set. # With pgloader only the headers will be iterated over. row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(header_offset + 1)) # types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) headers = [ header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip() ] # headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) # for field in zip(headers, types)] # TODO worry about csv header name problems # e.g. duplicate names # encoding (and line ending?)- use chardet # It is easier to reencode it as UTF8 than convert the name of the encoding # to one that pgloader will understand. logger.info('Ensuring character coding is UTF8') f_write = tempfile.NamedTemporaryFile(suffix=extension, delete=False) try: with open(csv_filepath, 'rb') as f_read: csv_decoder = messytables.commas.UTF8Recoder(f_read, encoding=None) for line in csv_decoder: f_write.write(line) f_write.close() # ensures the last line is written csv_filepath = f_write.name logger.info('Ensuring character coding is UTF8 complete') # check tables exists # datastore db connection engine = get_write_engine() # get column info from existing table existing = datastore_resource_exists(resource_id) existing_info = {} if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) ''' Delete existing datastore table before proceeding. Otherwise the COPY will append to the existing table. And if the fields have significantly changed, it may also fail. ''' logger.info('Deleting "{res_id}" from DataStore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id) # Columns types are either set (overridden) in the Data Dictionary page # or default to text type (which is robust) fields = [ {'id': header_name, 'type': existing_info.get(header_name, {})\ .get('type_override') or 'text', } for header_name in headers] # Maintain data dictionaries from matching column names if existing_info: for f in fields: if f['id'] in existing_info: f['info'] = existing_info[f['id']] logger.info('Fields: {}'.format(fields)) # Create table from ckan import model context = {'model': model, 'ignore_auth': True} data_dict = dict( resource_id=resource_id, fields=fields, ) data_dict['records'] = None # just create an empty table data_dict['force'] = True # TODO check this - I don't fully # understand read-only/datastore resources try: p.toolkit.get_action('datastore_create')(context, data_dict) except p.toolkit.ValidationError as e: if 'fields' in e.error_dict: # e.g. {'message': None, 'error_dict': {'fields': [u'"***" is not a valid field name']}, '_error_summary': None} error_message = e.error_dict['fields'][0] raise LoaderError( 'Error with field definition: {}'.format(error_message)) else: raise LoaderError( 'Validation error when creating the database table: {}'. format(str(e))) except Exception as e: raise LoaderError( 'Could not create the database table: {}'.format(e)) connection = context['connection'] = engine.connect() if not fulltext_trigger_exists(connection, resource_id): logger.info('Trigger created') _create_fulltext_trigger(connection, resource_id) # datstore_active is switched on by datastore_create - TODO temporarily # disable it until the load is complete # logger.info('Disabling row index trigger') _disable_fulltext_trigger(connection, resource_id) # logger.info('Dropping indexes') _drop_indexes(context, data_dict, False) logger.info('Copying to database...') # Options for loading into postgres: # 1. \copy - can't use as that is a psql meta-command and not accessible # via psycopg2 # 2. COPY - requires the db user to have superuser privileges. This is # dangerous. It is also not available on AWS, for example. # 3. pgloader method? - as described in its docs: # Note that while the COPY command is restricted to read either from its standard input or from a local file on the server's file system, the command line tool psql implements a \copy command that knows how to stream a file local to the client over the network and into the PostgreSQL server, using the same protocol as pgloader uses. # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids # the superuser issue. <-- picked # with psycopg2.connect(DSN) as conn: # with conn.cursor() as curs: # curs.execute(SQL) raw_connection = engine.raw_connection() try: cur = raw_connection.cursor() try: with open(csv_filepath, 'rb') as f: # can't use :param for table name because params are only # for filter values that are single quoted. try: cur.copy_expert( "COPY \"{resource_id}\" ({column_names}) " "FROM STDIN " "WITH (DELIMITER '{delimiter}', FORMAT csv, HEADER 1, " " ENCODING '{encoding}');".format( resource_id=resource_id, column_names=', '.join( ['"{}"'.format(h) for h in headers]), delimiter=delimiter, encoding='UTF8', ), f) except psycopg2.DataError as e: # e is a str but with foreign chars e.g. # 'extra data: "paul,pa\xc3\xbcl"\n' # but logging and exceptions need a normal (7 bit) str error_str = str(e).decode('ascii', 'replace').encode( 'ascii', 'replace') logger.warning(error_str) raise LoaderError( 'Error during the load into PostgreSQL:' ' {}'.format(error_str)) finally: cur.close() finally: raw_connection.commit() finally: os.remove(csv_filepath) # i.e. the tempfile shutil.rmtree(tempdir) logger.info('...copying done') logger.info('Creating search index...') _populate_fulltext(connection, resource_id, fields=fields) logger.info('...search index created') return fields
def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads an Excel file (or other tabular data recognized by messytables) into Datastore and creates indexes. Largely copied from datapusher - see below. Is slower than load_csv. ''' # use messytables to determine the header row logger.info('Determining column names and types') ct = mimetype format = os.path.splitext(table_filepath)[1] # filename extension with open(table_filepath, 'rb') as tmp: # # Copied from datapusher/jobs.py:push_to_datastore # try: table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct) except messytables.ReadError as e: # try again with format tmp.seek(0) try: table_set = messytables.any_tableset(tmp, mimetype=format, extension=format) except Exception as e: raise LoaderError(e) if not table_set.tables: raise LoaderError('Could not parse file as tabular data') row_set = table_set.tables.pop() offset, headers = messytables.headers_guess(row_set.sample) existing = datastore_resource_exists(resource_id) existing_info = None if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. headers = encode_headers(headers) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) TYPES, TYPE_MAPPING = get_types() types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] row_set.register_processor(messytables.types_processor(types)) headers = [ header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip() ] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row result = row_iterator() ''' Delete existing datstore resource before proceeding. Otherwise 'datastore_create' will append to the existing datastore. And if the fields have significantly changed, it may also fail. ''' if existing: logger.info('Deleting "{res_id}" from datastore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id) headers_dicts = [ dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types) ] # Maintain data dictionaries from matching column names if existing_info: for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in _TYPE_MAPPING.values(): h['type'] = type_override logger.info('Determined headers and types: {headers}'.format( headers=headers_dicts)) ### Commented - this is only for tests # if dry_run: # return headers_dicts, result logger.info('Copying to database...') count = 0 for i, records in enumerate(chunky(result, 250)): count += len(records) logger.info('Saving chunk {number}'.format(number=i)) send_resource_to_datastore(resource_id, headers_dicts, records) logger.info('...copying done') if count: logger.info( 'Successfully pushed {n} entries to "{res_id}".'.format( n=count, res_id=resource_id)) else: # no datastore table is created raise LoaderError('No entries found - nothing to load')
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): # use messytables to determine the header row extension = os.path.splitext(csv_filepath)[1] with open(csv_filepath, 'rb') as f: try: table_set = messytables.any_tableset(f, mimetype=mimetype, extension=extension) except messytables.ReadError as e: # # try again with format # f.seek(0) # try: # format = resource.get('format') # table_set = messytables.any_tableset(f, mimetype=format, # extension=format) # except Exception: raise LoaderError('Messytables error: {}'.format(e)) if not table_set.tables: raise LoaderError('Could not detect tabular data in this file') row_set = table_set.tables.pop() header_offset, headers = messytables.headers_guess(row_set.sample) # Some headers might have been converted from strings to floats and such. headers = [unicode(header) for header in headers] # Setup the converters that run when you iterate over the row_set. # With pgloader only the headers will be iterated over. row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(header_offset + 1)) # types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) headers = [header.strip() for header in headers if header.strip()] # headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) # for field in zip(headers, types)] # TODO worry about csv header name problems # e.g. duplicate names # encoding (and line ending?)- use chardet # It is easier to reencode it as UTF8 than convert the name of the encoding # to one that pgloader will understand. logger.info('Ensuring character coding is UTF8') f_write = tempfile.NamedTemporaryFile(suffix=extension, delete=False) try: with open(csv_filepath, 'rb') as f_read: csv_decoder = messytables.commas.UTF8Recoder(f_read, encoding=None) for line in csv_decoder: f_write.write(line) f_write.close() # ensures the last line is written csv_filepath = f_write.name # check tables exists # datastore db connection engine = get_write_engine() # get column info from existing table existing = datastore_resource_exists(resource_id) existing_info = {} if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) ''' Delete existing datastore table before proceeding. Otherwise the COPY will append to the existing table. And if the fields have significantly changed, it may also fail. ''' logger.info('Deleting "{res_id}" from DataStore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id) # Columns types are either set (overridden) in the Data Dictionary page # or default to text type (which is robust) fields = [ {'id': header_name, 'type': existing_info.get(header_name, {})\ .get('type_override') or 'text', } for header_name in headers] # Maintain data dictionaries from matching column names if existing_info: for f in fields: if f['id'] in existing_info: f['info'] = existing_info[f['id']] logger.info('Fields: {}'.format(fields)) # Create table from ckan import model context = {'model': model, 'ignore_auth': True} try: p.toolkit.get_action('datastore_create')( context, dict( resource_id=resource_id, fields=fields, records=None, # just create an empty table force=True, # TODO check this - I don't fully understand # read-only/datastore resources )) except p.toolkit.ValidationError as e: if 'fields' in e.error_dict: # e.g. {'message': None, 'error_dict': {'fields': [u'"***" is not a valid field name']}, '_error_summary': None} error_message = e.error_dict['fields'][0] raise LoaderError( 'Error with field definition: {}'.format(error_message)) else: raise LoaderError( 'Validation error when creating the database table: {}'. format(str(e))) except Exception as e: raise LoaderError( 'Could not create the database table: {}'.format(e)) connection = engine.connect() if not fulltext_trigger_exists(connection, resource_id): _create_fulltext_trigger(connection, resource_id) logger.info('Copying to database...') # Options for loading into postgres: # 1. \copy - can't use as that is a psql meta-command and not accessible # via psycopg2 # 2. COPY - requires the db user to have superuser privileges. This is # dangerous. It is also not available on AWS, for example. # 3. pgloader method? - as described in its docs: # Note that while the COPY command is restricted to read either from its standard input or from a local file on the server's file system, the command line tool psql implements a \copy command that knows how to stream a file local to the client over the network and into the PostgreSQL server, using the same protocol as pgloader uses. # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids # the superuser issue. <-- picked # with psycopg2.connect(DSN) as conn: # with conn.cursor() as curs: # curs.execute(SQL) connection = engine.raw_connection() try: cur = connection.cursor() try: with open(csv_filepath, 'rb') as f: # can't use :param for table name because params are only # for filter values that are single quoted. try: cur.copy_expert( "COPY \"{resource_id}\" ({column_names}) " "FROM STDIN " "WITH (DELIMITER ',', FORMAT csv, HEADER 1, " " ENCODING '{encoding}');".format( resource_id=resource_id, column_names=', '.join( ['"{}"'.format(h) for h in headers]), encoding='UTF8', ), f) except psycopg2.DataError as e: logger.error(e) raise LoaderError( 'Error during the load into PostgreSQL:' ' {}'.format(e)) finally: cur.close() finally: connection.commit() finally: os.remove(csv_filepath) # i.e. the tempfile logger.info('...copying done')