def _process_upload(context, data): """ When provided with a filename this function will process each row within the file and then return a tuple. The tuple will contain - a list of error messages (if any) - a list of dicts where each dict contains ... { 'package': 'a_package_id', 'action': 'Added' or 'Updated' } """ log = inventory_upload.get_logger() errors = [] results = [] filename = data['file'] publisher_name = data['publisher'] import urlparse client = CkanClient( base_location=urlparse.urljoin(context['site_url'], 'api'), api_key=context['apikey']) tableset = None try: _, ext = os.path.splitext(filename) tableset = messytables.any_tableset( open(filename, 'r'), extension=ext[1:]) except Exception, e: if str(e) == "Unrecognized MIME type: text/plain": tableset = messytables.any_tableset(f, mimetype="text/csv") else: errors.append("Unable to load file: {0}".format(e))
def _process_upload(context, data): """ When provided with a filename this function will process each row within the file and then return a tuple. The tuple will contain - a list of error messages (if any) - a list of dicts where each dict contains ... { 'package': 'a_package_id', 'action': 'Added' or 'Updated' } """ log = inventory_upload.get_logger() errors = [] results = [] filename = data['file'] publisher_name = data['publisher'] import urlparse client = CkanClient(base_location=urlparse.urljoin(context['site_url'], 'api'), api_key=context['apikey']) tableset = None try: _, ext = os.path.splitext(filename) tableset = messytables.any_tableset(open(filename, 'r'), extension=ext[1:]) except Exception, e: if str(e) == "Unrecognized MIME type: text/plain": tableset = messytables.any_tableset(f, mimetype="text/csv") else: errors.append("Unable to load file: {0}".format(e))
def create_new_model(self, modelname, app_label): """ Use messytables to guess field types and build a new model """ nocols = False cols = self.csvfile[0] for col in cols: if not col: nocols = True if nocols: cols = ["col_%s" % num for num in range(1, len(cols))] print("No column names for %s columns" % len(cols)) else: # strip quotes at ends and replace internal spaces with underscores cols = [col.strip("\r") for col in cols] cols = [col.strip('"') for col in cols] cols = [col.strip("'") for col in cols] cols = [cleancol.sub("_", col).lower() for col in cols] try: from messytables import any_tableset, type_guess except: self.errors.append( "If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org" ) self.modelname = "" return try: table_set = any_tableset(self.filehandle) row_set = table_set.tables[0] types = type_guess(row_set.sample) types = [str(typeobj) for typeobj in types] # If the header has more cols than the data has cols - ignore the end ones if len(cols) > len(types): cols = cols[:len(types)] except Exception as err: self.errors.append("messytables could not run due to error") self.errors.append(str(err)) self.modelname = "" return fieldset = [] maximums = self.get_maxlengths(cols) for i, col in enumerate(cols): length = maximums[i] if types[i] == "String" and length > 255: types[i] = "Text" integer = length decimal = int(length / 2) if decimal > 10: decimal = 10 blank = True default = True column = (col, types[i], length, length, integer, decimal, blank, default) fieldset.append(column) # Import here so that messytables is not a dependency for just using csvimport cmd from csvimport.make_model import MakeModel maker = MakeModel() return maker.model_from_table("%s_%s" % (app_label, modelname), fieldset)
def main(argv=None): args = parse_args(argv) if args.file is None: # slurp the whole input since there seems to be a bug in messytables # which should be able to handle streams but doesn't args.file = cStringIO.StringIO(sys.stdin.read()) relation_key = args_to_relation_key(args) table_set = any_tableset(args.file) if len(table_set.tables) != 1: raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables)) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(strip_processor()) row_set.register_processor(headers_processor(headers)) # Temporarily, mark the offset of the header row_set.register_processor(offset_processor(offset + 1)) # guess types and register them types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType]) row_set.register_processor(types_processor(types)) # Messytables seems to not handle the case where there are no headers. # Work around this as follows: # 1) offset must be 0 # 2) if the types of the data match the headers, assume there are # actually no headers if offset == 0: try: [t.cast(v) for (t, v) in zip(types, headers)] except: pass else: # We don't need the headers_processor or the offset_processor row_set._processors = [] row_set.register_processor(strip_processor()) row_set.register_processor(types_processor(types)) headers = None # Construct the Myria schema schema = messy_to_schema(types, headers) logging.info("Myria schema: {}".format(json.dumps(schema))) # Prepare data for writing to Myria data, kwargs = write_data(row_set, schema) if not args.dry: # Connect to Myria and send the data connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl) ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs) sys.stdout.write(pretty_json(ret)) else: sys.stdout.write(data)
def __init__(self, filename): """ When provided with a filename (to a CSV, XLS, or XLSX) the constructor will attempt to load the file and ensure that messytables knows how to process it. """ self.tableset = None try: _, ext = os.path.splitext(filename) self.tableset = messytables.any_tableset(open(filename, "r"), extension=ext[1:]) except Exception, e: if str(e) == "Unrecognized MIME type: text/plain": # Attempt to force the load as a CSV file to work around messytables # not recognising text/plain self.tableset = messytables.any_tableset(f, mimetype="text/csv") else: log.exception(e) raise Exception(u"Failed to load the file at {0}".format(filename))
def transform(self): handle = self.open_data(self.url) if not handle: raise ResourceError("Remote resource missing", "Unable to load the remote resource") try: if self.is_csv(): table_set = any_tableset(fileobj=handle, extension=self.type) else: table_set = any_tableset(fileobj=handle, extension=self.type, mimetype=self.mimetype) except Exception, e: # e.g. ValueError('Unrecognized MIME type: application/vnd.oasis.opendocument.spreadsheet') log.warn('Messytables parse error %s %s: %s', self.resource_identifier, self.url, e) log.warn('Some data: ext: %s, mime: %s', self.type, self.mimetype) raise ResourceError("Resource loading error", "Unable to load the resource")
def transform(self): handle = self.open_data(self.url) if not handle: raise ResourceError("Informacije", "Udaljeni resurs nedostupan") try: table_set = any_tableset(fileobj=handle, extension=self.type, mimetype=self.mimetype) except Exception, e: raise ResourceError("Informacija", "Resurs nedostupan")
def ingest(self, meta, local_path): with open(local_path, 'rb') as fh: table_set = any_tableset(fh, extension=meta.extension, mimetype=meta.mime_type, window=20000) tables = [] for sheet, row_set in enumerate(table_set.tables): tables.append(self.generate_table(meta, sheet, row_set)) meta.tables = tables document = self.create_document(meta) self.emit(document)
def __init__(self, filename): """ When provided with a filename (to a CSV, XLS, or XLSX) the constructor will attempt to load the file and ensure that messytables knows how to process it. """ self.tableset = None try: _, ext = os.path.splitext(filename) self.tableset = messytables.any_tableset(open(filename, 'r'), extension=ext[1:]) except Exception, e: if str(e) == "Unrecognized MIME type: text/plain": # Attempt to force the load as a CSV file to work around messytables # not recognising text/plain self.tableset = messytables.any_tableset(filename, mimetype="text/csv") else: log.exception(e) raise Exception( u"Failed to load the file at {0}".format(filename))
def proc(f, database_name, table_name): table_set = messytables.any_tableset(f) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=[ messytables.types.StringType, messytables.types.DateType, ], strict=True) hive_data_file = tempfile.NamedTemporaryFile(mode='w') fields_ddl = ','.join([ ' {0} {1}\n'.format( canonicalize_column_name(colName), hive_column_type(colType) ) for colName, colType in zip(headers, types) ]) hive_sql = ''' DROP TABLE IF EXISTS {0}; CREATE TABLE {0} ( {1} ) STORED AS TEXTFILE TBLPROPERTIES ("comment"="add_messytable on {3}"); LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0}; '''.format(table_name, fields_ddl, hive_data_file.name, datetime.datetime.now().isoformat()) hive_cmd_file = tempfile.NamedTemporaryFile(mode='w') print(hive_sql, file=hive_cmd_file) hive_cmd_file.flush() row_set.register_processor(messytables.types_processor(types)) for row in row_set: print('\001'.join(map(str, [ c.value for c in row])), file=hive_data_file) hive_data_file.flush() subprocess.call([ 'hive', '--database', database_name, '-f', hive_cmd_file.name, ])
def create_new_model(self, modelname, app_label): """ Use messytables to guess field types and build a new model """ nocols = False cols = self.csvfile[0] for col in cols: if not col: nocols = True if nocols: cols = ['col_%s' % num for num in range(1, len(cols))] print('No column names for %s columns' % len(cols)) else: cols = [cleancol.sub('_', col).lower() for col in cols] try: from messytables import any_tableset, type_guess except: self.errors.append( 'If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org' ) self.modelname = '' return try: table_set = any_tableset(self.filehandle) row_set = table_set.tables[0] types = type_guess(row_set.sample) types = [str(typeobj) for typeobj in types] except: self.errors.append('messytables could not guess your column types') self.modelname = '' return fieldset = [] maximums = self.get_maxlengths(cols) for i, col in enumerate(cols): length = maximums[i] if types[i] == 'String' and length > 255: types[i] = 'Text' integer = length decimal = int(length / 2) if decimal > 10: decimal = 10 blank = True default = True column = (col, types[i], length, length, integer, decimal, blank, default) fieldset.append(column) # Import here so that messytables is not a dependency for just using csvimport cmd from csvimport.make_model import MakeModel maker = MakeModel() return maker.model_from_table('%s_%s' % (app_label, modelname), fieldset)
def transform(self): handle = self.open_data(self.url) if not handle: raise ResourceError("Remote resource missing", "Unable to load the remote resource") try: table_set = any_tableset(fileobj=handle, extension=self.type, mimetype=self.mimetype) except Exception, e: raise ResourceError("Resource loading error", "Unable to load the resource")
def validate_file(file_tmp, file_name, tmp_filepath): log.info("upload: checking file * %s * ", file_name) MAX_HEADER_LENGTH = 64 # not allowed characters ( - ' " ’ ‘) regex inappropriate_chars = re.compile(r"[\-|\'|\"|\u2018|\u2019]"); datastore_ext = config.get('ckan.mimetype_guess', "csv xls xlsx tsv") tmp_file_name, tmp_file_ext = os.path.splitext(file_name) #check if datastore file (csv xls xlsx tsv) if tmp_file_ext[1:].lower() in datastore_ext: table_set = any_tableset(file_tmp) #check if only one data sheet in the file if len(table_set.tables)>1: rollback_tmp(file_tmp, tmp_filepath) log.error("upload: the file * %s * was not uploaded - There is more then one data sheet in the file", file_name) raise logic.ValidationError( {'upload': ['There is more then one data sheet in the file']} ) else: row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) for header in headers: # too long header if len(header) > MAX_HEADER_LENGTH: rollback_tmp(file_tmp, tmp_filepath) log.error("upload: the file * %s * was not uploaded - too long header - * %s *", file_name, header) raise logic.ValidationError( {'upload': ['too long header (64 max)']} ) # not allowed characters in header ( - ' " ’ ‘) if inappropriate_chars.search(header): rollback_tmp(file_tmp, tmp_filepath) log.error("upload: the file * %s * was not uploaded - there are inappropriate characters in headers * %s *", file_name, header) raise logic.ValidationError( {'upload': ['there are inappropriate characters in headers (apostrophe/apostrophes/dash)']} ) # Check for duplicate fields unique_fields = set(headers) if not len(unique_fields) == len(headers): rollback_tmp(file_tmp, tmp_filepath) log.error("upload: the file * %s * was not uploaded - Duplicate column names are not supported", file_name) raise logic.ValidationError({'upload': ['Duplicate column names are not supported']}) log.info("passed validation succesfully - the file * %s * was uploaded to CKAN (filestore)", file_name) else: pass
def create_new_model(self, modelname, app_label): """ Use messytables to guess field types and build a new model """ nocols = False cols = self.csvfile[0] for col in cols: if not col: nocols = True if nocols: cols = ['col_%s' % num for num in range(1, len(cols))] print ('No column names for %s columns' % len(cols)) else: cols = [cleancol.sub('_', col).lower() for col in cols] try: from messytables import any_tableset, type_guess except: self.errors.append( 'If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org') self.modelname = '' return try: table_set = any_tableset(self.filehandle) row_set = table_set.tables[0] types = type_guess(row_set.sample) types = [str(typeobj) for typeobj in types] except Exception as err: self.errors.append('messytables could not run due to error') self.errors.append(str(err)) self.modelname = '' return fieldset = [] maximums = self.get_maxlengths(cols) for i, col in enumerate(cols): length = maximums[i] if types[i] == 'String' and length > 255: types[i] = 'Text' integer = length decimal = int(length / 2) if decimal > 10: decimal = 10 blank = True default = True column = (col, types[i], length, length, integer, decimal, blank, default) fieldset.append(column) # Import here so that messytables is not a dependency for just using csvimport cmd from csvimport.make_model import MakeModel maker = MakeModel() return maker.model_from_table('%s_%s' % (app_label, modelname), fieldset)
def ingest(self, meta, local_path): with open(local_path, 'rb') as fh: table_set = any_tableset(fh, extension=meta.extension, mimetype=meta.mime_type, window=20000) tables = [] document = self.create_document(meta) for sheet, row_set in enumerate(table_set.tables): tables.append( self.generate_table(document, meta, sheet, row_set)) meta.tables = tables document.meta = meta self.emit(document)
def read_file(self, filename): """ Guess the filetype and read the file into row sets """ #print("Reading file", filename) try: fh = open(filename, 'rb') table_set = any_tableset(fh) # guess the type... except: #traceback.print_exc() # Cannot find the schema. table_set = None return table_set
def resource_row_set(package, resource): """ Generate an iterator over all the rows in this resource's source data. """ # Try to gather information about the source file type. if not resource.meta.get('extension'): resource.meta['extension'] = guess_extension(resource.meta.get('name')) # This is a work-around because messytables hangs on boto file # handles, so we're doing it via plain old HTTP. table_set = any_tableset(resource.fh(), extension=resource.meta.get('extension'), mimetype=resource.meta.get('mime_type')) tables = list(table_set.tables) if not len(tables): log.error("No tables were found in the source file.") return return tables[0]
def transform(self): handle = self.open_data(self.url) if not handle: raise ResourceError("Remote resource missing", "Unable to load the remote resource") try: table_set = any_tableset(fileobj=handle, extension=self.type, mimetype=self.mimetype) except Exception, e: # e.g. ValueError('Unrecognized MIME type: application/vnd.oasis.opendocument.spreadsheet') log.warn('Messytables parse error %s %s: %s', self.resource_identifier, self.url, e) raise ResourceError("Resource loading error", "Unable to load the resource")
def parse_table(source): # This is a work-around because messytables hangs on boto file # handles, so we're doing it via plain old HTTP. # We're also passing in an extended window size to give more # reliable type detection. # Because Python's CSV dialect sniffer isn't the best, this also # constrains the field quoting character to a double quote. table_set = mt.any_tableset(source.fh(), extension=source.meta.get('extension'), mimetype=source.meta.get('mime_type'), quotechar='"', window=20000) tables = list(table_set.tables) if not len(tables): log.error("No tables were found in the source file.") return row_set = tables[0] headers = [c.value for c in next(row_set.sample)] row_set.register_processor(mt.headers_processor(headers)) row_set.register_processor(mt.offset_processor(1)) types = mt.type_guess(row_set.sample, strict=True) row_set.register_processor(mt.types_processor(types, strict=True)) fields, i = {}, 0 row_iter = iter(row_set) while True: i += 1 try: row = row_iter.next() if not len(fields): fields = generate_field_spec(row) data = convert_row(row, fields, i) check_empty = set(data.values()) if None in check_empty and len(check_empty) == 1: continue yield None, fields, data except StopIteration: return except Exception, e: # log.exception(e) yield e, fields, None
def parse_data(input): fh = open(input, 'rb') try: table_set = messytables.any_tableset(fh) except messytables.ReadError as e: print(e) get_row_set = lambda table_set: table_set.tables.pop() row_set = get_row_set(table_set) offset, headers = messytables.headers_guess(row_set.sample) # Some headers might have been converted from strings to floats and such. headers = [str(header) for header in headers] row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row result = row_iterator() headers_dicts = [ dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types) ] print('Determined headers and types: {headers}'.format( headers=headers_dicts)) return headers_dicts, result
def resource_row_set(package, resource): """ Generate an iterator over all the rows in this resource's source data. """ # This is a work-around because messytables hangs on boto file # handles, so we're doing it via plain old HTTP. table_set = any_tableset(resource.fh(), extension=resource.meta.get('extension'), mimetype=resource.meta.get('mime_type')) tables = list(table_set.tables) if not len(tables): log.error("No tables were found in the source file.") return row_set = tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) return row_set
def test_simple_zip(self): fh = horror_fobj('simple.zip') table_set = any_tableset(fh, extension='zip') assert isinstance(table_set, ZIPTableSet)
def push_to_datastore(self, context, resource): # Get the resource's content hash, which is used to check whether the # resource file has changed since last time. hash_dict = resource.get('hash') if hash_dict: original_content_hash = json.loads(hash_dict)['content'] check_hash = not self.options.force else: # This resource has no hash yet, it must be a new resource. original_content_hash = '' check_hash = False try: result = fetch_resource.download(context, resource, self.max_content_length, DATA_FORMATS, check_modified=check_hash) except fetch_resource.ResourceNotModified as e: logger.info( u'Skipping unmodified resource: {0}'.format(resource['url']) ) return {'success': True, 'resource': resource['id'], 'error': None} except Exception as e: logger.exception(e) return {'success': False, 'resource': resource['id'], 'error': 'Could not download resource'} if check_hash and (result['hash'] == original_content_hash): logger.info( u'Skipping unmodified resource: {0}'.format(resource['url']) ) os.remove(result['saved_file']) return {'success': True, 'resource': resource['id'], 'error': None} content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') try: table_sets = any_tableset( f, mimetype=content_type, extension=resource['format'].lower() ) # only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) except Exception as e: logger.exception(e) os.remove(result['saved_file']) return {'success': False, 'resource': resource['id'], 'error': 'Error parsing the resource'} row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): data_dict = { 'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data, 'force': True, } response = toolkit.get_action('datastore_create')( context, data_dict ) return response # Delete any existing data before proceeding. Otherwise # 'datastore_create' will append to the existing datastore. And if the # fields have significantly changed, it may also fail. logger.info('Trying to delete existing datastore for resource {0} ' '(may not exist).'.format(resource['id'])) try: toolkit.get_action('datastore_delete')( context, {'resource_id': resource['id'], 'force': True} ) except toolkit.ObjectNotFound: logger.info('Datastore not found for resource {0}.'.format( resource['id'])) except Exception as e: logger.exception(e) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 try: for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) except Exception as e: logger.exception(e) os.remove(result['saved_file']) return {'success': False, 'resource': resource['id'], 'error': 'Error pushing data to datastore'} logger.info("There should be {n} entries in {res_id}.".format( n=count, res_id=resource['id'] )) resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.now().isoformat() }) toolkit.get_action('resource_update')(context, resource) os.remove(result['saved_file']) return {'success': True, 'resource': resource['id'], 'error': None}
def parse_csv(filename, cfg_in): """ Guess csv structure :param filename: :param cfg_in: :param known_structure: list of strings formats in order of columns, from start but may be not all (next is auto treeted) :return: lst_types, offset, headers * quotechar - specifies a one-character string to use as the quoting character. It defaults to '"'. * delimiter - specifies a one-character string to use as the field separator. It defaults to ','. * skipinitialspace - specifies how to interpret whitespace which immediately follows a delimiter. It defaults to False, which means that whitespace immediately following a delimiter is part of the following field. * lineterminator - specifies the character sequence which should terminate rows. * quoting - controls when quotes should be generated by the writer. It can take on any of the following module constants: csv.QUOTE_MINIMAL means only when required, for example, when a field contains either the quotechar or the delimiter csv.QUOTE_ALL means that quotes are always placed around fields. csv.QUOTE_NONNUMERIC means that quotes are always placed around fields which do not parse as integers or floating point numbers. csv.QUOTE_NONE means that quotes are never placed around fields. * escapechar - specifies a one-character string used to escape the delimiter when quoting is set to QUOTE_NONE. * doublequote - controls the handling of quotes inside fields. When True, two consecutive quotes are interpreted as one during read, and when writing, each quote character embedded in the data is written as two quotes Example: parse_csv(filename, ['%H:%M:%S']) """ set_field_if_no(cfg_in, 'types', []) set_field_if_no(cfg_in, 'delimiter') with open(filename, 'rb') as fh: ext = os_path.splitext(filename)[1] # Load a file object: try: # If you are sure that file is csv use CSVTableSet(fh) from magic import MagicException # because any_tableset uses libmagic table_set = any_tableset(fh, mimetype=None, extension=ext, delimiter=cfg_in['delimiter']) except (ImportError, MagicException) as e: print('There are error ', standard_error_info(e), '\n=> Loading file as csv without trying other formats') table_set = CSVTableSet(fh, delimiter=cfg_in['delimiter']) # A table set is a collection of tables: row_set = table_set.tables[0] # A row set is an iterator over the table, but it can only # be run once. To peek, a sample is provided: # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) # tolerance=1 row_set.register_processor(headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(offset_processor(offset + 1)) # guess column types: lst_types = type_guess(row_set.sample, strict=True) row_sample = next(row_set.sample) # check not detected types def formats2types(formats_str): for f in formats_str: if f: if is_date_format(f): yield (types.DateType(f)) else: yield (TimeType()) else: yield (None) known_types = formats2types(cfg_in['types']) for n, (t, s, kt) in enumerate(zip(lst_types, row_sample, known_types)): if t.result_type == types.StringType.result_type: # not auto detected? -> check known_types if kt.test(s.value): lst_types[n] = kt # t= kt else: # known_types fits element print( "col'" 's#{:d} value "{}" type not match provided type of {}'. format(n, s.value, type(kt))) # kt = types.DateType('mm/dd/yyyy') # kt.test('0'+s.value) # detect? else: pass # not works for time type: # print(jts.headers_and_typed_as_jts(headers, # list(map(jts.celltype_as_string, lst_types))).as_json()) return lst_types, offset, headers
def test_libreoffice_xlsx(self): fh = horror_fobj('libreoffice.xlsx') table_set = any_tableset(fh) row_set = table_set.tables[0] data = list(row_set) assert_equal(0, len(data))
def _datastorer_upload(context, resource, logger): result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') table_sets = any_tableset(f, mimetype=content_type, extension=resource['format'].lower()) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess(row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % ( ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = { 'resource_id': resource['id'], 'fields': [ dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names) ], 'force': True, 'records': data } response = requests.post( datastore_create_request_url, data=json.dumps(request), headers={ 'Content-Type': 'application/json', 'Authorization': context['apikey'] }, ) check_response_and_retry(response, datastore_create_request_url, logger) # Delete any existing data before proceeding. Otherwise 'datastore_create' will # append to the existing datastore. And if the fields have significantly changed, # it may also fail. try: logger.info( 'Deleting existing datastore (it may not exist): {0}.'.format( resource['id'])) response = requests.post('%s/api/action/datastore_delete' % (ckan_url), data=json.dumps({ 'resource_id': resource['id'], 'force': True }), headers={ 'Content-Type': 'application/json', 'Authorization': context['apikey'] }) if not response.status_code or response.status_code not in (200, 404): # skips 200 (OK) or 404 (datastore does not exist, no need to delete it) logger.error('Deleting existing datastore failed: {0}'.format( get_response_error(response))) raise DatastorerException("Deleting existing datastore failed.") except requests.exceptions.RequestException as e: logger.error('Deleting existing datastore failed: {0}'.format(str(e))) raise DatastorerException("Deleting existing datastore failed.") logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list(itertools.imap(dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format( n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat() }) response = requests.post(ckan_request_url, data=json.dumps(resource), headers={ 'Content-Type': 'application/json', 'Authorization': context['apikey'] }) if response.status_code not in (201, 200): raise DatastorerException( 'Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))
def check_filename(d): if not d['tableset']: raise SkipTest("Optional library not installed. Skipping") fh = horror_fobj(d['filename']) table_set = any_tableset(fh, extension=d['filename'], auto_detect=False) assert isinstance(table_set, d['tableset']), type(table_set)
def check_no_filename(d): fh = horror_fobj(d['filename']) table_set = any_tableset(fh) assert isinstance(table_set, d['tableset']), type(table_set)
def test_scraperwiki_xlsx(self): fh = horror_fobj('sw_gen.xlsx') table_set = any_tableset(fh) row_set = table_set.tables[0] data = list(row_set) assert_equal(16, len(data))
def push_to_datastore(task_id, input, dry_run=False): '''Download and parse a resource push its data into CKAN's DataStore. An asynchronous job that gets a resource from CKAN, downloads the resource's data file and, if the data file has changed since last time, parses the data and posts it into CKAN's DataStore. :param dry_run: Fetch and parse the data file but don't actually post the data to the DataStore, instead return the data headers and rows that would have been posted. :type dry_run: boolean ''' handler = util.StoringHandler(task_id, input) logger = logging.getLogger(task_id) logger.addHandler(handler) logger.setLevel(logging.DEBUG) validate_input(input) data = input['metadata'] ckan_url = data['ckan_url'] resource_id = data['resource_id'] api_key = input.get('api_key') try: resource = get_resource(resource_id, ckan_url, api_key) except util.JobError as e: # try again in 5 seconds just incase CKAN is slow at adding resource time.sleep(5) resource = get_resource(resource_id, ckan_url, api_key) # check if the resource url_type is a datastore if resource.get('url_type') == 'datastore': logger.info('Dump files are managed with the Datastore API') return # check scheme url = resource.get('url') scheme = urlsplit(url).scheme if scheme not in ('http', 'https', 'ftp'): raise util.JobError( 'Only http, https, and ftp resources may be fetched.' ) # fetch the resource data logger.info('Fetching from: {0}'.format(url)) headers = {} if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources headers['Authorization'] = api_key try: response = requests.get( url, headers=headers, timeout=DOWNLOAD_TIMEOUT, verify=SSL_VERIFY, stream=True, # just gets the headers for now ) response.raise_for_status() cl = response.headers.get('content-length') try: if cl and int(cl) > MAX_CONTENT_LENGTH: raise util.JobError( 'Resource too large to download: {cl} > max ({max_cl}).' .format(cl=cl, max_cl=MAX_CONTENT_LENGTH)) except ValueError: pass tmp = tempfile.TemporaryFile() length = 0 m = hashlib.md5() for chunk in response.iter_content(CHUNK_SIZE): length += len(chunk) if length > MAX_CONTENT_LENGTH: raise util.JobError( 'Resource too large to process: {cl} > max ({max_cl}).' .format(cl=length, max_cl=MAX_CONTENT_LENGTH)) tmp.write(chunk) m.update(chunk) ct = response.headers.get('content-type', '').split(';', 1)[0] except requests.HTTPError as e: raise HTTPError( "DataPusher received a bad HTTP response when trying to download " "the data file", status_code=e.response.status_code, request_url=url, response=e.response.content) except requests.RequestException as e: raise HTTPError( message=str(e), status_code=None, request_url=url, response=None) file_hash = m.hexdigest() tmp.seek(0) if (resource.get('hash') == file_hash and not data.get('ignore_hash')): logger.info("The file hash hasn't changed: {hash}.".format( hash=file_hash)) return resource['hash'] = file_hash try: table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct) except messytables.ReadError as e: # try again with format tmp.seek(0) try: format = resource.get('format') table_set = messytables.any_tableset(tmp, mimetype=format, extension=format) except: raise util.JobError(e) get_row_set = web.app.config.get('GET_ROW_SET', lambda table_set: table_set.tables.pop()) row_set = get_row_set(table_set) offset, headers = messytables.headers_guess(row_set.sample) existing = datastore_resource_exists(resource_id, api_key, ckan_url) existing_info = None if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. headers = [str(header) for header in headers] row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue if isinstance(cell.value, str): try: data_row[column_name] = cell.value.encode('latin-1').decode('utf-8') except (UnicodeDecodeError, UnicodeEncodeError): data_row[column_name] = cell.value else: data_row[column_name] = cell.value yield data_row result = row_iterator() ''' Delete existing datstore resource before proceeding. Otherwise 'datastore_create' will append to the existing datastore. And if the fields have significantly changed, it may also fail. ''' if existing: logger.info('Deleting "{res_id}" from datastore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id, api_key, ckan_url) headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types)] # Maintain data dictionaries from matching column names if existing_info: for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in list(_TYPE_MAPPING.values()): h['type'] = type_override logger.info('Determined headers and types: {headers}'.format( headers=headers_dicts)) if dry_run: return headers_dicts, result count = 0 for i, chunk in enumerate(chunky(result, 250)): records, is_it_the_last_chunk = chunk count += len(records) logger.info('Saving chunk {number} {is_last}'.format( number=i, is_last='(last)' if is_it_the_last_chunk else '')) send_resource_to_datastore(resource, headers_dicts, records, is_it_the_last_chunk, api_key, ckan_url) logger.info('Successfully pushed {n} entries to "{res_id}".'.format( n=count, res_id=resource_id)) if data.get('set_url_type', False): update_resource(resource, api_key, ckan_url)
def test_simple_csv(self): fh = horror_fobj('simple.csv') table_set = any_tableset(fh, extension='csv') assert isinstance(table_set, CSVTableSet)
def check_no_filename(d): if not d['tableset']: raise SkipTest("Optional library not installed. Skipping") fh = horror_fobj(d['filename']) table_set = any_tableset(fh) assert isinstance(table_set, d['tableset']), type(table_set)
f = cStringIO.StringIO(file_content) ## #f = cStringIO.StringIO(response.read()) file_hash = hashlib.md5(f.read()).hexdigest() f.seek(0) if (resource.get('hash') == file_hash and not data.get('ignore_hash')): logger.info( "The file hash hasn't changed: {hash}.".format(hash=file_hash)) return resource['hash'] = file_hash try: table_set = messytables.any_tableset(f, mimetype=ct, extension=ct) except messytables.ReadError as e: ## try again with format f.seek(0) try: format = resource.get('format') table_set = messytables.any_tableset(f, mimetype=format, extension=format) except: raise util.JobError(e) row_set = table_set.tables.pop() offset, headers = messytables.headers_guess(row_set.sample) existing = datastore_resource_exists(resource_id, api_key, ckan_url)
def validate_file(file_tmp, file_name, tmp_filepath): log.info("upload: checking file * %s * ", file_name) MAX_HEADER_LENGTH = 64 # not allowed characters ( - ' " ’ ‘) regex inappropriate_chars = re.compile(r"[\-|\'|\"|\u2018|\u2019]") datastore_ext = config.get('ckan.mimetype_guess', "csv xls xlsx tsv") tmp_file_name, tmp_file_ext = os.path.splitext(file_name) tmp_file_ext_str = tmp_file_ext[1:].lower() #check if datastore file (csv xls xlsx tsv) if tmp_file_ext_str in datastore_ext: try: table_set = any_tableset(file_tmp) except: log.info("file is not valid * %s * ", file_name) raise logic.ValidationError({'upload': ['The file is not valid']}) #check if only one data sheet in the file if len(table_set.tables) > 1: rollback_tmp(file_tmp, tmp_filepath) log.error( "upload: the file * %s * was not uploaded - There is more then one data sheet in the file", file_name) raise logic.ValidationError({ 'upload': [_('There is more then one data sheet in the file')] }) #check if table_set is not empty elif len(table_set.tables) > 0: row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) for header in headers: # too long header if len(header) > MAX_HEADER_LENGTH: rollback_tmp(file_tmp, tmp_filepath) log.error( "upload: the file * %s * was not uploaded - too long header - * %s *", file_name, header) raise logic.ValidationError( {'upload': [_('too long header (64 max)')]}) # not allowed characters in header ( - ' " ’ ‘) #if inappropriate_chars.search(header): # rollback_tmp(file_tmp, tmp_filepath) # log.error("upload: the file * %s * was not uploaded - there are inappropriate characters in headers * %s *", # file_name, header) # raise logic.ValidationError( # {'upload': [_('there are inappropriate characters in headers (apostrophe/apostrophes/dash)')]} # ) # Check for duplicate fields unique_fields = set(headers) if not len(unique_fields) == len(headers): rollback_tmp(file_tmp, tmp_filepath) log.error( "upload: the file * %s * was not uploaded - Duplicate column names are not supported", file_name) raise logic.ValidationError({ 'upload': [_('Duplicate column names are not supported')] }) log.info( "passed validation succesfully - the file * %s * was uploaded to CKAN (filestore)", file_name) else: log.info( "no table_set was created by messytables - skip headers validations in the file * %s * ", file_name) else: pass
def test_unknown(self): fh = horror_fobj('simple.unknown') self.assertRaises(ValueError, lambda: any_tableset(fh, extension='unknown'))
file_hash = m.hexdigest() tmp.seek(0) if (resource.get('hash') == file_hash and not data.get('ignore_hash')): logger.info( "The file hash hasn't changed: {hash}.".format(hash=file_hash)) return resource['hash'] = file_hash # Decoded data if needed decoded_tmp = force_decode(tmp) try: table_set = messytables.any_tableset(decoded_tmp, mimetype=ct, extension=ct) except messytables.ReadError as e: ## try again with format decoded_tmp.seek(0) try: format = resource.get('format') table_set = messytables.any_tableset(decoded_tmp, mimetype=format, extension=format) except: raise util.JobError(e) row_set = table_set.tables.pop() offset, headers = messytables.headers_guess(row_set.sample)
def check_filename(d): fh = horror_fobj(d['filename']) table_set = any_tableset(fh, extension=d['filename'], auto_detect=False) assert isinstance(table_set, d['tableset']), type(table_set)
def test_simple_xlsx(self): fh = horror_fobj('simple.xlsx') table_set = any_tableset(fh, extension='xlsx') assert isinstance(table_set, XLSXTableSet)
def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads an Excel file (or other tabular data recognized by messytables) into Datastore and creates indexes. Largely copied from datapusher - see below. Is slower than load_csv. ''' # use messytables to determine the header row logger.info('Determining column names and types') ct = mimetype format = os.path.splitext(table_filepath)[1] # filename extension with open(table_filepath, 'rb') as tmp: # # Copied from datapusher/jobs.py:push_to_datastore # try: table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct) except messytables.ReadError as e: # try again with format tmp.seek(0) try: table_set = messytables.any_tableset(tmp, mimetype=format, extension=format) except Exception as e: raise LoaderError(e) if not table_set.tables: raise LoaderError('Could not parse file as tabular data') row_set = table_set.tables.pop() offset, headers = messytables.headers_guess(row_set.sample) existing = datastore_resource_exists(resource_id) existing_info = None if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. headers = encode_headers(headers) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) TYPES, TYPE_MAPPING = get_types() types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] row_set.register_processor(messytables.types_processor(types)) headers = [ header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip() ] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row result = row_iterator() ''' Delete existing datstore resource before proceeding. Otherwise 'datastore_create' will append to the existing datastore. And if the fields have significantly changed, it may also fail. ''' if existing: logger.info('Deleting "{res_id}" from datastore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id) headers_dicts = [ dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types) ] # Maintain data dictionaries from matching column names if existing_info: for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in _TYPE_MAPPING.values(): h['type'] = type_override logger.info('Determined headers and types: {headers}'.format( headers=headers_dicts)) ### Commented - this is only for tests # if dry_run: # return headers_dicts, result logger.info('Copying to database...') count = 0 for i, records in enumerate(chunky(result, 250)): count += len(records) logger.info('Saving chunk {number}'.format(number=i)) send_resource_to_datastore(resource_id, headers_dicts, records) logger.info('...copying done') if count: logger.info( 'Successfully pushed {n} entries to "{res_id}".'.format( n=count, res_id=resource_id)) else: # no datastore table is created raise LoaderError('No entries found - nothing to load')
def test_xlsm(self): fh = horror_fobj('bian-anal-mca-2005-dols-eng-1011-0312-tab3.xlsm') table_set = any_tableset(fh, extension='xls') row_set = table_set.tables[0] data = list(row_set) assert_equal(62, len(data))
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads a CSV into DataStore. Does not create the indexes.''' # use messytables to determine the header row extension = os.path.splitext(csv_filepath)[1] tempdir = tempfile.mkdtemp(suffix=resource_id) if extension.lower() == '.zip': with zipfile.ZipFile(csv_filepath, "r") as zip_ref: csvfiles = [ file for file in zip_ref.filelist if file.filename.lower().endswith('.csv') ] if len(csvfiles) == 0: logger.info("no csvfiles found in %s" % csv_filepath) if len(csvfiles) > 0: if len(csvfiles) > 1: logger.info( "multiple csv files found in %s, only one will be ingested: %s" % (csv_filepath, csvfiles[0].filename)) else: logger.info("unzipping %s and ingesting %s" % (csv_filepath, csvfiles[0].filename)) zip_ref.extract(csvfiles[0], tempdir) new_loc = os.path.join(tempdir, csvfiles[0].filename) csv_filepath = new_loc extension = os.path.splitext(csv_filepath)[1] logger.info("unzipped %s" % csvfiles[0].filename) # close. zip_ref.close() with open(csv_filepath, 'rb') as f: try: table_set = messytables.any_tableset(f, mimetype=mimetype, extension=extension) except messytables.ReadError as e: # # try again with format # f.seek(0) # try: # format = resource.get('format') # table_set = messytables.any_tableset(f, mimetype=format, # extension=format) # except Exception: raise LoaderError('Messytables error: {}'.format(e)) if not table_set.tables: raise LoaderError('Could not detect tabular data in this file') row_set = table_set.tables.pop() header_offset, headers = messytables.headers_guess(row_set.sample) # Some headers might have been converted from strings to floats and such. headers = encode_headers(headers) # Guess the delimiter used in the file with open(csv_filepath, 'r') as f: header_line = f.readline() try: sniffer = csv.Sniffer() delimiter = sniffer.sniff(header_line).delimiter except csv.Error: logger.warning( 'Could not determine delimiter from file, use default ","') delimiter = ',' # Setup the converters that run when you iterate over the row_set. # With pgloader only the headers will be iterated over. row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(header_offset + 1)) # types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) headers = [ header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip() ] # headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) # for field in zip(headers, types)] # TODO worry about csv header name problems # e.g. duplicate names # encoding (and line ending?)- use chardet # It is easier to reencode it as UTF8 than convert the name of the encoding # to one that pgloader will understand. logger.info('Ensuring character coding is UTF8') f_write = tempfile.NamedTemporaryFile(suffix=extension, delete=False) try: with open(csv_filepath, 'rb') as f_read: csv_decoder = messytables.commas.UTF8Recoder(f_read, encoding=None) for line in csv_decoder: f_write.write(line) f_write.close() # ensures the last line is written csv_filepath = f_write.name logger.info('Ensuring character coding is UTF8 complete') # check tables exists # datastore db connection engine = get_write_engine() # get column info from existing table existing = datastore_resource_exists(resource_id) existing_info = {} if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) ''' Delete existing datastore table before proceeding. Otherwise the COPY will append to the existing table. And if the fields have significantly changed, it may also fail. ''' logger.info('Deleting "{res_id}" from DataStore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id) # Columns types are either set (overridden) in the Data Dictionary page # or default to text type (which is robust) fields = [ {'id': header_name, 'type': existing_info.get(header_name, {})\ .get('type_override') or 'text', } for header_name in headers] # Maintain data dictionaries from matching column names if existing_info: for f in fields: if f['id'] in existing_info: f['info'] = existing_info[f['id']] logger.info('Fields: {}'.format(fields)) # Create table from ckan import model context = {'model': model, 'ignore_auth': True} data_dict = dict( resource_id=resource_id, fields=fields, ) data_dict['records'] = None # just create an empty table data_dict['force'] = True # TODO check this - I don't fully # understand read-only/datastore resources try: p.toolkit.get_action('datastore_create')(context, data_dict) except p.toolkit.ValidationError as e: if 'fields' in e.error_dict: # e.g. {'message': None, 'error_dict': {'fields': [u'"***" is not a valid field name']}, '_error_summary': None} error_message = e.error_dict['fields'][0] raise LoaderError( 'Error with field definition: {}'.format(error_message)) else: raise LoaderError( 'Validation error when creating the database table: {}'. format(str(e))) except Exception as e: raise LoaderError( 'Could not create the database table: {}'.format(e)) connection = context['connection'] = engine.connect() if not fulltext_trigger_exists(connection, resource_id): logger.info('Trigger created') _create_fulltext_trigger(connection, resource_id) # datstore_active is switched on by datastore_create - TODO temporarily # disable it until the load is complete # logger.info('Disabling row index trigger') _disable_fulltext_trigger(connection, resource_id) # logger.info('Dropping indexes') _drop_indexes(context, data_dict, False) logger.info('Copying to database...') # Options for loading into postgres: # 1. \copy - can't use as that is a psql meta-command and not accessible # via psycopg2 # 2. COPY - requires the db user to have superuser privileges. This is # dangerous. It is also not available on AWS, for example. # 3. pgloader method? - as described in its docs: # Note that while the COPY command is restricted to read either from its standard input or from a local file on the server's file system, the command line tool psql implements a \copy command that knows how to stream a file local to the client over the network and into the PostgreSQL server, using the same protocol as pgloader uses. # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids # the superuser issue. <-- picked # with psycopg2.connect(DSN) as conn: # with conn.cursor() as curs: # curs.execute(SQL) raw_connection = engine.raw_connection() try: cur = raw_connection.cursor() try: with open(csv_filepath, 'rb') as f: # can't use :param for table name because params are only # for filter values that are single quoted. try: cur.copy_expert( "COPY \"{resource_id}\" ({column_names}) " "FROM STDIN " "WITH (DELIMITER '{delimiter}', FORMAT csv, HEADER 1, " " ENCODING '{encoding}');".format( resource_id=resource_id, column_names=', '.join( ['"{}"'.format(h) for h in headers]), delimiter=delimiter, encoding='UTF8', ), f) except psycopg2.DataError as e: # e is a str but with foreign chars e.g. # 'extra data: "paul,pa\xc3\xbcl"\n' # but logging and exceptions need a normal (7 bit) str error_str = str(e).decode('ascii', 'replace').encode( 'ascii', 'replace') logger.warning(error_str) raise LoaderError( 'Error during the load into PostgreSQL:' ' {}'.format(error_str)) finally: cur.close() finally: raw_connection.commit() finally: os.remove(csv_filepath) # i.e. the tempfile shutil.rmtree(tempdir) logger.info('...copying done') logger.info('Creating search index...') _populate_fulltext(connection, resource_id, fields=fields) logger.info('...search index created') return fields
def _datastorer_upload(context, resource, logger): result = download(context, resource, data_formats=DATA_FORMATS) logger.info('Downloaded resource %r' %(resource)) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters extension = resource['format'].lower() fp = open(result['saved_file'], 'rb') if zipfile.is_zipfile(result['saved_file']): fp, zf = open_zipped_tableset(fp, extension=extension) logger.info('Opened entry %s from ZIP archive %s', zf, result['saved_file']) else: logger.info('Opened file %s' %(result['saved_file'])) table_sets = any_tableset(fp, extension=extension) if 'sample_size' in context: table_sets.window = max(1000, int(context['sample_size'])) logger.info('Using a sample window of %d', table_sets.window) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'force': True, 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, datastore_create_request_url, logger) # Delete any existing data before proceeding. Otherwise 'datastore_create' will # append to the existing datastore. And if the fields have significantly changed, # it may also fail. try: logger.info('Deleting existing datastore (it may not exist): {0}.'.format(resource['id'])) response = requests.post('%s/api/action/datastore_delete' % (ckan_url), data=json.dumps({'resource_id': resource['id'], 'force': True}), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']} ) if not response.status_code or response.status_code not in (200, 404): # skips 200 (OK) or 404 (datastore does not exist, no need to delete it) logger.error('Deleting existing datastore failed: {0}'.format(get_response_error(response))) raise DatastorerException("Deleting existing datastore failed.") except requests.exceptions.RequestException as e: logger.error('Deleting existing datastore failed: {0}'.format(str(e))) raise DatastorerException("Deleting existing datastore failed.") logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat() }) response = requests.post( ckan_request_url, data=json.dumps(resource), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}) if response.status_code not in (201, 200): raise DatastorerException('Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))
ct = response.info().getheader('content-type').split(';', 1)[0] f = cStringIO.StringIO(response.read()) file_hash = hashlib.md5(f.read()).hexdigest() f.seek(0) if (resource.get('hash') == file_hash and not data.get('ignore_hash')): logger.info("The file hash hasn't changed: {hash}.".format( hash=file_hash)) return resource['hash'] = file_hash try: table_set = messytables.any_tableset(f, mimetype=ct, extension=ct) except messytables.ReadError as e: ## try again with format f.seek(0) try: format = resource.get('format') table_set = messytables.any_tableset(f, mimetype=format, extension=format) except: raise util.JobError(e) row_set = table_set.tables.pop() offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) row_set.register_processor(messytables.types_processor(types))