def main(argv=None): args = parse_args(argv) if args.file is None: # slurp the whole input since there seems to be a bug in messytables # which should be able to handle streams but doesn't args.file = cStringIO.StringIO(sys.stdin.read()) relation_key = args_to_relation_key(args) table_set = any_tableset(args.file) if len(table_set.tables) != 1: raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables)) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(strip_processor()) row_set.register_processor(headers_processor(headers)) # Temporarily, mark the offset of the header row_set.register_processor(offset_processor(offset + 1)) # guess types and register them types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType]) row_set.register_processor(types_processor(types)) # Messytables seems to not handle the case where there are no headers. # Work around this as follows: # 1) offset must be 0 # 2) if the types of the data match the headers, assume there are # actually no headers if offset == 0: try: [t.cast(v) for (t, v) in zip(types, headers)] except: pass else: # We don't need the headers_processor or the offset_processor row_set._processors = [] row_set.register_processor(strip_processor()) row_set.register_processor(types_processor(types)) headers = None # Construct the Myria schema schema = messy_to_schema(types, headers) logging.info("Myria schema: {}".format(json.dumps(schema))) # Prepare data for writing to Myria data, kwargs = write_data(row_set, schema) if not args.dry: # Connect to Myria and send the data connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl) ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs) sys.stdout.write(pretty_json(ret)) else: sys.stdout.write(data)
def test_null_process(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(null_processor(['null'])) data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) expected_types = [IntegerType(), BoolType(), BoolType(), BoolType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) # after applying the types, '' should become None for int columns data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, True, True])
def test_null_process(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(null_processor(['null'])) data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), IntegerType(), IntegerType(), IntegerType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) # after applying the types, '' should become None for int columns data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, True, True])
def test_apply_null_values(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), StringType(), IntegerType(), StringType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) # treat null as non empty text and 0 as non empty integer assert [x.empty for x in data[0]] == [False, False, False, False] assert [x.empty for x in data[1]] == [False, False, False, False] assert [x.empty for x in data[2]] == [False, False, True, True] assert [x.empty for x in data[3]] == [False, False, False, False] assert [x.empty for x in data[4]] == [False, False, False, True] assert [x.empty for x in data[5]] == [False, False, False, True] # we expect None for Integers and "" for empty strings in CSV assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
def main(basic_config_file, batch_config_file): with open(basic_config_file, "r") as f: base_settings = yaml.load(f) if batch_config_file: # RUN MANY # parse csv into a list of settings-dicts import messytables with open(batch_config_file, "rb") as f: row_set = messytables.CSVRowSet("", f) offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, strict=True) row_set.register_processor(messytables.types_processor(types)) settings_list = row_set.dicts() name = batch_config_file.replace(".csv", "") run_many(settings_list, name, base_settings=base_settings) else: # RUN ONE # parse yaml into a settings-dict settings_file = os.path.join(base_settings["out_dir"], "settings.yml") with open(settings_file, "w") as f: yaml.dump(base_settings, f) training_log, exit_status = run_one(**base_settings) training_log_file = os.path.join(base_settings["out_dir"], "training_log.csv") training_log.to_csv(training_log_file) stats = compute_final_stats(training_log) stats["exit_status"] = exit_status training_stats_file = os.path.join(base_settings["out_dir"], "training_stats.yml") with open(training_stats_file, "w") as f: yaml.dump(stats, f)
def connect(self, host=None, port=None, database=None, username=None, password=None, file=None): # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle self.database = database conn_string = '' if self.engine == 'psycopg2': if database: conn_string += "dbname='%s' " % database if username: conn_string += "user='******' " % username if host: conn_string += "host='%s' " % host if port: conn_string += "port='%s' " % port if password: conn_string += "password='******' " % password self.conn = psycopg2.connect(conn_string) elif self.engine == 'pymssql': self.conn = pymssql.connect(host, username, password, database, port=port, as_dict=True, charset='LATIN1') elif self.engine == 'csv': # https://messytables.readthedocs.io/en/latest/ fh = StringIO.StringIO(self.data) #dialect = csv.Sniffer().sniff(f.read(1024)) #f.seek(0) #self.conn = csv.DictReader(f, dialect=dialect) #fh = open('messy.csv', 'rb') # Load a file object: table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) self.conn = row_set return self.conn
def proc(f, database_name, table_name): table_set = messytables.any_tableset(f) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=[ messytables.types.StringType, messytables.types.DateType, ], strict=True) hive_data_file = tempfile.NamedTemporaryFile(mode='w') fields_ddl = ','.join([ ' {0} {1}\n'.format( canonicalize_column_name(colName), hive_column_type(colType) ) for colName, colType in zip(headers, types) ]) hive_sql = ''' DROP TABLE IF EXISTS {0}; CREATE TABLE {0} ( {1} ) STORED AS TEXTFILE TBLPROPERTIES ("comment"="add_messytable on {3}"); LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0}; '''.format(table_name, fields_ddl, hive_data_file.name, datetime.datetime.now().isoformat()) hive_cmd_file = tempfile.NamedTemporaryFile(mode='w') print(hive_sql, file=hive_cmd_file) hive_cmd_file.flush() row_set.register_processor(messytables.types_processor(types)) for row in row_set: print('\001'.join(map(str, [ c.value for c in row])), file=hive_data_file) hive_data_file.flush() subprocess.call([ 'hive', '--database', database_name, '-f', hive_cmd_file.name, ])
def test_read_type_guess_simple(self): fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample) expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) header_types = [c.type for c in data[0]] assert_equal(header_types, [StringType()] * 3) row_types = [c.type for c in data[2]] assert_equal(expected_types, row_types)
def csvParse(csv_file_path): fh = open(csv_file_path, 'rb') # Load a file object: table_set = CSVTableSet(fh) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(offset_processor(offset + 1)) # guess column types: types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) return row_set, headers, offset, types
def test_read_type_guess_simple(self): fh = horror_fobj("simple.csv") table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample) expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) header_types = map(lambda c: c.type, data[0]) assert_equal(header_types, [StringType()] * 3) row_types = map(lambda c: c.type, data[2]) assert_equal(expected_types, row_types)
def prepare_csv_rows(csv_file): row_set = CSVTableSet(csv_file).tables[0] offset, headers = headers_guess(row_set.sample) headers = [convert_header_to_column_name(header) for header in (h for h in headers if h)] row_set.register_processor(headers_processor_remove_blank(headers)) row_set.register_processor(offset_processor(offset + 1)) DateType.formats = create_date_formats(day_first=False) # We are never wanting boolean types, so remove that from the default list eligible_types = [StringType, DecimalType, IntegerType, DateType] types = type_guess(row_set.sample, types=eligible_types, strict=True) row_set.register_processor(types_processor(types)) return row_set
def csvimport_table(name): from messytables import CSVTableSet, type_guess from messytables import types_processor, headers_guess from messytables import headers_processor, offset_processor from spendb.etl.extract import parse_table row_set = CSVTableSet(data_fixture(name)).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) rows = [] for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)): rows.append(row) return fields, rows
def parse_table(source): # This is a work-around because messytables hangs on boto file # handles, so we're doing it via plain old HTTP. # We're also passing in an extended window size to give more # reliable type detection. # Because Python's CSV dialect sniffer isn't the best, this also # constrains the field quoting character to a double quote. table_set = mt.any_tableset(source.fh(), extension=source.meta.get('extension'), mimetype=source.meta.get('mime_type'), quotechar='"', window=20000) tables = list(table_set.tables) if not len(tables): log.error("No tables were found in the source file.") return row_set = tables[0] headers = [c.value for c in next(row_set.sample)] row_set.register_processor(mt.headers_processor(headers)) row_set.register_processor(mt.offset_processor(1)) types = mt.type_guess(row_set.sample, strict=True) row_set.register_processor(mt.types_processor(types, strict=True)) fields, i = {}, 0 row_iter = iter(row_set) while True: i += 1 try: row = row_iter.next() if not len(fields): fields = generate_field_spec(row) data = convert_row(row, fields, i) check_empty = set(data.values()) if None in check_empty and len(check_empty) == 1: continue yield None, fields, data except StopIteration: return except Exception, e: # log.exception(e) yield e, fields, None
def parse_data(input): fh = open(input, 'rb') try: table_set = messytables.any_tableset(fh) except messytables.ReadError as e: print(e) get_row_set = lambda table_set: table_set.tables.pop() row_set = get_row_set(table_set) offset, headers = messytables.headers_guess(row_set.sample) # Some headers might have been converted from strings to floats and such. headers = [str(header) for header in headers] row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row result = row_iterator() headers_dicts = [ dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types) ] print('Determined headers and types: {headers}'.format( headers=headers_dicts)) return headers_dicts, result
def resource_row_set(package, resource): """ Generate an iterator over all the rows in this resource's source data. """ # This is a work-around because messytables hangs on boto file # handles, so we're doing it via plain old HTTP. table_set = any_tableset(resource.fh(), extension=resource.meta.get('extension'), mimetype=resource.meta.get('mime_type')) tables = list(table_set.tables) if not len(tables): log.error("No tables were found in the source file.") return row_set = tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) return row_set
def test_apply_null_values(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) expected_types = [IntegerType(), StringType(), IntegerType(), StringType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) # treat null as non empty text and 0 as non empty integer assert [x.empty for x in data[0]] == [False, False, False, False] assert [x.empty for x in data[1]] == [False, False, False, False] assert [x.empty for x in data[2]] == [False, False, True, True] assert [x.empty for x in data[3]] == [False, False, False, False] assert [x.empty for x in data[4]] == [False, False, False, True] assert [x.empty for x in data[5]] == [False, False, False, True] # we expect None for Integers and "" for empty strings in CSV assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
def _guess_csv_datatype(fh): table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) logger.info("(offset, headers) = ({}, {})".format(offset, headers)) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) counter = 0 for row in row_set: logger.info(row) counter += 1 if counter >= 32: break d = {h: t for h, t in zip(headers, types)} logger.info(d) return d
def parse_table(row_set, save_func): num_rows = 0 fields = {} offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) for i, row in enumerate(row_set): if not len(fields): fields = generate_field_spec(row) data = {} for cell, field in zip(row, fields): value = cell.value if isinstance(value, datetime): value = value.date() if isinstance(value, Decimal): # Baby jesus forgive me. value = float(value) if isinstance(value, basestring) and not len(value.strip()): value = None data[field['name']] = value random_sample(value, field, i) check_empty = set(data.values()) if None in check_empty and len(check_empty) == 1: continue save_func(data) num_rows = i fields = {f.get('name'): f for f in fields} return num_rows, fields
def determine_messytables_types(file_handle, types=messytables.types.TYPES): """ :param file_handle: file handle opened in binary mode :return: (headers, types, row_set) """ # Load a file object: table_set = messytables.CSVTableSet(file_handle) # If you aren't sure what kind of file it is # table_set = messytables.any_tableset(file_handle) # A table set is a collection of tables: row_set = table_set.tables[0] # A row set is an iterator over the table, but it can only # be run once. To peek, a sample is provided: print(next(row_set.sample)) # guess header names and the offset of the header: offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(messytables.offset_processor(offset + 1)) # guess column types: types = messytables.type_guess(row_set.sample, types, strict=True) # and tell the row set to apply these types to # each row when traversing the iterator: row_set.register_processor(messytables.types_processor(types)) # now run some operation on the data: return headers, types, row_set
def push_to_datastore(task_id, input, dry_run=False): '''Download and parse a resource push its data into CKAN's DataStore. An asynchronous job that gets a resource from CKAN, downloads the resource's data file and, if the data file has changed since last time, parses the data and posts it into CKAN's DataStore. :param dry_run: Fetch and parse the data file but don't actually post the data to the DataStore, instead return the data headers and rows that would have been posted. :type dry_run: boolean ''' handler = util.StoringHandler(task_id, input) logger = logging.getLogger(task_id) logger.addHandler(handler) logger.setLevel(logging.DEBUG) validate_input(input) data = input['metadata'] ckan_url = data['ckan_url'] resource_id = data['resource_id'] api_key = input.get('api_key') try: resource = get_resource(resource_id, ckan_url, api_key) except util.JobError as e: # try again in 5 seconds just incase CKAN is slow at adding resource time.sleep(5) resource = get_resource(resource_id, ckan_url, api_key) # check if the resource url_type is a datastore if resource.get('url_type') == 'datastore': logger.info('Dump files are managed with the Datastore API') return # check scheme url = resource.get('url') scheme = urlsplit(url).scheme if scheme not in ('http', 'https', 'ftp'): raise util.JobError( 'Only http, https, and ftp resources may be fetched.' ) # fetch the resource data logger.info('Fetching from: {0}'.format(url)) headers = {} if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources headers['Authorization'] = api_key try: response = requests.get( url, headers=headers, timeout=DOWNLOAD_TIMEOUT, verify=SSL_VERIFY, stream=True, # just gets the headers for now ) response.raise_for_status() cl = response.headers.get('content-length') try: if cl and int(cl) > MAX_CONTENT_LENGTH: raise util.JobError( 'Resource too large to download: {cl} > max ({max_cl}).' .format(cl=cl, max_cl=MAX_CONTENT_LENGTH)) except ValueError: pass tmp = tempfile.TemporaryFile() length = 0 m = hashlib.md5() for chunk in response.iter_content(CHUNK_SIZE): length += len(chunk) if length > MAX_CONTENT_LENGTH: raise util.JobError( 'Resource too large to process: {cl} > max ({max_cl}).' .format(cl=length, max_cl=MAX_CONTENT_LENGTH)) tmp.write(chunk) m.update(chunk) ct = response.headers.get('content-type', '').split(';', 1)[0] except requests.HTTPError as e: raise HTTPError( "DataPusher received a bad HTTP response when trying to download " "the data file", status_code=e.response.status_code, request_url=url, response=e.response.content) except requests.RequestException as e: raise HTTPError( message=str(e), status_code=None, request_url=url, response=None) file_hash = m.hexdigest() tmp.seek(0) if (resource.get('hash') == file_hash and not data.get('ignore_hash')): logger.info("The file hash hasn't changed: {hash}.".format( hash=file_hash)) return resource['hash'] = file_hash try: table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct) except messytables.ReadError as e: # try again with format tmp.seek(0) try: format = resource.get('format') table_set = messytables.any_tableset(tmp, mimetype=format, extension=format) except: raise util.JobError(e) get_row_set = web.app.config.get('GET_ROW_SET', lambda table_set: table_set.tables.pop()) row_set = get_row_set(table_set) offset, headers = messytables.headers_guess(row_set.sample) existing = datastore_resource_exists(resource_id, api_key, ckan_url) existing_info = None if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. headers = [str(header) for header in headers] row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue if isinstance(cell.value, str): try: data_row[column_name] = cell.value.encode('latin-1').decode('utf-8') except (UnicodeDecodeError, UnicodeEncodeError): data_row[column_name] = cell.value else: data_row[column_name] = cell.value yield data_row result = row_iterator() ''' Delete existing datstore resource before proceeding. Otherwise 'datastore_create' will append to the existing datastore. And if the fields have significantly changed, it may also fail. ''' if existing: logger.info('Deleting "{res_id}" from datastore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id, api_key, ckan_url) headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types)] # Maintain data dictionaries from matching column names if existing_info: for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in list(_TYPE_MAPPING.values()): h['type'] = type_override logger.info('Determined headers and types: {headers}'.format( headers=headers_dicts)) if dry_run: return headers_dicts, result count = 0 for i, chunk in enumerate(chunky(result, 250)): records, is_it_the_last_chunk = chunk count += len(records) logger.info('Saving chunk {number} {is_last}'.format( number=i, is_last='(last)' if is_it_the_last_chunk else '')) send_resource_to_datastore(resource, headers_dicts, records, is_it_the_last_chunk, api_key, ckan_url) logger.info('Successfully pushed {n} entries to "{res_id}".'.format( n=count, res_id=resource_id)) if data.get('set_url_type', False): update_resource(resource, api_key, ckan_url)
def push_to_datastore(self, context, resource): # Get the resource's content hash, which is used to check whether the # resource file has changed since last time. hash_dict = resource.get('hash') if hash_dict: original_content_hash = json.loads(hash_dict)['content'] check_hash = not self.options.force else: # This resource has no hash yet, it must be a new resource. original_content_hash = '' check_hash = False try: result = fetch_resource.download(context, resource, self.max_content_length, DATA_FORMATS, check_modified=check_hash) except fetch_resource.ResourceNotModified as e: logger.info(u'Skipping unmodified resource: {0}'.format( resource['url'])) return {'success': True, 'resource': resource['id'], 'error': None} except Exception as e: logger.exception(e) return { 'success': False, 'resource': resource['id'], 'error': 'Could not download resource' } if check_hash and (result['hash'] == original_content_hash): logger.info(u'Skipping unmodified resource: {0}'.format( resource['url'])) os.remove(result['saved_file']) return {'success': True, 'resource': resource['id'], 'error': None} content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') try: table_sets = any_tableset(f, mimetype=content_type, extension=resource['format'].lower()) # only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) except Exception as e: logger.exception(e) os.remove(result['saved_file']) return { 'success': False, 'resource': resource['id'], 'error': 'Error parsing the resource' } row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess(row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): data_dict = { 'resource_id': resource['id'], 'fields': [ dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names) ], 'records': data, 'force': True, } response = toolkit.get_action('datastore_create')(context, data_dict) return response # Delete any existing data before proceeding. Otherwise # 'datastore_create' will append to the existing datastore. And if the # fields have significantly changed, it may also fail. logger.info('Trying to delete existing datastore for resource {0} ' '(may not exist).'.format(resource['id'])) try: toolkit.get_action('datastore_delete')( context, { 'resource_id': resource['id'], 'force': True }) except toolkit.ObjectNotFound: logger.info('Datastore not found for resource {0}.'.format( resource['id'])) except Exception as e: logger.exception(e) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list(itertools.imap(dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 try: for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) except Exception as e: logger.exception(e) os.remove(result['saved_file']) return { 'success': False, 'resource': resource['id'], 'error': 'Error pushing data to datastore' } logger.info("There should be {n} entries in {res_id}.".format( n=count, res_id=resource['id'])) resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.now().isoformat() }) toolkit.get_action('resource_update')(context, resource) os.remove(result['saved_file']) return {'success': True, 'resource': resource['id'], 'error': None}
logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json',
# print(row) x += 1 print("Rows: ", x - 1) # prints the number of rows (minus the header) print("Columns: ", len(row)) # prints the number of columns cN_array = np.array(cN) print("Header: ", cN_array) ft = open(file, "rb") tableTypes = CSVTableSet(ft) columnTypes = tableTypes.tables[0] offset, headers = headers_guess(columnTypes.sample) columnTypes.register_processor(headers_processor(headers)) columnTypes.register_processor(offset_processor(offset + 1)) types = type_guess(columnTypes.sample, strict=True) columnTypes.register_processor(types_processor(types)) h_length = len(headers) h_var = 0 dt_array = [] for each_type in range(0, h_length): types[h_var] = str(types[h_var]) if types[h_var] == 'Decimal': types[h_var] = "Float" randVar = f"{headers[h_var]} : {types[h_var]}" dt_array.append(randVar) h_var += 1 print("Datatypes: ", dt_array) # except Exception: print("File Unavailable")
def push_to_datastore(self, context, resource): # Get the resource's content hash, which is used to check whether the # resource file has changed since last time. hash_dict = resource.get('hash') if hash_dict: original_content_hash = json.loads(hash_dict)['content'] check_hash = not self.options.force else: # This resource has no hash yet, it must be a new resource. original_content_hash = '' check_hash = False try: result = fetch_resource.download(context, resource, self.max_content_length, DATA_FORMATS, check_modified=check_hash) except fetch_resource.ResourceNotModified as e: logger.info( u'Skipping unmodified resource: {0}'.format(resource['url']) ) return {'success': True, 'resource': resource['id'], 'error': None} except Exception as e: logger.exception(e) return {'success': False, 'resource': resource['id'], 'error': 'Could not download resource'} if check_hash and (result['hash'] == original_content_hash): logger.info( u'Skipping unmodified resource: {0}'.format(resource['url']) ) os.remove(result['saved_file']) return {'success': True, 'resource': resource['id'], 'error': None} content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') try: table_sets = any_tableset( f, mimetype=content_type, extension=resource['format'].lower() ) # only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) except Exception as e: logger.exception(e) os.remove(result['saved_file']) return {'success': False, 'resource': resource['id'], 'error': 'Error parsing the resource'} row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): data_dict = { 'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data, 'force': True, } response = toolkit.get_action('datastore_create')( context, data_dict ) return response # Delete any existing data before proceeding. Otherwise # 'datastore_create' will append to the existing datastore. And if the # fields have significantly changed, it may also fail. logger.info('Trying to delete existing datastore for resource {0} ' '(may not exist).'.format(resource['id'])) try: toolkit.get_action('datastore_delete')( context, {'resource_id': resource['id'], 'force': True} ) except toolkit.ObjectNotFound: logger.info('Datastore not found for resource {0}.'.format( resource['id'])) except Exception as e: logger.exception(e) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 try: for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) except Exception as e: logger.exception(e) os.remove(result['saved_file']) return {'success': False, 'resource': resource['id'], 'error': 'Error pushing data to datastore'} logger.info("There should be {n} entries in {res_id}.".format( n=count, res_id=resource['id'] )) resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.now().isoformat() }) toolkit.get_action('resource_update')(context, resource) os.remove(result['saved_file']) return {'success': True, 'resource': resource['id'], 'error': None}
def _datastorer_upload(context, resource, logger): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: is_tsv = (content_type in tsv_types or resource['format'] in tsv_types) delimiter = '\t' if is_tsv else ',' table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, datastore_create_request_url, logger) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat(), 'url': resource['url'] } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}) if response.status_code not in (201, 200): raise DatastorerException('Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))
def push_to_datastore(self, context, resource): try: result = download( context, resource, self.max_content_length, DATA_FORMATS ) except Exception as e: logger.exception(e) return content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') table_sets = AnyTableSet.from_fileobj( f, mimetype=content_type, extension=resource['format'].lower() ) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): data_dict = { 'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data } response = logic.get_action('datastore_create')( context, data_dict ) return response # Delete any existing data before proceeding. Otherwise # 'datastore_create' will append to the existing datastore. And if the # fields have significantly changed, it may also fail. logger.info('Deleting existing datastore (it may not exist): ' '{0}.'.format(resource['id'])) try: logic.get_action('datastore_delete')( context, {'resource_id': resource['id']} ) except Exception as e: logger.exception(e) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format( n=count, res_id=resource['id'] )) resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat() }) logic.get_action('resource_update')(context, resource)
# A table set is a collection of tables: row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(offset_processor(offset + 1)) # guess column types: types = type_guess(row_set.sample, strict=True) # and tell the row set to apply these types to # each row when traversing the i: row_set.register_processor(types_processor(types)) #-------doing rest of operations using pandas------# dataFrame = pd.read_csv(datafile) ListColumns = list(dataFrame.columns) rw, col = dataFrame.shape #number of rows,number of colmuns ListOfFunctionalDependecies = {} ListOfCombinationsOfColumns = GetAllCombinations(ListColumns) for_primary_key_and_file_breaking = [[]] * (len(ListOfCombinationsOfColumns)) candidate_keys = [] one_to_one_relation = [] many_to_one_relation = [] super_keys = [] StoringAllRelationsInADictionaryOfTuples = {} for x in xrange(0, len(ListColumns)):
def webstorer_upload(context, data): context = json.loads(context) resource = json.loads(data) excel_types = ['xls', 'application/ms-excel', 'application/xls'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) rows = [] for row in row_set.dicts(): rows.append(row) webstore_url = context.get('webstore_url').rstrip('/') webstore_request_url = '%s/%s/%s' % (webstore_url, context['username'], resource['id'] ) #check if resource is already there. webstore_response = requests.get(webstore_request_url+'.json') check_response_and_retry(webstore_response, webstore_request_url+'.json') #should be an empty list as no tables should be there. if json.loads(webstore_response.content): raise WebstorerError('Webstore already has this resource') response = requests.post(webstore_request_url+'/data', data = json.dumps(rows), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, webstore_request_url+'.json') if response.status_code != 201: raise WebstorerError('Websore bad response code (%s). Response was %s'% (response.status_code, response.content) ) ckan_url = context['site_url'].rstrip('/') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url+'/data', 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
def parse(stream, guess_types=True, **kwargs): '''Parse CSV file and return row iterator plus metadata (fields etc). Additional CSV arguments as per http://docs.python.org/2/library/csv.html#csv-fmt-params :param delimiter: :param quotechar: :param window: the size of the sample used for analysis There is also support for: :param encoding: file encoding (will be guess with chardet if not provided) You can process csv as well as tsv files using this function. For tsv just pass:: delimiter='\t' ''' metadata = dict(**kwargs) delimiter = metadata.get('delimiter', None) quotechar = metadata.get('quotechar', None) window = metadata.get('window', None) encoding = metadata.get('encoding', None) table_set = CSVTableSet.from_fileobj(stream, delimiter=delimiter, quotechar=quotechar, encoding=encoding, window=window) row_set = table_set.tables.pop() offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 if guess_types: guessable_types = [StringType, IntegerType, FloatType, DecimalType, DateUtilType] row_types = type_guess(row_set.sample, guessable_types) for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', unicode(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])]) if guess_types: if isinstance(row_types[index], DateUtilType): field_dict['type'] = 'DateTime' else: field_dict['type'] = str(row_types[index]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) if guess_types: row_set.register_processor(types_processor(row_types)) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): data_row[cell.column] = cell.value yield data_row return row_iterator(), {'fields': fields}
def _datastorer_upload(context, resource, logger): result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') table_sets = any_tableset(f, mimetype=content_type, extension=resource['format'].lower()) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess(row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % ( ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = { 'resource_id': resource['id'], 'fields': [ dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names) ], 'force': True, 'records': data } response = requests.post( datastore_create_request_url, data=json.dumps(request), headers={ 'Content-Type': 'application/json', 'Authorization': context['apikey'] }, ) check_response_and_retry(response, datastore_create_request_url, logger) # Delete any existing data before proceeding. Otherwise 'datastore_create' will # append to the existing datastore. And if the fields have significantly changed, # it may also fail. try: logger.info( 'Deleting existing datastore (it may not exist): {0}.'.format( resource['id'])) response = requests.post('%s/api/action/datastore_delete' % (ckan_url), data=json.dumps({ 'resource_id': resource['id'], 'force': True }), headers={ 'Content-Type': 'application/json', 'Authorization': context['apikey'] }) if not response.status_code or response.status_code not in (200, 404): # skips 200 (OK) or 404 (datastore does not exist, no need to delete it) logger.error('Deleting existing datastore failed: {0}'.format( get_response_error(response))) raise DatastorerException("Deleting existing datastore failed.") except requests.exceptions.RequestException as e: logger.error('Deleting existing datastore failed: {0}'.format(str(e))) raise DatastorerException("Deleting existing datastore failed.") logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list(itertools.imap(dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format( n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat() }) response = requests.post(ckan_request_url, data=json.dumps(resource), headers={ 'Content-Type': 'application/json', 'Authorization': context['apikey'] }) if response.status_code not in (201, 200): raise DatastorerException( 'Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))
# A table set is a collection of tables: row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(offset_processor(offset + 1)) # guess column types: types = type_guess(row_set.sample, strict=True) # and tell the row set to apply these types to # each row when traversing the i: row_set.register_processor(types_processor(types)) def update_datetypes(dataFrame,ListColumns,rw): for x in xrange(0,len(ListColumns)): datet = False tempList = dataFrame[ListColumns[x]].tolist() for i in xrange(1,rw): if(not type(tempList[i]) == str): break if(getType(tempList[i]) == datetime): datet = True break if(datet): types[x] = datetime #creating dataframe
def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads an Excel file (or other tabular data recognized by messytables) into Datastore and creates indexes. Largely copied from datapusher - see below. Is slower than load_csv. ''' # use messytables to determine the header row logger.info('Determining column names and types') ct = mimetype format = os.path.splitext(table_filepath)[1] # filename extension with open(table_filepath, 'rb') as tmp: # # Copied from datapusher/jobs.py:push_to_datastore # try: table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct) except messytables.ReadError as e: # try again with format tmp.seek(0) try: table_set = messytables.any_tableset(tmp, mimetype=format, extension=format) except Exception as e: raise LoaderError(e) if not table_set.tables: raise LoaderError('Could not parse file as tabular data') row_set = table_set.tables.pop() offset, headers = messytables.headers_guess(row_set.sample) existing = datastore_resource_exists(resource_id) existing_info = None if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. headers = encode_headers(headers) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) TYPES, TYPE_MAPPING = get_types() types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] row_set.register_processor(messytables.types_processor(types)) headers = [ header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip() ] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row result = row_iterator() ''' Delete existing datstore resource before proceeding. Otherwise 'datastore_create' will append to the existing datastore. And if the fields have significantly changed, it may also fail. ''' if existing: logger.info('Deleting "{res_id}" from datastore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id) headers_dicts = [ dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types) ] # Maintain data dictionaries from matching column names if existing_info: for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in _TYPE_MAPPING.values(): h['type'] = type_override logger.info('Determined headers and types: {headers}'.format( headers=headers_dicts)) ### Commented - this is only for tests # if dry_run: # return headers_dicts, result logger.info('Copying to database...') count = 0 for i, records in enumerate(chunky(result, 250)): count += len(records) logger.info('Saving chunk {number}'.format(number=i)) send_resource_to_datastore(resource_id, headers_dicts, records) logger.info('...copying done') if count: logger.info( 'Successfully pushed {n} entries to "{res_id}".'.format( n=count, res_id=resource_id)) else: # no datastore table is created raise LoaderError('No entries found - nothing to load')
def parse(stream, guess_types=True, **kwargs): '''Parse CSV file and return row iterator plus metadata (fields etc). Additional CSV arguments as per http://docs.python.org/2/library/csv.html#csv-fmt-params :param delimiter: :param quotechar: :param window: the size of the sample used for analysis There is also support for: :param encoding: file encoding (will be guess with chardet if not provided) You can process csv as well as tsv files using this function. For tsv just pass:: delimiter='\t' ''' metadata = dict(**kwargs) delimiter = metadata.get('delimiter', None) quotechar = metadata.get('quotechar', None) window = metadata.get('window', None) encoding = metadata.get('encoding', None) table_set = CSVTableSet(stream, delimiter=delimiter, quotechar=quotechar, encoding=encoding, window=window) strict_type_guess = metadata.get('strict_type_guess', False) row_set = table_set.tables.pop() offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 if guess_types: guessable_types = [ StringType, IntegerType, FloatType, DecimalType, DateUtilType ] sample = row_set.sample for _ in range(offset + 1): sample.next() row_types = type_guess(sample, guessable_types, strict=strict_type_guess) for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', unicode(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])]) if guess_types: if isinstance(row_types[index], DateUtilType): field_dict['type'] = 'DateTime' else: field_dict['type'] = str(row_types[index]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) if guess_types: row_set.register_processor( types_processor(row_types, strict=strict_type_guess)) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): data_row[cell.column] = cell.value yield data_row return row_iterator(), {'fields': fields}
headers = [unicode(header) for header in headers] row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row result = row_iterator()
def _datastorer_upload(context, resource): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) ckan_url = context['site_url'].rstrip('/') webstore_request_url = '%s/api/data/%s/' % (ckan_url, resource['id'] ) def send_request(data): return requests.post(webstore_request_url + '_bulk', data = "%s%s" % ("\n".join(data), "\n"), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) data = [] for count,dict_ in enumerate(row_set.dicts()): data.append(json.dumps({"index": {"_id": count+1}})) data.append(json.dumps(dict_)) if (count % 100) == 0: response = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') data[:] = [] if data: respose = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url, 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
table_set = messytables.any_tableset(f, mimetype=ct, extension=ct) except messytables.ReadError as e: ## try again with format f.seek(0) try: format = resource.get('format') table_set = messytables.any_tableset(f, mimetype=format, extension=format) except: raise util.JobError(e) row_set = table_set.tables.pop() offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row result = row_iterator()
def _datastorer_upload(context, resource, logger): result = download(context, resource, data_formats=DATA_FORMATS) logger.info('Downloaded resource %r' %(resource)) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters extension = resource['format'].lower() fp = open(result['saved_file'], 'rb') if zipfile.is_zipfile(result['saved_file']): fp, zf = open_zipped_tableset(fp, extension=extension) logger.info('Opened entry %s from ZIP archive %s', zf, result['saved_file']) else: logger.info('Opened file %s' %(result['saved_file'])) table_sets = any_tableset(fp, extension=extension) if 'sample_size' in context: table_sets.window = max(1000, int(context['sample_size'])) logger.info('Using a sample window of %d', table_sets.window) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'force': True, 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, datastore_create_request_url, logger) # Delete any existing data before proceeding. Otherwise 'datastore_create' will # append to the existing datastore. And if the fields have significantly changed, # it may also fail. try: logger.info('Deleting existing datastore (it may not exist): {0}.'.format(resource['id'])) response = requests.post('%s/api/action/datastore_delete' % (ckan_url), data=json.dumps({'resource_id': resource['id'], 'force': True}), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']} ) if not response.status_code or response.status_code not in (200, 404): # skips 200 (OK) or 404 (datastore does not exist, no need to delete it) logger.error('Deleting existing datastore failed: {0}'.format(get_response_error(response))) raise DatastorerException("Deleting existing datastore failed.") except requests.exceptions.RequestException as e: logger.error('Deleting existing datastore failed: {0}'.format(str(e))) raise DatastorerException("Deleting existing datastore failed.") logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat() }) response = requests.post( ckan_request_url, data=json.dumps(resource), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}) if response.status_code not in (201, 200): raise DatastorerException('Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))