def __init__(self, csv_file): self.fname_ = csv_file self.table_ = Table(self.fname_)
def __init__(self, meta_field_path, filepath): self.meta_field_path = meta_field_path self.filepath = filepath self.table = Table(self.filepath) self.meta_field = pd.read_csv(self.meta_field_path)
def requery(self): self._table = Table(self._filePath)
def test_iter(): table = Table(DATA_MIN, schema=SCHEMA_MIN) expect = [['one', 1], ['two', 2]] actual = list(table.iter())
from tableschema import Table fileCSV = 'D:\dct\enem-microdados\DADOS_ENEM_2009.csv' fileJSON = 'D:\dct\enem-microdados\DADOS_ENEM_2009-schema.json' # Create table table = Table(fileCSV) table.infer(limit=100000) # table.schema.descriptor table.schema.save(fileJSON)
def test_schema_instance(apply_defaults): schema_instance = Schema(SCHEMA_MIN) actual = Table(DATA_MIN, schema=schema_instance).schema.descriptor expect = apply_defaults(SCHEMA_MIN) assert actual == expect
def test_schema_infer_tabulator(): table = Table('data/data_infer.csv') table.infer() assert table.headers == ['id', 'age', 'name'] assert table.schema.descriptor == SCHEMA_CSV
def test_size_remote(): table = Table(BASE_URL % 'data/data.csv') table.read() assert table.size == SIZE
def test_size_not_read(): table = Table(BASE_URL % 'data/data.csv') assert table.size is None
def test_size(): table = Table('data/data.csv') table.read() assert table.size == SIZE
def test_size_compressed(): table = Table('data/data.csv.zip') table.read() assert table.size == SIZE
def test_read_limit(): table = Table(DATA_MIN, schema=SCHEMA_MIN) expect = [['one', 1]] actual = table.read(limit=1) assert actual == expect
def test_read_keyed(): table = Table(DATA_MIN, schema=SCHEMA_MIN) expect = [{'key': 'one', 'value': 1}, {'key': 'two', 'value': 2}] actual = table.read(keyed=True) assert actual == expect
def test_iter_keyed(): table = Table(DATA_MIN, schema=SCHEMA_MIN) expect = [{'key': 'one', 'value': 1}, {'key': 'two', 'value': 2}] actual = list(table.iter(keyed=True)) assert actual == expect
def test_read_integrity_hash(): table = Table('data/data.csv') table.read(integrity={'hash': HASH}) assert True
def test_hash(): table = Table('data/data.csv') table.read() assert table.hash == HASH
def test_read_integrity_hash_error(): table = Table('data/data.csv') with pytest.raises(exceptions.IntegrityError) as excinfo: table.read(integrity={'hash': HASH + 'a'}) assert HASH in str(excinfo.value)
def test_hash_compressed(): table = Table('data/data.csv.zip') table.read() assert table.hash == HASH
def test_schema_descriptor(apply_defaults): actual = Table(DATA_MIN, schema=SCHEMA_MIN).schema.descriptor expect = apply_defaults(SCHEMA_MIN) assert actual == expect
def test_hash_remote(): table = Table(BASE_URL % 'data/data.csv') table.read() assert table.hash == HASH
def test_iter_missing_cols_stream_closed(): table = Table('data/data_missing_cols.csv', schema=SCHEMA_MIN) with pytest.raises(exceptions.CastError) as excinfo: for _ in table.iter(): pass assert table._Table__stream.closed
def test_hash(): table = Table(BASE_URL % 'data/data.csv') assert table.hash is None
def test_iter_csv(): table = Table('data/data_infer.csv', schema=SCHEMA_CSV) expect = [[1, 39, 'Paul'], [2, 23, 'Jimmy'], [3, 36, 'Jane'], [4, 28, 'Judy']] actual = list(table.iter()) assert actual == expect
def test_read_integrity(): table = Table('data/data.csv') table.read(integrity={'size': SIZE, 'hash': HASH}) assert True
def create_table_from_csv(form, table): """Uploads a csv file and creates a superset datasource in Hive.""" def convert_to_hive_type(col_type): """maps tableschema's types to hive types""" tableschema_to_hive_types = { 'boolean': 'BOOLEAN', 'integer': 'INT', 'number': 'DOUBLE', 'string': 'STRING', } return tableschema_to_hive_types.get(col_type, 'STRING') bucket_path = config['CSV_TO_HIVE_UPLOAD_S3_BUCKET'] if not bucket_path: logging.info('No upload bucket specified') raise Exception( 'No upload bucket specified. You can specify one in the config file.') table_name = form.name.data schema_name = form.schema.data if config.get('UPLOADED_CSV_HIVE_NAMESPACE'): if '.' in table_name or schema_name: raise Exception( "You can't specify a namespace. " 'All tables will be uploaded to the `{}` namespace'.format( config.get('HIVE_NAMESPACE'))) full_table_name = '{}.{}'.format( config.get('UPLOADED_CSV_HIVE_NAMESPACE'), table_name) else: if '.' in table_name and schema_name: raise Exception( "You can't specify a namespace both in the name of the table " 'and in the schema field. Please remove one') full_table_name = '{}.{}'.format( schema_name, table_name) if schema_name else table_name filename = form.csv_file.data.filename upload_prefix = config['CSV_TO_HIVE_UPLOAD_DIRECTORY'] upload_path = config['UPLOAD_FOLDER'] + \ secure_filename(filename) # Optional dependency from tableschema import Table # pylint: disable=import-error hive_table_schema = Table(upload_path).infer() column_name_and_type = [] for column_info in hive_table_schema['fields']: column_name_and_type.append( '`{}` {}'.format( column_info['name'], convert_to_hive_type(column_info['type']))) schema_definition = ', '.join(column_name_and_type) # Optional dependency import boto3 # pylint: disable=import-error s3 = boto3.client('s3') location = os.path.join('s3a://', bucket_path, upload_prefix, table_name) s3.upload_file( upload_path, bucket_path, os.path.join(upload_prefix, table_name, filename)) sql = f"""CREATE TABLE {full_table_name} ( {schema_definition} ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '{location}' tblproperties ('skip.header.line.count'='1')""" logging.info(form.con.data) engine = create_engine(form.con.data.sqlalchemy_uri_decrypted) engine.execute(sql)
def test_read_integrity_size(): table = Table('data/data.csv') table.read(integrity={'size': SIZE}) assert True
def __init__(self, filePath): self._filePath = filePath self._table = Table(filePath)
def test_read_integrity_size_error(): table = Table('data/data.csv') with pytest.raises(exceptions.IntegrityError) as excinfo: table.read(integrity={'size': SIZE + 1}) assert str(SIZE) in str(excinfo.value)
def create_table_from_csv(cls, form, table): """Uploads a csv file and creates a superset datasource in Hive.""" def convert_to_hive_type(col_type): """maps tableschema's types to hive types""" tableschema_to_hive_types = { "boolean": "BOOLEAN", "integer": "INT", "number": "DOUBLE", "string": "STRING", } return tableschema_to_hive_types.get(col_type, "STRING") bucket_path = config["CSV_TO_HIVE_UPLOAD_S3_BUCKET"] if not bucket_path: logging.info("No upload bucket specified") raise Exception( "No upload bucket specified. You can specify one in the config file." ) table_name = form.name.data schema_name = form.schema.data if config.get("UPLOADED_CSV_HIVE_NAMESPACE"): if "." in table_name or schema_name: raise Exception( "You can't specify a namespace. " "All tables will be uploaded to the `{}` namespace".format( config.get("HIVE_NAMESPACE") ) ) full_table_name = "{}.{}".format( config.get("UPLOADED_CSV_HIVE_NAMESPACE"), table_name ) else: if "." in table_name and schema_name: raise Exception( "You can't specify a namespace both in the name of the table " "and in the schema field. Please remove one" ) full_table_name = ( "{}.{}".format(schema_name, table_name) if schema_name else table_name ) filename = form.csv_file.data.filename upload_prefix = config["CSV_TO_HIVE_UPLOAD_DIRECTORY"] upload_path = config["UPLOAD_FOLDER"] + secure_filename(filename) # Optional dependency from tableschema import Table # pylint: disable=import-error hive_table_schema = Table(upload_path).infer() column_name_and_type = [] for column_info in hive_table_schema["fields"]: column_name_and_type.append( "`{}` {}".format( column_info["name"], convert_to_hive_type(column_info["type"]) ) ) schema_definition = ", ".join(column_name_and_type) # Optional dependency import boto3 # pylint: disable=import-error s3 = boto3.client("s3") location = os.path.join("s3a://", bucket_path, upload_prefix, table_name) s3.upload_file( upload_path, bucket_path, os.path.join(upload_prefix, table_name, filename) ) sql = f"""CREATE TABLE {full_table_name} ( {schema_definition} ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '{location}' tblproperties ('skip.header.line.count'='1')""" logging.info(form.con.data) engine = create_engine(form.con.data.sqlalchemy_uri_decrypted) engine.execute(sql)
output_row['pseudo_align_70m_SD_mm'] = '' output_row['mean_top_70m_mm'] = input_row['mean_top_70m_mm'] # unvalidated extension point for non-standard geometry items output_row['extended_items_geometry'] = json.dumps( {'curvature_mm': input_row['curvature_mm']}) output_row['accel_z_wb_ms_2'] = input_row['accel_z_wb_ms_2'] output_row['accel_x_wc_ms_2'] = input_row['accel_x_wc_ms_2'] output_row['accel_x_wd_ms_2'] = input_row['accel_x_wd_ms_2'] output_row['accel_y_wd_ms_2'] = input_row['accel_y_wd_ms_2'] output_row['accel_y_wp_ms_2'] = input_row['accel_y_wp_ms_2'] output_row['creating_adapter_version'] = ADAPTER_VERSION output_row['data_row_uid'] = uuid.uuid4() wr.writerow(output_row) if args.schema is not None: # validate the output file against the schema # print(args.schema.name) tbl = Table(out_file.name, schema=args.schema.name) # print('checking...') try: tbl.read(limit=2000) print('OK') except exceptions.TableSchemaException as exception: for error in exception.errors: print(error) time.sleep(5)