def getSchema(self): ''' Get data schema infered from records, the number of records is defined by SCHEMA_INFER_LIMIT the confidence trashold is defined by SCHEMA_INFER_CONFIDENCE ''' try: tmpCursor = self.getCursor() tmpCursor.execute(self.query_sql) i = settings.SCHEMA_INFER_LIMIT tmpList = [] row = tmpCursor.fetchone() while i > 0 and row is not None: tmpList.append(row) row = tmpCursor.fetchone() i -= 1 tmpList = [self.getColumns()] + tmpList t = Table(tmpList) t.infer() t.schema.descriptor[ 'missingValues'] = settings.SCHEMA_DATABASE_MISSING_VALUES return t.infer(confidence=settings.SCHEMA_INFER_CONFIDENCE) except Exception as e: tmpCursor.close() raise e
def _is_columns_dtype_same(self, row_limit: int = 500, confidence: float = 0.85): train_table = Table(self.train_csv_path) predict_table = Table(self.predict_csv_path) train_schema = train_table.infer(limit=row_limit, confidence=confidence) predict_schema = predict_table.infer(limit=row_limit, confidence=confidence) train_fields = train_schema['fields'] predict_fields = predict_schema['fields'] train_col_names = [d['name'] for d in train_fields] predict_col_names = [d['name'] for d in predict_fields] num_col_diff = 0 for predict_dict in predict_fields: predict_dtype = predict_dict['type'] predict_name = predict_dict['name'] train_idx = train_col_names.index(predict_name) train_dict = train_fields[train_idx] train_dtype = predict_dict['type'] if train_dtype != predict_dtype: raise Exception( f'Column: {predict_name} have different data types between train and predict CSV' )
def test_schema_infer_storage(import_module, apply_defaults): import_module.return_value = Mock(Storage=Mock(return_value=Mock( describe=Mock(return_value=SCHEMA_MIN), iter=Mock(return_value=DATA_MIN[1:]), ))) table = Table('table', storage='storage') table.infer() assert table.headers == ['key', 'value'] assert table.schema.descriptor == apply_defaults(SCHEMA_MIN)
def test_schema_infer_missing_values(): table = Table('data/data_infer_missing_values.csv') table.infer(missing_values=['-']) schema = deepcopy(SCHEMA_CSV) schema['missingValues'] = ['-'] assert table.schema.descriptor == schema assert table.read() == [[1, 39, 'Paul'], [None, 25, 'Test'], [2, 23, 'Jimmy'], [None, 25, 'Test'], [3, 36, 'Jane'], [None, 25, 'Test'], [4, 28, 'Judy']]
def test_read_storage(import_module): # Mocks import_module.return_value = Mock(Storage=Mock(return_value=Mock( describe=Mock(return_value=SCHEMA_MIN), iter=Mock(return_value=DATA_MIN[1:]), ))) # Tests table = Table('table', storage='storage') table.infer() expect = [['one', 1], ['two', 2]] actual = table.read() assert actual == expect
def infer(self, limit=None, confidence=.75): """ Infer the current type by looking at the values in the table """ # Do initial infer tqo set up headers and schema. Table.infer(self) rows = self.read(cast=False) headers = self.headers # Get descriptor fields = [] type_matches = {} for header in headers: fields.append({'name': header}) rindex = 0 for rindex, row in enumerate(rows): if limit is not None and rindex == limit: break # build a column-wise lookup of type matches for cindex, value in enumerate(row): typeid = self.__get_type(value, type_matches.get(cindex, None)) type_matches[cindex] = typeid self.row_count = rindex url_type = type(urlparse('foo')) for index, results in type_matches.items(): type_name, type_format = None, 'default' if results is bool: type_name = 'boolean' elif results is int: type_name = 'integer' elif results is float: type_name = 'number' elif results is str: type_name = 'string' elif results is datetime.datetime: type_name = 'datetime' type_format = 'any' elif results is url_type: type_name = 'string' type_format = 'uri' else: raise DerivaCSVError(msg='Bad type in infer') fields[index].update({'type': type_name, 'format': type_format}) # Now update the schema to have the inferred values. self.schema.descriptor['fields'] = fields # Reset the key constraints as they were blasted away by the infer. self.__set_key_constraints() self.schema.commit() return
def task5(): table = Table('movies.csv') table.infer() # fixing fields # IDs should be strings table.schema.descriptor['fields'][0]['type'] = 'string' table.schema.descriptor['fields'][1]['type'] = 'string' table.schema.commit() print(table.schema.valid) table.schema.save('schema.json')
def infer(path, limit=2000): table = Table(path) table.infer(limit=limit, confidence=0.75) data = pd.read_csv(path, low_memory=False) num_rows = data.index._stop rows_to_scan = limit if limit < num_rows else num_rows metadata_array = [] for field in table.schema.fields: metadata = Metadata() metadata.name = field.name metadata.type = field.type metadata.format = field.format object_description = data[field.name].astype(object).describe() missing_count = num_rows - int(object_description['count']) metadata.missing_count = missing_count metadata.missing_percentage = round( float(missing_count) / num_rows * 100, 2) distinct_count = int(object_description['unique']) metadata.distinct_count = distinct_count metadata.distinct_percentage = round( float(distinct_count) / (num_rows - missing_count) * 100, 2) metadata.most_frequent = object_description['top'] if metadata.type == "string" and metadata.missing_percentage != 100.0: if rows_to_scan == data[metadata.name].head(rows_to_scan).apply( lambda x: is_date(x)).sum(): metadata.type = "date" if (metadata.type == "integer" or metadata.type == "number") and (data.dtypes[field.name] == "int64" or data.dtypes[field.name] == "float64"): numeric_description = data[field.name].describe() metadata.min = numeric_description['min'] metadata.max = numeric_description['max'] metadata.ml_type = metadata.type if metadata.type == "integer" or metadata.type == "number": metadata.ml_type = "numeric" if metadata.type == "string": metadata.ml_type = "open_text" metadata_array.append(metadata) return metadata_array
def test_read_storage_passed_as_instance(): # Mocks storage = Mock( describe=Mock(return_value=SCHEMA_MIN), iter=Mock(return_value=DATA_MIN[1:]), spec=Storage, ) # Tests table = Table('table', storage=storage) table.infer() expect = [['one', 1], ['two', 2]] actual = table.read() assert actual == expect
def getSchema(self): ''' Get data schema infered from records, the number of records is defined by SCHEMA_INFER_LIMIT the confidence trashold is defined by SCHEMA_INFER_CONFIDENCE ''' t = Table(self._filePath) t.infer() t.schema.descriptor[ 'missingValues'] = settings.SCHEMA_CSV_MISSING_VALUES t.schema.commit() return t.infer( settings.SCHEMA_INFER_LIMIT, confidence=settings.SCHEMA_INFER_CONFIDENCE)
def test_processors(): # Processor def skip_under_30(erows): for row_number, headers, row in erows: krow = dict(zip(headers, row)) if krow['age'] >= 30: yield (row_number, headers, row) # Create table table = Table('data/data_infer.csv', post_cast=[skip_under_30]) table.infer() expect = [[1, 39, 'Paul'], [3, 36, 'Jane']] actual = table.read() assert actual == expect
def test_infer_schema_empty_file(): s = Table('data/empty.csv') d = s.infer() assert d == { 'fields': [], 'missingValues': [''], }
class TableLayoutOrder: ''' Find the table using the order of the datatypes ''' def __init__(self, meta_field_path, filepath): self.meta_field_path = meta_field_path self.filepath = filepath self.table = Table(self.filepath) self.meta_field = pd.read_csv(self.meta_field_path) def get_input_array(self): input_table_arr = [] for i, value in enumerate(self.table.infer()['fields']): val = str(value['type'] + str(i + 1)) input_table_arr.append(val) return input_table_arr def check_name(self): print(self.filepath) info = TableInformation(self.filepath) deli = info.show_delimeter() with open(self.filepath, ) as f: reader = csv.reader(f, delimiter=deli) line = next(reader) unique_tables = self.meta_field['TableName'].unique() print('sss', unique_tables) array_name = [] for i in unique_tables: temp = self.meta_field.loc[self.meta_field["TableName"] == i] if np.array_equal(sorted(line), sorted(temp["fields"].values)) == True: array_name.append(i) # print(temp["fields"].values) return array_name def check_ordered(self, check_tables=[]): if not check_tables: #unique_tables = check_tables #unique_tables = list(set(unique_tables) & set(check_tables)) unique_tables = self.check_name() else: print(self.check_name()) unique_tables = self.check_name() probable = [] print(unique_tables) for i in unique_tables: unique_table_arr = [] temp = self.meta_field.loc[self.meta_field['TableName'] == i].copy() for index, row in temp.iterrows(): unique_table_arr.append( str(row['datatype']) + str(row['order'])) if np.array_equal(sorted(self.get_input_array()), sorted(unique_table_arr)) == True: probable.append(i) return probable
def get_table_schema_fields(table: Table, max_rows: int) -> List[SchemaField]: table.infer(limit=max_rows) fields: List[SchemaField] = [] for raw_field in table.schema.fields: mapped_type: Type = tableschema_type_map.get(raw_field.type, NullTypeClass) field = SchemaField( fieldPath=raw_field.name, type=SchemaFieldDataType(mapped_type()), nativeDataType=str(raw_field.type), recursive=False, ) fields.append(field) return fields
def reindex(self): file_path, params = self.get_file_info() t = Table(file_path, ignore_blank_headers=True, **params) schema = t.infer() data = t.iter(keyed=True) self.storage.create(self.index_name, schema, reindex=True, always_recreate=True) self.storage.write(self.index_name, data) for res in self.storage.write(self.index_name, data): pass
def task5(inputcsv,outputjson): ! pip install tableschema from tableschema import Table table = Table(inputcsv) table.read(keyed=True) table.infer() table.schema.descriptor table.schema.descriptor["missingValues"] = "N/A" table.schema.commit() table.schema.valid table.schema.errors table.schema.descriptor["missingValues"] = "N/A" table.schema.commit() table.schema.valid table.schema.errors table.read(keyed=True) table.schema.save(outputjson)
def geraSchema(csvFile, tabelName): from tableschema import Table import json # abre o CSV table = Table(csvFile) #, schema='schema.json') table.infer(limit=100000) json_data = table.schema.descriptor # print json_data.keys() sqlCommand = "DROP TABLE IF EXISTS " + tabelName + "; \n" + "CREATE TABLE " + tabelName + " ( " + "\n" t = len(json_data['fields']) cont = 0 for campos in json_data['fields']: cont = cont + 1 comma = "," if (cont == t): comma = " " typeData = campos['type'] if (campos['name'] == "NU_INSCRICAO"): typeData = "varchar(12)" else: if (typeData == "string"): typeData = "varchar" else: if (typeData == "number"): typeData = "varchar(20)" # "numeric(7,2)" else: if (typeData == "integer"): typeData = "varchar(10)" sqlCommand = sqlCommand + "\t" + \ campos['name'] + "\t" + typeData + comma + "\n" sqlCommand = sqlCommand + "); \t" # print sqlCommand return sqlCommand
class DataCleaner(object): """DataCleaner has a set of functions to automatically clean tabular data. It does things like - infer data type (numeric, categorical, date, boolean) - infer missing values - automatically describe data in simple (lay terms) - automatically run some inference / correlation algos """ def __init__(self, csv_file): self.fname_ = csv_file self.table_ = Table(self.fname_) def InferType(self): # First pass: use tableschema inference to get syntactic shema # syntactic schema: integer, float, string, date syn_schema = self.table_.infer() # convert to json syn_schema_json = json.dumps(syn_schema, indent=4) print("json:" + str(syn_schema_json)) # # semantic schema for a column: # numeric: integer, float, integer range, float range, date, date range # categorical: boolean, enum # string: person name, geo location, etc # # other attributes for a column: is_identifier, is_unique, missing_id, human_description # # Signals from name (i.e., things like "id" "number" "num" "count" "code" "name") # - numeric-ness: "num" etc # - categorical-ness # - id-ness: "id" "code" # - string-ness: "name" # # Signals from distribution (for syntactic types integer, float) # - basic stats: min, max, median, mode, etc # - gaussian fit? # - smoothness / monotonicity: bumpiness denotes categorical (code) # # Signals from distribution (for syntactic types integer, string, date) # - num_distinct_val # - num_distinct_val_for_90% # - boolean-ness: "Yes" "No"; 0/1; up/down, etc # - nameness # - camelcase: denotes categorical # return syn_schema
def schema(self, use_headers=True): if not self.resource.file: raise ValidationError(_('File does not exist')) if self.resource.format not in ('csv', 'tsv', 'xls', 'xlsx', 'ods'): raise ValidationError(_('Invalid file type')) if not self._schema_cache: _schema = self.resource.tabular_data_schema or None if not _schema: _table = Table(self.resource.file.path, ignore_blank_headers=True, format=self.resource.format, encoding=self.resource.file_encoding or 'utf-8') _schema = _table.infer(limit=5000) self._schema_cache = _schema return self._schema_cache
# Define a DataSet Schema dsr = DataSetRequest() dsr.name = # PUT YOUR DATASET NAME LOGIC HERE dsr.description = # DATASET DESCRIPTION dsr.schema = Schema(jeff) # Create a DataSet with the given Schema dataset = datasets.create(dsr) domo.logger.info("Created DataSet " + dataset['id']) # Get a DataSets's metadata retrieved_dataset = datasets.get(dataset['id']) domo.logger.info("Retrieved DataSet " + retrieved_dataset['id']) # List DataSets dataset_list = list(datasets.list(sort=Sorting.NAME)) domo.logger.info("Retrieved a list containing {} DataSet(s)".format( len(dataset_list))) csv_file_path = allFiles datasets.data_import_from_file(dataset['id'], csv_file_path) domo.logger.info("Uploaded data from a file to DataSet {}".format( dataset['id'])) for eachCSV in os.listdir(inputdir): allFiles = str(os.path.abspath(eachCSV)) table = Table(allFiles) table.infer() jeff = table.schema.descriptor table.read(keyed=True) directoryUpload()
from tableschema import Table fileCSV = 'D:\dct\enem-microdados\DADOS_ENEM_2009.csv' fileJSON = 'D:\dct\enem-microdados\DADOS_ENEM_2009-schema.json' # Create table table = Table(fileCSV) table.infer(limit=100000) # table.schema.descriptor table.schema.save(fileJSON)
class CsvReader(DataReader): '''this is a wrapper class for tableschema library''' def __init__(self, filePath): self._filePath = filePath self._table = Table(filePath) def close(self): pass def getColumns(self): if not self._table.headers: self._table.infer( settings.SCHEMA_INFER_LIMIT) # clean the headers result = [] for header in self._table.headers: tmpheader = header.lower() tmpheader = tmpheader.replace(' ', '_').replace('-', '_') r = re.search('\w+', tmpheader) if r: result.append(r.group()) else: raise InvalidCsvHeaderException( '%s is not a valid header' % header) return result def requery(self): self._table = Table(self._filePath) def getRow(self): i = self._table.iter(cast=True) return next(i) def getRowsList(self): self._table.infer() self._table.schema.descriptor[ 'missingValues'] = settings.SCHEMA_CSV_MISSING_VALUES self._table.schema.commit() i = self._table.iter(cast=True) return list(map(tuple, i)) def getSchema(self): ''' Get data schema infered from records, the number of records is defined by SCHEMA_INFER_LIMIT the confidence trashold is defined by SCHEMA_INFER_CONFIDENCE ''' t = Table(self._filePath) t.infer() t.schema.descriptor[ 'missingValues'] = settings.SCHEMA_CSV_MISSING_VALUES t.schema.commit() return t.infer( settings.SCHEMA_INFER_LIMIT, confidence=settings.SCHEMA_INFER_CONFIDENCE) def __repr__(self): return (settings.SOURCE_TYPE_CSV_PREFIX + self._filePath)
def _get_columns_names(self, file_path, **kwargs): table = Table(file_path, **kwargs) table.read(keyed=True, limit=1) return [field['name'] for field in table.infer()['fields']]
def test_schema_infer_tabulator(): table = Table('data/data_infer.csv') table.infer() assert table.headers == ['id', 'age', 'name'] assert table.schema.descriptor == SCHEMA_CSV
def data(self): source = self.storage.read(self.index_name) t = Table(source, ignore_blank_headers=True) return (t.infer(), t.headers, t.read(keyed=False, limit=1000))
def __init__(self, source, schema_name, table_name=None, column_map=True, key_columns=None, row_number_as_key=False, schema=None): """ :param source: File containing the table data :param schema_name: Name of the Deriva Schema in which this table will be located :param table_name: Name of the table. If not provided, use the source file name :param column_map: a column name mapping dictionary, of the form [n1,n2,n3] or {n1:v, n2:v}. In the list form elements are the exact capatilization of words to be used in a name. In the dictionary form, the values are what the name should be replaced with. All matching is done case insensitive. Word substitution is only done after column names are split. Other matches are done both before and after the mapping of the name into snake case. :param key_columns: name of columns to use as keys (non-null, unique). Can be a single column or a list of list of columns. The first element of the list is used as the primary key. :param row_number_as_key: if key column is not provided, use the row number of the CSV in combination with a upload ID generated by system to identify the row. :param schema: existing tableschema file to use instead of infering types """ if schema is True: schema = None super(DerivaCSV, self).__init__(source, schema=schema) self.source = source self.table_name = table_name self._column_map = column_map self._key_columns = key_columns self.schema_name = schema_name self.row_count = None self.validation_report = None self.row_number_as_key = row_number_as_key if self._key_columns is None else False # Normalize the column map so we only have a dictionary. if self._column_map: if isinstance(self._column_map, list): # We have a word map. self._column_map = {i.upper(): i for i in self._column_map} elif isinstance(self._column_map, dict): self._column_map = { k.upper(): v for k, v in self._column_map.items() } else: self._column_map = {} # If tablename is not specified, use the file name of the data file as the table name. if not self.table_name: self.table_name = os.path.splitext(os.path.basename(source))[0] # Make the table name consistent with the naming strategy self.table_name = self.map_name(self.table_name) # Do initial infer to set up headers and schema. Table.infer(self) # Headers have to be unique if len(self.headers) != len(set(self.headers)): raise DerivaCSVError(msg='Duplicated column name in table') # Keys column can be a single column or a list of a list of columns. Normlize to list of lists... if self._key_columns is None or self._key_columns == []: self._key_columns = [] elif not isinstance(self._key_columns, list): self._key_columns = [[self._key_columns]] else: self._key_columns = [ i if type(i) is list else [i] for i in self._key_columns ] self.__set_key_constraints() return