def setup_augmentation(orig, aug): with data(orig) as d: orig_meta = process_dataset(d) with data(aug) as d: aug_meta = process_dataset(d) with tempfile.TemporaryDirectory() as tmp: result = os.path.join(tmp, 'result.csv') writer = make_writer(result) with data(orig) as orig_data: with data(aug) as aug_data: yield orig_data, aug_data, orig_meta, aug_meta, result, writer
def profile(self, df: pd.DataFrame, columns: Optional[Columns] = None) -> Dict: """Run profiler on a given data frame. Ensure to create a new data frame first that has the row index reset. Parameters ---------- df: pd.DataFrame Input data frame. columns: int, string, or list(int or string), default=None Single column or list of column index positions or column names for those columns that are being profiled. Profile the full dataset if None. Returns ------- dict """ # Filter columns if list of columns is given. Otherwise project on all # columns in the schema to get a new data frame where we can securely # reset the row index. columns = list(range(len(df.columns))) if columns is None else columns df = select(df=df, columns=columns).reset_index(drop=True) return dmp.process_dataset(df, include_sample=False, plots=True)
def indentify_feature_types(csv_path, unkown_feature_types, target_names): metadata = datamart_profiler.process_dataset(csv_path) inferred_feature_types = {} for index, item in enumerate(metadata['columns']): feature_name = item['name'] if feature_name in unkown_feature_types: semantic_types = item['semantic_types'] if len( item['semantic_types']) > 0 else [item['structural_type']] d3m_semantic_types = [] for semantic_type in semantic_types: if semantic_type == 'http://schema.org/Enumeration': # Changing to D3M format semantic_type = 'https://metadata.datadrivendiscovery.org/types/CategoricalData' elif semantic_type == 'http://schema.org/identifier': # Changing to D3M format #semantic_type = 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' semantic_type = 'http://schema.org/Integer' elif semantic_type == 'https://metadata.datadrivendiscovery.org/types/MissingData': semantic_type = 'http://schema.org/Text' d3m_semantic_types.append(semantic_type) role = 'https://metadata.datadrivendiscovery.org/types/Attribute' if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in d3m_semantic_types: role = 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' elif feature_name in target_names: role = 'https://metadata.datadrivendiscovery.org/types/TrueTarget' inferred_feature_types[feature_name] = (role, d3m_semantic_types, index) logger.info( 'Inferred feature types:\n%s', '\n'.join([ '%s = [%s]' % (k, ', '.join([i for i in v[1]])) for k, v in inferred_feature_types.items() ])) return inferred_feature_types
def main(): parser = argparse.ArgumentParser() parser.add_argument('-v', action='count', default=0, dest='verbosity', help="augments verbosity level") parser.add_argument('--include-sample', action='store_true', default=False, help="include a few random rows to the result") parser.add_argument('--no-coverage', action='store_false', default=True, dest='coverage', help="don't compute data ranges (using k-means)") parser.add_argument('--plots', action='store_true', default=False, dest='plots', help="compute plots (in vega format)") parser.add_argument('--load-max-size', action='store', nargs=1, help="target size of the data to be analyzed. The " "data will be randomly sampled if it is bigger") parser.add_argument('file', nargs=1, help="file to profile") args = parser.parse_args() # Set up logging level = { 0: logging.WARNING, 1: logging.INFO, }.get(args.verbosity, logging.DEBUG) logging.basicConfig(level=level) # Check for datamart-geo try: from datamart_geo import GeoData geo_data = GeoData.from_local_cache() except ImportError: logger.info("datamart-geo not installed") geo_data = None except FileNotFoundError: logger.warning("datamart-geo is installed but no data is available") geo_data = None # Parse max size load_max_size = None if args.load_max_size: if args.load_max_size[0] in ('0', '-1', ''): load_max_size = float('inf') else: load_max_size = parse_size(args.load_max_size[0]) # Profile metadata = process_dataset( args.file[0], geo_data=geo_data, include_sample=args.include_sample, coverage=args.coverage, plots=args.plots, load_max_size=load_max_size, ) json.dump(metadata, sys.stdout, indent=2, sort_keys=True)
def test_point_latlong(self): """Test profiling latitudes & longitudes""" with data('geo_latlong.csv', 'r') as data_fp: metadata = process_dataset( data_fp, coverage=True, ) self.assertJson( metadata, { 'types': ['numerical', 'spatial'], "size": 4408, "nb_rows": 100, "nb_profiled_rows": 100, "nb_columns": 3, "nb_spatial_columns": 1, "nb_numerical_columns": 1, "average_row_size": lambda n: round(n, 2) == 44.08, "attribute_keywords": ["id", "coords", "height"], "columns": [ { "name": "id", "structural_type": "http://schema.org/Text", "semantic_types": [], "missing_values_ratio": 0.01, "num_distinct_values": 99 }, { "name": "coords", "structural_type": "http://schema.org/GeoCoordinates", "semantic_types": [], "unclean_values_ratio": 0.0, "point_format": "lat,long", }, { "name": "height", "structural_type": "http://schema.org/Float", "semantic_types": [], "unclean_values_ratio": 0.0, "mean": lambda n: round(n, 3) == 47.827, "stddev": lambda n: round(n, 2) == 21.28, "coverage": check_ranges(1.0, 90.0), } ], "spatial_coverage": [ { "type": "point_latlong", "column_names": ["coords"], "column_indexes": [1], "geohashes4": check_geohashes('1211302313'), "ranges": check_geo_ranges(-74.006, 40.6905, -73.983, 40.7352), "number": 100, }, ], }, )
def test_no_index(self): """Test profiling a DataFrame that has no index, for reference""" df = self.DATA self.assertEqual(list(df.columns), ['a', 'b', 'c']) metadata = process_dataset(df) self.assertEqual( [col['name'] for col in metadata['columns']], ['a', 'b', 'c'], )
def test_process(self): """Test pairing latitudes & longitudes in profiler""" with data('lat_longs.csv', 'r') as data_fp: dataframe = pandas.read_csv(data_fp) metadata = process_dataset( dataframe, ) # Check columns self.assertJson( [ {k: v for k, v in c.items() if k in ['name', 'structural_type', 'semantic_types']} for c in metadata['columns']], [ { 'name': 'from latitude', 'structural_type': 'http://schema.org/Float', 'semantic_types': ['http://schema.org/latitude'], }, { 'name': 'to long', 'structural_type': 'http://schema.org/Float', 'semantic_types': ['http://schema.org/longitude'], }, { 'name': 'to lat', 'structural_type': 'http://schema.org/Float', 'semantic_types': ['http://schema.org/latitude'], }, { 'name': 'from longitude', 'structural_type': 'http://schema.org/Float', 'semantic_types': ['http://schema.org/longitude'], }, { 'name': 'unpaired lat', 'structural_type': 'http://schema.org/Float', 'semantic_types': [], }, ] ) # Check pairs self.assertJson( [ { k: v for k, v in c.items() if k not in ('ranges', 'geohashes4', 'number') } for c in metadata['spatial_coverage'] ], [ {'type': 'latlong', 'column_names': ['to lat', 'to long'], 'column_indexes': [2, 1]}, {'type': 'latlong', 'column_names': ['from latitude', 'from longitude'], 'column_indexes': [0, 3]}, ], )
def test_multi_index(self): """Test profiling a DataFrame that has multiple indexes (MultiIndex)""" df = self.DATA.set_index(['a', 'b']) self.assertEqual(list(df.index.names), ['a', 'b']) self.assertEqual(list(df.columns), ['c']) metadata = process_dataset(df) self.assertEqual( [col['name'] for col in metadata['columns']], ['a', 'b', 'c'], )
def test_duplicate_column_names(self): """Test reading a CSV with duplicate names.""" metadata = process_dataset(io.StringIO(textwrap.dedent('''\ one,two,one a,1,c d,2,f '''))) self.assertEqual( [col['name'] for col in metadata['columns']], ['one', 'two', 'one'], )
def test_year(self): """Test the 'year' special-case.""" dataframe = pandas.DataFrame({ 'year': [2004, 2005, 2006], 'number': [2014, 2015, float('nan')], }) metadata = process_dataset(dataframe) def year_rng(year): year = float(year) return {'range': {'gte': year, 'lte': year}} self.assertJson( metadata, { 'nb_rows': 3, 'nb_profiled_rows': 3, 'types': ['numerical', 'temporal'], 'attribute_keywords': ['year', 'number'], 'columns': [ { 'name': 'year', 'structural_type': 'http://schema.org/Text', 'semantic_types': ['http://schema.org/DateTime'], 'unclean_values_ratio': 0.0, 'num_distinct_values': 3, 'mean': 1104508800.0, 'stddev': lambda n: round(n, 3) == 25784316.871, 'coverage': [ year_rng(1072915200.0), year_rng(1104537600.0), year_rng(1136073600.0), ], 'temporal_resolution': 'year', }, { 'name': 'number', 'structural_type': 'http://schema.org/Integer', 'semantic_types': [], 'missing_values_ratio': lambda n: round(n, 2) == 0.33, 'unclean_values_ratio': 0.0, 'num_distinct_values': 2, 'mean': 2014.5, 'stddev': 0.5, 'coverage': [ {'range': {'gte': 2014.0, 'lte': 2014.0}}, {'range': {'gte': 2015.0, 'lte': 2015.0}}, ], }, ], }, )
def test_process(self): with data('lat_longs.csv', 'r') as data_fp: dataframe = pandas.read_csv(data_fp) metadata = process_dataset(dataframe, ) # Check columns self.assertJson([{ k: v for k, v in c.items() if k in ['name', 'structural_type', 'semantic_types'] } for c in metadata['columns']], [ { 'name': 'from latitude', 'structural_type': 'http://schema.org/Float', 'semantic_types': ['http://schema.org/latitude'], }, { 'name': 'to long', 'structural_type': 'http://schema.org/Float', 'semantic_types': ['http://schema.org/longitude'], }, { 'name': 'to lat', 'structural_type': 'http://schema.org/Float', 'semantic_types': ['http://schema.org/latitude'], }, { 'name': 'from longitude', 'structural_type': 'http://schema.org/Float', 'semantic_types': ['http://schema.org/longitude'], }, { 'name': 'unpaired lat', 'structural_type': 'http://schema.org/Float', 'semantic_types': [], }, ]) # Check pairs self.assertJson( [{k: v for k, v in c.items() if k != 'ranges'} for c in metadata['spatial_coverage']], [ { 'lat': 'to lat', 'lon': 'to long' }, { 'lat': 'from latitude', 'lon': 'from longitude' }, ], )
def handle_data_parameter(self, data): """ Handles the 'data' parameter. :param data: the input parameter :return: (data, data_profile) data: data as bytes (either the input or loaded from the input) data_profile: the profiling (metadata) of the data """ if not isinstance(data, bytes): raise ValueError # Use SHA1 of file as cache key sha1 = hashlib.sha1(data) data_hash = sha1.hexdigest() data_profile = self.application.redis.get('profile:' + data_hash) if data_profile is not None: logger.info("Found cached profile_data") data_profile = pickle.loads(data_profile) else: logger.info("Profiling...") start = time.perf_counter() data_profile = process_dataset( data=io.BytesIO(data), lazo_client=self.application.lazo_client, nominatim=self.application.nominatim, search=True, include_sample=False, coverage=True, ) logger.info("Profiled in %.2fs", time.perf_counter() - start) self.application.redis.set( 'profile:' + data_hash, pickle.dumps(data_profile), ) return data_profile, data_hash
def create_d3mdataset(csv_path, destination_path, version='4.0.0'): metadata = datamart_profiler.process_dataset(csv_path) dataset_path = join(destination_path, 'datasetDoc.json') if exists(destination_path): shutil.rmtree(destination_path) writer = D3mWriter( dataset_id='internal_dataset', destination=destination_path, metadata=metadata, format_options={ 'need_d3mindex': True, 'version': version }, ) with open(csv_path, 'rb') as source: with writer.open_file('wb') as dest: shutil.copyfileobj(source, dest) writer.finish() return dataset_path
def materialize_and_process_dataset( dataset_id, metadata, lazo_client, nominatim, geo_data, profile_semaphore, ): with contextlib.ExitStack() as stack: # Remove converters, we'll discover what's needed metadata = dict(metadata) materialize = dict(metadata.pop('materialize')) materialize.pop('convert', None) with prom_incremented(PROM_DOWNLOADING): dataset_path = stack.enter_context( get_dataset( dict(metadata, materialize=materialize), dataset_id, ) ) def convert_dataset(func, path): def convert(cache_temp): with open(cache_temp, 'w', newline='') as dst: func(path, dst) converted_key = dataset_cache_key( dataset_id, dict(metadata, materialize=materialize), 'csv', {}, ) return stack.enter_context( cache_get_or_set( '/cache/datasets', converted_key, convert, ) ) dataset_path = detect_format_convert_to_csv( dataset_path, convert_dataset, materialize, ) # Profile with profile_semaphore: with prom_incremented(PROM_PROFILING): with tracer.start_as_current_span( 'profile', attributes={'dataset': dataset_id}, ): logger.info("Profiling dataset %r", dataset_id) start = time.perf_counter() metadata = process_dataset( data=dataset_path, dataset_id=dataset_id, metadata=metadata, lazo_client=lazo_client, nominatim=nominatim, geo_data=geo_data, include_sample=True, coverage=True, plots=True, ) logger.info( "Profiling dataset %r took %.2fs", dataset_id, time.perf_counter() - start, ) metadata['materialize'] = materialize return metadata
def test_profile(self): with data('admins.csv', 'r') as data_fp: metadata = process_dataset( data_fp, geo_data=self.geo_data, coverage=True, ) self.assertJson( metadata, { 'size': 93, 'nb_rows': 5, 'nb_profiled_rows': 5, 'types': ['spatial'], 'attribute_keywords': ['zero', 'one'], 'columns': [ { 'name': 'zero', 'structural_type': 'http://schema.org/Text', 'semantic_types': [ 'http://schema.org/AdministrativeArea', 'http://schema.org/Enumeration', ], 'num_distinct_values': 2, 'admin_area_level': 0, }, { 'name': 'one', 'structural_type': 'http://schema.org/Text', 'semantic_types': [ 'http://schema.org/AdministrativeArea', 'http://schema.org/Enumeration', ], 'num_distinct_values': 5, 'admin_area_level': 1, }, ], 'spatial_coverage': [ { 'type': 'admin', 'column_names': ['zero'], 'column_indexes': [0], 'ranges': [ { 'range': { 'type': 'envelope', 'coordinates': [ [-61.79784095, 55.065334377], [55.8545028, -21.370782159], ], }, }, ], }, { 'type': 'admin', 'column_names': ['one'], 'column_indexes': [1], 'ranges': [ { 'range': { 'type': 'envelope', 'coordinates': [ [-61.79784, 53.72778], [13.81686, 14.40811], ], }, }, ], }, ], }, )
def test_year(self): """Test the 'year' special-case.""" dataframe = pandas.DataFrame({ 'year': [2004, 2005, 2006], 'number': [2014, 2015, 2016], }) metadata = process_dataset(dataframe) def year_rng(year): year = float(year) return {'range': {'gte': year, 'lte': year}} self.assertJson( metadata, { 'nb_rows': 3, 'nb_profiled_rows': 3, 'columns': [ { 'name': 'year', 'structural_type': 'http://schema.org/Integer', 'semantic_types': ['http://schema.org/DateTime'], 'unclean_values_ratio': 0.0, 'num_distinct_values': 3, 'mean': 2005.0, 'stddev': lambda n: round(n, 3) == 0.816, 'coverage': [ year_rng(2004), year_rng(2005), year_rng(2006), ], 'temporal_resolution': 'year', }, { 'name': 'number', 'structural_type': 'http://schema.org/Integer', 'semantic_types': [], 'unclean_values_ratio': 0.0, 'num_distinct_values': 3, 'mean': 2015.0, 'stddev': lambda n: round(n, 3) == 0.816, 'coverage': [ { 'range': { 'gte': 2014.0, 'lte': 2014.0 } }, { 'range': { 'gte': 2015.0, 'lte': 2015.0 } }, { 'range': { 'gte': 2016.0, 'lte': 2016.0 } }, ], }, ], }, )
def test_profile(self): old_query = datamart_profiler.spatial.nominatim_query queries = { "70 Washington Square S, New York, NY 10012": [{ 'lat': 40.7294, 'lon': -73.9972, }], "6 MetroTech, Brooklyn, NY 11201": [{ 'lat': 40.6944, 'lon': -73.9857, }], "251 Mercer St, New York, NY 10012": [{ 'lat': 40.7287, 'lon': -73.9957, }], } datamart_profiler.spatial.nominatim_query = \ lambda url, *, q: [queries[qe] for qe in q] try: with data('addresses.csv', 'r') as data_fp: metadata = process_dataset( data_fp, nominatim='http://nominatim/', coverage=True, ) finally: datamart_profiler.spatial.nominatim_query = old_query self.assertJson( metadata, { 'size': 142, 'nb_rows': 3, 'nb_profiled_rows': 3, 'columns': [ { 'name': 'place', 'num_distinct_values': 3, 'structural_type': 'http://schema.org/Text', 'semantic_types': [], }, { 'name': 'loc', 'structural_type': 'http://schema.org/Text', 'semantic_types': [ 'http://schema.org/Text', 'http://schema.org/address', ], }, ], 'spatial_coverage': [ { 'address': 'loc', 'ranges': check_geo_ranges(-74.00, 40.69, -73.98, 40.73), }, ], }, )
def test_admin(self): """Test profiling administrative areas""" with data('admins.csv', 'r') as data_fp: metadata = process_dataset( data_fp, geo_data=self.geo_data, coverage=True, ) self.assertJson( metadata, { 'size': 143, 'nb_rows': 5, 'nb_profiled_rows': 5, 'nb_columns': 3, 'nb_spatial_columns': 2, 'average_row_size': lambda n: round(n, 2) == 28.6, 'types': ['spatial'], 'attribute_keywords': ['zero', 'one', 'mixed'], 'columns': [ { 'name': 'zero', 'structural_type': 'http://schema.org/Text', 'semantic_types': [ 'http://schema.org/AdministrativeArea', 'http://schema.org/Enumeration', ], 'num_distinct_values': 3, 'admin_area_level': 0, }, { 'name': 'one', 'structural_type': 'http://schema.org/Text', 'semantic_types': [ 'http://schema.org/AdministrativeArea', 'http://schema.org/Enumeration', ], 'num_distinct_values': 5, 'admin_area_level': 1, }, { 'name': 'mixed', 'structural_type': 'http://schema.org/Text', 'semantic_types': [], 'num_distinct_values': 5, }, ], 'spatial_coverage': [ { 'type': 'admin', 'column_names': ['zero'], 'column_indexes': [0], 'ranges': [ { 'range': { 'type': 'envelope', 'coordinates': [ [-18.393686294555664, 55.09916687011719], [18.784475326538086, 27.433542251586914], ], }, }, ], 'geohashes4': lambda l: sorted(l, key=lambda h: h['hash']) == [ {'hash': '123201', 'number': 1}, {'hash': '123203', 'number': 1}, {'hash': '123210', 'number': 1}, {'hash': '123211', 'number': 1}, {'hash': '123212', 'number': 1}, {'hash': '123213', 'number': 1}, {'hash': '123221', 'number': 1}, {'hash': '123223', 'number': 1}, {'hash': '123230', 'number': 1}, {'hash': '123231', 'number': 1}, {'hash': '123232', 'number': 1}, {'hash': '123233', 'number': 1}, {'hash': '123300', 'number': 1}, {'hash': '123301', 'number': 1}, {'hash': '123302', 'number': 1}, {'hash': '123303', 'number': 1}, {'hash': '123310', 'number': 1}, {'hash': '123311', 'number': 1}, {'hash': '123312', 'number': 1}, {'hash': '123313', 'number': 1}, {'hash': '123320', 'number': 1}, {'hash': '123321', 'number': 1}, {'hash': '123322', 'number': 1}, {'hash': '123323', 'number': 1}, {'hash': '123330', 'number': 1}, {'hash': '123331', 'number': 1}, {'hash': '123332', 'number': 1}, {'hash': '123333', 'number': 1}, {'hash': '301001', 'number': 1}, {'hash': '301010', 'number': 1}, {'hash': '301011', 'number': 1}, {'hash': '301100', 'number': 1}, {'hash': '301101', 'number': 1}, {'hash': '301102', 'number': 1}, {'hash': '301103', 'number': 1}, {'hash': '301110', 'number': 1}, {'hash': '301111', 'number': 1}, {'hash': '301112', 'number': 1}, {'hash': '301113', 'number': 1}, {'hash': '301120', 'number': 1}, {'hash': '301121', 'number': 1}, {'hash': '301122', 'number': 1}, {'hash': '301123', 'number': 1}, {'hash': '301130', 'number': 1}, {'hash': '301131', 'number': 1}, {'hash': '301132', 'number': 1}, {'hash': '301133', 'number': 1}, {'hash': '310002', 'number': 2}, {'hash': '310003', 'number': 1}, {'hash': '310012', 'number': 1}, {'hash': '310013', 'number': 1}, {'hash': '310020', 'number': 2}, {'hash': '310021', 'number': 1}, {'hash': '310022', 'number': 1}, {'hash': '310030', 'number': 1}, {'hash': '310031', 'number': 1}, ], 'number': 3, }, { 'type': 'admin', 'column_names': ['one'], 'column_indexes': [1], 'ranges': [ { 'range': { 'type': 'envelope', 'coordinates': [ [-5.144032955169678, 50.564720153808594], [13.839637756347656, 42.33274841308594], ], }, }, ], 'geohashes4': lambda l: sorted(l, key=lambda h: h['hash']) == [ {'hash': '12333322', 'number': 1}, {'hash': '12333323', 'number': 1}, {'hash': '12333332', 'number': 1}, {'hash': '12333333', 'number': 1}, {'hash': '13222211', 'number': 1}, {'hash': '13222213', 'number': 1}, {'hash': '13222222', 'number': 1}, {'hash': '13222231', 'number': 1}, {'hash': '13222233', 'number': 1}, {'hash': '13222300', 'number': 1}, {'hash': '13222301', 'number': 1}, {'hash': '13222302', 'number': 1}, {'hash': '13222303', 'number': 1}, {'hash': '13222320', 'number': 2}, {'hash': '13222321', 'number': 2}, {'hash': '13222322', 'number': 2}, {'hash': '13222323', 'number': 2}, {'hash': '13222330', 'number': 1}, {'hash': '13222331', 'number': 1}, {'hash': '13222332', 'number': 1}, {'hash': '13222333', 'number': 1}, {'hash': '30111100', 'number': 1}, {'hash': '30111101', 'number': 1}, {'hash': '30111102', 'number': 1}, {'hash': '30111103', 'number': 1}, {'hash': '30111110', 'number': 1}, {'hash': '30111111', 'number': 1}, {'hash': '30111112', 'number': 1}, {'hash': '30111113', 'number': 1}, {'hash': '30111120', 'number': 1}, {'hash': '30111121', 'number': 1}, {'hash': '30111122', 'number': 1}, {'hash': '30111123', 'number': 1}, {'hash': '30111130', 'number': 1}, {'hash': '30111131', 'number': 1}, {'hash': '30111132', 'number': 1}, {'hash': '30111133', 'number': 1}, {'hash': '31000000', 'number': 1}, {'hash': '31000002', 'number': 1}, {'hash': '31000020', 'number': 1}, {'hash': '31000022', 'number': 1}, {'hash': '31000100', 'number': 1}, {'hash': '31000101', 'number': 1}, {'hash': '31000102', 'number': 1}, {'hash': '31000103', 'number': 1}, {'hash': '31000110', 'number': 1}, {'hash': '31000111', 'number': 1}, {'hash': '31000112', 'number': 1}, {'hash': '31000113', 'number': 1}, {'hash': '31000231', 'number': 1}, {'hash': '31000233', 'number': 1}, {'hash': '31000320', 'number': 1}, {'hash': '31000321', 'number': 1}, {'hash': '31000322', 'number': 1}, {'hash': '31000323', 'number': 1}, {'hash': '31000330', 'number': 1}, {'hash': '31000331', 'number': 1}, {'hash': '31000332', 'number': 1}, {'hash': '31000333', 'number': 1}, {'hash': '31002011', 'number': 1}, {'hash': '31002013', 'number': 1}, {'hash': '31002100', 'number': 1}, {'hash': '31002101', 'number': 1}, {'hash': '31002102', 'number': 1}, {'hash': '31002103', 'number': 1}, {'hash': '31002110', 'number': 1}, {'hash': '31002111', 'number': 1}, {'hash': '31002112', 'number': 1}, {'hash': '31002113', 'number': 1}, ], # FIXME: number currently 1 because of missing geo data 'number': lambda n: isinstance(n, int), }, ], }, )
def materialize_and_process_dataset( dataset_id, metadata, lazo_client, nominatim, profile_semaphore, cache_invalid=False, ): with contextlib.ExitStack() as stack: with prom_incremented(PROM_DOWNLOADING): dataset_path = stack.enter_context( get_dataset(metadata, dataset_id, cache_invalid=cache_invalid) ) materialize = metadata.pop('materialize') # Check for Excel file format try: xlrd.open_workbook(dataset_path) except xlrd.XLRDError: pass else: logger.info("This is an Excel file") materialize.setdefault('convert', []).append({'identifier': 'xls'}) excel_temp_path = dataset_path + '.xls' os.rename(dataset_path, excel_temp_path) try: with open(dataset_path, 'w', newline='') as dst: xls_to_csv(excel_temp_path, dst) finally: os.remove(excel_temp_path) # Check for TSV file format with open(dataset_path, 'r') as fp: try: dialect = csv.Sniffer().sniff(fp.read(16384)) except Exception as error: # csv.Error, UnicodeDecodeError logger.error("csv.Sniffer error: %s", error) dialect = csv.get_dialect('excel') if getattr(dialect, 'delimiter', '') == '\t': logger.info("This is a TSV file") materialize.setdefault('convert', []).append({'identifier': 'tsv'}) tsv_temp_path = dataset_path + '.tsv' os.rename(dataset_path, tsv_temp_path) try: with open(dataset_path, 'w', newline='') as dst: tsv_to_csv(tsv_temp_path, dst) finally: os.remove(tsv_temp_path) # Check for pivoted temporal table with open(dataset_path, 'r') as fp: reader = csv.reader(fp) try: columns = next(iter(reader)) except StopIteration: columns = [] if len(columns) >= 3: non_matches = [ i for i, name in enumerate(columns) if parse_date(name) is None ] if len(non_matches) <= max(2.0, 0.20 * len(columns)): logger.info("Detected pivoted table") materialize.setdefault('convert', []).append({ 'identifier': 'pivot', 'except_columns': non_matches, }) pivot_temp_path = dataset_path + '.pivot.csv' os.rename(dataset_path, pivot_temp_path) try: with open(dataset_path, 'w', newline='') as dst: pivot_table(pivot_temp_path, dst, non_matches) finally: os.remove(pivot_temp_path) # Profile with profile_semaphore: with prom_incremented(PROM_PROFILING): logger.info("Profiling dataset %r", dataset_id) start = time.perf_counter() metadata = process_dataset( data=dataset_path, dataset_id=dataset_id, metadata=metadata, lazo_client=lazo_client, nominatim=nominatim, include_sample=True, coverage=True, plots=True, ) logger.info( "Profiling dataset %r took %.2fs", dataset_id, time.perf_counter() - start, ) metadata['materialize'] = materialize return metadata
if len(sys.argv) != 2: print("Invalid numebr of arguments") exit(-1) data_file = sys.argv[1] temporal_columns = [] df_data = pandas.read_csv(data_file) threshold = 0.98 for col in list(df_data): column_data = df_data[col].values temporal = 0 for i in range(column_data.size): cell = str(column_data[i]) if is_time(cell) or is_year(cell) or is_month(cell) or is_timestamp( cell) or is_datetime_1(cell) or is_datetime_2(cell): temporal += 1 if temporal / column_data.size >= threshold: temporal_columns.append(col) prof = datamart_profiler.process_dataset(df_data) for i in range(len(prof["columns"])): if "temporal_resolution" in prof["columns"][i].keys(): temporal_columns.append(prof["columns"][i]["name"]) with open(data_file[-13:-3] + "out", 'w') as file: file.write("Number of columns: %d\n" % (prof["nb_columns"])) file.write("Number of temporal columns: %d\n" % (len(temporal_columns))) for i in range(len(temporal_columns)): file.write(temporal_columns[i] + '\n')
def test_profile(self): queries = { "70 Washington Square S, New York, NY 10012": [{ 'lat': 40.7294, 'lon': -73.9972, }], "6 MetroTech, Brooklyn, NY 11201": [{ 'lat': 40.6944, 'lon': -73.9857, }], "251 Mercer St, New York, NY 10012": [{ 'lat': 40.7287, 'lon': -73.9957, }], } def replacement(url, *, q): if not replacement.failed: # Fail just once replacement.failed = True response = requests.Response() response.status_code = 500 raise requests.HTTPError("Fake 500 error", response=response) return [queries[qe] for qe in q] replacement.failed = False old_query = spatial.nominatim_query old_min_batch_size = spatial.NOMINATIM_MIN_SPLIT_BATCH_SIZE spatial.nominatim_query = replacement spatial.NOMINATIM_MIN_SPLIT_BATCH_SIZE = 2 try: with data('addresses.csv', 'r') as data_fp: metadata = process_dataset( data_fp, nominatim='http://nominatim/', coverage=True, ) finally: spatial.nominatim_query = old_query spatial.NOMINATIM_MIN_SPLIT_BATCH_SIZE = old_min_batch_size self.assertJson( metadata, { 'size': 142, 'nb_rows': 3, 'nb_profiled_rows': 3, 'types': ['spatial'], 'attribute_keywords': ['place', 'loc'], 'columns': [ { 'name': 'place', 'num_distinct_values': 3, 'structural_type': 'http://schema.org/Text', 'semantic_types': [], }, { 'name': 'loc', 'structural_type': 'http://schema.org/Text', 'semantic_types': [ 'http://schema.org/Text', 'http://schema.org/address', ], }, ], 'spatial_coverage': [ { 'type': 'address', 'column_names': ['loc'], 'column_indexes': [1], 'ranges': check_geo_ranges(-74.00, 40.69, -73.98, 40.73), }, ], }, )
def handle_data_parameter(self, data): """ Handles the 'data' parameter. :param data: the input parameter :return: (data, data_profile) data: data as bytes (either the input or loaded from the input) data_profile: the profiling (metadata) of the data """ if not isinstance(data, bytes): raise ValueError # Use SHA1 of file as cache key sha1 = hashlib.sha1(data) data_hash = sha1.hexdigest() data_profile = self.application.redis.get('profile:' + data_hash) # Do format conversion materialize = {} def create_csv(cache_temp): with open(cache_temp, 'wb') as fp: fp.write(data) def convert_dataset(func, path): with tempfile.NamedTemporaryFile( prefix='.convert', dir='/cache/user_data', ) as tmpfile: os.rename(path, tmpfile.name) with open(path, 'w', newline='') as dst: func(tmpfile.name, dst) return path ret = detect_format_convert_to_csv( cache_temp, convert_dataset, materialize, ) assert ret == cache_temp with cache_get_or_set( '/cache/user_data', data_hash, create_csv, ) as csv_path: if data_profile is not None: # This is here because we want to put the data in the cache # even if the profile is already in Redis logger.info("Found cached profile_data") data_profile = json.loads(data_profile) else: logger.info("Profiling...") start = time.perf_counter() with open(csv_path, 'rb') as data: data_profile = process_dataset( data=data, lazo_client=self.application.lazo_client, nominatim=self.application.nominatim, geo_data=self.application.geo_data, search=True, include_sample=True, coverage=True, ) logger.info("Profiled in %.2fs", time.perf_counter() - start) data_profile['materialize'] = materialize self.application.redis.set( 'profile:' + data_hash, json.dumps( data_profile, # Compact sort_keys=True, indent=None, separators=(',', ':'), ), ) return data_profile, data_hash