Beispiel #1
0
def setup_augmentation(orig, aug):
    with data(orig) as d:
        orig_meta = process_dataset(d)
    with data(aug) as d:
        aug_meta = process_dataset(d)

    with tempfile.TemporaryDirectory() as tmp:
        result = os.path.join(tmp, 'result.csv')
        writer = make_writer(result)
        with data(orig) as orig_data:
            with data(aug) as aug_data:
                yield orig_data, aug_data, orig_meta, aug_meta, result, writer
Beispiel #2
0
    def profile(self,
                df: pd.DataFrame,
                columns: Optional[Columns] = None) -> Dict:
        """Run profiler on a given data frame. Ensure to create a new data frame
        first that has the row index reset.

        Parameters
        ----------
        df: pd.DataFrame
            Input data frame.
        columns: int, string, or list(int or string), default=None
            Single column or list of column index positions or column names for
            those columns that are being profiled. Profile the full dataset if
            None.

        Returns
        -------
        dict
        """
        # Filter columns if list of columns is given. Otherwise project on all
        # columns in the schema to get a new data frame where we can securely
        # reset the row index.
        columns = list(range(len(df.columns))) if columns is None else columns
        df = select(df=df, columns=columns).reset_index(drop=True)
        return dmp.process_dataset(df, include_sample=False, plots=True)
Beispiel #3
0
def indentify_feature_types(csv_path, unkown_feature_types, target_names):
    metadata = datamart_profiler.process_dataset(csv_path)
    inferred_feature_types = {}

    for index, item in enumerate(metadata['columns']):
        feature_name = item['name']
        if feature_name in unkown_feature_types:
            semantic_types = item['semantic_types'] if len(
                item['semantic_types']) > 0 else [item['structural_type']]
            d3m_semantic_types = []
            for semantic_type in semantic_types:
                if semantic_type == 'http://schema.org/Enumeration':  # Changing to D3M format
                    semantic_type = 'https://metadata.datadrivendiscovery.org/types/CategoricalData'
                elif semantic_type == 'http://schema.org/identifier':  # Changing to D3M format
                    #semantic_type = 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'
                    semantic_type = 'http://schema.org/Integer'
                elif semantic_type == 'https://metadata.datadrivendiscovery.org/types/MissingData':
                    semantic_type = 'http://schema.org/Text'
                d3m_semantic_types.append(semantic_type)

            role = 'https://metadata.datadrivendiscovery.org/types/Attribute'
            if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in d3m_semantic_types:
                role = 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'
            elif feature_name in target_names:
                role = 'https://metadata.datadrivendiscovery.org/types/TrueTarget'
            inferred_feature_types[feature_name] = (role, d3m_semantic_types,
                                                    index)

    logger.info(
        'Inferred feature types:\n%s', '\n'.join([
            '%s = [%s]' % (k, ', '.join([i for i in v[1]]))
            for k, v in inferred_feature_types.items()
        ]))

    return inferred_feature_types
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-v', action='count',
                        default=0, dest='verbosity',
                        help="augments verbosity level")
    parser.add_argument('--include-sample',
                        action='store_true', default=False,
                        help="include a few random rows to the result")
    parser.add_argument('--no-coverage',
                        action='store_false', default=True, dest='coverage',
                        help="don't compute data ranges (using k-means)")
    parser.add_argument('--plots',
                        action='store_true', default=False, dest='plots',
                        help="compute plots (in vega format)")
    parser.add_argument('--load-max-size', action='store', nargs=1,
                        help="target size of the data to be analyzed. The "
                             "data will be randomly sampled if it is bigger")
    parser.add_argument('file', nargs=1, help="file to profile")
    args = parser.parse_args()

    # Set up logging
    level = {
        0: logging.WARNING,
        1: logging.INFO,
    }.get(args.verbosity, logging.DEBUG)
    logging.basicConfig(level=level)

    # Check for datamart-geo
    try:
        from datamart_geo import GeoData
        geo_data = GeoData.from_local_cache()
    except ImportError:
        logger.info("datamart-geo not installed")
        geo_data = None
    except FileNotFoundError:
        logger.warning("datamart-geo is installed but no data is available")
        geo_data = None

    # Parse max size
    load_max_size = None
    if args.load_max_size:
        if args.load_max_size[0] in ('0', '-1', ''):
            load_max_size = float('inf')
        else:
            load_max_size = parse_size(args.load_max_size[0])

    # Profile
    metadata = process_dataset(
        args.file[0],
        geo_data=geo_data,
        include_sample=args.include_sample,
        coverage=args.coverage,
        plots=args.plots,
        load_max_size=load_max_size,
    )

    json.dump(metadata, sys.stdout, indent=2, sort_keys=True)
Beispiel #5
0
    def test_point_latlong(self):
        """Test profiling latitudes & longitudes"""
        with data('geo_latlong.csv', 'r') as data_fp:
            metadata = process_dataset(
                data_fp,
                coverage=True,
            )

        self.assertJson(
            metadata,
            {
                'types': ['numerical', 'spatial'],
                "size": 4408,
                "nb_rows": 100,
                "nb_profiled_rows": 100,
                "nb_columns": 3,
                "nb_spatial_columns": 1,
                "nb_numerical_columns": 1,
                "average_row_size": lambda n: round(n, 2) == 44.08,
                "attribute_keywords": ["id", "coords", "height"],
                "columns": [
                    {
                        "name": "id",
                        "structural_type": "http://schema.org/Text",
                        "semantic_types": [],
                        "missing_values_ratio": 0.01,
                        "num_distinct_values": 99
                    },
                    {
                        "name": "coords",
                        "structural_type": "http://schema.org/GeoCoordinates",
                        "semantic_types": [],
                        "unclean_values_ratio": 0.0,
                        "point_format": "lat,long",
                    },
                    {
                        "name": "height",
                        "structural_type": "http://schema.org/Float",
                        "semantic_types": [],
                        "unclean_values_ratio": 0.0,
                        "mean": lambda n: round(n, 3) == 47.827,
                        "stddev": lambda n: round(n, 2) == 21.28,
                        "coverage": check_ranges(1.0, 90.0),
                    }
                ],
                "spatial_coverage": [
                    {
                        "type": "point_latlong",
                        "column_names": ["coords"],
                        "column_indexes": [1],
                        "geohashes4": check_geohashes('1211302313'),
                        "ranges": check_geo_ranges(-74.006, 40.6905, -73.983, 40.7352),
                        "number": 100,
                    },
                ],
            },
        )
Beispiel #6
0
    def test_no_index(self):
        """Test profiling a DataFrame that has no index, for reference"""
        df = self.DATA
        self.assertEqual(list(df.columns), ['a', 'b', 'c'])

        metadata = process_dataset(df)
        self.assertEqual(
            [col['name'] for col in metadata['columns']],
            ['a', 'b', 'c'],
        )
Beispiel #7
0
 def test_process(self):
     """Test pairing latitudes & longitudes in profiler"""
     with data('lat_longs.csv', 'r') as data_fp:
         dataframe = pandas.read_csv(data_fp)
     metadata = process_dataset(
         dataframe,
     )
     # Check columns
     self.assertJson(
         [
             {k: v for k, v in c.items()
              if k in ['name', 'structural_type', 'semantic_types']}
             for c in metadata['columns']],
         [
             {
                 'name': 'from latitude',
                 'structural_type': 'http://schema.org/Float',
                 'semantic_types': ['http://schema.org/latitude'],
             },
             {
                 'name': 'to long',
                 'structural_type': 'http://schema.org/Float',
                 'semantic_types': ['http://schema.org/longitude'],
             },
             {
                 'name': 'to lat',
                 'structural_type': 'http://schema.org/Float',
                 'semantic_types': ['http://schema.org/latitude'],
             },
             {
                 'name': 'from longitude',
                 'structural_type': 'http://schema.org/Float',
                 'semantic_types': ['http://schema.org/longitude'],
             },
             {
                 'name': 'unpaired lat',
                 'structural_type': 'http://schema.org/Float',
                 'semantic_types': [],
             },
         ]
     )
     # Check pairs
     self.assertJson(
         [
             {
                 k: v for k, v in c.items()
                 if k not in ('ranges', 'geohashes4', 'number')
             }
             for c in metadata['spatial_coverage']
         ],
         [
             {'type': 'latlong', 'column_names': ['to lat', 'to long'], 'column_indexes': [2, 1]},
             {'type': 'latlong', 'column_names': ['from latitude', 'from longitude'], 'column_indexes': [0, 3]},
         ],
     )
Beispiel #8
0
    def test_multi_index(self):
        """Test profiling a DataFrame that has multiple indexes (MultiIndex)"""
        df = self.DATA.set_index(['a', 'b'])
        self.assertEqual(list(df.index.names), ['a', 'b'])
        self.assertEqual(list(df.columns), ['c'])

        metadata = process_dataset(df)
        self.assertEqual(
            [col['name'] for col in metadata['columns']],
            ['a', 'b', 'c'],
        )
Beispiel #9
0
 def test_duplicate_column_names(self):
     """Test reading a CSV with duplicate names."""
     metadata = process_dataset(io.StringIO(textwrap.dedent('''\
         one,two,one
         a,1,c
         d,2,f
     ''')))
     self.assertEqual(
         [col['name'] for col in metadata['columns']],
         ['one', 'two', 'one'],
     )
Beispiel #10
0
    def test_year(self):
        """Test the 'year' special-case."""
        dataframe = pandas.DataFrame({
            'year': [2004, 2005, 2006],
            'number': [2014, 2015, float('nan')],
        })
        metadata = process_dataset(dataframe)

        def year_rng(year):
            year = float(year)
            return {'range': {'gte': year, 'lte': year}}

        self.assertJson(
            metadata,
            {
                'nb_rows': 3,
                'nb_profiled_rows': 3,
                'types': ['numerical', 'temporal'],
                'attribute_keywords': ['year', 'number'],
                'columns': [
                    {
                        'name': 'year',
                        'structural_type': 'http://schema.org/Text',
                        'semantic_types': ['http://schema.org/DateTime'],
                        'unclean_values_ratio': 0.0,
                        'num_distinct_values': 3,
                        'mean': 1104508800.0,
                        'stddev': lambda n: round(n, 3) == 25784316.871,
                        'coverage': [
                            year_rng(1072915200.0),
                            year_rng(1104537600.0),
                            year_rng(1136073600.0),
                        ],
                        'temporal_resolution': 'year',
                    },
                    {
                        'name': 'number',
                        'structural_type': 'http://schema.org/Integer',
                        'semantic_types': [],
                        'missing_values_ratio': lambda n: round(n, 2) == 0.33,
                        'unclean_values_ratio': 0.0,
                        'num_distinct_values': 2,
                        'mean': 2014.5,
                        'stddev': 0.5,
                        'coverage': [
                            {'range': {'gte': 2014.0, 'lte': 2014.0}},
                            {'range': {'gte': 2015.0, 'lte': 2015.0}},
                        ],
                    },
                ],
            },
        )
Beispiel #11
0
 def test_process(self):
     with data('lat_longs.csv', 'r') as data_fp:
         dataframe = pandas.read_csv(data_fp)
     metadata = process_dataset(dataframe, )
     # Check columns
     self.assertJson([{
         k: v
         for k, v in c.items()
         if k in ['name', 'structural_type', 'semantic_types']
     } for c in metadata['columns']], [
         {
             'name': 'from latitude',
             'structural_type': 'http://schema.org/Float',
             'semantic_types': ['http://schema.org/latitude'],
         },
         {
             'name': 'to long',
             'structural_type': 'http://schema.org/Float',
             'semantic_types': ['http://schema.org/longitude'],
         },
         {
             'name': 'to lat',
             'structural_type': 'http://schema.org/Float',
             'semantic_types': ['http://schema.org/latitude'],
         },
         {
             'name': 'from longitude',
             'structural_type': 'http://schema.org/Float',
             'semantic_types': ['http://schema.org/longitude'],
         },
         {
             'name': 'unpaired lat',
             'structural_type': 'http://schema.org/Float',
             'semantic_types': [],
         },
     ])
     # Check pairs
     self.assertJson(
         [{k: v
           for k, v in c.items() if k != 'ranges'}
          for c in metadata['spatial_coverage']],
         [
             {
                 'lat': 'to lat',
                 'lon': 'to long'
             },
             {
                 'lat': 'from latitude',
                 'lon': 'from longitude'
             },
         ],
     )
Beispiel #12
0
    def handle_data_parameter(self, data):
        """
        Handles the 'data' parameter.

        :param data: the input parameter
        :return: (data, data_profile)
          data: data as bytes (either the input or loaded from the input)
          data_profile: the profiling (metadata) of the data
        """

        if not isinstance(data, bytes):
            raise ValueError

        # Use SHA1 of file as cache key
        sha1 = hashlib.sha1(data)
        data_hash = sha1.hexdigest()

        data_profile = self.application.redis.get('profile:' + data_hash)

        if data_profile is not None:
            logger.info("Found cached profile_data")
            data_profile = pickle.loads(data_profile)
        else:
            logger.info("Profiling...")
            start = time.perf_counter()
            data_profile = process_dataset(
                data=io.BytesIO(data),
                lazo_client=self.application.lazo_client,
                nominatim=self.application.nominatim,
                search=True,
                include_sample=False,
                coverage=True,
            )
            logger.info("Profiled in %.2fs", time.perf_counter() - start)

            self.application.redis.set(
                'profile:' + data_hash,
                pickle.dumps(data_profile),
            )

        return data_profile, data_hash
Beispiel #13
0
def create_d3mdataset(csv_path, destination_path, version='4.0.0'):
    metadata = datamart_profiler.process_dataset(csv_path)
    dataset_path = join(destination_path, 'datasetDoc.json')

    if exists(destination_path):
        shutil.rmtree(destination_path)

    writer = D3mWriter(
        dataset_id='internal_dataset',
        destination=destination_path,
        metadata=metadata,
        format_options={
            'need_d3mindex': True,
            'version': version
        },
    )
    with open(csv_path, 'rb') as source:
        with writer.open_file('wb') as dest:
            shutil.copyfileobj(source, dest)
    writer.finish()

    return dataset_path
Beispiel #14
0
def materialize_and_process_dataset(
    dataset_id, metadata,
    lazo_client, nominatim, geo_data,
    profile_semaphore,
):
    with contextlib.ExitStack() as stack:
        # Remove converters, we'll discover what's needed
        metadata = dict(metadata)
        materialize = dict(metadata.pop('materialize'))
        materialize.pop('convert', None)

        with prom_incremented(PROM_DOWNLOADING):
            dataset_path = stack.enter_context(
                get_dataset(
                    dict(metadata, materialize=materialize),
                    dataset_id,
                )
            )

        def convert_dataset(func, path):
            def convert(cache_temp):
                with open(cache_temp, 'w', newline='') as dst:
                    func(path, dst)
            converted_key = dataset_cache_key(
                dataset_id,
                dict(metadata, materialize=materialize),
                'csv',
                {},
            )
            return stack.enter_context(
                cache_get_or_set(
                    '/cache/datasets',
                    converted_key,
                    convert,
                )
            )

        dataset_path = detect_format_convert_to_csv(
            dataset_path,
            convert_dataset,
            materialize,
        )

        # Profile
        with profile_semaphore:
            with prom_incremented(PROM_PROFILING):
                with tracer.start_as_current_span(
                    'profile',
                    attributes={'dataset': dataset_id},
                ):
                    logger.info("Profiling dataset %r", dataset_id)
                    start = time.perf_counter()
                    metadata = process_dataset(
                        data=dataset_path,
                        dataset_id=dataset_id,
                        metadata=metadata,
                        lazo_client=lazo_client,
                        nominatim=nominatim,
                        geo_data=geo_data,
                        include_sample=True,
                        coverage=True,
                        plots=True,
                    )
                    logger.info(
                        "Profiling dataset %r took %.2fs",
                        dataset_id,
                        time.perf_counter() - start,
                    )

        metadata['materialize'] = materialize
        return metadata
Beispiel #15
0
    def test_profile(self):
        with data('admins.csv', 'r') as data_fp:
            metadata = process_dataset(
                data_fp,
                geo_data=self.geo_data,
                coverage=True,
            )

        self.assertJson(
            metadata,
            {
                'size': 93,
                'nb_rows': 5,
                'nb_profiled_rows': 5,
                'types': ['spatial'],
                'attribute_keywords': ['zero', 'one'],
                'columns': [
                    {
                        'name': 'zero',
                        'structural_type': 'http://schema.org/Text',
                        'semantic_types': [
                            'http://schema.org/AdministrativeArea',
                            'http://schema.org/Enumeration',
                        ],
                        'num_distinct_values': 2,
                        'admin_area_level': 0,
                    },
                    {
                        'name': 'one',
                        'structural_type': 'http://schema.org/Text',
                        'semantic_types': [
                            'http://schema.org/AdministrativeArea',
                            'http://schema.org/Enumeration',
                        ],
                        'num_distinct_values': 5,
                        'admin_area_level': 1,
                    },
                ],
                'spatial_coverage': [
                    {
                        'type': 'admin',
                        'column_names': ['zero'],
                        'column_indexes': [0],
                        'ranges': [
                            {
                                'range': {
                                    'type': 'envelope',
                                    'coordinates': [
                                        [-61.79784095, 55.065334377],
                                        [55.8545028, -21.370782159],
                                    ],
                                },
                            },
                        ],
                    },
                    {
                        'type': 'admin',
                        'column_names': ['one'],
                        'column_indexes': [1],
                        'ranges': [
                            {
                                'range': {
                                    'type': 'envelope',
                                    'coordinates': [
                                        [-61.79784, 53.72778],
                                        [13.81686, 14.40811],
                                    ],
                                },
                            },
                        ],
                    },
                ],
            },
        )
Beispiel #16
0
    def test_year(self):
        """Test the 'year' special-case."""
        dataframe = pandas.DataFrame({
            'year': [2004, 2005, 2006],
            'number': [2014, 2015, 2016],
        })
        metadata = process_dataset(dataframe)

        def year_rng(year):
            year = float(year)
            return {'range': {'gte': year, 'lte': year}}

        self.assertJson(
            metadata,
            {
                'nb_rows':
                3,
                'nb_profiled_rows':
                3,
                'columns': [
                    {
                        'name':
                        'year',
                        'structural_type':
                        'http://schema.org/Integer',
                        'semantic_types': ['http://schema.org/DateTime'],
                        'unclean_values_ratio':
                        0.0,
                        'num_distinct_values':
                        3,
                        'mean':
                        2005.0,
                        'stddev':
                        lambda n: round(n, 3) == 0.816,
                        'coverage': [
                            year_rng(2004),
                            year_rng(2005),
                            year_rng(2006),
                        ],
                        'temporal_resolution':
                        'year',
                    },
                    {
                        'name':
                        'number',
                        'structural_type':
                        'http://schema.org/Integer',
                        'semantic_types': [],
                        'unclean_values_ratio':
                        0.0,
                        'num_distinct_values':
                        3,
                        'mean':
                        2015.0,
                        'stddev':
                        lambda n: round(n, 3) == 0.816,
                        'coverage': [
                            {
                                'range': {
                                    'gte': 2014.0,
                                    'lte': 2014.0
                                }
                            },
                            {
                                'range': {
                                    'gte': 2015.0,
                                    'lte': 2015.0
                                }
                            },
                            {
                                'range': {
                                    'gte': 2016.0,
                                    'lte': 2016.0
                                }
                            },
                        ],
                    },
                ],
            },
        )
Beispiel #17
0
    def test_profile(self):
        old_query = datamart_profiler.spatial.nominatim_query
        queries = {
            "70 Washington Square S, New York, NY 10012": [{
                'lat': 40.7294,
                'lon': -73.9972,
            }],
            "6 MetroTech, Brooklyn, NY 11201": [{
                'lat': 40.6944,
                'lon': -73.9857,
            }],
            "251 Mercer St, New York, NY 10012": [{
                'lat': 40.7287,
                'lon': -73.9957,
            }],
        }
        datamart_profiler.spatial.nominatim_query = \
            lambda url, *, q: [queries[qe] for qe in q]
        try:
            with data('addresses.csv', 'r') as data_fp:
                metadata = process_dataset(
                    data_fp,
                    nominatim='http://nominatim/',
                    coverage=True,
                )
        finally:
            datamart_profiler.spatial.nominatim_query = old_query

        self.assertJson(
            metadata,
            {
                'size':
                142,
                'nb_rows':
                3,
                'nb_profiled_rows':
                3,
                'columns': [
                    {
                        'name': 'place',
                        'num_distinct_values': 3,
                        'structural_type': 'http://schema.org/Text',
                        'semantic_types': [],
                    },
                    {
                        'name':
                        'loc',
                        'structural_type':
                        'http://schema.org/Text',
                        'semantic_types': [
                            'http://schema.org/Text',
                            'http://schema.org/address',
                        ],
                    },
                ],
                'spatial_coverage': [
                    {
                        'address': 'loc',
                        'ranges': check_geo_ranges(-74.00, 40.69, -73.98,
                                                   40.73),
                    },
                ],
            },
        )
Beispiel #18
0
    def test_admin(self):
        """Test profiling administrative areas"""
        with data('admins.csv', 'r') as data_fp:
            metadata = process_dataset(
                data_fp,
                geo_data=self.geo_data,
                coverage=True,
            )

        self.assertJson(
            metadata,
            {
                'size': 143,
                'nb_rows': 5,
                'nb_profiled_rows': 5,
                'nb_columns': 3,
                'nb_spatial_columns': 2,
                'average_row_size': lambda n: round(n, 2) == 28.6,
                'types': ['spatial'],
                'attribute_keywords': ['zero', 'one', 'mixed'],
                'columns': [
                    {
                        'name': 'zero',
                        'structural_type': 'http://schema.org/Text',
                        'semantic_types': [
                            'http://schema.org/AdministrativeArea',
                            'http://schema.org/Enumeration',
                        ],
                        'num_distinct_values': 3,
                        'admin_area_level': 0,
                    },
                    {
                        'name': 'one',
                        'structural_type': 'http://schema.org/Text',
                        'semantic_types': [
                            'http://schema.org/AdministrativeArea',
                            'http://schema.org/Enumeration',
                        ],
                        'num_distinct_values': 5,
                        'admin_area_level': 1,
                    },
                    {
                        'name': 'mixed',
                        'structural_type': 'http://schema.org/Text',
                        'semantic_types': [],
                        'num_distinct_values': 5,
                    },
                ],
                'spatial_coverage': [
                    {
                        'type': 'admin',
                        'column_names': ['zero'],
                        'column_indexes': [0],
                        'ranges': [
                            {
                                'range': {
                                    'type': 'envelope',
                                    'coordinates': [
                                        [-18.393686294555664, 55.09916687011719],
                                        [18.784475326538086, 27.433542251586914],
                                    ],
                                },
                            },
                        ],
                        'geohashes4': lambda l: sorted(l, key=lambda h: h['hash']) == [
                            {'hash': '123201', 'number': 1},
                            {'hash': '123203', 'number': 1},
                            {'hash': '123210', 'number': 1},
                            {'hash': '123211', 'number': 1},
                            {'hash': '123212', 'number': 1},
                            {'hash': '123213', 'number': 1},
                            {'hash': '123221', 'number': 1},
                            {'hash': '123223', 'number': 1},
                            {'hash': '123230', 'number': 1},
                            {'hash': '123231', 'number': 1},
                            {'hash': '123232', 'number': 1},
                            {'hash': '123233', 'number': 1},
                            {'hash': '123300', 'number': 1},
                            {'hash': '123301', 'number': 1},
                            {'hash': '123302', 'number': 1},
                            {'hash': '123303', 'number': 1},
                            {'hash': '123310', 'number': 1},
                            {'hash': '123311', 'number': 1},
                            {'hash': '123312', 'number': 1},
                            {'hash': '123313', 'number': 1},
                            {'hash': '123320', 'number': 1},
                            {'hash': '123321', 'number': 1},
                            {'hash': '123322', 'number': 1},
                            {'hash': '123323', 'number': 1},
                            {'hash': '123330', 'number': 1},
                            {'hash': '123331', 'number': 1},
                            {'hash': '123332', 'number': 1},
                            {'hash': '123333', 'number': 1},
                            {'hash': '301001', 'number': 1},
                            {'hash': '301010', 'number': 1},
                            {'hash': '301011', 'number': 1},
                            {'hash': '301100', 'number': 1},
                            {'hash': '301101', 'number': 1},
                            {'hash': '301102', 'number': 1},
                            {'hash': '301103', 'number': 1},
                            {'hash': '301110', 'number': 1},
                            {'hash': '301111', 'number': 1},
                            {'hash': '301112', 'number': 1},
                            {'hash': '301113', 'number': 1},
                            {'hash': '301120', 'number': 1},
                            {'hash': '301121', 'number': 1},
                            {'hash': '301122', 'number': 1},
                            {'hash': '301123', 'number': 1},
                            {'hash': '301130', 'number': 1},
                            {'hash': '301131', 'number': 1},
                            {'hash': '301132', 'number': 1},
                            {'hash': '301133', 'number': 1},
                            {'hash': '310002', 'number': 2},
                            {'hash': '310003', 'number': 1},
                            {'hash': '310012', 'number': 1},
                            {'hash': '310013', 'number': 1},
                            {'hash': '310020', 'number': 2},
                            {'hash': '310021', 'number': 1},
                            {'hash': '310022', 'number': 1},
                            {'hash': '310030', 'number': 1},
                            {'hash': '310031', 'number': 1},
                        ],
                        'number': 3,
                    },
                    {
                        'type': 'admin',
                        'column_names': ['one'],
                        'column_indexes': [1],
                        'ranges': [
                            {
                                'range': {
                                    'type': 'envelope',
                                    'coordinates': [
                                        [-5.144032955169678, 50.564720153808594],
                                        [13.839637756347656, 42.33274841308594],
                                    ],
                                },
                            },
                        ],
                        'geohashes4': lambda l: sorted(l, key=lambda h: h['hash']) == [
                            {'hash': '12333322', 'number': 1},
                            {'hash': '12333323', 'number': 1},
                            {'hash': '12333332', 'number': 1},
                            {'hash': '12333333', 'number': 1},
                            {'hash': '13222211', 'number': 1},
                            {'hash': '13222213', 'number': 1},
                            {'hash': '13222222', 'number': 1},
                            {'hash': '13222231', 'number': 1},
                            {'hash': '13222233', 'number': 1},
                            {'hash': '13222300', 'number': 1},
                            {'hash': '13222301', 'number': 1},
                            {'hash': '13222302', 'number': 1},
                            {'hash': '13222303', 'number': 1},
                            {'hash': '13222320', 'number': 2},
                            {'hash': '13222321', 'number': 2},
                            {'hash': '13222322', 'number': 2},
                            {'hash': '13222323', 'number': 2},
                            {'hash': '13222330', 'number': 1},
                            {'hash': '13222331', 'number': 1},
                            {'hash': '13222332', 'number': 1},
                            {'hash': '13222333', 'number': 1},
                            {'hash': '30111100', 'number': 1},
                            {'hash': '30111101', 'number': 1},
                            {'hash': '30111102', 'number': 1},
                            {'hash': '30111103', 'number': 1},
                            {'hash': '30111110', 'number': 1},
                            {'hash': '30111111', 'number': 1},
                            {'hash': '30111112', 'number': 1},
                            {'hash': '30111113', 'number': 1},
                            {'hash': '30111120', 'number': 1},
                            {'hash': '30111121', 'number': 1},
                            {'hash': '30111122', 'number': 1},
                            {'hash': '30111123', 'number': 1},
                            {'hash': '30111130', 'number': 1},
                            {'hash': '30111131', 'number': 1},
                            {'hash': '30111132', 'number': 1},
                            {'hash': '30111133', 'number': 1},
                            {'hash': '31000000', 'number': 1},
                            {'hash': '31000002', 'number': 1},
                            {'hash': '31000020', 'number': 1},
                            {'hash': '31000022', 'number': 1},
                            {'hash': '31000100', 'number': 1},
                            {'hash': '31000101', 'number': 1},
                            {'hash': '31000102', 'number': 1},
                            {'hash': '31000103', 'number': 1},
                            {'hash': '31000110', 'number': 1},
                            {'hash': '31000111', 'number': 1},
                            {'hash': '31000112', 'number': 1},
                            {'hash': '31000113', 'number': 1},
                            {'hash': '31000231', 'number': 1},
                            {'hash': '31000233', 'number': 1},
                            {'hash': '31000320', 'number': 1},
                            {'hash': '31000321', 'number': 1},
                            {'hash': '31000322', 'number': 1},
                            {'hash': '31000323', 'number': 1},
                            {'hash': '31000330', 'number': 1},
                            {'hash': '31000331', 'number': 1},
                            {'hash': '31000332', 'number': 1},
                            {'hash': '31000333', 'number': 1},
                            {'hash': '31002011', 'number': 1},
                            {'hash': '31002013', 'number': 1},
                            {'hash': '31002100', 'number': 1},
                            {'hash': '31002101', 'number': 1},
                            {'hash': '31002102', 'number': 1},
                            {'hash': '31002103', 'number': 1},
                            {'hash': '31002110', 'number': 1},
                            {'hash': '31002111', 'number': 1},
                            {'hash': '31002112', 'number': 1},
                            {'hash': '31002113', 'number': 1},
                        ],
                        # FIXME: number currently 1 because of missing geo data
                        'number': lambda n: isinstance(n, int),
                    },
                ],
            },
        )
Beispiel #19
0
def materialize_and_process_dataset(
    dataset_id, metadata,
    lazo_client, nominatim,
    profile_semaphore,
    cache_invalid=False,
):
    with contextlib.ExitStack() as stack:
        with prom_incremented(PROM_DOWNLOADING):
            dataset_path = stack.enter_context(
                get_dataset(metadata, dataset_id, cache_invalid=cache_invalid)
            )
        materialize = metadata.pop('materialize')

        # Check for Excel file format
        try:
            xlrd.open_workbook(dataset_path)
        except xlrd.XLRDError:
            pass
        else:
            logger.info("This is an Excel file")
            materialize.setdefault('convert', []).append({'identifier': 'xls'})
            excel_temp_path = dataset_path + '.xls'
            os.rename(dataset_path, excel_temp_path)
            try:
                with open(dataset_path, 'w', newline='') as dst:
                    xls_to_csv(excel_temp_path, dst)
            finally:
                os.remove(excel_temp_path)

        # Check for TSV file format
        with open(dataset_path, 'r') as fp:
            try:
                dialect = csv.Sniffer().sniff(fp.read(16384))
            except Exception as error:  # csv.Error, UnicodeDecodeError
                logger.error("csv.Sniffer error: %s", error)
                dialect = csv.get_dialect('excel')
        if getattr(dialect, 'delimiter', '') == '\t':
            logger.info("This is a TSV file")
            materialize.setdefault('convert', []).append({'identifier': 'tsv'})
            tsv_temp_path = dataset_path + '.tsv'
            os.rename(dataset_path, tsv_temp_path)
            try:
                with open(dataset_path, 'w', newline='') as dst:
                    tsv_to_csv(tsv_temp_path, dst)
            finally:
                os.remove(tsv_temp_path)

        # Check for pivoted temporal table
        with open(dataset_path, 'r') as fp:
            reader = csv.reader(fp)
            try:
                columns = next(iter(reader))
            except StopIteration:
                columns = []
        if len(columns) >= 3:
            non_matches = [
                i for i, name in enumerate(columns)
                if parse_date(name) is None
            ]
            if len(non_matches) <= max(2.0, 0.20 * len(columns)):
                logger.info("Detected pivoted table")
                materialize.setdefault('convert', []).append({
                    'identifier': 'pivot',
                    'except_columns': non_matches,
                })
                pivot_temp_path = dataset_path + '.pivot.csv'
                os.rename(dataset_path, pivot_temp_path)
                try:
                    with open(dataset_path, 'w', newline='') as dst:
                        pivot_table(pivot_temp_path, dst, non_matches)
                finally:
                    os.remove(pivot_temp_path)

        # Profile
        with profile_semaphore:
            with prom_incremented(PROM_PROFILING):
                logger.info("Profiling dataset %r", dataset_id)
                start = time.perf_counter()
                metadata = process_dataset(
                    data=dataset_path,
                    dataset_id=dataset_id,
                    metadata=metadata,
                    lazo_client=lazo_client,
                    nominatim=nominatim,
                    include_sample=True,
                    coverage=True,
                    plots=True,
                )
                logger.info(
                    "Profiling dataset %r took %.2fs",
                    dataset_id,
                    time.perf_counter() - start,
                )

        metadata['materialize'] = materialize
        return metadata

if len(sys.argv) != 2:
    print("Invalid numebr of arguments")
    exit(-1)
data_file = sys.argv[1]
temporal_columns = []
df_data = pandas.read_csv(data_file)

threshold = 0.98
for col in list(df_data):
    column_data = df_data[col].values
    temporal = 0
    for i in range(column_data.size):
        cell = str(column_data[i])
        if is_time(cell) or is_year(cell) or is_month(cell) or is_timestamp(
                cell) or is_datetime_1(cell) or is_datetime_2(cell):
            temporal += 1
    if temporal / column_data.size >= threshold:
        temporal_columns.append(col)

prof = datamart_profiler.process_dataset(df_data)
for i in range(len(prof["columns"])):
    if "temporal_resolution" in prof["columns"][i].keys():
        temporal_columns.append(prof["columns"][i]["name"])
with open(data_file[-13:-3] + "out", 'w') as file:
    file.write("Number of columns:  %d\n" % (prof["nb_columns"]))
    file.write("Number of temporal columns: %d\n" % (len(temporal_columns)))
    for i in range(len(temporal_columns)):
        file.write(temporal_columns[i] + '\n')
Beispiel #21
0
    def test_profile(self):
        queries = {
            "70 Washington Square S, New York, NY 10012": [{
                'lat': 40.7294, 'lon': -73.9972,
            }],
            "6 MetroTech, Brooklyn, NY 11201": [{
                'lat': 40.6944, 'lon': -73.9857,
            }],
            "251 Mercer St, New York, NY 10012": [{
                'lat': 40.7287, 'lon': -73.9957,
            }],
        }

        def replacement(url, *, q):
            if not replacement.failed:  # Fail just once
                replacement.failed = True
                response = requests.Response()
                response.status_code = 500
                raise requests.HTTPError("Fake 500 error", response=response)
            return [queries[qe] for qe in q]
        replacement.failed = False

        old_query = spatial.nominatim_query
        old_min_batch_size = spatial.NOMINATIM_MIN_SPLIT_BATCH_SIZE
        spatial.nominatim_query = replacement
        spatial.NOMINATIM_MIN_SPLIT_BATCH_SIZE = 2
        try:
            with data('addresses.csv', 'r') as data_fp:
                metadata = process_dataset(
                    data_fp,
                    nominatim='http://nominatim/',
                    coverage=True,
                )
        finally:
            spatial.nominatim_query = old_query
            spatial.NOMINATIM_MIN_SPLIT_BATCH_SIZE = old_min_batch_size

        self.assertJson(
            metadata,
            {
                'size': 142,
                'nb_rows': 3,
                'nb_profiled_rows': 3,
                'types': ['spatial'],
                'attribute_keywords': ['place', 'loc'],
                'columns': [
                    {
                        'name': 'place',
                        'num_distinct_values': 3,
                        'structural_type': 'http://schema.org/Text',
                        'semantic_types': [],
                    },
                    {
                        'name': 'loc',
                        'structural_type': 'http://schema.org/Text',
                        'semantic_types': [
                            'http://schema.org/Text',
                            'http://schema.org/address',
                        ],
                    },
                ],
                'spatial_coverage': [
                    {
                        'type': 'address',
                        'column_names': ['loc'],
                        'column_indexes': [1],
                        'ranges': check_geo_ranges(-74.00, 40.69, -73.98, 40.73),
                    },
                ],
            },
        )
Beispiel #22
0
    def handle_data_parameter(self, data):
        """
        Handles the 'data' parameter.

        :param data: the input parameter
        :return: (data, data_profile)
          data: data as bytes (either the input or loaded from the input)
          data_profile: the profiling (metadata) of the data
        """

        if not isinstance(data, bytes):
            raise ValueError

        # Use SHA1 of file as cache key
        sha1 = hashlib.sha1(data)
        data_hash = sha1.hexdigest()

        data_profile = self.application.redis.get('profile:' + data_hash)

        # Do format conversion
        materialize = {}

        def create_csv(cache_temp):
            with open(cache_temp, 'wb') as fp:
                fp.write(data)

            def convert_dataset(func, path):
                with tempfile.NamedTemporaryFile(
                        prefix='.convert',
                        dir='/cache/user_data',
                ) as tmpfile:
                    os.rename(path, tmpfile.name)
                    with open(path, 'w', newline='') as dst:
                        func(tmpfile.name, dst)
                    return path

            ret = detect_format_convert_to_csv(
                cache_temp,
                convert_dataset,
                materialize,
            )
            assert ret == cache_temp

        with cache_get_or_set(
                '/cache/user_data',
                data_hash,
                create_csv,
        ) as csv_path:
            if data_profile is not None:
                # This is here because we want to put the data in the cache
                # even if the profile is already in Redis
                logger.info("Found cached profile_data")
                data_profile = json.loads(data_profile)
            else:
                logger.info("Profiling...")
                start = time.perf_counter()
                with open(csv_path, 'rb') as data:
                    data_profile = process_dataset(
                        data=data,
                        lazo_client=self.application.lazo_client,
                        nominatim=self.application.nominatim,
                        geo_data=self.application.geo_data,
                        search=True,
                        include_sample=True,
                        coverage=True,
                    )
                logger.info("Profiled in %.2fs", time.perf_counter() - start)

                data_profile['materialize'] = materialize

                self.application.redis.set(
                    'profile:' + data_hash,
                    json.dumps(
                        data_profile,
                        # Compact
                        sort_keys=True,
                        indent=None,
                        separators=(',', ':'),
                    ),
                )

        return data_profile, data_hash