Ejemplo n.º 1
0
    def setUp(self):
        raw1 = '\n'.join(['date,city,temp,description,air condition',
                          '12-01-2018,New York,55,rain,3',
                          '12-02-2018,Los Angeles,75,sunny,2',
                          '12-05-2018,Chicago,41,wind,3',
                          '12-06-2018,Chicago,42,cloudy,2',
                          '12-07-2018,Chicago,43,snow,3',
                          '12-08-2018,Los Angeles,72,moggy,2'])
        self.df1 = pd.read_csv(StringIO(raw1))
        raw2 = '\n'.join(['month,day,year,city,wind',
                          '12,01,2018,new york,37.5',
                          '12,02,2018,los angeles,22.1',
                          '12,05,2018,chicago,58.8'])
        self.df2 = pd.read_csv(StringIO(raw2))

        dsbox_profiler = DSboxProfiler()
        self.meta1 = dsbox_profiler.profile(inputs=self.df1, metadata={
            'variables': [
                {'semantic_type': ['http://schema.org/Date']},
                {}, {}, {}, {}]})
        self.meta2 = dsbox_profiler.profile(inputs=self.df2, metadata={
            'variables': [
                {'semantic_type': ['http://schema.org/Month']},
                {'semantic_type': ['http://schema.org/Day']},
                {'semantic_type': ['http://schema.org/Year']},
                {}, {}]})

        self.args = {
            'left_df': self.df1,
            'right_df': self.df2,
            'left_metadata': self.meta1,
            'right_metadata': self.meta2
        }

        self.rltk_joiner = RLTKJoiner()
Ejemplo n.º 2
0
    def calculate_dsbox_features(data: pd.DataFrame, metadata: typing.Union[dict, None]) -> dict:
        """Calculate dsbox features, add to metadata dictionary

         Args:
             data: dataset as a pandas dataframe
             metadata: metadata dict

         Returns:
              updated metadata dict
         """

        from datamart.profilers.dsbox_profiler import DSboxProfiler
        if not metadata:
            return metadata
        return DSboxProfiler().profile(inputs=data, metadata=metadata)
Ejemplo n.º 3
0
 def __init__(self):
     self.basic_profiler = BasicProfiler()
     self.dsbox_profiler = DSboxProfiler()
Ejemplo n.º 4
0
 def __init__(self):
     self.basic_profiler = BasicProfiler()
     self.dsbox_profiler = DSboxProfiler()
     self.two_ravens_profiler = TwoRavensProfiler()
Ejemplo n.º 5
0
 def test_dsbox_profiler(self):
     self.fake_matadata = {"variables": []}
     for i in range(self.df.shape[1]):
         self.fake_matadata["variables"].append({})
     dsbox_profiler = DSboxProfiler()
     metadata = dsbox_profiler.profile(inputs=self.df,
                                       metadata=self.fake_matadata)
     expected = {
         'variables': [{
             'dsbox_profiled': {
                 'ratio_of_numeric_values': 1.0,
                 'number_of_outlier_numeric_values': 0
             }
         }, {
             'dsbox_profiled': {
                 'ratio_of_numeric_values':
                 0.25,
                 'number_std':
                 0,
                 'number_of_outlier_numeric_values':
                 0,
                 'most_common_tokens': [{
                     'name': '2014-02-23',
                     'count': 1
                 }, {
                     'name': '2018-10-05',
                     'count': 1
                 }, {
                     'name': '2020-09-23T00:10:00',
                     'count': 1
                 }, {
                     'name': '2023213',
                     'count': 1
                 }],
                 'number_of_tokens_containing_numeric_char':
                 4,
                 'ratio_of_tokens_containing_numeric_char':
                 1.0,
                 'number_of_values_containing_numeric_char':
                 4,
                 'ratio_of_values_containing_numeric_char':
                 1.0
             }
         }, {
             'dsbox_profiled': {
                 'most_common_tokens': [{
                     'name': 'Jack',
                     'count': 1
                 }, {
                     'name': 'Ricky',
                     'count': 1
                 }, {
                     'name': 'Steve',
                     'count': 1
                 }, {
                     'name': 'Tom',
                     'count': 1
                 }]
             }
         }]
     }
     self.assertEqual(metadata, expected)