def test_to_numpy_dtype_integer(self):
        mock_name = Mock(name='name')
        mock_statistics = Mock(name='statistics')
        mock_representations = Mock(name='representations')
        mock_field_type = 'integer'
        expectations = {
            (-100, 100): np.int8,
            (0, 240): np.uint8,
            (-10000, 10000): np.int16,
            (500, 40000): np.uint16,
            (-200000000, 200000000): np.int32,
            (25, 4000000000): np.uint32,
            (-9000000000000000000, 2000000000): np.int64,
            (25, 10000000000000000000): np.uint64,
            (25, 1000000000000000000000000000): np.float128,
            (None, None): np.int64
        }
        for (mock_min, mock_max), expected_pandas_type in expectations.items():
            mock_constraints = Mock(name='constraints')
            mock_constraints.min_ = mock_min
            mock_constraints.max_ = mock_max
            field = RecordsSchemaField(name=mock_name,
                                       field_type=mock_field_type,
                                       constraints=mock_constraints,
                                       statistics=mock_statistics,
                                       representations=mock_representations)

            out = field.to_numpy_dtype()
            self.assertEqual(out, expected_pandas_type, f"min={mock_min}, max={mock_max}")
    def test_to_numpy_dtype_decimal_float(self):
        mock_name = Mock(name='name')
        mock_statistics = Mock(name='statistics')
        mock_representations = Mock(name='representations')
        mock_field_type = 'decimal'
        expectations = {
            (8, 4): np.float16,
            (20, 10): np.float32,
            (40, 20): np.float64,
            (80, 64): np.float128,
            (500, 250): np.float128,
            (None, None): np.float64,
        }
        for (fp_total_bits, fp_significand_bits), expected_pandas_type in expectations.items():
            mock_constraints = Mock(name='constraints')
            mock_constraints.fixed_precision = None
            mock_constraints.fixed_scale = None
            mock_constraints.fp_total_bits = fp_total_bits
            mock_constraints.fp_significand_bits = fp_significand_bits
            field = RecordsSchemaField(name=mock_name,
                                       field_type=mock_field_type,
                                       constraints=mock_constraints,
                                       statistics=mock_statistics,
                                       representations=mock_representations)

            out = field.to_numpy_dtype()
            self.assertEqual(out, expected_pandas_type)
Beispiel #3
0
def check_dtype(field_type, constraints, expectation):
    field = RecordsSchemaField(
        name="test",
        field_type=field_type,
        constraints=constraints,
        statistics=None,
        representations=None,
    )
    out = field.cast_series_type(pd.Series(1, dtype=np.int8))
    assert_equal(out.dtype, expectation)
    def test_to_numpy_dtype_decimal_no_constraints(self):
        mock_name = Mock(name='name')
        mock_statistics = Mock(name='statistics')
        mock_representations = Mock(name='representations')
        mock_field_type = 'decimal'
        field = RecordsSchemaField(name=mock_name,
                                   field_type=mock_field_type,
                                   constraints=None,
                                   statistics=mock_statistics,
                                   representations=mock_representations)

        out = field.to_numpy_dtype()
        self.assertEqual(out, np.float64)
Beispiel #5
0
 def test_cast_series_type_time_empty(self):
     mock_name = Mock(name='name')
     mock_field_type = 'time'
     mock_constraints = Mock(name='constraints')
     mock_statistics = Mock(name='statistics')
     mock_representations = Mock(name='representations')
     field = RecordsSchemaField(name=mock_name,
                                field_type=mock_field_type,
                                constraints=mock_constraints,
                                statistics=mock_statistics,
                                representations=mock_representations)
     data = np.array([])
     series = pd.Series(data)
     new_series = field.cast_series_type(series)
     self.assertIsNotNone(new_series)
Beispiel #6
0
 def test_to_sqlalchemy_column(self, mock_field_to_sqlalchemy_column):
     mock_driver = Mock(name='driver')
     mock_name = Mock(name='name')
     mock_field_type = Mock(name='field_type')
     mock_constraints = Mock(name='constraints')
     mock_statistics = Mock(name='statistics')
     mock_representations = Mock(name='representations')
     field = RecordsSchemaField(name=mock_name,
                                field_type=mock_field_type,
                                constraints=mock_constraints,
                                statistics=mock_statistics,
                                representations=mock_representations)
     out = field.to_sqlalchemy_column(mock_driver)
     mock_field_to_sqlalchemy_column.assert_called_with(field, mock_driver)
     self.assertEqual(out, mock_field_to_sqlalchemy_column.return_value)
Beispiel #7
0
 def test_cast_series_type_time_timedelta_entries_zeroed(self):
     mock_name = Mock(name='name')
     mock_field_type = 'time'
     mock_constraints = Mock(name='constraints')
     mock_statistics = Mock(name='statistics')
     mock_representations = Mock(name='representations')
     field = RecordsSchemaField(name=mock_name,
                                field_type=mock_field_type,
                                constraints=mock_constraints,
                                statistics=mock_statistics,
                                representations=mock_representations)
     data = np.array([pd.Timedelta(hours=0, minutes=0, seconds=0)])
     series = pd.Series(data)
     new_series = field.cast_series_type(series)
     self.assertEqual(new_series[0], datetime.time(0, 0, 0))
    def test_to_numpy_dtype_fixed_precision_(self):
        mock_name = Mock(name='name')
        mock_statistics = Mock(name='statistics')
        mock_representations = Mock(name='representations')
        mock_constraints = Mock(name='constraints')
        mock_constraints.fixed_precision = 1
        mock_constraints.fixed_scale = 1
        mock_field_type = 'decimal'
        field = RecordsSchemaField(name=mock_name,
                                   field_type=mock_field_type,
                                   constraints=mock_constraints,
                                   statistics=mock_statistics,
                                   representations=mock_representations)

        out = field.to_numpy_dtype()
        self.assertEqual(out, np.float64)
 def test_convert_datetime_to_datetimetz_not_datetime(self):
     mock_name = Mock(name='name')
     mock_field_type = 'time'
     mock_constraints = Mock(name='constraints')
     mock_statistics = Mock(name='statistics')
     mock_representations = Mock(name='representations')
     field = RecordsSchemaField(name=mock_name,
                                field_type=mock_field_type,
                                constraints=mock_constraints,
                                statistics=mock_statistics,
                                representations=mock_representations)
     out = field.convert_datetime_to_datetimetz()
     self.assertEqual(out.name, mock_name)
     self.assertEqual(out.field_type, mock_field_type)
     self.assertEqual(out.constraints, mock_constraints)
     self.assertEqual(out.representations, mock_representations)
Beispiel #10
0
 def test_from_index(self, mock_field_from_index):
     mock_index = Mock(name='index')
     mock_processing_instructions = Mock(name='processing_instructions')
     out = RecordsSchemaField.from_index(mock_index, mock_processing_instructions)
     mock_field_from_index.\
         assert_called_with(index=mock_index,
                            processing_instructions=mock_processing_instructions)
     self.assertEqual(out, mock_field_from_index.return_value)
Beispiel #11
0
 def test_refine_from_series(self, mock_refine_field_from_series):
     mock_name = Mock(name='name')
     mock_field_type = Mock(name='field_type')
     mock_constraints = Mock(name='constraints')
     mock_statistics = Mock(name='statistics')
     mock_representations = Mock(name='representations')
     mock_series = Mock(name='series')
     mock_total_rows = Mock(name='total_rows')
     mock_rows_sampled = Mock(name='rows_sampled')
     field = RecordsSchemaField(name=mock_name,
                                field_type=mock_field_type,
                                constraints=mock_constraints,
                                statistics=mock_statistics,
                                representations=mock_representations)
     field.refine_from_series(mock_series, mock_total_rows, mock_rows_sampled)
     mock_refine_field_from_series.assert_called_with(field, mock_series, mock_total_rows,
                                                      mock_rows_sampled)
Beispiel #12
0
 def test_from_sqlalchemy_column(self, mock_field_from_sqlalchemy_column):
     mock_column = Mock(name='column')
     mock_driver = Mock(name='driver')
     mock_rep_type = Mock(name='rep_type')
     out = RecordsSchemaField.from_sqlalchemy_column(column=mock_column,
                                                     driver=mock_driver,
                                                     rep_type=mock_rep_type)
     mock_field_from_sqlalchemy_column.\
         assert_called_with(column=mock_column,
                            driver=mock_driver,
                            rep_type=mock_rep_type)
     self.assertEqual(out, mock_field_from_sqlalchemy_column.return_value)
    def test_to_numpy_dtype_misc(self):
        mock_name = Mock(name='name')
        mock_constraints = Mock(name='constraints')
        mock_statistics = Mock(name='statistics')
        mock_representations = Mock(name='representations')
        expectations = {
            'boolean': np.bool_,
            'string': np.object_,
            'date': np.object_,
            'datetime': 'datetime64[ns]',
            'datetimetz': 'datetime64[ns, UTC]',
            'time': np.object_,
        }
        for field_type, expected_pandas_type in expectations.items():
            field = RecordsSchemaField(name=mock_name,
                                       field_type=field_type,
                                       constraints=mock_constraints,
                                       statistics=mock_statistics,
                                       representations=mock_representations)

            out = field.to_numpy_dtype()
            self.assertEqual(out, expected_pandas_type)
Beispiel #14
0
def schema_from_dataframe(df: DataFrame,
                          processing_instructions: ProcessingInstructions,
                          include_index: bool) -> 'RecordsSchema':
    from records_mover.records.schema import RecordsSchema  # noqa
    from records_mover.records.schema.field import RecordsSchemaField  # noqa
    fields = []
    origin_representation = \
        RecordsSchemaKnownRepresentation.from_dataframe(df, processing_instructions)
    known_representations: Dict[str, RecordsSchemaKnownRepresentation] = {
        'origin': origin_representation
    }

    if include_index:
        fields.append(
            RecordsSchemaField.from_index(
                df.index, processing_instructions=processing_instructions))
    for column in df:
        fields.append(
            RecordsSchemaField.from_series(
                df[column], processing_instructions=processing_instructions))

    return RecordsSchema(fields=fields,
                         known_representations=known_representations)
Beispiel #15
0
 def test_python_type_to_field_type(self):
     mock_unknown_type = Mock(name='unknown_type')
     out = RecordsSchemaField.python_type_to_field_type(mock_unknown_type)
     self.assertIsNone(out)
Beispiel #16
0
 def test_is_more_specific_type_false(self):
     self.assertFalse(RecordsSchemaField.is_more_specific_type('string', 'integer'))
Beispiel #17
0
 def test_is_more_specific_type_true(self):
     self.assertTrue(RecordsSchemaField.is_more_specific_type('integer', 'string'))
Beispiel #18
0
    def test_refine_field_from_series_more_specific(self) -> None:
        # This test is designed to break when a new field type is
        # introduced, so you can add new expectations and make sure
        # the code handles the new type!

        fields = {
            'integer': {
                'series': pd.Series([30, 35, 40]),
                'constraints_type': RecordsSchemaFieldIntegerConstraints,
                'statistics_type': type(None),
            },
            'decimal': {
                'series': pd.Series([30.0, 35.1, 40.2]),
                'constraints_type': RecordsSchemaFieldDecimalConstraints,
                'statistics_type': type(None),
            },
            'string': {
                'series': pd.Series(['a', 'b', 'c']),
                'constraints_type': RecordsSchemaFieldStringConstraints,
                'statistics_type': RecordsSchemaFieldStringStatistics,
            },
            'boolean': {
                'series': pd.Series([True, True, False]),
                'constraints_type': RecordsSchemaFieldConstraints,
                'statistics_type': type(None),
            },
            'date': {
                'series': pd.Series([datetime.date(2020, 1, 1)]),
                'constraints_type': RecordsSchemaFieldConstraints,
                'statistics_type': type(None),
            },
            'time': {
                'series':
                pd.Series([datetime.time(hour=12, minute=0, second=0)]),
                'constraints_type': RecordsSchemaFieldConstraints,
                'statistics_type': type(None),
            },
            'timetz': {
                'series':
                pd.Series([
                    datetime.time(hour=12,
                                  minute=0,
                                  second=0,
                                  tzinfo=pytz.timezone('US/Eastern'))
                ]),
                # refine_field_from_series() is not smart enough to
                # distinguish whether the time objects inside it all
                # have timezones or not.
                'expected_field_type':
                'time',
                'constraints_type':
                RecordsSchemaFieldConstraints,
                'statistics_type':
                type(None),
            },
            'datetime': {
                'series': pd.Series([datetime.datetime(2020, 1, 1, hour=12)]),
                'constraints_type': RecordsSchemaFieldConstraints,
                'statistics_type': type(None),
            },
            'datetimetz': {
                'series':
                pd.Series([
                    datetime.datetime(2020,
                                      1,
                                      1,
                                      hour=12,
                                      tzinfo=pytz.timezone('US/Eastern'))
                ]),
                # refine_field_from_series() is not smart enough to
                # distinguish whether the datetime objects inside it all
                # have timezones or not.
                'expected_field_type':
                'datetime',
                'constraints_type':
                RecordsSchemaFieldConstraints,
                'statistics_type':
                type(None),
            }
        }
        for field_type in RECORDS_FIELD_TYPES:
            constraints = RecordsSchemaFieldStringConstraints(
                required=True,
                unique=False,
                max_length_bytes=255,
                max_length_chars=255)
            pandas_representation = RecordsSchemaPandasFieldRepresentation(
                pd_df_dtype={}, pd_df_ftype=None, pd_df_coltype='series')
            field = RecordsSchemaField(
                name='testfield',
                field_type='string',
                constraints=constraints,
                statistics=None,
                representations={'pandas': pandas_representation})
            series = fields[field_type]['series']
            returned_field = refine_field_from_series(field,
                                                      series,
                                                      total_rows=10,
                                                      rows_sampled=10)
            if 'expected_field_type' in fields[field_type]:
                self.assertEquals(returned_field.field_type,
                                  fields[field_type]['expected_field_type'])
            else:
                self.assertEquals(returned_field.field_type, field_type)
            self.assertEquals(type(returned_field.constraints),
                              fields[field_type]['constraints_type'])
            self.assertEquals(type(returned_field.statistics),
                              fields[field_type]['statistics_type'])