Exemple #1
0
 def test_create_schema_view_using_regex_and_unischema_fields(self):
     TestSchema = Unischema('TestSchema', [
         UnischemaField('int_field', np.int8,
                        (), ScalarCodec(IntegerType()), False),
         UnischemaField('string_field', np.string_,
                        (), ScalarCodec(StringType()), False),
         UnischemaField('other_string_field', np.string_,
                        (), ScalarCodec(StringType()), False),
     ])
     view = TestSchema.create_schema_view(
         ['int.*$', TestSchema.string_field])
     self.assertEqual(set(view.fields.keys()),
                      {'int_field', 'string_field'})
Exemple #2
0
    def test_as_spark_schema(self):
        """Try using 'as_spark_schema' function"""
        TestSchema = Unischema('TestSchema', [
            UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
            UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
        ])

        spark_schema = TestSchema.as_spark_schema()
        self.assertEqual(spark_schema.fields[0].name, 'int_field')
        self.assertEqual(spark_schema.fields[1].name, 'string_field')

        self.assertEqual(TestSchema.fields['int_field'].name, 'int_field')
        self.assertEqual(TestSchema.fields['string_field'].name, 'string_field')
Exemple #3
0
def transform_schema(schema, transform_spec):
    """Creates a post-transform given a pre-transform schema and a transform_spec with mutation instructions.

    :param schema: A pre-transform schema
    :param transform_spec: a TransformSpec object with mutation instructions.
    :return: A post-transform schema
    """
    removed_fields = set(transform_spec.removed_fields)
    unknown_field_names = removed_fields - set(schema.fields.keys())
    if unknown_field_names:
        raise ValueError(
            'Unexpected field names found in TransformSpec remove_fields list: "%s". '
            'Valid values are "%s".', ', '.join(removed_fields),
            ', '.join(schema.fields.keys()))

    exclude_fields = {f[0]
                      for f in transform_spec.edit_fields} | removed_fields
    fields = [v for k, v in schema.fields.items() if k not in exclude_fields]

    for field_to_edit in transform_spec.edit_fields:
        edited_unischema_field = UnischemaField(name=field_to_edit[0],
                                                numpy_dtype=field_to_edit[1],
                                                shape=field_to_edit[2],
                                                codec=None,
                                                nullable=field_to_edit[3])
        fields.append(edited_unischema_field)

    return Unischema(schema._name + '_transformed', fields)
def transform_schema(schema, transform_spec):
    """Creates a post-transform given a pre-transform schema and a transform_spec with mutation instructions.

    :param schema: A pre-transform schema
    :param transform_spec: a TransformSpec object with mutation instructions.
    :return: A post-transform schema
    """
    removed_fields = set(transform_spec.removed_fields)
    unknown_field_names = removed_fields - set(schema.fields.keys())
    if unknown_field_names:
        warnings.warn(
            'remove_fields specified some field names that are not part of the schema. '
            'These field names will be ignored "{}". '.format(
                ', '.join(unknown_field_names)))

    exclude_fields = {f[0]
                      for f in transform_spec.edit_fields} | removed_fields
    fields = [v for k, v in schema.fields.items() if k not in exclude_fields]

    for field_to_edit in transform_spec.edit_fields:
        edited_unischema_field = UnischemaField(name=field_to_edit[0],
                                                numpy_dtype=field_to_edit[1],
                                                shape=field_to_edit[2],
                                                codec=None,
                                                nullable=field_to_edit[3])
        fields.append(edited_unischema_field)

    return Unischema(schema._name + '_transformed', fields)
Exemple #5
0
    def get_petastorm_schema(name, column_list):
        petastorm_column_list = []
        for _column in column_list:
            petastorm_column = SchemaUtils.get_petastorm_column(_column)
            petastorm_column_list.append(petastorm_column)

        petastorm_schema = Unischema(name, petastorm_column_list)
        return petastorm_schema
def test_decode_numpy_scalar_with_unknown_dtype():
    """If numpy_dtype is None, then the value is not decoded, just passed through."""

    MatrixSchema = Unischema('TestSchema',
                             [UnischemaField('scalar', None, ())])
    row = {'scalar': [4, 2]}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == [4, 2]
Exemple #7
0
def main(source, target, test_size, under_sampling):
    source_data_dir_path = Path(source)
    target_data_dir_path = Path(target)

    # prepare dir for dataset
    application_data_dir_path = target_data_dir_path / 'application_classification'
    traffic_data_dir_path = target_data_dir_path / 'traffic_classification'

    # initialise local spark
    os.environ['PYSPARK_PYTHON'] = sys.executable
    os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
    memory_gb = psutil.virtual_memory().available // 1024 // 1024 // 1024
    spark = (SparkSession.builder.master('local[*]').config(
        'spark.driver.memory',
        f'{memory_gb}g').config('spark.driver.host',
                                '127.0.0.1').getOrCreate())

    # prepare final schema
    schema = Unischema('data_schema', [
        UnischemaField('feature', np.float32,
                       (1, 1500), CompressedNdarrayCodec(), False),
        UnischemaField('flow_feature', np.float32,
                       (1, 76), CompressedNdarrayCodec(), False),
        UnischemaField('label', np.int32, (), ScalarCodec(LongType()), False),
    ])

    # read data+
    df = spark.read.parquet(
        f'{source_data_dir_path.absolute().as_uri()}/*.parquet')

    # prepare data for application classification and traffic classification
    print('processing application classification dataset')
    create_train_test_for_task(df=df,
                               label_col='app_label',
                               spark=spark,
                               schema=schema,
                               test_size=test_size,
                               under_sampling=under_sampling,
                               data_dir_path=application_data_dir_path)

    print('processing traffic classification dataset')
    create_train_test_for_task(df=df,
                               label_col='traffic_label',
                               spark=spark,
                               schema=schema,
                               test_size=test_size,
                               under_sampling=under_sampling,
                               data_dir_path=traffic_data_dir_path)

    # stats
    print_df_label_distribution(spark, schema,
                                application_data_dir_path / 'train.parquet')
    print_df_label_distribution(spark, schema,
                                application_data_dir_path / 'test.parquet')
    print_df_label_distribution(spark, schema,
                                traffic_data_dir_path / 'train.parquet')
    print_df_label_distribution(spark, schema,
                                traffic_data_dir_path / 'test.parquet')
    def test_partial_application(self):
        unischema = Unischema('foo', [])
        func = partial(dict_to_spark_row, unischema)
        func({})

        # Must pass as positional arg in the right order
        func = partial(dict_to_spark_row, {})
        with self.assertRaises(AssertionError):
            func(Unischema)
def test_decode_numpy_scalar_when_codec_is_none():
    """Decoding a row that has a field with the codec set to None. The type should be deduced automatically
    from UnischemaField's numpy_dtype attribute"""

    MatrixSchema = Unischema('TestSchema',
                             [UnischemaField('scalar', np.float64, ())])
    row = {'scalar': 42.0}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == 42
    assert isinstance(decoded_value, np.float64)