def test_cast_row_to_string(self):
     self.assertEqual(
         cast_to_string(
             Row(
                 a=collections.OrderedDict([('value', None), ('b', {
                     'c': 7
                 })]),
                 b=None,
                 c=True,
                 d=5.2,
             ),
             StructType([
                 StructField(
                     'a',
                     MapType(
                         StringType(),
                         MapType(StringType(), LongType(), True),
                         True,
                     ),
                     True,
                 ),
                 StructField('b', LongType(), True),
                 StructField('c', BooleanType(), True),
                 StructField('d', DoubleType(), True),
             ]),
             options=BASE_OPTIONS,
         ),
         '[[value ->, b -> [c -> 7]],, true, 5.2]',
     )
Esempio n. 2
0
def guess_type_from_values_as_string(values, options):
    # Reproduces inferences available in Spark
    # PartitioningUtils.inferPartitionColumnValue()
    # located in org.apache.spark.sql.execution.datasources
    tested_types = (
        IntegerType(),
        LongType(),
        DecimalType(),
        DoubleType(),
        TimestampType(),
        StringType(),
    )
    string_type = StringType()
    for tested_type in tested_types:
        type_caster = get_caster(from_type=string_type,
                                 to_type=tested_type,
                                 options=options)
        try:
            for value in values:
                casted_value = type_caster(value)
                if casted_value is None and value not in ('null', None):
                    raise ValueError
            return tested_type
        except ValueError:
            pass
    # Should never happen
    raise AnalysisException(
        'Unable to find a matching type for some fields, even StringType did not work'
    )
    def test_cast_hour_string_to_timestamp(self):
        today = datetime.date.today()

        self.assertEqual(
            cast_to_timestamp('10:', StringType(), options=BASE_OPTIONS),
            datetime.datetime(today.year, today.month, today.day, 10, 0, 0),
        )
 def test_cast_bigger_string_to_long(self):
     self.assertEqual(
         cast_to_long('9223372036854775808',
                      StringType(),
                      options=BASE_OPTIONS),
         None,
     )
 def test_cast_small_string_to_long(self):
     self.assertEqual(
         cast_to_long('9223372036854775807',
                      StringType(),
                      options=BASE_OPTIONS),
         9223372036854775807,
     )
 def test_cast_map_to_map(self):
     self.assertEqual(
         cast_to_map(
             {
                 1: '1',
                 2: '2'
             },
             MapType(ByteType(), StringType()),
             MapType(StringType(), FloatType()),
             options=BASE_OPTIONS,
         ),
         {
             '1': 1.0,
             '2': 2.0
         },
     )
Esempio n. 7
0
    def eval(self, row, schema):
        raw_value = self.column.cast(StringType()).eval(row, schema)

        if raw_value is None:
            return None

        if raw_value == '':
            return ''

        value = raw_value.upper()
        initial = value[0]

        last_code = self._encode(initial)
        if last_code is None:
            return raw_value

        res = [initial]
        for letter in value:
            code = self._encode(letter)
            if code is None:
                continue
            if code == 7:
                continue
            if code not in (0, last_code):
                res.append(str(code))
                if len(res) > 3:
                    break
            last_code = code

        return (''.join(res) + '000')[:4]
 def test_cast_random_string_to_boolean(self):
     self.assertEqual(
         cast_to_boolean('fast_pyspark_tester',
                         StringType(),
                         options=BASE_OPTIONS),
         None,
     )
 def test_cast_longer_tz_string_to_timestamp(self):
     self.assertEqual(
         cast_to_timestamp('2019-10-01T05:40:36+03:',
                           StringType(),
                           options=BASE_OPTIONS),
         datetime.datetime(2019, 10, 1, 4, 40, 36),
     )
 def test_cast_array_to_array(self):
     self.assertEqual(
         cast_to_array(
             [1, 2, None, 4],
             ArrayType(ByteType()),
             ArrayType(StringType()),
             options=BASE_OPTIONS,
         ),
         ['1', '2', None, '4'],
     )
 def test_cast_weird_strings_to_date(self):
     # Mimic Spark behavior
     self.assertEqual(
         cast_to_date(
             '2019-10-0001Tthis should be ignored',
             StringType(),
             options=BASE_OPTIONS,
         ),
         datetime.date(2019, 10, 1),
     )
Esempio n. 12
0
def get_datetime_parser(java_time_format):
    if java_time_format is None:
        return lambda value: cast_to_timestamp(value, StringType(), {})

    if java_time_format is NO_TIMESTAMP_CONVERSION:
        return lambda value: None

    python_pattern = ''
    for token, _ in JAVA_TIME_FORMAT_TOKENS.findall(java_time_format):
        python_pattern += FORMAT_MAPPING.get(token, token)
    return lambda value: datetime.datetime.strptime(value, python_pattern)
 def test_cast_to_struct(self):
     self.assertEqual(
         cast_to_struct(
             Row(character='Alice', day='28', month='8', year='2019'),
             from_type=StructType(fields=[
                 StructField('character', StringType()),
                 StructField('day', StringType()),
                 StructField('month', StringType()),
                 StructField('year', StringType()),
             ]),
             to_type=StructType(fields=[
                 StructField('character', StringType()),
                 StructField('day', IntegerType()),
                 StructField('month', IntegerType()),
                 StructField('year', IntegerType()),
             ]),
             options=BASE_OPTIONS,
         ),
         Row(character='Alice', day=28, month=8, year=2019),
     )
 def test_cast_map_to_string(self):
     self.assertEqual(
         cast_to_string(
             {
                 True:
                 collections.OrderedDict([('one', 1), ('nothing', None),
                                          ('three', 3)])
             },
             MapType(BooleanType(), MapType(StringType(), IntegerType())),
             options=BASE_OPTIONS,
         ),
         '[true -> [one -> 1, nothing ->, three -> 3]]',
     )
    def read(self):
        sc = self.spark._sc
        paths = self.paths

        partitions, partition_schema = resolve_partitions(paths)

        rdd_filenames = sc.parallelize(sorted(partitions.keys()),
                                       len(partitions))
        rdd = rdd_filenames.flatMap(
            partial(parse_csv_file, partitions, partition_schema, self.schema,
                    self.options))

        if self.schema is not None:
            schema = self.schema
        elif self.options.inferSchema:
            fields = rdd.take(1)[0].__fields__
            schema = guess_schema_from_strings(fields,
                                               rdd.collect(),
                                               options=self.options)
        else:
            schema = infer_schema_from_rdd(rdd)

        schema_with_string = StructType(fields=[
            StructField(field.name, StringType()) for field in schema.fields
        ])

        if partition_schema:
            partitions_fields = partition_schema.fields
            full_schema = StructType(schema.fields[:-len(partitions_fields)] +
                                     partitions_fields)
        else:
            full_schema = schema

        cast_row = get_caster(from_type=schema_with_string,
                              to_type=full_schema,
                              options=self.options)
        casted_rdd = rdd.map(cast_row)
        casted_rdd._name = paths

        return DataFrameInternal(sc, casted_rdd, schema=full_schema)
Esempio n. 16
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     try:
         return value.index(self.substr)
     except IndexError:
         return 0
 def test_cast_bigger_string_to_short(self):
     self.assertEqual(
         cast_to_short('32768', StringType(), options=BASE_OPTIONS), None)
Esempio n. 18
0
 def eval(self, row, schema):
     value_1 = self.column1.cast(StringType()).eval(row, schema)
     value_2 = self.column2.cast(StringType()).eval(row, schema)
     if value_1 is None or value_2 is None:
         return None
     return levenshtein_distance(value_1, value_2)
Esempio n. 19
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     return ' '.join(word.capitalize() for word in value.split())
Esempio n. 20
0
 def eval(self, row, schema):
     return self.column.cast(StringType()).eval(row, schema).translate(self.translation_table)
Esempio n. 21
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     return value * self.n
 def test_cast_bigger_string_to_int(self):
     self.assertEqual(
         cast_to_int('2147483648', StringType(), options=BASE_OPTIONS),
         None)
 def test_cast_small_string_to_int(self):
     self.assertEqual(
         cast_to_int('2147483647', StringType(), options=BASE_OPTIONS),
         2147483647)
 def test_cast_string_to_binary(self):
     self.assertEqual(
         cast_to_binary('test', StringType(), options=BASE_OPTIONS),
         bytearray(b'test'),
     )
 def test_cast_year_month_as_string_to_date(self):
     self.assertEqual(
         cast_to_date('2019-02', StringType(), options=BASE_OPTIONS),
         datetime.date(2019, 2, 1),
     )
Esempio n. 26
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     if self.substr not in value[self.start:]:
         return 0
     return value.index(self.substr, self.start) + 1
 def test_cast_date_as_string_to_date(self):
     self.assertEqual(
         cast_to_date('2019-03-01', StringType(), options=BASE_OPTIONS),
         datetime.date(2019, 3, 1),
     )
 def test_cast_date_without_0_as_string_to_date(self):
     self.assertEqual(
         cast_to_date('2019-4-1', StringType(), options=BASE_OPTIONS),
         datetime.date(2019, 4, 1),
     )
Esempio n. 29
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     delta = self.length - len(value)
     padding = (self.pad * delta)[:delta]  # Handle pad with multiple characters
     return '{0}{1}'.format(value, padding)
 def test_cast_date_string_to_timestamp(self):
     self.assertEqual(
         cast_to_timestamp('2019-10-01', StringType(),
                           options=BASE_OPTIONS),
         datetime.datetime(2019, 10, 1, 0, 0, 0),
     )