def test_cast_row_to_string(self): self.assertEqual( cast_to_string( Row( a=collections.OrderedDict([('value', None), ('b', { 'c': 7 })]), b=None, c=True, d=5.2, ), StructType([ StructField( 'a', MapType( StringType(), MapType(StringType(), LongType(), True), True, ), True, ), StructField('b', LongType(), True), StructField('c', BooleanType(), True), StructField('d', DoubleType(), True), ]), options=BASE_OPTIONS, ), '[[value ->, b -> [c -> 7]],, true, 5.2]', )
def guess_type_from_values_as_string(values, options): # Reproduces inferences available in Spark # PartitioningUtils.inferPartitionColumnValue() # located in org.apache.spark.sql.execution.datasources tested_types = ( IntegerType(), LongType(), DecimalType(), DoubleType(), TimestampType(), StringType(), ) string_type = StringType() for tested_type in tested_types: type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options) try: for value in values: casted_value = type_caster(value) if casted_value is None and value not in ('null', None): raise ValueError return tested_type except ValueError: pass # Should never happen raise AnalysisException( 'Unable to find a matching type for some fields, even StringType did not work' )
def test_cast_hour_string_to_timestamp(self): today = datetime.date.today() self.assertEqual( cast_to_timestamp('10:', StringType(), options=BASE_OPTIONS), datetime.datetime(today.year, today.month, today.day, 10, 0, 0), )
def test_cast_bigger_string_to_long(self): self.assertEqual( cast_to_long('9223372036854775808', StringType(), options=BASE_OPTIONS), None, )
def test_cast_small_string_to_long(self): self.assertEqual( cast_to_long('9223372036854775807', StringType(), options=BASE_OPTIONS), 9223372036854775807, )
def test_cast_map_to_map(self): self.assertEqual( cast_to_map( { 1: '1', 2: '2' }, MapType(ByteType(), StringType()), MapType(StringType(), FloatType()), options=BASE_OPTIONS, ), { '1': 1.0, '2': 2.0 }, )
def eval(self, row, schema): raw_value = self.column.cast(StringType()).eval(row, schema) if raw_value is None: return None if raw_value == '': return '' value = raw_value.upper() initial = value[0] last_code = self._encode(initial) if last_code is None: return raw_value res = [initial] for letter in value: code = self._encode(letter) if code is None: continue if code == 7: continue if code not in (0, last_code): res.append(str(code)) if len(res) > 3: break last_code = code return (''.join(res) + '000')[:4]
def test_cast_random_string_to_boolean(self): self.assertEqual( cast_to_boolean('fast_pyspark_tester', StringType(), options=BASE_OPTIONS), None, )
def test_cast_longer_tz_string_to_timestamp(self): self.assertEqual( cast_to_timestamp('2019-10-01T05:40:36+03:', StringType(), options=BASE_OPTIONS), datetime.datetime(2019, 10, 1, 4, 40, 36), )
def test_cast_array_to_array(self): self.assertEqual( cast_to_array( [1, 2, None, 4], ArrayType(ByteType()), ArrayType(StringType()), options=BASE_OPTIONS, ), ['1', '2', None, '4'], )
def test_cast_weird_strings_to_date(self): # Mimic Spark behavior self.assertEqual( cast_to_date( '2019-10-0001Tthis should be ignored', StringType(), options=BASE_OPTIONS, ), datetime.date(2019, 10, 1), )
def get_datetime_parser(java_time_format): if java_time_format is None: return lambda value: cast_to_timestamp(value, StringType(), {}) if java_time_format is NO_TIMESTAMP_CONVERSION: return lambda value: None python_pattern = '' for token, _ in JAVA_TIME_FORMAT_TOKENS.findall(java_time_format): python_pattern += FORMAT_MAPPING.get(token, token) return lambda value: datetime.datetime.strptime(value, python_pattern)
def test_cast_to_struct(self): self.assertEqual( cast_to_struct( Row(character='Alice', day='28', month='8', year='2019'), from_type=StructType(fields=[ StructField('character', StringType()), StructField('day', StringType()), StructField('month', StringType()), StructField('year', StringType()), ]), to_type=StructType(fields=[ StructField('character', StringType()), StructField('day', IntegerType()), StructField('month', IntegerType()), StructField('year', IntegerType()), ]), options=BASE_OPTIONS, ), Row(character='Alice', day=28, month=8, year=2019), )
def test_cast_map_to_string(self): self.assertEqual( cast_to_string( { True: collections.OrderedDict([('one', 1), ('nothing', None), ('three', 3)]) }, MapType(BooleanType(), MapType(StringType(), IntegerType())), options=BASE_OPTIONS, ), '[true -> [one -> 1, nothing ->, three -> 3]]', )
def read(self): sc = self.spark._sc paths = self.paths partitions, partition_schema = resolve_partitions(paths) rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions)) rdd = rdd_filenames.flatMap( partial(parse_csv_file, partitions, partition_schema, self.schema, self.options)) if self.schema is not None: schema = self.schema elif self.options.inferSchema: fields = rdd.take(1)[0].__fields__ schema = guess_schema_from_strings(fields, rdd.collect(), options=self.options) else: schema = infer_schema_from_rdd(rdd) schema_with_string = StructType(fields=[ StructField(field.name, StringType()) for field in schema.fields ]) if partition_schema: partitions_fields = partition_schema.fields full_schema = StructType(schema.fields[:-len(partitions_fields)] + partitions_fields) else: full_schema = schema cast_row = get_caster(from_type=schema_with_string, to_type=full_schema, options=self.options) casted_rdd = rdd.map(cast_row) casted_rdd._name = paths return DataFrameInternal(sc, casted_rdd, schema=full_schema)
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) try: return value.index(self.substr) except IndexError: return 0
def test_cast_bigger_string_to_short(self): self.assertEqual( cast_to_short('32768', StringType(), options=BASE_OPTIONS), None)
def eval(self, row, schema): value_1 = self.column1.cast(StringType()).eval(row, schema) value_2 = self.column2.cast(StringType()).eval(row, schema) if value_1 is None or value_2 is None: return None return levenshtein_distance(value_1, value_2)
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) return ' '.join(word.capitalize() for word in value.split())
def eval(self, row, schema): return self.column.cast(StringType()).eval(row, schema).translate(self.translation_table)
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) return value * self.n
def test_cast_bigger_string_to_int(self): self.assertEqual( cast_to_int('2147483648', StringType(), options=BASE_OPTIONS), None)
def test_cast_small_string_to_int(self): self.assertEqual( cast_to_int('2147483647', StringType(), options=BASE_OPTIONS), 2147483647)
def test_cast_string_to_binary(self): self.assertEqual( cast_to_binary('test', StringType(), options=BASE_OPTIONS), bytearray(b'test'), )
def test_cast_year_month_as_string_to_date(self): self.assertEqual( cast_to_date('2019-02', StringType(), options=BASE_OPTIONS), datetime.date(2019, 2, 1), )
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) if self.substr not in value[self.start:]: return 0 return value.index(self.substr, self.start) + 1
def test_cast_date_as_string_to_date(self): self.assertEqual( cast_to_date('2019-03-01', StringType(), options=BASE_OPTIONS), datetime.date(2019, 3, 1), )
def test_cast_date_without_0_as_string_to_date(self): self.assertEqual( cast_to_date('2019-4-1', StringType(), options=BASE_OPTIONS), datetime.date(2019, 4, 1), )
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) delta = self.length - len(value) padding = (self.pad * delta)[:delta] # Handle pad with multiple characters return '{0}{1}'.format(value, padding)
def test_cast_date_string_to_timestamp(self): self.assertEqual( cast_to_timestamp('2019-10-01', StringType(), options=BASE_OPTIONS), datetime.datetime(2019, 10, 1, 0, 0, 0), )