def guess_type_from_values_as_string(values, options): # Reproduces inferences available in Spark # PartitioningUtils.inferPartitionColumnValue() # located in org.apache.spark.sql.execution.datasources tested_types = ( IntegerType(), LongType(), DecimalType(), DoubleType(), TimestampType(), StringType() ) string_type = StringType() for tested_type in tested_types: type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options) try: for value in values: casted_value = type_caster(value) if casted_value is None and value not in ("null", None): raise ValueError return tested_type except ValueError: pass # Should never happen raise AnalysisException( "Unable to find a matching type for some fields, even StringType did not work" )
def test_cast_map_to_map(self): self.assertEqual( cast_to_map({ 1: "1", 2: "2" }, MapType(ByteType(), StringType()), MapType(StringType(), FloatType()), options=BASE_OPTIONS), { '1': 1.0, '2': 2.0 })
def test_session_create_data_frame_from_list_with_schema(self): schema = StructType( [StructField("map", MapType(StringType(), IntegerType()), True)]) df = self.spark.createDataFrame([({'a': 1}, )], schema=schema) self.assertEqual(df.count(), 1) self.assertListEqual(df.collect(), [Row(map={'a': 1})]) self.assertEqual(df.schema, schema)
def test_session_create_data_frame_from_pandas_data_frame(self): try: # Pandas is an optional dependency # pylint: disable=import-outside-toplevel import pandas as pd except ImportError as e: raise ImportError("pandas is not importable") from e pdf = pd.DataFrame([(1, "one"), (2, "two"), (3, "three")]) df = self.spark.createDataFrame(pdf) self.assertEqual(df.count(), 3) self.assertListEqual(df.collect(), [ Row(**{ "0": 1, "1": 'one' }), Row(**{ "0": 2, "1": 'two' }), Row(**{ "0": 3, "2": 'three' }) ]) self.assertEqual( df.schema, StructType([ StructField("0", LongType(), True), StructField("1", StringType(), True) ]))
def eval(self, row, schema): raw_value = self.column.cast(StringType()).eval(row, schema) if raw_value is None: return None if raw_value == "": return "" value = raw_value.upper() initial = value[0] last_code = self._encode(initial) if last_code is None: return raw_value res = [initial] for letter in value: code = self._encode(letter) if code is None: continue if code == 7: continue if code not in (0, last_code): res.append(str(code)) if len(res) > 3: break last_code = code return ("".join(res) + "000")[:4]
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) return self.convert( value, self.from_base, abs(self.to_base), positive_only=self.to_base > 0 )
def test_csv_read_with_given_schema(self): schema = StructType([ StructField("permalink", StringType()), StructField("company", StringType()), StructField("numEmps", IntegerType()), StructField("category", StringType()), StructField("city", StringType()), StructField("state", StringType()), StructField("fundedDate", DateType()), StructField("raisedAmt", IntegerType()), StructField("raisedCurrency", StringType()), StructField("round", StringType()) ]) df = spark.read.schema(schema).csv(os.path.join( os.path.dirname(os.path.realpath(__file__)), "data/fundings/"), header=True) self.assertEqual([Row(**r.asDict()) for r in df.collect()], [ Row(permalink='mycityfaces', company='MyCityFaces', numEmps=7, category='web', city='Scottsdale', state='AZ', fundedDate=datetime.date(2008, 1, 1), raisedAmt=50000, raisedCurrency='USD', round='seed'), Row(permalink='flypaper', company='Flypaper', numEmps=None, category='web', city='Phoenix', state='AZ', fundedDate=datetime.date(2008, 2, 1), raisedAmt=3000000, raisedCurrency='USD', round='a'), Row(permalink='chosenlist-com', company='ChosenList.com', numEmps=5, category='web', city='Scottsdale', state='AZ', fundedDate=datetime.date(2008, 1, 25), raisedAmt=233750, raisedCurrency='USD', round='angel'), Row(permalink='digg', company='Digg', numEmps=60, category='web', city='San Francisco', state='CA', fundedDate=datetime.date(2006, 12, 1), raisedAmt=8500000, raisedCurrency='USD', round='b') ])
def test_cast_map_to_string(self): self.assertEqual( cast_to_string( { True: collections.OrderedDict([("one", 1), ("nothing", None), ("three", 3)]) }, MapType(BooleanType(), MapType(StringType(), IntegerType())), options=BASE_OPTIONS), "[true -> [one -> 1, nothing ->, three -> 3]]")
def get_datetime_parser(java_time_format): if java_time_format is None: return lambda value: cast_to_timestamp(value, StringType(), {}) if java_time_format is NO_TIMESTAMP_CONVERSION: return lambda value: None python_pattern = "" for token, _ in JAVA_TIME_FORMAT_TOKENS.findall(java_time_format): python_pattern += FORMAT_MAPPING.get(token, token) return lambda value: datetime.datetime.strptime(value, python_pattern)
def test_cast_to_struct(self): self.assertEqual( cast_to_struct(Row(character='Alice', day='28', month='8', year='2019'), from_type=StructType(fields=[ StructField("character", StringType()), StructField("day", StringType()), StructField("month", StringType()), StructField("year", StringType()), ]), to_type=StructType(fields=[ StructField("character", StringType()), StructField("day", IntegerType()), StructField("month", IntegerType()), StructField("year", IntegerType()), ]), options=BASE_OPTIONS), Row(character='Alice', day=28, month=8, year=2019), )
def test_cast_row_to_string(self): self.assertEqual( cast_to_string(Row(a=collections.OrderedDict([("value", None), ("b", { "c": 7 })]), b=None, c=True, d=5.2), StructType([ StructField( "a", MapType( StringType(), MapType(StringType(), LongType(), True), True), True), StructField("b", LongType(), True), StructField("c", BooleanType(), True), StructField("d", DoubleType(), True) ]), options=BASE_OPTIONS), "[[value ->, b -> [c -> 7]],, true, 5.2]")
def test_session_create_data_frame_from_list(self): df = self.spark.createDataFrame([ (1, "one"), (2, "two"), (3, "three"), ]) self.assertEqual(df.count(), 3) self.assertListEqual( df.collect(), [Row(_1=1, _2='one'), Row(_1=2, _2='two'), Row(_1=3, _2='three')]) self.assertEqual( df.schema, StructType([StructField("_1", LongType(), True), StructField("_2", StringType(), True)]) )
def read(self): sc = self.spark._sc paths = self.paths partitions, partition_schema = resolve_partitions(paths) rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions)) rdd = rdd_filenames.flatMap( partial(parse_csv_file, partitions, partition_schema, self.schema, self.options)) if self.schema is not None: schema = self.schema elif self.options.inferSchema: fields = rdd.take(1)[0].__fields__ schema = guess_schema_from_strings(fields, rdd.collect(), options=self.options) else: schema = infer_schema_from_rdd(rdd) schema_with_string = StructType(fields=[ StructField(field.name, StringType()) for field in schema.fields ]) if partition_schema: partitions_fields = partition_schema.fields full_schema = StructType(schema.fields[:-len(partitions_fields)] + partitions_fields) else: full_schema = schema cast_row = get_caster(from_type=schema_with_string, to_type=full_schema, options=self.options) casted_rdd = rdd.map(cast_row) casted_rdd._name = paths return DataFrameInternal(sc, casted_rdd, schema=full_schema)
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) try: return value.index(self.substr) except IndexError: return 0
def eval(self, row, schema): value_1 = self.column1.cast(StringType()).eval(row, schema) value_2 = self.column2.cast(StringType()).eval(row, schema) if value_1 is None or value_2 is None: return None return levenshtein_distance(value_1, value_2)
def eval(self, row, schema): return self.column.cast(StringType()).eval(row, schema).translate( self.translation_table)
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) return " ".join(word.capitalize() for word in value.split())
def test_cast_array_to_array(self): self.assertEqual( cast_to_array([1, 2, None, 4], ArrayType(ByteType()), ArrayType(StringType()), options=BASE_OPTIONS), ['1', '2', None, '4'])
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) return value * self.n
def test_cast_date_as_string_to_date(self): self.assertEqual( cast_to_date("2019-03-01", StringType(), options=BASE_OPTIONS), datetime.date(2019, 3, 1))
def test_cast_year_month_as_string_to_date(self): self.assertEqual( cast_to_date("2019-02", StringType(), options=BASE_OPTIONS), datetime.date(2019, 2, 1))
def test_cast_weird_strings_to_date(self): # Mimic Spark behavior self.assertEqual( cast_to_date("2019-10-0001Tthis should be ignored", StringType(), options=BASE_OPTIONS), datetime.date(2019, 10, 1))
def test_cast_date_without_0_as_string_to_date(self): self.assertEqual( cast_to_date("2019-4-1", StringType(), options=BASE_OPTIONS), datetime.date(2019, 4, 1))
def test_cast_basic_string_to_timestamp(self): self.assertEqual( cast_to_timestamp("2019-10-01T05:40:36", StringType(), options=BASE_OPTIONS), datetime.datetime(2019, 10, 1, 5, 40, 36))
def test_cast_longer_tz_string_to_timestamp(self): self.assertEqual( cast_to_timestamp("2019-10-01T05:40:36+03:", StringType(), options=BASE_OPTIONS), datetime.datetime(2019, 10, 1, 3, 40, 36) + self.tz_diff)
def test_cast_date_string_to_timestamp(self): self.assertEqual( cast_to_timestamp("2019-10-01", StringType(), options=BASE_OPTIONS), datetime.datetime(2019, 10, 1, 0, 0, 0))
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) if self.substr not in value[self.start:]: return 0 return value.index(self.substr, self.start) + 1
def test_cast_string_to_binary(self): self.assertEqual( cast_to_binary("test", StringType(), options=BASE_OPTIONS), bytearray(b'test'))
def eval(self, row, schema): value = self.column.cast(StringType()).eval(row, schema) delta = self.length - len(value) padding = (self.pad * delta)[:delta] # Handle pad with multiple characters return "{0}{1}".format(padding, value)
def test_cast_hour_string_to_timestamp(self): today = datetime.date.today() self.assertEqual( cast_to_timestamp("10:", StringType(), options=BASE_OPTIONS), datetime.datetime(today.year, today.month, today.day, 10, 0, 0))