def eval(self, row, schema): value_1 = self.column1.cast(TimestampType()).eval(row, schema) value_2 = self.column2.cast(TimestampType()).eval(row, schema) if (not isinstance(value_1, datetime.datetime) or not isinstance(value_2, datetime.datetime)): return None one_day = datetime.timedelta(days=1) value_1_is_the_last_of_its_month = (value_1.month != (value_1 + one_day).month) value_2_is_the_last_of_its_month = (value_2.month != (value_2 + one_day).month) if value_1.day == value_2.day or ( value_1_is_the_last_of_its_month and value_2_is_the_last_of_its_month ): # Special cases where time of day is not consider diff = ((value_1.year - value_2.year) * 12 + (value_1.month - value_2.month)) else: day_offset = (value_1.day - value_2.day + (value_1.hour - value_2.hour) / 24 + (value_1.minute - value_2.minute) / 1440 + (value_1.second - value_2.second) / 86400) diff = ((value_1.year - value_2.year) * 12 + (value_1.month - value_2.month) * 1 + day_offset / 31) if self.round_off: return float(round(diff, 8)) return float(diff)
def test_cast_timestamp_to_decimal_with_scale(self): self.assertEqual( cast_to_decimal(datetime.datetime(2019, 8, 28), TimestampType(), DecimalType(precision=11, scale=1), options=BASE_OPTIONS), 1566939600.0 + self.tz_diff.seconds)
def eval(self, row, schema): value = self.column.cast(TimestampType()).eval(row, schema) if self.pytz is None: return value local_date = self.pytz.localize(value) gmt_date = local_date.astimezone(GMT_TIMEZONE) return gmt_date.replace(tzinfo=None)
def guess_type_from_values_as_string(values, options): # Reproduces inferences available in Spark # PartitioningUtils.inferPartitionColumnValue() # located in org.apache.spark.sql.execution.datasources tested_types = ( IntegerType(), LongType(), DecimalType(), DoubleType(), TimestampType(), StringType() ) string_type = StringType() for tested_type in tested_types: type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options) try: for value in values: casted_value = type_caster(value) if casted_value is None and value not in ("null", None): raise ValueError return tested_type except ValueError: pass # Should never happen raise AnalysisException( "Unable to find a matching type for some fields, even StringType did not work" )
def test_cast_timestamp_to_float_without_jump_issue(self): # Spark's floats have precision issue. # As pysparkling is using python that does not have this issue, # there is a discrepancy in behaviours # This test is using a value for which Spark can handle the exact value # Hence the behaviour is the same in pysparkling and PySpark self.assertEqual( cast_to_float(datetime.datetime(2019, 8, 28, 0, 2, 40), TimestampType(), options=BASE_OPTIONS), 1566943360.0)
def eval(self, row, schema): value = self.column.cast(TimestampType()).eval(row, schema) day_truncation = self.truncate_to_day(value) if day_truncation: return day_truncation time_truncated = self.truncate_to_time(value) if time_truncated: return time_truncated return None
def test_cast_timestamp_to_string(self): self.assertEqual( cast_to_string(datetime.datetime(2019, 8, 28, 13, 5, 0), TimestampType(), options=BASE_OPTIONS), "2019-08-28 13:05:00")
def test_cast_timestamp_to_decimal_with_too_small_precision(self): self.assertEqual( cast_to_decimal(datetime.datetime(2019, 8, 28), TimestampType(), DecimalType(precision=10, scale=1), options=BASE_OPTIONS), None)
def test_csv_read_with_inferred_schema(self): df = spark.read.option("inferSchema", True).csv(os.path.join( os.path.dirname(os.path.realpath(__file__)), "data/fundings/"), header=True) self.assertEqual(df.count(), 4) self.assertEqual( df.schema, StructType([ StructField("permalink", StringType()), StructField("company", StringType()), StructField("numEmps", IntegerType()), StructField("category", StringType()), StructField("city", StringType()), StructField("state", StringType()), StructField("fundedDate", TimestampType()), StructField("raisedAmt", IntegerType()), StructField("raisedCurrency", StringType()), StructField("round", StringType()) ])) self.assertEqual([Row(**r.asDict()) for r in df.collect()], [ Row(permalink='mycityfaces', company='MyCityFaces', numEmps=7, category='web', city='Scottsdale', state='AZ', fundedDate=datetime.datetime(2008, 1, 1, 0, 0), raisedAmt=50000, raisedCurrency='USD', round='seed'), Row(permalink='flypaper', company='Flypaper', numEmps=None, category='web', city='Phoenix', state='AZ', fundedDate=datetime.datetime(2008, 2, 1, 0, 0), raisedAmt=3000000, raisedCurrency='USD', round='a'), Row(permalink='chosenlist-com', company='ChosenList.com', numEmps=5, category='web', city='Scottsdale', state='AZ', fundedDate=datetime.datetime(2008, 1, 25, 0, 0), raisedAmt=233750, raisedCurrency='USD', round='angel'), Row(permalink='digg', company='Digg', numEmps=60, category='web', city='San Francisco', state='CA', fundedDate=datetime.datetime(2006, 12, 1, 0, 0), raisedAmt=8500000, raisedCurrency='USD', round='b') ])
def eval(self, row, schema): return self.column.cast(TimestampType()).eval(row, schema).hour
def eval(self, row, schema): timestamp = self.column.cast(TimestampType()).eval(row, schema) return self.formatter(timestamp)
def test_cast_timestamp_to_decimal_without_scale(self): self.assertEqual( cast_to_decimal(datetime.datetime(2019, 8, 28), TimestampType(), DecimalType(), options=BASE_OPTIONS), 1566943200.0)