Exemple #1
0
    def eval(self, row, schema):
        value_1 = self.column1.cast(TimestampType()).eval(row, schema)
        value_2 = self.column2.cast(TimestampType()).eval(row, schema)

        if (not isinstance(value_1, datetime.datetime)
                or not isinstance(value_2, datetime.datetime)):
            return None

        one_day = datetime.timedelta(days=1)
        value_1_is_the_last_of_its_month = (value_1.month != (value_1 + one_day).month)
        value_2_is_the_last_of_its_month = (value_2.month != (value_2 + one_day).month)
        if value_1.day == value_2.day or (
                value_1_is_the_last_of_its_month and
                value_2_is_the_last_of_its_month
        ):
            # Special cases where time of day is not consider
            diff = ((value_1.year - value_2.year) * 12 +
                    (value_1.month - value_2.month))
        else:
            day_offset = (value_1.day - value_2.day +
                          (value_1.hour - value_2.hour) / 24 +
                          (value_1.minute - value_2.minute) / 1440 +
                          (value_1.second - value_2.second) / 86400)
            diff = ((value_1.year - value_2.year) * 12 +
                    (value_1.month - value_2.month) * 1 +
                    day_offset / 31)
        if self.round_off:
            return float(round(diff, 8))
        return float(diff)
Exemple #2
0
 def test_cast_timestamp_to_decimal_with_scale(self):
     self.assertEqual(
         cast_to_decimal(datetime.datetime(2019, 8, 28),
                         TimestampType(),
                         DecimalType(precision=11, scale=1),
                         options=BASE_OPTIONS),
         1566939600.0 + self.tz_diff.seconds)
Exemple #3
0
 def eval(self, row, schema):
     value = self.column.cast(TimestampType()).eval(row, schema)
     if self.pytz is None:
         return value
     local_date = self.pytz.localize(value)
     gmt_date = local_date.astimezone(GMT_TIMEZONE)
     return gmt_date.replace(tzinfo=None)
Exemple #4
0
def guess_type_from_values_as_string(values, options):
    # Reproduces inferences available in Spark
    # PartitioningUtils.inferPartitionColumnValue()
    # located in org.apache.spark.sql.execution.datasources
    tested_types = (
        IntegerType(),
        LongType(),
        DecimalType(),
        DoubleType(),
        TimestampType(),
        StringType()
    )
    string_type = StringType()
    for tested_type in tested_types:
        type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options)
        try:
            for value in values:
                casted_value = type_caster(value)
                if casted_value is None and value not in ("null", None):
                    raise ValueError
            return tested_type
        except ValueError:
            pass
    # Should never happen
    raise AnalysisException(
        "Unable to find a matching type for some fields, even StringType did not work"
    )
Exemple #5
0
 def test_cast_timestamp_to_float_without_jump_issue(self):
     # Spark's floats have precision issue.
     # As pysparkling is using python that does not have this issue,
     # there is a discrepancy in behaviours
     # This test is using a value for which Spark can handle the exact value
     # Hence the behaviour is the same in pysparkling and PySpark
     self.assertEqual(
         cast_to_float(datetime.datetime(2019, 8, 28, 0, 2, 40),
                       TimestampType(),
                       options=BASE_OPTIONS), 1566943360.0)
Exemple #6
0
    def eval(self, row, schema):
        value = self.column.cast(TimestampType()).eval(row, schema)

        day_truncation = self.truncate_to_day(value)
        if day_truncation:
            return day_truncation

        time_truncated = self.truncate_to_time(value)
        if time_truncated:
            return time_truncated

        return None
Exemple #7
0
 def test_cast_timestamp_to_string(self):
     self.assertEqual(
         cast_to_string(datetime.datetime(2019, 8, 28, 13, 5, 0),
                        TimestampType(),
                        options=BASE_OPTIONS), "2019-08-28 13:05:00")
Exemple #8
0
 def test_cast_timestamp_to_decimal_with_too_small_precision(self):
     self.assertEqual(
         cast_to_decimal(datetime.datetime(2019, 8, 28),
                         TimestampType(),
                         DecimalType(precision=10, scale=1),
                         options=BASE_OPTIONS), None)
Exemple #9
0
 def test_csv_read_with_inferred_schema(self):
     df = spark.read.option("inferSchema", True).csv(os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "data/fundings/"),
                                                     header=True)
     self.assertEqual(df.count(), 4)
     self.assertEqual(
         df.schema,
         StructType([
             StructField("permalink", StringType()),
             StructField("company", StringType()),
             StructField("numEmps", IntegerType()),
             StructField("category", StringType()),
             StructField("city", StringType()),
             StructField("state", StringType()),
             StructField("fundedDate", TimestampType()),
             StructField("raisedAmt", IntegerType()),
             StructField("raisedCurrency", StringType()),
             StructField("round", StringType())
         ]))
     self.assertEqual([Row(**r.asDict()) for r in df.collect()], [
         Row(permalink='mycityfaces',
             company='MyCityFaces',
             numEmps=7,
             category='web',
             city='Scottsdale',
             state='AZ',
             fundedDate=datetime.datetime(2008, 1, 1, 0, 0),
             raisedAmt=50000,
             raisedCurrency='USD',
             round='seed'),
         Row(permalink='flypaper',
             company='Flypaper',
             numEmps=None,
             category='web',
             city='Phoenix',
             state='AZ',
             fundedDate=datetime.datetime(2008, 2, 1, 0, 0),
             raisedAmt=3000000,
             raisedCurrency='USD',
             round='a'),
         Row(permalink='chosenlist-com',
             company='ChosenList.com',
             numEmps=5,
             category='web',
             city='Scottsdale',
             state='AZ',
             fundedDate=datetime.datetime(2008, 1, 25, 0, 0),
             raisedAmt=233750,
             raisedCurrency='USD',
             round='angel'),
         Row(permalink='digg',
             company='Digg',
             numEmps=60,
             category='web',
             city='San Francisco',
             state='CA',
             fundedDate=datetime.datetime(2006, 12, 1, 0, 0),
             raisedAmt=8500000,
             raisedCurrency='USD',
             round='b')
     ])
Exemple #10
0
 def eval(self, row, schema):
     return self.column.cast(TimestampType()).eval(row, schema).hour
Exemple #11
0
 def eval(self, row, schema):
     timestamp = self.column.cast(TimestampType()).eval(row, schema)
     return self.formatter(timestamp)
Exemple #12
0
 def test_cast_timestamp_to_decimal_without_scale(self):
     self.assertEqual(
         cast_to_decimal(datetime.datetime(2019, 8, 28),
                         TimestampType(),
                         DecimalType(),
                         options=BASE_OPTIONS), 1566943200.0)