Beispiel #1
0
    def eval(self, row, schema):
        value_1 = self.column1.cast(TimestampType()).eval(row, schema)
        value_2 = self.column2.cast(TimestampType()).eval(row, schema)

        if not isinstance(value_1, datetime.datetime) or not isinstance(
                value_2, datetime.datetime):
            return None

        one_day = datetime.timedelta(days=1)
        value_1_is_the_last_of_its_month = value_1.month != (value_1 +
                                                             one_day).month
        value_2_is_the_last_of_its_month = value_2.month != (value_2 +
                                                             one_day).month
        if value_1.day == value_2.day or (value_1_is_the_last_of_its_month and
                                          value_2_is_the_last_of_its_month):
            # Special cases where time of day is not consider
            diff = (value_1.year - value_2.year) * 12 + (value_1.month -
                                                         value_2.month)
        else:
            day_offset = (value_1.day - value_2.day +
                          (value_1.hour - value_2.hour) / 24 +
                          (value_1.minute - value_2.minute) / 1440 +
                          (value_1.second - value_2.second) / 86400)
            diff = (value_1.year - value_2.year) * 12 + (
                value_1.month - value_2.month) * 1 + day_offset / 31
        if self.round_off:
            return float(round(diff, 8))
        return float(diff)
Beispiel #2
0
 def eval(self, row, schema):
     value = self.column.cast(TimestampType()).eval(row, schema)
     if self.pytz is None:
         return value
     local_date = self.pytz.localize(value)
     gmt_date = local_date.astimezone(GMT_TIMEZONE)
     return gmt_date.replace(tzinfo=None)
def guess_type_from_values_as_string(values, options):
    # Reproduces inferences available in Spark
    # PartitioningUtils.inferPartitionColumnValue()
    # located in org.apache.spark.sql.execution.datasources
    tested_types = (
        IntegerType(),
        LongType(),
        DecimalType(),
        DoubleType(),
        TimestampType(),
        StringType(),
    )
    string_type = StringType()
    for tested_type in tested_types:
        type_caster = get_caster(from_type=string_type,
                                 to_type=tested_type,
                                 options=options)
        try:
            for value in values:
                casted_value = type_caster(value)
                if casted_value is None and value not in ('null', None):
                    raise ValueError
            return tested_type
        except ValueError:
            pass
    # Should never happen
    raise AnalysisException(
        'Unable to find a matching type for some fields, even StringType did not work'
    )
 def test_cast_timestamp_to_string(self):
     self.assertEqual(
         cast_to_string(
             datetime.datetime(2019, 8, 28, 13, 5, 0),
             TimestampType(),
             options=BASE_OPTIONS,
         ),
         '2019-08-28 13:05:00',
     )
 def test_cast_timestamp_to_decimal_with_scale(self):
     self.assertEqual(
         cast_to_decimal(
             datetime.datetime(2019, 8, 28),
             TimestampType(),
             DecimalType(precision=11, scale=1),
             options=BASE_OPTIONS,
         ),
         1566943200.0,
     )
 def test_cast_timestamp_to_decimal_with_too_small_precision(self):
     self.assertEqual(
         cast_to_decimal(
             datetime.datetime(2019, 8, 28),
             TimestampType(),
             DecimalType(precision=10, scale=1),
             options=BASE_OPTIONS,
         ),
         None,
     )
Beispiel #7
0
    def eval(self, row, schema):
        value = self.column.cast(TimestampType()).eval(row, schema)

        day_truncation = self.truncate_to_day(value)
        if day_truncation:
            return day_truncation

        time_truncated = self.truncate_to_time(value)
        if time_truncated:
            return time_truncated

        return None
 def test_cast_timestamp_to_float_without_jump_issue(self):
     # Spark's floats have precision issue.
     # As fast_pyspark_tester is using python that does not have this issue,
     # there is a discrepancy in behaviours
     # This test is using a value for which Spark can handle the exact value
     # Hence the behaviour is the same in fast_pyspark_tester and PySpark
     self.assertEqual(
         cast_to_float(
             datetime.datetime(2019, 8, 28, 0, 2, 40),
             TimestampType(),
             options=BASE_OPTIONS,
         ),
         1566943360.0,
     )
Beispiel #9
0
 def eval(self, row, schema):
     return self.column.cast(TimestampType()).eval(row, schema).minute
Beispiel #10
0
 def eval(self, row, schema):
     timestamp = self.column.cast(TimestampType()).eval(row, schema)
     return self.formatter(timestamp)
Beispiel #11
0
 def test_csv_read_with_inferred_schema(self):
     df = spark.read.option('inferSchema', True).csv(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      'data/fundings/'),
         header=True,
     )
     self.assertEqual(df.count(), 4)
     self.assertEqual(
         df.schema,
         StructType([
             StructField('permalink', StringType()),
             StructField('company', StringType()),
             StructField('numEmps', IntegerType()),
             StructField('category', StringType()),
             StructField('city', StringType()),
             StructField('state', StringType()),
             StructField('fundedDate', TimestampType()),
             StructField('raisedAmt', IntegerType()),
             StructField('raisedCurrency', StringType()),
             StructField('round', StringType()),
         ]),
     )
     self.assertEqual(
         [Row(**r.asDict()) for r in df.collect()],
         [
             Row(
                 permalink='mycityfaces',
                 company='MyCityFaces',
                 numEmps=7,
                 category='web',
                 city='Scottsdale',
                 state='AZ',
                 fundedDate=datetime.datetime(2008, 1, 1, 0, 0),
                 raisedAmt=50000,
                 raisedCurrency='USD',
                 round='seed',
             ),
             Row(
                 permalink='flypaper',
                 company='Flypaper',
                 numEmps=None,
                 category='web',
                 city='Phoenix',
                 state='AZ',
                 fundedDate=datetime.datetime(2008, 2, 1, 0, 0),
                 raisedAmt=3000000,
                 raisedCurrency='USD',
                 round='a',
             ),
             Row(
                 permalink='chosenlist-com',
                 company='ChosenList.com',
                 numEmps=5,
                 category='web',
                 city='Scottsdale',
                 state='AZ',
                 fundedDate=datetime.datetime(2008, 1, 25, 0, 0),
                 raisedAmt=233750,
                 raisedCurrency='USD',
                 round='angel',
             ),
             Row(
                 permalink='digg',
                 company='Digg',
                 numEmps=60,
                 category='web',
                 city='San Francisco',
                 state='CA',
                 fundedDate=datetime.datetime(2006, 12, 1, 0, 0),
                 raisedAmt=8500000,
                 raisedCurrency='USD',
                 round='b',
             ),
         ],
     )