Beispiel #1
0
def guess_type_from_values_as_string(values, options):
    # Reproduces inferences available in Spark
    # PartitioningUtils.inferPartitionColumnValue()
    # located in org.apache.spark.sql.execution.datasources
    tested_types = (
        IntegerType(),
        LongType(),
        DecimalType(),
        DoubleType(),
        TimestampType(),
        StringType()
    )
    string_type = StringType()
    for tested_type in tested_types:
        type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options)
        try:
            for value in values:
                casted_value = type_caster(value)
                if casted_value is None and value not in ("null", None):
                    raise ValueError
            return tested_type
        except ValueError:
            pass
    # Should never happen
    raise AnalysisException(
        "Unable to find a matching type for some fields, even StringType did not work"
    )
Beispiel #2
0
 def test_cast_map_to_map(self):
     self.assertEqual(
         cast_to_map({
             1: "1",
             2: "2"
         },
                     MapType(ByteType(), StringType()),
                     MapType(StringType(), FloatType()),
                     options=BASE_OPTIONS), {
                         '1': 1.0,
                         '2': 2.0
                     })
Beispiel #3
0
 def test_session_create_data_frame_from_list_with_schema(self):
     schema = StructType(
         [StructField("map", MapType(StringType(), IntegerType()), True)])
     df = self.spark.createDataFrame([({'a': 1}, )], schema=schema)
     self.assertEqual(df.count(), 1)
     self.assertListEqual(df.collect(), [Row(map={'a': 1})])
     self.assertEqual(df.schema, schema)
Beispiel #4
0
    def test_session_create_data_frame_from_pandas_data_frame(self):
        try:
            # Pandas is an optional dependency
            # pylint: disable=import-outside-toplevel
            import pandas as pd
        except ImportError as e:
            raise ImportError("pandas is not importable") from e

        pdf = pd.DataFrame([(1, "one"), (2, "two"), (3, "three")])

        df = self.spark.createDataFrame(pdf)

        self.assertEqual(df.count(), 3)
        self.assertListEqual(df.collect(), [
            Row(**{
                "0": 1,
                "1": 'one'
            }),
            Row(**{
                "0": 2,
                "1": 'two'
            }),
            Row(**{
                "0": 3,
                "2": 'three'
            })
        ])
        self.assertEqual(
            df.schema,
            StructType([
                StructField("0", LongType(), True),
                StructField("1", StringType(), True)
            ]))
Beispiel #5
0
    def eval(self, row, schema):
        raw_value = self.column.cast(StringType()).eval(row, schema)

        if raw_value is None:
            return None

        if raw_value == "":
            return ""

        value = raw_value.upper()
        initial = value[0]

        last_code = self._encode(initial)
        if last_code is None:
            return raw_value

        res = [initial]
        for letter in value:
            code = self._encode(letter)
            if code is None:
                continue
            if code == 7:
                continue
            if code not in (0, last_code):
                res.append(str(code))
                if len(res) > 3:
                    break
            last_code = code

        return ("".join(res) + "000")[:4]
Beispiel #6
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     return self.convert(
         value,
         self.from_base,
         abs(self.to_base),
         positive_only=self.to_base > 0
     )
Beispiel #7
0
 def test_csv_read_with_given_schema(self):
     schema = StructType([
         StructField("permalink", StringType()),
         StructField("company", StringType()),
         StructField("numEmps", IntegerType()),
         StructField("category", StringType()),
         StructField("city", StringType()),
         StructField("state", StringType()),
         StructField("fundedDate", DateType()),
         StructField("raisedAmt", IntegerType()),
         StructField("raisedCurrency", StringType()),
         StructField("round", StringType())
     ])
     df = spark.read.schema(schema).csv(os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "data/fundings/"),
                                        header=True)
     self.assertEqual([Row(**r.asDict()) for r in df.collect()], [
         Row(permalink='mycityfaces',
             company='MyCityFaces',
             numEmps=7,
             category='web',
             city='Scottsdale',
             state='AZ',
             fundedDate=datetime.date(2008, 1, 1),
             raisedAmt=50000,
             raisedCurrency='USD',
             round='seed'),
         Row(permalink='flypaper',
             company='Flypaper',
             numEmps=None,
             category='web',
             city='Phoenix',
             state='AZ',
             fundedDate=datetime.date(2008, 2, 1),
             raisedAmt=3000000,
             raisedCurrency='USD',
             round='a'),
         Row(permalink='chosenlist-com',
             company='ChosenList.com',
             numEmps=5,
             category='web',
             city='Scottsdale',
             state='AZ',
             fundedDate=datetime.date(2008, 1, 25),
             raisedAmt=233750,
             raisedCurrency='USD',
             round='angel'),
         Row(permalink='digg',
             company='Digg',
             numEmps=60,
             category='web',
             city='San Francisco',
             state='CA',
             fundedDate=datetime.date(2006, 12, 1),
             raisedAmt=8500000,
             raisedCurrency='USD',
             round='b')
     ])
Beispiel #8
0
 def test_cast_map_to_string(self):
     self.assertEqual(
         cast_to_string(
             {
                 True:
                 collections.OrderedDict([("one", 1), ("nothing", None),
                                          ("three", 3)])
             },
             MapType(BooleanType(), MapType(StringType(), IntegerType())),
             options=BASE_OPTIONS),
         "[true -> [one -> 1, nothing ->, three -> 3]]")
Beispiel #9
0
def get_datetime_parser(java_time_format):
    if java_time_format is None:
        return lambda value: cast_to_timestamp(value, StringType(), {})

    if java_time_format is NO_TIMESTAMP_CONVERSION:
        return lambda value: None

    python_pattern = ""
    for token, _ in JAVA_TIME_FORMAT_TOKENS.findall(java_time_format):
        python_pattern += FORMAT_MAPPING.get(token, token)
    return lambda value: datetime.datetime.strptime(value, python_pattern)
Beispiel #10
0
 def test_cast_to_struct(self):
     self.assertEqual(
         cast_to_struct(Row(character='Alice',
                            day='28',
                            month='8',
                            year='2019'),
                        from_type=StructType(fields=[
                            StructField("character", StringType()),
                            StructField("day", StringType()),
                            StructField("month", StringType()),
                            StructField("year", StringType()),
                        ]),
                        to_type=StructType(fields=[
                            StructField("character", StringType()),
                            StructField("day", IntegerType()),
                            StructField("month", IntegerType()),
                            StructField("year", IntegerType()),
                        ]),
                        options=BASE_OPTIONS),
         Row(character='Alice', day=28, month=8, year=2019),
     )
Beispiel #11
0
 def test_cast_row_to_string(self):
     self.assertEqual(
         cast_to_string(Row(a=collections.OrderedDict([("value", None),
                                                       ("b", {
                                                           "c": 7
                                                       })]),
                            b=None,
                            c=True,
                            d=5.2),
                        StructType([
                            StructField(
                                "a",
                                MapType(
                                    StringType(),
                                    MapType(StringType(), LongType(), True),
                                    True), True),
                            StructField("b", LongType(), True),
                            StructField("c", BooleanType(), True),
                            StructField("d", DoubleType(), True)
                        ]),
                        options=BASE_OPTIONS),
         "[[value ->, b -> [c -> 7]],, true, 5.2]")
Beispiel #12
0
 def test_session_create_data_frame_from_list(self):
     df = self.spark.createDataFrame([
         (1, "one"),
         (2, "two"),
         (3, "three"),
     ])
     self.assertEqual(df.count(), 3)
     self.assertListEqual(
         df.collect(),
         [Row(_1=1, _2='one'),
          Row(_1=2, _2='two'),
          Row(_1=3, _2='three')])
     self.assertEqual(
         df.schema,
         StructType([StructField("_1", LongType(), True), StructField("_2", StringType(), True)])
     )
Beispiel #13
0
    def read(self):
        sc = self.spark._sc
        paths = self.paths

        partitions, partition_schema = resolve_partitions(paths)

        rdd_filenames = sc.parallelize(sorted(partitions.keys()),
                                       len(partitions))
        rdd = rdd_filenames.flatMap(
            partial(parse_csv_file, partitions, partition_schema, self.schema,
                    self.options))

        if self.schema is not None:
            schema = self.schema
        elif self.options.inferSchema:
            fields = rdd.take(1)[0].__fields__
            schema = guess_schema_from_strings(fields,
                                               rdd.collect(),
                                               options=self.options)
        else:
            schema = infer_schema_from_rdd(rdd)

        schema_with_string = StructType(fields=[
            StructField(field.name, StringType()) for field in schema.fields
        ])

        if partition_schema:
            partitions_fields = partition_schema.fields
            full_schema = StructType(schema.fields[:-len(partitions_fields)] +
                                     partitions_fields)
        else:
            full_schema = schema

        cast_row = get_caster(from_type=schema_with_string,
                              to_type=full_schema,
                              options=self.options)
        casted_rdd = rdd.map(cast_row)
        casted_rdd._name = paths

        return DataFrameInternal(sc, casted_rdd, schema=full_schema)
Beispiel #14
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     try:
         return value.index(self.substr)
     except IndexError:
         return 0
Beispiel #15
0
 def eval(self, row, schema):
     value_1 = self.column1.cast(StringType()).eval(row, schema)
     value_2 = self.column2.cast(StringType()).eval(row, schema)
     if value_1 is None or value_2 is None:
         return None
     return levenshtein_distance(value_1, value_2)
Beispiel #16
0
 def eval(self, row, schema):
     return self.column.cast(StringType()).eval(row, schema).translate(
         self.translation_table)
Beispiel #17
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     return " ".join(word.capitalize() for word in value.split())
Beispiel #18
0
 def test_cast_array_to_array(self):
     self.assertEqual(
         cast_to_array([1, 2, None, 4],
                       ArrayType(ByteType()),
                       ArrayType(StringType()),
                       options=BASE_OPTIONS), ['1', '2', None, '4'])
Beispiel #19
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     return value * self.n
Beispiel #20
0
 def test_cast_date_as_string_to_date(self):
     self.assertEqual(
         cast_to_date("2019-03-01", StringType(), options=BASE_OPTIONS),
         datetime.date(2019, 3, 1))
Beispiel #21
0
 def test_cast_year_month_as_string_to_date(self):
     self.assertEqual(
         cast_to_date("2019-02", StringType(), options=BASE_OPTIONS),
         datetime.date(2019, 2, 1))
Beispiel #22
0
 def test_cast_weird_strings_to_date(self):
     # Mimic Spark behavior
     self.assertEqual(
         cast_to_date("2019-10-0001Tthis should be ignored",
                      StringType(),
                      options=BASE_OPTIONS), datetime.date(2019, 10, 1))
Beispiel #23
0
 def test_cast_date_without_0_as_string_to_date(self):
     self.assertEqual(
         cast_to_date("2019-4-1", StringType(), options=BASE_OPTIONS),
         datetime.date(2019, 4, 1))
Beispiel #24
0
 def test_cast_basic_string_to_timestamp(self):
     self.assertEqual(
         cast_to_timestamp("2019-10-01T05:40:36",
                           StringType(),
                           options=BASE_OPTIONS),
         datetime.datetime(2019, 10, 1, 5, 40, 36))
Beispiel #25
0
 def test_cast_longer_tz_string_to_timestamp(self):
     self.assertEqual(
         cast_to_timestamp("2019-10-01T05:40:36+03:",
                           StringType(),
                           options=BASE_OPTIONS),
         datetime.datetime(2019, 10, 1, 3, 40, 36) + self.tz_diff)
Beispiel #26
0
 def test_cast_date_string_to_timestamp(self):
     self.assertEqual(
         cast_to_timestamp("2019-10-01", StringType(),
                           options=BASE_OPTIONS),
         datetime.datetime(2019, 10, 1, 0, 0, 0))
Beispiel #27
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     if self.substr not in value[self.start:]:
         return 0
     return value.index(self.substr, self.start) + 1
Beispiel #28
0
 def test_cast_string_to_binary(self):
     self.assertEqual(
         cast_to_binary("test", StringType(), options=BASE_OPTIONS),
         bytearray(b'test'))
Beispiel #29
0
 def eval(self, row, schema):
     value = self.column.cast(StringType()).eval(row, schema)
     delta = self.length - len(value)
     padding = (self.pad *
                delta)[:delta]  # Handle pad with multiple characters
     return "{0}{1}".format(padding, value)
Beispiel #30
0
    def test_cast_hour_string_to_timestamp(self):
        today = datetime.date.today()

        self.assertEqual(
            cast_to_timestamp("10:", StringType(), options=BASE_OPTIONS),
            datetime.datetime(today.year, today.month, today.day, 10, 0, 0))