def main(inputs, output):
    # main logic starts here
    comments_schema = types.StructType([  # commented-out fields won't be read
        types.StructField('archived', types.BooleanType(), True),
        types.StructField('author', types.StringType(), True),
        types.StructField('author_flair_css_class', types.StringType(), True),
        types.StructField('author_flair_text', types.StringType(), True),
        types.StructField('body', types.StringType(), True),
        types.StructField('controversiality', types.LongType(), True),
        types.StructField('created_utc', types.StringType(), True),
        types.StructField('distinguished', types.StringType(), True),
        types.StructField('downs', types.LongType(), True),
        types.StructField('edited', types.StringType(), True),
        types.StructField('gilded', types.LongType(), True),
        types.StructField('id', types.StringType(), True),
        types.StructField('link_id', types.StringType(), True),
        types.StructField('name', types.StringType(), True),
        types.StructField('parent_id', types.StringType(), True),
        types.StructField('retrieved_on', types.LongType(), True),
        types.StructField('score', types.LongType(), True),
        types.StructField('score_hidden', types.BooleanType(), True),
        types.StructField('subreddit', types.StringType(), True),
        types.StructField('subreddit_id', types.StringType(), True),
        types.StructField('ups', types.LongType(), True),
        #types.StructField('year', types.IntegerType(), False),
        #types.StructField('month', types.IntegerType(), False),
    ])
    comments = spark.read.json(inputs, schema=comments_schema)
    average_func = {'score': 'avg'}
    comments_average = comments.groupby(
        comments['subreddit']).agg(average_func)
    averages = comments_average.sort(comments['subreddit'], ascending=True)
    averages.write.csv(output, mode='overwrite')
Esempio n. 2
0
def main(inputs, output):

    # main logic starts here

    comments_schema = types.StructType([
        types.StructField('archived', types.BooleanType(), True),
        types.StructField('author', types.StringType(), True),
        types.StructField('author_flair_css_class', types.StringType(), True),
        types.StructField('author_flair_text', types.StringType(), True),
        types.StructField('body', types.StringType(), True),
        types.StructField('controversiality', types.LongType(), True),
        types.StructField('created_utc', types.StringType(), True),
        types.StructField('distinguished', types.StringType(), True),
        types.StructField('downs', types.LongType(), True),
        types.StructField('edited', types.StringType(), True),
        types.StructField('gilded', types.LongType(), True),
        types.StructField('id', types.StringType(), True),
        types.StructField('link_id', types.StringType(), True),
        types.StructField('name', types.StringType(), True),
        types.StructField('parent_id', types.StringType(), True),
        types.StructField('retrieved_on', types.LongType(), True),
        types.StructField('score', types.LongType(), True),
        types.StructField('score_hidden', types.BooleanType(), True),
        types.StructField('subreddit', types.StringType(), True),
        types.StructField('subreddit_id', types.StringType(), True),
        types.StructField('ups', types.LongType(), True),
    ])

    comments = spark.read.json(inputs, schema=comments_schema)
    averages = comments.groupby('subreddit').agg(
        functions.avg(comments['score']))
    averages.show()
    averages.write.csv(output, mode='overwrite')
    averages.explain()
Esempio n. 3
0
def main(inputs, output):
    comments_schema = types.StructType([
        types.StructField('archived', types.BooleanType()),
        types.StructField('author', types.StringType()),
        types.StructField('author_flair_css_class', types.StringType()),
        types.StructField('author_flair_text', types.StringType()),
        types.StructField('body', types.StringType()),
        types.StructField('controversiality', types.LongType()),
        types.StructField('created_utc', types.StringType()),
        types.StructField('distinguished', types.StringType()),
        types.StructField('downs', types.LongType()),
        types.StructField('edited', types.StringType()),
        types.StructField('gilded', types.LongType()),
        types.StructField('id', types.StringType()),
        types.StructField('link_id', types.StringType()),
        types.StructField('name', types.StringType()),
        types.StructField('parent_id', types.StringType()),
        types.StructField('retrieved_on', types.LongType()),
        types.StructField('score', types.LongType()),
        types.StructField('score_hidden', types.BooleanType()),
        types.StructField('subreddit', types.StringType()),
        types.StructField('subreddit_id', types.StringType()),
        types.StructField('ups', types.LongType()),
        # types.StructField('year', types.IntegerType()),
        # types.StructField('month', types.IntegerType()),
    ])

    df = spark.read.json(inputs, schema=comments_schema)
    averages = df.groupBy(df['subreddit']).agg(
        functions.avg(df['score']).alias('average_score'))
    averages.write.csv(output, mode='overwrite')
    averages.explain()
Esempio n. 4
0
def main(inputs, output):
    # main logic starts here
    comments_schema = types.StructType([  # commented-out fields won't be read
        types.StructField('archived', types.BooleanType(), True),
        types.StructField('author', types.StringType(), True),
        types.StructField('author_flair_css_class', types.StringType(), True),
        types.StructField('author_flair_text', types.StringType(), True),
        types.StructField('body', types.StringType(), True),
        types.StructField('controversiality', types.LongType(), True),
        types.StructField('created_utc', types.StringType(), True),
        types.StructField('distinguished', types.StringType(), True),
        types.StructField('downs', types.LongType(), True),
        types.StructField('edited', types.StringType(), True),
        types.StructField('gilded', types.LongType(), True),
        types.StructField('id', types.StringType(), True),
        types.StructField('link_id', types.StringType(), True),
        types.StructField('name', types.StringType(), True),
        types.StructField('parent_id', types.StringType(), True),
        types.StructField('retrieved_on', types.LongType(), True),
        types.StructField('score', types.LongType(), True),
        types.StructField('score_hidden', types.BooleanType(), True),
        types.StructField('subreddit', types.StringType(), True),
        types.StructField('subreddit_id', types.StringType(), True),
        types.StructField('ups', types.LongType(), True),
    ])
    comments = spark.read.json(
        inputs, schema=comments_schema)  #read input file into a dataframe
    subreddit_averages = comments.groupby('subreddit').avg(
        'score')  #calculate average for each subreddit
    subreddit_averages.write.csv(output, mode='overwrite')
 def getRedditDataFrameSchema(self):
     return tp.StructType([
         tp.StructField('show_title', tp.StringType(), True),
         tp.StructField('show_director', tp.StringType(), True),
         tp.StructField('submission_id', tp.StringType(), True),
         tp.StructField('source', tp.StringType(), True),
         tp.StructField('title', tp.StringType(), True),
         tp.StructField('description', tp.StringType(), True),
         tp.StructField('created_utc', tp.TimestampType(), True),
         tp.StructField('author', tp.StringType(), True),
         tp.StructField('score', tp.IntegerType(), True),
         tp.StructField('spoiler', tp.BooleanType(), True),
         tp.StructField('is_original_content', tp.BooleanType(), True),
         tp.StructField('distinguished', tp.StringType(), True),
         tp.StructField('link', tp.StringType(), True),
         tp.StructField(
             'comments',
             tp.ArrayType(
                 tp.StructType([
                     tp.StructField('comment_id', tp.StringType(), True),
                     tp.StructField('body', tp.StringType(), True),
                     tp.StructField('created_utc', tp.TimestampType(),
                                    True),
                     tp.StructField('score', tp.IntegerType(), True),
                     tp.StructField('parent_id', tp.StringType(), True),
                     tp.StructField('submission_id', tp.StringType(), True)
                 ])), True)
     ])
def main(inputs, output):
    # main logic starts here
    comments_schema = types.StructType([
        types.StructField('archived', types.BooleanType()),
        types.StructField('author', types.StringType()),
        types.StructField('author_flair_css_class', types.StringType()),
        types.StructField('author_flair_text', types.StringType()),
        types.StructField('body', types.StringType()),
        types.StructField('controversiality', types.LongType()),
        types.StructField('created_utc', types.StringType()),
        types.StructField('distinguished', types.StringType()),
        types.StructField('downs', types.LongType()),
        types.StructField('edited', types.StringType()),
        types.StructField('gilded', types.LongType()),
        types.StructField('id', types.StringType()),
        types.StructField('link_id', types.StringType()),
        types.StructField('name', types.StringType()),
        types.StructField('parent_id', types.StringType()),
        types.StructField('retrieved_on', types.LongType()),
        types.StructField('score', types.LongType()),
        types.StructField('score_hidden', types.BooleanType()),
        types.StructField('subreddit', types.StringType()),
        types.StructField('subreddit_id', types.StringType()),
        types.StructField('ups', types.LongType()),
    ])
    #reading data from json
    commentsData = spark.read.json(inputs, schema=comments_schema)
    #grouping by subreddit and calculating avg score
    averages = commentsData.groupBy('subreddit').agg(
        functions.avg(commentsData['score']))
    #writing output in csv file
    averages.write.csv(output, mode='overwrite')
    averages.explain()
Esempio n. 7
0
def main(inputs, output):
    # main logic starts here
    comments_schema = types.StructType([  # commented-out fields won't be read
        types.StructField('archived', types.BooleanType(), True),
        types.StructField('author', types.StringType(), True),
        types.StructField('author_flair_css_class', types.StringType(), True),
        types.StructField('author_flair_text', types.StringType(), True),
        types.StructField('body', types.StringType(), True),
        types.StructField('controversiality', types.LongType(), True),
        types.StructField('created_utc', types.StringType(), True),
        types.StructField('distinguished', types.StringType(), True),
        types.StructField('downs', types.LongType(), True),
        types.StructField('edited', types.StringType(), True),
        types.StructField('gilded', types.LongType(), True),
        types.StructField('id', types.StringType(), True),
        types.StructField('link_id', types.StringType(), True),
        types.StructField('name', types.StringType(), True),
        types.StructField('parent_id', types.StringType(), True),
        types.StructField('retrieved_on', types.LongType(), True),
        types.StructField('score', types.LongType(), True),
        types.StructField('score_hidden', types.BooleanType(), True),
        types.StructField('subreddit', types.StringType(), True),
        types.StructField('subreddit_id', types.StringType(), True),
        types.StructField('ups', types.LongType(), True),
        #types.StructField('year', types.IntegerType(), False),
        #types.StructField('month', types.IntegerType(), False),
    ])
    comments = spark.read.json(inputs, schema=comments_schema)
    find_avg = comments.groupBy((comments.subreddit).alias("Subreddit")).agg(
        avg(comments.score).alias("Average"))
    averages = find_avg.orderBy(asc("Subreddit")).coalesce(1)
    averages.write.csv(output, mode='overwrite')
Esempio n. 8
0
def df_regex_make(wikiqtsv):
    # make wikiq tsv into a dataframe
    tsv2df = reader.csv(wikiqtsv,
                        sep="\t",
                        inferSchema=False,
                        header=True,
                        mode="PERMISSIVE",
                        quote="")
    #tsv2df = tsv2df.repartition(args.num_partitions)

    # basic structure
    struct = types.StructType().add("anon",types.StringType(),True)
    struct = struct.add("articleid",types.LongType(),True)
    struct = struct.add("date_time",types.TimestampType(), True)
    struct = struct.add("deleted",types.BooleanType(), True)
    struct = struct.add("editor",types.StringType(),True)
    struct = struct.add("editor_id",types.LongType(), True)
    struct = struct.add("minor", types.BooleanType(), True)
    struct = struct.add("namespace", types.LongType(), True)
    struct = struct.add("revert", types.BooleanType(), True)
    struct = struct.add("reverteds", types.StringType(), True)
    struct = struct.add("revid", types.LongType(), True)
    struct = struct.add("sha1", types.StringType(), True)
    struct = struct.add("text_chars", types.LongType(), True)
    struct = struct.add("title",types.StringType(), True)

    # structure the df to get the def with columns of metadata and regexes
    regex_one_df = df_structurize(tsv2df,struct)

    return regex_one_df
Esempio n. 9
0
def main(inputs, output):
    comments_schema = types.StructType([
        types.StructField('archived', types.BooleanType(), True),
        types.StructField('author', types.StringType(), True),
        types.StructField('author_flair_css_class', types.StringType(), True),
        types.StructField('author_flair_text', types.StringType(), True),
        types.StructField('body', types.StringType(), True),
        types.StructField('controversiality', types.LongType(), True),
        types.StructField('created_utc', types.StringType(), True),
        types.StructField('distinguished', types.StringType(), True),
        types.StructField('downs', types.LongType(), True),
        types.StructField('edited', types.StringType(), True),
        types.StructField('gilded', types.LongType(), True),
        types.StructField('id', types.StringType(), True),
        types.StructField('link_id', types.StringType(), True),
        types.StructField('name', types.StringType(), True),
        types.StructField('parent_id', types.StringType(), True),
        types.StructField('retrieved_on', types.LongType(), True),
        types.StructField('score', types.LongType(), True),
        types.StructField('score_hidden', types.BooleanType(), True),
        types.StructField('subreddit', types.StringType(), True),
        types.StructField('subreddit_id', types.StringType(), True),
        types.StructField('ups', types.LongType(), True)
    ])

    comments = spark.read.json(inputs, schema=comments_schema)
    comments_avg = comments.groupBy('subreddit').avg('score')
    comments_avg.explain()
    comments_avg.write.json(output, mode='overwrite')
Esempio n. 10
0
def main(inputs, output):
    # main logic starts here
    comments_schema = types.StructType([ # commented-out fields won't be read
        types.StructField('archived', types.BooleanType(), True),
        types.StructField('author', types.StringType(), True),
        types.StructField('author_flair_css_class', types.StringType(), True),
        types.StructField('author_flair_text', types.StringType(), True),
        types.StructField('body', types.StringType(), True),
        types.StructField('controversiality', types.LongType(), True),
        types.StructField('created_utc', types.StringType(), True),
        types.StructField('distinguished', types.StringType(), True),
        types.StructField('downs', types.LongType(), True),
        types.StructField('edited', types.StringType(), True),
        types.StructField('gilded', types.LongType(), True),
        types.StructField('id', types.StringType(), True),
        types.StructField('link_id', types.StringType(), True),
        types.StructField('name', types.StringType(), True),
        types.StructField('parent_id', types.StringType(), True),
        types.StructField('retrieved_on', types.LongType(), True),
        types.StructField('score', types.LongType(), True),
        types.StructField('score_hidden', types.BooleanType(), True),
        types.StructField('subreddit', types.StringType(), True),
        types.StructField('subreddit_id', types.StringType(), True),
        types.StructField('ups', types.LongType(), True),
        #types.StructField('year', types.IntegerType(), False),
        #types.StructField('month', types.IntegerType(), False),
    ])
    
    #inp = '/courses/732/reddit-1/' # or other path on your computer
    # comments = spark.read.json(inputs)
    comments = spark.read.json(inputs, schema=comments_schema)   
    averages = comments.groupby('subreddit').agg(functions.avg(comments['score']))
    averages.explain()
    averages.show()
    averages.write.csv(output, mode='overwrite')
Esempio n. 11
0
    def test_datatype(self):
        first = T.StructType([
            T.StructField('f1', T.BooleanType()),
            T.StructField('f2', T.ByteType()),
            T.StructField('f3', T.IntegerType()),
            T.StructField('f4', T.LongType()),
        ])
        second = T.StructType([
            T.StructField('f3', T.IntegerType()),
            T.StructField('f2', T.ByteType()),
            T.StructField('f4', T.LongType()),
            T.StructField('f1', T.BooleanType()),
        ])

        SparklyTest().assertRowsEqual(first, second, ignore_order=True)
        with self.assertRaises(AssertionError):
            self.assertEqual(first, second)

        # change entry (f4, T.LongType)
        second = T.StructType([
            T.StructField('f3', T.IntegerType()),
            T.StructField('f2', T.ByteType()),
            T.StructField('f4', T.StringType()),
            T.StructField('f1', T.BooleanType()),
        ])

        with self.assertRaises(AssertionError):
            SparklyTest().assertRowsEqual(first, second, ignore_order=True)
def schema_extra_missing_non_nullable_field() -> T.StructType:
    """Return an sample spark schema with an extra field defined."""
    return (
        T.StructType([
            T.StructField("name", T.StringType(), True),
            T.StructField("empid", T.IntegerType(), True),
            T.StructField("happy", T.BooleanType(), True),
            T.StructField("extra", T.BooleanType(), False)])
    )
Esempio n. 13
0
    def _metadata_schema() -> t.StructType:
        return t.StructType([
            t.StructField('tenant_col', t.StringType(), False),
            t.StructField('user_col', t.StringType(), False),
            t.StructField('user_vec_col', t.StringType(), False),
            t.StructField('res_col', t.StringType(), False),
            t.StructField('res_vec_col', t.StringType(), False),
            t.StructField('output_col', t.StringType(), False),

            t.StructField('has_history_access_df', t.BooleanType(), False),
            t.StructField('has_user2component_mappings_df', t.BooleanType(), False),
            t.StructField('has_res2component_mappings_df', t.BooleanType(), False),
            t.StructField('has_user_feature_vector_mapping_df', t.BooleanType(), False),
            t.StructField('has_res_feature_vector_mapping_df', t.BooleanType(), False)
        ])
Esempio n. 14
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes, ):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date, ):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in ():
        return types.ArrayType(types.StringType())
Esempio n. 15
0
def infer_spark_type(typeclass) -> t.DataType:
    if typeclass in (None, NoneType):
        return t.NullType()
    elif typeclass is str:
        return t.StringType()
    elif typeclass in {bytes, bytearray}:
        return t.BinaryType()
    elif typeclass is bool:
        return t.BooleanType()
    elif typeclass is date:
        return t.DateType()
    elif typeclass is datetime:
        return t.TimestampType()
    elif typeclass is Decimal:
        return t.DecimalType(precision=36, scale=6)
    elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal):
        (precision, scale) = typeclass.__constraints__
        return t.DecimalType(precision=precision, scale=scale)
    elif typeclass is float:
        return t.DoubleType()
    elif typeclass is int:
        return t.IntegerType()
    elif typeclass is long:
        return t.LongType()
    elif typeclass is short:
        return t.ShortType()
    elif typeclass is byte:
        return t.ByteType()
    elif getattr(typeclass, "__origin__", None) is not None:
        return infer_complex_spark_type(typeclass)
    elif is_pyspark_class(typeclass):
        return transform(typeclass)
    else:
        raise TypeError(f"Don't know how to represent {typeclass} in Spark")
Esempio n. 16
0
def _numpy_to_spark_mapping():
    """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
    of multiple objects in each call."""

    # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
    # notation to avoid copy/paste/typo mistakes
    cache_attr_name = 'cached_numpy_to_pyspark_types_map'
    if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
        import pyspark.sql.types as T

        setattr(_numpy_to_spark_mapping, cache_attr_name,
                {
                    np.int8: T.ByteType(),
                    np.uint8: T.ShortType(),
                    np.int16: T.ShortType(),
                    np.uint16: T.IntegerType(),
                    np.int32: T.IntegerType(),
                    np.int64: T.LongType(),
                    np.float32: T.FloatType(),
                    np.float64: T.DoubleType(),
                    np.string_: T.StringType(),
                    np.str_: T.StringType(),
                    np.unicode_: T.StringType(),
                    np.bool_: T.BooleanType(),
                })

    return getattr(_numpy_to_spark_mapping, cache_attr_name)
Esempio n. 17
0
    def step_03_join(self):
        # TODO:
        # - Join all result of step_02 based on the group by attributes.
        # - For each metrics, renamed it to "datasource: metric_name"
        # - For each combination of datasource, calculate data difference column
        # - Calculate a test_result column if every related metric matches (If only 2 input sources is provided)
        group_by = self.config["group_by"]

        # Rename every metric with prefix as source_metricname
        for source, agg in self.agg.items():
            metric_cols = list(filter(lambda x: x not in group_by,
                                      agg.columns))
            self.agg[source] = reduce(
                lambda df, metric: df.withColumnRenamed(
                    metric, source + "_" + metric), metric_cols, agg)

        # Join
        joined = reduce(lambda x, y: x.join(y, how="full", on=group_by),
                        self.agg.values())

        # Calculate differences if there are only two sources
        if len(self.agg) == 2:
            source1, source2 = tuple(self.config["data"].keys())
            source1_metrics = list(
                self.config["data"][source1]["metrics"].keys())
            source2_metrics = list(
                self.config["data"][source2]["metrics"].keys())
            # Look for same metrics in both sources
            # I know that it could be done in O(n), this is more readable
            shared_metrics = sorted(
                set(source1_metrics) & set(source2_metrics))
            for metric in shared_metrics:
                try:
                    joined = joined.withColumn(
                        "delta_" + metric,
                        F.abs(
                            F.col(source1 + "_" + metric) -
                            F.col(source2 + "_" + metric)))
                except:  # Cannot calculate difference, eg in case the metric is string
                    pass
                # For float and double type, the acceptance rate is 0.1 percent
                if dict(joined.dtypes)[source1 + "_" + metric] in ("float", "double") \
                    or dict(joined.dtypes)[source2 + "_" + metric] in ("float", "double"):

                    def difference(number1, number2, error=1e-3):
                        return abs((number1 - number2) / number2) < error

                    joined = joined.withColumn(
                        "match_" + metric,
                        F.udf(difference,
                              T.BooleanType())(F.col(source1 + "_" + metric),
                                               F.col(source2 + "_" + metric)))
                else:
                    joined = joined.withColumn(
                        "match_" + metric,
                        F.col(source1 + "_" + metric) == F.col(source2 + "_" +
                                                               metric))
        self.joined = joined
        return joined
Esempio n. 18
0
def get_spark_data_type(input_value):
    return {
        "str": T.StringType(),
        "int": T.LongType(),
        "bool": T.BooleanType(),
        "float": T.DoubleType(),
        "NoneType": T.NullType(),
    }[type(input_value).__name__]
def schema_different_uncastable_data_field() -> T.StructType:
    """Return an sample spark schema with uncastable change in datatype."""
    return (
        T.StructType([
            T.StructField("name", T.IntegerType(), True),
            T.StructField("empid", T.StringType(), True),
            T.StructField("happy", T.BooleanType(), True)])
    )
def create_valid_schema() -> T.StructType:
    """Return a spark schema."""
    return (
        T.StructType([
            T.StructField("name", T.StringType(), True),
            T.StructField("empid", T.IntegerType(), True),
            T.StructField("happy", T.BooleanType(), True)])
    )
Esempio n. 21
0
def field_to_spark_field(name, python_type) -> spark_types.StructField:
    spark_type = {
        int: spark_types.IntegerType(),
        float: spark_types.DoubleType(),
        str: spark_types.StringType(),
        datetime: spark_types.TimestampType(),
        bool: spark_types.BooleanType(),
    }[python_type]
    return spark_types.StructField(name, spark_type)
Esempio n. 22
0
def spark_data_flow():
    get_phone_label_udf = fun.udf(lambda x: 'Entity;Contact;Phone',
                                  tp.StringType())
    get_email_label_udf = fun.udf(lambda x: 'Entity;Contact;Email',
                                  tp.StringType())
    get_phone_type_udf = fun.udf(lambda x: 'PHONE', tp.StringType())
    get_email_type_udf = fun.udf(lambda x: 'EMAIL', tp.StringType())
    filter_comma_udf = fun.udf(filter_comma, tp.BooleanType())

    raw_nb_df = spark.sql("""
        SELECT 
        bbd_qyxx_id,
        phone,
        email,
        year
        FROM
        dw.qyxx_annual_report_jbxx 
        WHERE
        dt='{version}'
        """.format(version=XGXX_RELATION))

    tid_nb_df = raw_nb_df.where("bbd_qyxx_id != 'null'").where(
        "phone != 'null'").where("email != 'null'").where(
            raw_nb_df.bbd_qyxx_id.isNotNull()).where(
                raw_nb_df.phone.isNotNull()).where(
                    raw_nb_df.email.isNotNull()).where(
                        filter_comma_udf('bbd_qyxx_id')).where(
                            filter_comma_udf('phone')).where(
                                filter_comma_udf('email')).cache()

    prd_phone_node_df = tid_nb_df.select(
        tid_nb_df.phone.alias('bbd_contact_id:ID'),
        fun.unix_timestamp().alias('create_time:long'),
        fun.unix_timestamp().alias('update_time:long'),
        get_phone_label_udf('phone').alias(':LABEL')).distinct()

    prd_phone_edge_df = tid_nb_df.select(
        tid_nb_df.bbd_qyxx_id.alias(':START_ID'),
        tid_nb_df.phone.alias(':END_ID'), tid_nb_df.year.alias('year'),
        fun.unix_timestamp().alias('create_time:long'),
        get_phone_type_udf('phone').alias(':TYPE')).distinct()

    prd_email_node_df = tid_nb_df.select(
        tid_nb_df.email.alias('bbd_contact_id:ID'),
        fun.unix_timestamp().alias('create_time:long'),
        fun.unix_timestamp().alias('update_time:long'),
        get_email_label_udf('email').alias(':LABEL')).distinct()

    prd_email_edge_df = tid_nb_df.select(
        tid_nb_df.bbd_qyxx_id.alias(':START_ID'),
        tid_nb_df.email.alias(':END_ID'), tid_nb_df.year.alias('year'),
        fun.unix_timestamp().alias('create_time:long'),
        get_email_type_udf('email').alias(':TYPE')).distinct()

    return (prd_phone_node_df, prd_phone_edge_df, prd_email_node_df,
            prd_email_edge_df)
Esempio n. 23
0
 def find_type(x):
     if x.type in ['object', 'str']:
         return T.StringType()
     elif x.type == 'int':
         return T.IntegerType()
     elif x.dtype == 'float':
         return T.FloatType()
     elif x.dtype == 'bool':
         return T.BooleanType()
     raise TypeError('%s type is unknown' % (x.dtype))
Esempio n. 24
0
def build_training_set(inputs: DataFrame) -> DataFrame:

    udf_country = fn.udf(Udfs.country, st.StringType())
    udf_currency = fn.udf(Udfs.currency, st.StringType())
    udf_is_valid_label = fn.udf(Udfs.is_valid_label, st.BooleanType())

    udf_filter = fn.udf(Udfs.filter_hours_days_goal, st.BooleanType())

    replace_values = {
        'days_campaign': -1,
        'hours_prepa': -1,
        'goal': -1,
        'country_clean': 'unknown',
        'currency_clean': 'unknown'
    }

    result = inputs.withColumn('goal', fn.col('goal').cast(st.DoubleType())) \
        .withColumn('deadline', fn.col('deadline').cast(st.IntegerType())) \
        .withColumn('state_changed_at', fn.col('state_changed_at').cast(st.IntegerType())) \
        .withColumn('created_at', fn.col('created_at').cast(st.IntegerType())) \
        .withColumn('launched_at', fn.col('launched_at').cast(st.IntegerType())) \
        .drop('disable_communication') \
        .drop('state_changed_at', 'backers_count') \
        .withColumn('country_clean', udf_country(fn.col('country'), fn.col('currency'))) \
        .withColumn('currency_clean', udf_currency(fn.col('currency'))) \
        .drop('country', 'currency') \
        .filter(udf_is_valid_label(fn.col('final_status'))) \
        .withColumn("deadline_clean", fn.to_date(fn.from_unixtime(fn.col('deadline')))) \
        .withColumn("created_at_clean", fn.to_date(fn.from_unixtime(fn.col('created_at')))) \
        .withColumn("launched_at_clean", fn.to_date(fn.from_unixtime(fn.col('launched_at')))) \
        .withColumn("days_campaign", fn.datediff(fn.col('deadline_clean'), fn.col('launched_at_clean'))) \
        .withColumn("hours_prepa", fn.round((fn.col('launched_at') - fn.col('created_at')) / 3600, 2)) \
        .filter(udf_filter(fn.col('hours_prepa'), fn.col('days_campaign'), fn.col('goal'))) \
        .drop('created_at', 'launched_at', 'deadline') \
        .withColumn("name", fn.lower(fn.col('name'))) \
        .withColumn("desc", fn.lower(fn.col('desc'))) \
        .withColumn("keywords", fn.lower(fn.col('keywords'))) \
        .withColumn("text", fn.concat_ws(" ", fn.col('name'), fn.col('desc'), fn.col('keywords'))) \
        .drop("name", "desc", "keywords") \
        .na.fill(replace_values)

    return result
Esempio n. 25
0
def labeled_msg_schema():
    schema = types.StructType([
        types.StructField('optional_field', types.BooleanType()),
        types.StructField('required_field', types.DoubleType(),
                          nullable=False),
        types.StructField(
            'repeated_field',
            types.ArrayType(types.IntegerType(), containsNull=False)),
        types.StructField('default_field', types.StringType()),
    ])
    return schema
Esempio n. 26
0
def clean(spark, rows):
    # Load Data
    df = spark.createDataFrame(Row(**row) for row in rows)

    # Clean column country
    re_country = "[a-zA-Z][a-zA-Z\s\-]*"

    df = df.withColumn(
        "country",
        (F.lower(F.trim(F.regexp_extract("country", re_country, 0)))),
    )

    # Clean column campus
    re_campus = "([a-zA-Z]+[_\ \-]?)+"

    df = df.withColumn(
        "campus", (F.lower(F.trim(F.regexp_extract("campus", re_campus, 0)))))

    # Clean column mobility
    re_mobility = "([a-zA-Z0-9]+[\ \-]?)+"

    df = df.withColumn(
        "mobility",
        (F.lower(F.trim(F.regexp_extract("mobility", re_mobility, 0)))))

    # Clean column contracts
    df = df.withColumn(
        "contracts", null_negative_int(df["contracts"].cast(T.IntegerType())))

    # Clean column alternative_choice
    re_alternative_choice = "([a-zA-Z]+[_\ \-]?)+"

    df = df.withColumn(
        "alternative_choice",
        (F.lower(
            F.trim(
                F.regexp_extract("alternative_choice", re_alternative_choice,
                                 0)))),
    )

    # Clean column distance
    re_distance = "[0-9]+"

    df = df.withColumn(
        "distance",
        (F.lower(F.trim(F.regexp_extract("distance", re_distance, 0))).cast(
            T.IntegerType())),
    )

    # Clean column pro_contract
    df = df.withColumn("pro_contract",
                       df["pro_contract"].cast(T.BooleanType()))

    return df
Esempio n. 27
0
def get_dtypes_spark(type):
    switcher = {
        'int32': st.IntegerType(),
        'int64': st.LongType(),
        'float32': st.FloatType(),
        'float64': st.DoubleType(),
        'date64': st.DateType(),  #TimestampType
        'str': st.StringType(),
        'boolean': st.BooleanType()
    }

    func = switcher.get(type, "nothing")
    # Execute the function
    return func
def get_schema():
    return TableSchema(
        [
            t.StructField("LoanID", t.StringType(), True),
            t.StructField("Rating", t.StringType(), True),
            t.StructField("Country", t.StringType(), True),
            t.StructField("Defaulted", t.BooleanType(), False),
            t.StructField("Year", t.IntegerType(), True),
            t.StructField("Month", t.IntegerType(), True),
        ],
        primary_key="LoanID",
        partition_by=["Month"],
        tbl_properties={"Test": "test"},
    )
Esempio n. 29
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # TODO: Add "boolean" and "string" types.
    # ArrayType
    if tpe in (np.ndarray,):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):
        return types.ArrayType(as_spark_type(tpe.__args__[0]))
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date,):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal,):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampType()
    else:
        raise TypeError("Type %s was not understood." % tpe)
Esempio n. 30
0
def get_ppd_tables(
    spark: SparkSession,
    input_path,
    headers_path: str,
    output_path: Optional[str] = None,
) -> Dict[str, DataFrame]:
    """
    Process Price Paid Dataset (PPD) by extracting fact and dimension tables
    and saving them to S3 in the csv format.
    Args:
        spark: Current Spark session object.
        input_path: Path to the PPD dataset in the CSV format.
        output_path: Path where the resulting csv files are saved.
        headers_path: Path to the PPD headers TSV file.

    Returns: A dictionary mapping table name to its dataframe.
    """
    df = read_ppd_table(spark, input_path, headers_path)
    property_types = {
        "D": "detached",
        "S": "semi-detached",
        "T": "terraced",
        "F": "flat",
        "O": "other",
    }
    property_types = spark.sparkContext.broadcast(property_types)

    df = (df.withColumn(
        "property_type",
        f.udf(lambda x: property_types.value[x],
              t.StringType())(f.col("property_type")),
    ).withColumn(
        "is_new",
        f.udf(lambda x: True
              if x == "Y" else False, t.BooleanType())(f.col("old_new")),
    ).withColumn(
        "duration",
        f.udf(lambda x: "freehold"
              if x == "F" else "leasehold", t.StringType())(f.col("duration")),
    ).withColumn("date", f.to_date(df["date_of_transfer"])))
    df = _normalise_address(df).select(
        [column for column in df.columns if column not in {"old_new"}] +
        ["is_new", "date", "property_address"])
    tables = {
        "property": write_property_table(df, output_path),
        "time": write_time_table(df, output_path),
        "sale": write_sale_table(df, output_path),
    }
    return tables