コード例 #1
0
def group_by_grid_square_and_tokenize(spark_session, latlongrid, tweets_df):
    """Calculates the grid square id from 'lat' and 'lon' columns in tweets_df, and then
    groups the tweets by grid square. Tweets are tokenized. Returned dataframe has
    columns ['grid_square', 'tokens'], where 'tokens' is a list of all tokens from every
    tweet within an entry's 'grid_square'.
    
    Args:
        spark_session    --    An active SparkSession.
        latlongrid    --    A LatLonGrid object.
        tweets_df    --    A dataframe with columns ['lat', 'lon', and 'tweet'] of types
                           [DoubleType, DoubleType, StringType]."""

    sql_tokenize = functions.udf(lambda tweet: twokenize.tokenize(tweet),
                                 returnType=types.ArrayType(
                                     types.StringType()))
    tweets_df = (tweets_df.withColumn('tweet_tokens',
                                      sql_tokenize(
                                          tweets_df['tweet'])).drop('tweet'))

    row_to_gridsquare_tokens = lambda row: (latlongrid.grid_square_index(
        lat=row['lat'], lon=row['lon']), row['tweet_tokens'])

    tokens_rdd = (tweets_df.rdd.map(row_to_gridsquare_tokens).reduceByKey(
        operator.concat))

    tokens_df_schema = types.StructType([
        types.StructField('grid_square', types.IntegerType()),
        types.StructField('tokens', types.ArrayType(types.StringType()))
    ])
    tokens_df = spark_session.createDataFrame(tokens_rdd,
                                              schema=tokens_df_schema)

    return tokens_df
コード例 #2
0
    def frontend_result(sc, dataframe, buckets=20, prediction_col='prediction'):
        n_buckets = sc.broadcast(buckets)
        buckets_list_udf = F.udf(
            f=lambda dist, ratio, boundary: ShowResults.make_buckets(
                distances=dist, ratio=ratio, boundary=boundary, n_buckets=n_buckets.value)
            ,
            returnType=T.ArrayType(
                elementType=T.ArrayType(
                    elementType=T.IntegerType(),
                    containsNull=True),
                containsNull=True
            )
        )

        tmp = (dataframe
               .groupBy(prediction_col, F.col('computed_boundary'))
               .agg(F.min('distance').alias('min'), F.max('distance').alias('max'),
                    F.sum('is_outlier').alias('n_outliers'),
                    F.collect_list('distance').alias('distances'))
               .withColumn(colName='ratio', col=F.col('max')/n_buckets.value)
               .withColumn(colName='buckets', col=buckets_list_udf(
                            'distances', 'ratio', 'computed_boundary'))
               )

        return tmp.select(prediction_col, 'buckets')
コード例 #3
0
def calc(df):
    
    ## function to calculate the appoximating function and its derivative
    def foo(x,y):

        y_arr = np.array(y)
        gy = g(y_arr)
        gp = gprime(y_arr)
        x_arr = np.array(x)
        res = np.outer(gy,x_arr)
        return([res.flatten().tolist(), gp.tolist()])

    udf_foo = f.udf(foo, t.ArrayType(t.ArrayType(t.DoubleType())))



    df2 = df.withColumn("vals", udf_foo("features","Y"))

    df2 = df2.select("id", f.col("vals").getItem(0).alias("gy"), f.col("vals").getItem(1).alias("gy_"))
    GY_ = np.array(df2.agg(f.array([f.sum(f.col("gy")[i]) 
                                for i in range(n_comp**2)])).collect()[0][0]).reshape(n_comp,n_comp)/num_rows

    GY_AVG_V  = np.array(df2.agg(f.array([f.avg(f.col("gy_")[i]) 
                                  for i in range(n_comp)])).collect()[0][0]).reshape(n_comp,1)*V

    return(GY_, GY_AVG_V)
コード例 #4
0
ファイル: test_utils.py プロジェクト: luzbetak/sparkly
    def test_undefined_field(self):
        with six.assertRaisesRegex(self, KeyError, 'f2'):
            schema_has(
                T.StructType([T.StructField('f1', T.IntegerType())]),
                T.StructType([T.StructField('f2', T.LongType())]),
            )

        with six.assertRaisesRegex(self, KeyError, 'f1\.element\.s2'):
            schema_has(
                T.StructType([
                    T.StructField(
                        'f1',
                        T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])),
                    ),
                ]),
                T.StructType([
                    T.StructField(
                        'f1',
                        T.ArrayType(T.StructType([T.StructField('s2', T.LongType())])),
                    ),
                ]),
            )

        with six.assertRaisesRegex(self, TypeError, 'element is IntegerType, expected LongType'):
            schema_has(
                T.ArrayType(T.IntegerType()),
                T.ArrayType(T.LongType()),
            )
コード例 #5
0
    def sum_word_vectors(
            urls_and_weighted_word_vectors: DataFrame) -> DataFrame:
        """
        Sums weighted word vectors and their corresponding coefficients for each URL.

        :param urls_and_weighted_word_vectors: A DataFrame of URLs and weighted word vectors with columns: id, url, pos,
                                               word, weighted_word_vector, coefficient.
        :return: A DataFrame of URLs and their corresponding sum of word vectors and sum of coefficients with columns:
                 id, url, split_url, coefficients, summed_vectors, summed_coefficients.
        """

        word_array_sorter_udf = F.udf(
            URLVectorCalculator.sort_list_of_2_tuples_by_0th_item,
            T.ArrayType(T.StringType()))
        coefficient_array_sorter_udf = F.udf(
            URLVectorCalculator.sort_list_of_2_tuples_by_0th_item,
            T.ArrayType(T.DoubleType()))

        vector_size = len(
            urls_and_weighted_word_vectors.select(
                'weighted_word_vector').first()[0])
        return urls_and_weighted_word_vectors \
            .groupBy("id", "url") \
            .agg(F.collect_list(F.struct("pos", "word")).alias("positions_and_words"),
                 F.collect_list(F.struct("pos", "coefficient")).alias("positions_and_coefficients"),
                 F.sum("coefficient").alias("summed_coefficients"),
                 F.array(*[F.sum(F.col("weighted_word_vector")[i])
                           for i in range(vector_size)]).alias("summed_vectors")) \
            .select("id", "url", "summed_coefficients", "summed_vectors",
                    word_array_sorter_udf("positions_and_words").alias("split_url"),
                    coefficient_array_sorter_udf("positions_and_coefficients").alias("coefficients"))
コード例 #6
0
 def get_df_schema(self):
     return tp.StructType([
         tp.StructField('added_date', tp.DateType(), True),
         tp.StructField('release_year', tp.IntegerType(), True),
         tp.StructField('title', tp.StringType(), False),
         tp.StructField('director', tp.StringType(), True),
         tp.StructField('type', tp.StringType(), False),
         tp.StructField('duration', tp.StringType(), True),
         tp.StructField('description', tp.StringType(), True),
         tp.StructField(
             'comments',
             tp.ArrayType(
                 tp.StructType([
                     tp.StructField('body', tp.StringType(), True),
                     tp.StructField('author', tp.StringType(), True),
                     tp.StructField('created_utc', tp.TimestampType(),
                                    True),
                     tp.StructField('score', tp.IntegerType(), True),
                     tp.StructField('sentiment', tp.StringType(), True),
                     tp.StructField('description_word', tp.StringType(),
                                    True),
                     tp.StructField('source', tp.StringType(), True)
                 ])), True),
         tp.StructField(
             'actors',
             tp.ArrayType(
                 tp.StructType(
                     [tp.StructField('name', tp.StringType(), True)])),
             True)
     ])
コード例 #7
0
    def hook_spark_pipeline_init(self, sc, sqlc, schema, indexer):

        if self.include_external:
            schema.append(
                SparkTypes.StructField(
                    "external_links",
                    SparkTypes.ArrayType(
                        SparkTypes.StructType([
                            SparkTypes.StructField("href",
                                                   SparkTypes.StringType(),
                                                   nullable=False),
                            SparkTypes.StructField("text",
                                                   SparkTypes.StringType(),
                                                   nullable=True)
                        ])),
                    nullable=True))

        if self.include_internal:
            schema.append(
                SparkTypes.StructField(
                    "internal_links",
                    SparkTypes.ArrayType(
                        SparkTypes.StructType([
                            SparkTypes.StructField("path",
                                                   SparkTypes.StringType(),
                                                   nullable=False),
                            SparkTypes.StructField("text",
                                                   SparkTypes.StringType(),
                                                   nullable=True)
                        ])),
                    nullable=True))
コード例 #8
0
ファイル: lda1.py プロジェクト: MehvishSaleem/r-meta-bot
def main():
    #input_comments = '/Users/Mehvish/Documents/SFU/BigDataLab/Metabot/comments/RC_2016-01-aaaa.json.gz'
    change_to_str = F.udf(to_text,
                          returnType=types.ArrayType(types.StringType()))

    sub_comments = spark.read.json(input_comments,
                                   schema=comments_schema).repartition(500)
    comm = sub_comments.select(
        sub_comments['subreddit'].alias('id'),
        sub_comments['body'].alias('comments'),
        sub_comments['ups'].alias('ups')
    )  #.where(sub_comments['subreddit'] == 'AskReddit').limit(10)

    preprocess = F.udf(clean_data,
                       returnType=types.ArrayType(types.StringType()))

    comm_cleaned = comm.select(comm['id'],
                               preprocess(comm['comments']).alias('comments'),
                               comm['ups'])
    #comm_cleaned.show(truncate=False)

    subreddit_group = comm_cleaned.groupBy(comm_cleaned['id']).agg(change_to_str(F.collect_list('comments')).alias('comments')
                                                                                 , F.sum('ups').alias('ups'),
                                                                                F.count('id').alias('count')) \
                                                    .select('id', 'comments', 'ups', 'count')

    #subreddit_group.show(20, False)
    #print("done")
    subreddit_group.write.format('parquet').save(output, mode='overwrite')
コード例 #9
0
def main():
    spark.sql("CLEAR CACHE")
    business = spark.read.parquet("yelp-etl/business_etl").repartition(8)
    business.createOrReplaceTempView("business")
    review = spark.read.parquet("yelp-etl/review_etl").repartition(16)#.cache()
    review.createOrReplaceTempView("review")

    ## Location based reviews
    # spark.sql("SELECT b.state, COUNT(*) AS bus_rev_count FROM business b INNER JOIN review r ON b.business_id = r.business_id GROUP BY b.state ORDER BY bus_rev_count DESC").show()
    #
    # ## Choosing reviews from Pennsylvania (state = "PA")
    pa_bus_rev = spark.sql("SELECT r.review_id, b.business_id, r.text, r.label FROM business b INNER JOIN review r ON b.business_id = r.business_id WHERE b.state = 'PA' AND r.label = 1")

    ## Remove punctuations and spaces
    punct_remover = functions.udf(lambda x: remove_punct(x))
    review_df = pa_bus_rev.select('review_id', 'business_id', punct_remover('text')).withColumnRenamed('<lambda>(text)', 'text')

    ## Tokenize
    tok = Tokenizer(inputCol="text", outputCol="words")

    ## Remove stop words
    stopwordList = ['','i','get','got','also','really','would','one','good','like','great','tri','love','two','three','took','awesome','me','bad','horrible','disgusting','terrible','fabulous','amazing','terrific','worst','best','fine','excellent','acceptable','my','exceptional','satisfactory','satisfying','super','awful','atrocious','unacceptable','poor','sad','gross','authentic','myself','cheap','expensive','we','our','ours','ourselves','you','your','yours','yourself','yourselves', 'he', 'him', 'his', 'himself','she','her','hers','herself','it','its','itself','they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then','once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each','few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn','weren', 'won', 'wouldn']

    stopword_rm = StopWordsRemover(inputCol="words", outputCol="words_nsw", stopWords=stopwordList)

    pipestages = [tok,stopword_rm]
    pipeline = Pipeline(stages = pipestages)
    model = pipeline.fit(review_df)
    tokenized_df = model.transform(review_df)

    ## Lemmatizing
    lemmatize_udf = functions.udf(lambda x: lemmatize(x), types.ArrayType(types.StringType()))
    lemmatized_df = tokenized_df.withColumn("lemmatized",lemmatize_udf("words_nsw")).select("review_id","business_id","lemmatized")
    ## Stemming
    stemmer_udf = functions.udf(lambda x: stem(x), types.ArrayType(types.StringType()))
    stemmed_df = lemmatized_df.withColumn("stemmed", stemmer_udf("lemmatized")).drop(lemmatized_df["lemmatized"])


    ## Count Vectorizer
    cv = CountVectorizer(inputCol="stemmed", outputCol="vectors")
    cv_model = cv.fit(stemmed_df)
    cv_df = cv_model.transform(stemmed_df).drop(stemmed_df["stemmed"])
    cv_model.save("topic_modelling/cvmodel_pos")

    idf = IDF(inputCol="vectors",outputCol="tfidf")
    idf_model = idf.fit(cv_df)
    result = idf_model.transform(cv_df)

    result = result.select("review_id","business_id","tfidf")

    lda = LDA(featuresCol='tfidf', k=5, seed=42, maxIter=50)
    model = lda.fit(result)
    model.write().overwrite().save("topic_modelling/ldamodel_pos")
    transformed = model.transform(result)
    transformed.write.parquet("topic_modelling/review_topics_pos",mode="overwrite")
    spark.stop()
コード例 #10
0
 def get_resume_er_schema():
     return types.StructType([
         types.StructField('id', types.LongType(), nullable=False),
         types.StructField('job_title',
                           types.ArrayType(types.StringType()),
                           nullable=False),
         types.StructField('job_details',
                           types.ArrayType(types.StringType()),
                           nullable=False),
     ])
コード例 #11
0
def make_img_df(sqlContext, keys_rdd):
    kmeta_df = sqlContext.createDataFrame(keys_rdd.map(lambda x: x._asdict()))
    # applying python functions to DataFrames is more difficult and requires using typed UDFs
    twod_arr_type = sq_types.ArrayType(
        sq_types.ArrayType(sq_types.IntegerType()))
    # the pull_input_tile function is wrapped into a udf to it can be applied to create the new image column
    # numpy data is not directly supported and typed arrays must be used instead therefor we run the .tolist command
    pull_tile_udf = F.udf(lambda x: pull_input_tile(x_to_tile(x)).tolist(),
                          returnType=twod_arr_type)
    kimg_df = kmeta_df.withColumn('Image', pull_tile_udf(kmeta_df['x']))

    s_query = kimg_df.where(kimg_df['x'] > 99)
    return s_query.show()
コード例 #12
0
    def __init__(self, configuration: StatsExtractionConfig):
        self._filters = FilterTypesEnum

        self._columns = DataframeColumnsEnum
        self._stats = StatsExtractionEnum
        self._purging = PurgingEnum
        self._configuration = configuration
        standardisation_config_dict = self._configuration.standardisation_config
        standardisation_config = [
            FilterConfiguration(name=name, parameters=params)
            for name, params in standardisation_config_dict.items()
        ]

        dec_separator = self._stats.DECORATION_SEPARATOR_TOKEN
        attachment_token = self._stats.ATTACHMENT_POINT_TOKEN
        self._mol_wts_udf = psf.udf(
            lambda x: ExactMolWt(Chem.MolFromSmiles(x)), pst.FloatType())
        self._num_rings_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(x)),
            pst.IntegerType())
        self._num_atoms_udf = psf.udf(
            lambda x: Chem.MolFromSmiles(x).GetNumHeavyAtoms(),
            pst.IntegerType())
        self._num_aromatic_rings_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumAromaticRings(
                Chem.MolFromSmiles(x)), pst.IntegerType())
        self._hbond_donors_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumHBD(Chem.MolFromSmiles(x)),
            pst.IntegerType())
        self._hbond_acceptors_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumHBA(Chem.MolFromSmiles(x)),
            pst.IntegerType())
        self._hetero_atom_ratio_udf = psf.udf(
            lambda x: len([
                atom for atom in Chem.MolFromSmiles(x).GetAtoms()
                if atom.GetAtomicNum() == 6
            ]) / Chem.MolFromSmiles(x).GetNumHeavyAtoms(), pst.FloatType())
        self._make_canonical_udf = psf.udf(
            lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)),
            pst.StringType())
        self._standardise_smiles_udf = psf.udf(
            lambda x: RDKitStandardizer(standardisation_config, None).
            apply_filter(x), pst.StringType())
        pattern = self._stats.REGEX_TOKENS
        self.regex = re.compile(pattern)
        self._tokeniser_udf = psf.udf(self.regex.findall,
                                      pst.ArrayType(pst.StringType()))
        self._decoration_split_udf = psf.udf(lambda x: x.split(dec_separator),
                                             pst.ArrayType(pst.StringType()))
        self._count_decorations_udf = psf.udf(
            lambda s: list(s).count(attachment_token), pst.IntegerType())
コード例 #13
0
def create_credits_dataframe():
    creditsDF = (spark.read.csv(
        "/Users/butterflyeffect/Downloads/tmdb-5000-movie-dataset/tmdb_5000_credits.csv",
        header=True,
        quote='"',
        escape='"',
    ))
    # Define non string columns into their corresponding datatypes
    credits_cols = {
        "movie_id": T.IntegerType,
    }
    # Define json columns into their corresponding types
    credits_json_cols = {
        "cast":
        T.ArrayType(
            T.StructType([
                T.StructField("cast_id", T.IntegerType()),
                T.StructField("character", T.StringType()),
                T.StructField("credit_id", T.StringType()),
                T.StructField("gender", T.IntegerType()),
                T.StructField("id", T.IntegerType()),
                T.StructField("name", T.StringType()),
                T.StructField("order", T.IntegerType()),
            ])),
        "crew":
        T.ArrayType(
            T.StructType([
                T.StructField("credit_id", T.StringType()),
                T.StructField("department", T.StringType()),
                T.StructField("gender", T.IntegerType()),
                T.StructField("id", T.IntegerType()),
                T.StructField("job", T.StringType()),
                T.StructField("name", T.StringType()),
            ])),
    }
    for col, schema in credits_cols.items():
        creditsDF = creditsDF.withColumn(col, F.col(col).astype(schema()))
    for col, schema in credits_json_cols.items():
        creditsDF = creditsDF.withColumn(col, F.from_json(col, schema))

    # Validate Schema
    # creditsDF.printSchema()

    # Validate column names and types
    # print (creditsDF.columns)
    # print (creditsDF.dtypes)

    # Validate Rows
    # creditsDF.show(2, False)
    return creditsDF
コード例 #14
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # TODO: Add "boolean" and "string" types.
    # ArrayType
    if tpe in (np.ndarray,):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):
        return types.ArrayType(as_spark_type(tpe.__args__[0]))
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date,):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal,):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampType()
    else:
        raise TypeError("Type %s was not understood." % tpe)
コード例 #15
0
def get_schema(data_type=None):
    type_lib = {
        'int': types.IntegerType(),
        'float': types.FloatType(),
        'str': types.StringType(),
        'dt': types.TimestampType(),
        'arr_int': types.ArrayType(types.IntegerType()),
        'arr_float': types.ArrayType(types.FloatType()),
        'arr_str': types.ArrayType(types.StringType()),
    }

    return types.StructType([
      types.StructField(fname, type_lib[ftype])
      for fname, ftype in cols_dict[data_type]
  ])
コード例 #16
0
def streaming_sent(dfX):
    # apply sentiment analysis to text stream
    df = pipeline.transform(dfX)

    # select sentiment column from pipeline output
    df = df.select('sentiment.result',"sentiment.metadata") \
        .withColumn('result',F.concat_ws(',','result')) \
        .withColumn("result", regexp_replace('result', "positive",'1')) \
        .withColumn("result", regexp_replace('result', "na",'0')) \
        .withColumn("result", regexp_replace('result', "negative",'-1')) \
        .select(F.split('result', ',').alias('sents'), 'metadata')

    # Convert datatypes
    mapper = F.udf(lambda x: [i['confidence'] for i in x],
                   T.ArrayType(T.StringType()))
    df = df.withColumn("metadata", mapper('metadata'))
    df = df.withColumn("metadata", df.metadata.cast("array<float>"))

    # Compute column product
    df_product = df.withColumn(
        "product",
        F.expr(
            "transform(arrays_zip(sents, metadata), x -> x.sents * x.metadata)"
        ))

    # Average array
    array_mean = F.udf(lambda x: float(np.mean(x)), T.FloatType())
    sent_df = df_product.select(array_mean("product").alias("value"))
    return sent_df
コード例 #17
0
    def test_generated_rings(self):
        num_samples = 500
        # make a simple unit circle
        theta = np.linspace(0, 2 * np.pi, num_samples)
        X1 = np.random.rand(num_samples, 2) + np.transpose(
            [0.5 * np.cos(theta), 0.5 * np.sin(theta)])
        X2 = np.random.rand(num_samples, 2) + np.transpose(
            [5 * np.cos(theta), 5 * np.sin(theta)])
        X = np.concatenate([X1, X2])
        db = DBSCAN(eps=0.3, min_samples=5).fit(X)
        labels = db.labels_
        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)
        labels_spark = np.zeros_like(db.labels_)
        labels_spark[:] = -1

        data = [(i, [float(item) for item in X[i]]) for i in range(X.shape[0])]
        schema = T.StructType([
            T.StructField("id", T.IntegerType(), False),
            T.StructField("value", T.ArrayType(T.FloatType()), False)
        ])
        df = self.spark.createDataFrame(data, schema=schema)
        df_clusters = dbscan.process(self.spark, df, .3, 5, dist, 2,
                                     "checkpoint")
        out = df_clusters.distinct().collect()
        for item in out:
            labels_spark[item.point] = item.component
        n_clusters_spark_ = len(set(labels_spark)) - (1 if -1 in labels else 0)
        n_noise_spark_ = list(labels_spark).count(-1)
        self.assertEqual(n_clusters_, n_clusters_spark_)
        self.assertEqual(n_noise_, n_noise_spark_)
コード例 #18
0
    def test_generated_blobs(self):
        centers = [[1, 1], [-1, -1], [1, -1]]
        # with following data operations with sklearn dbscan 750*749/2 = 280875 for spark  149716(.2) 217624(0.3)
        X, labels_true = make_blobs(n_samples=750,
                                    centers=centers,
                                    cluster_std=0.4,
                                    random_state=5)
        db = DBSCAN(eps=0.2, min_samples=10).fit(X)
        labels = db.labels_
        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)
        labels_spark = np.zeros_like(db.labels_)
        labels_spark[:] = -1

        data = [(i, [float(item) for item in X[i]]) for i in range(X.shape[0])]
        schema = T.StructType([
            T.StructField("id", T.IntegerType(), False),
            T.StructField("value", T.ArrayType(T.FloatType()), False)
        ])
        df = self.spark.createDataFrame(data, schema=schema)
        df_clusters = dbscan.process(self.spark, df, .2, 10, dist, 2,
                                     "checkpoint")
        out = df_clusters.distinct().collect()
        for item in out:
            labels_spark[item.point] = item.component
        n_clusters_spark_ = len(set(labels_spark)) - (1 if -1 in labels else 0)
        n_noise_spark_ = list(labels_spark).count(-1)
        self.assertEqual(n_clusters_, n_clusters_spark_)
        self.assertEqual(n_noise_, n_noise_spark_)
コード例 #19
0
 def getRedditDataFrameSchema(self):
     return tp.StructType([
         tp.StructField('show_title', tp.StringType(), True),
         tp.StructField('show_director', tp.StringType(), True),
         tp.StructField('submission_id', tp.StringType(), True),
         tp.StructField('source', tp.StringType(), True),
         tp.StructField('title', tp.StringType(), True),
         tp.StructField('description', tp.StringType(), True),
         tp.StructField('created_utc', tp.TimestampType(), True),
         tp.StructField('author', tp.StringType(), True),
         tp.StructField('score', tp.IntegerType(), True),
         tp.StructField('spoiler', tp.BooleanType(), True),
         tp.StructField('is_original_content', tp.BooleanType(), True),
         tp.StructField('distinguished', tp.StringType(), True),
         tp.StructField('link', tp.StringType(), True),
         tp.StructField(
             'comments',
             tp.ArrayType(
                 tp.StructType([
                     tp.StructField('comment_id', tp.StringType(), True),
                     tp.StructField('body', tp.StringType(), True),
                     tp.StructField('created_utc', tp.TimestampType(),
                                    True),
                     tp.StructField('score', tp.IntegerType(), True),
                     tp.StructField('parent_id', tp.StringType(), True),
                     tp.StructField('submission_id', tp.StringType(), True)
                 ])), True)
     ])
コード例 #20
0
ファイル: test_writer.py プロジェクト: luzbetak/sparkly
    def test_group_by(self):
        df = self.spark.createDataFrame(
            data=[
                ('k4', 'k14', [1, 14, 141]),
                ('k1', 'k12', [1, 12, 121]),
                ('k1', 'k11', [1, 11, 111]),
                ('k1', 'k13', [1, 13, 131]),
            ],
            schema=T.StructType([
                T.StructField('key_1', T.StringType()),
                T.StructField('key_2', T.StringType()),
                T.StructField('aux_data', T.ArrayType(T.IntegerType())),
            ])
        )

        df.write_ext.by_url(
            'redis://redis.docker?keyBy=key_1&groupByKey=true&maxPipelineSize=2'
        )

        redis_client = redis.StrictRedis('redis.docker')

        self.assertRowsEqual(redis_client.keys(), [b'k1', b'k4'], ignore_order=True)

        written_data = [json.loads(redis_client.get(key)) for key in [b'k1', b'k4']]

        expected = [
            [
                {'key_1': 'k1', 'key_2': 'k11', 'aux_data': [1, 11, 111]},
                {'key_1': 'k1', 'key_2': 'k12', 'aux_data': [1, 12, 121]},
                {'key_1': 'k1', 'key_2': 'k13', 'aux_data': [1, 13, 131]},
            ],
            [{'key_1': 'k4', 'key_2': 'k14', 'aux_data': [1, 14, 141]}],
        ]

        self.assertRowsEqual(written_data, expected, ignore_order=True)
コード例 #21
0
ファイル: test_writer.py プロジェクト: luzbetak/sparkly
    def test_redis_client_init(self):
        df = self.spark.createDataFrame(
            data=[
                ('k1', 'k14', [1, 14, 141]),
            ],
            schema=T.StructType([
                T.StructField('key_1', T.StringType()),
                T.StructField('key_2', T.StringType()),
                T.StructField('aux_data', T.ArrayType(T.IntegerType())),
            ])
        )

        df.write_ext.redis(
            key_by=['key_2'],
            max_pipeline_size=3,
            redis_client_init=partial(redis.StrictRedis, 'redis.docker'),
        )

        redis_client = redis.StrictRedis('redis.docker')

        self.assertEqual(redis_client.keys(), [b'k14'])

        written_data = json.loads(redis_client.get('k14'))
        expected = {'key_1': 'k1', 'key_2': 'k14', 'aux_data': [1, 14, 141]}
        self.assertEqual(written_data, expected)
コード例 #22
0
ファイル: test_functions.py プロジェクト: Mallik-G/sparkly
    def test_coalescing_heavy_type_mismatch(self):
        first_df = self.spark.createDataFrame(
            data=[(1, None), (2, 'hi'), (3, None), (4, 'may')],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.StringType()),
            ]),
        )
        second_df = self.spark.createDataFrame(
            data=[(2, [
                2,
            ]), (3, [
                3,
            ]), (4, None)],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.ArrayType(T.IntegerType())),
            ]),
        )

        with self.assertRaises(U.AnalysisException):
            SF.multijoin([first_df, second_df],
                         on='id',
                         how='inner',
                         coalesce=['value'])
コード例 #23
0
    def preprocessDF(self, df, cols):
        """
            Input: $df represents a DataFrame
                   $cols represents the list of columns (in $df) that will be concatenated and be tokenized

            Output: Return a new DataFrame that adds the "joinKey" column into the input $df

            Comments: The "joinKey" column is a list of tokens, which is generated as follows:
                     (1) concatenate the $cols in $df;
                     (2) apply the tokenizer to the concatenated string
            Here is how the tokenizer should work:
                     (1) Use "re.split(r'\W+', string)" to split a string into a set of tokens
                     (2) Convert each token to its lower-case
                     (3) Remove stop words
        """
        stop_words = self.stopWordsBC

        def tokenized_filterized_string(string):
            string = re.sub('\s+', ' ', string).strip().lower(
            )  # Remove extra whitespace and finally remove trailing spaces
            tokens = re.split(r'\W+', string)
            stop_words.add('')
            tokens = set(tokens) - stop_words
            return list(tokens)

        get_tokenized_string = functions.udf(
            tokenized_filterized_string, types.ArrayType(types.StringType()))
        concatanated_column = 'joinKey'
        df = df.withColumn(concatanated_column,
                           concat_ws(' ', df[cols[0]], df[cols[1]]))
        df = df.withColumn(concatanated_column,
                           get_tokenized_string(df[concatanated_column]))
        return df
コード例 #24
0
def polynomialExpansionCore(requestStr, df):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    columnNames = requestDict['columnNames']
    # 新列的列名默认为"多项式扩展" + columnNames,若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = "_".join(columnNames) + "_PolynomialExpansion"
    # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息
    vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features")
    try:
        df = vecAssembler.transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"
    # 设定多项式扩展模型
    px = PolynomialExpansion(inputCol="features", outputCol=newColumnName)
    # 训练
    df = px.transform(df)

    # 转换新列的数据格式
    def do_something(col):
        try:
            floatrow = []
            for i in list(col):
                floatrow.append(float(i))
            return floatrow
        except:
            return []

    udf_dosth = F.udf(do_something, T.ArrayType(T.FloatType()))
    df = df.withColumn(newColumnName, udf_dosth(df[newColumnName]))
    df = df.drop("features")
    # df.show()
    return df
コード例 #25
0
def piStrOneHotEncoding(featurename, dataframe):
    from pyspark.ml.feature import OneHotEncoder
    from pyspark.ml.feature import StringIndexer
    #from pyspark.ml.feature import VectorIndexer
    indexed = dataframe
    indexer = StringIndexer(inputCol=featurename, outputCol=featurename + "HE")
    indexed = indexer.fit(indexed).transform(indexed)
    encoder = OneHotEncoder(inputCols=[featurename + "HE"],
                            outputCols=[featurename + "OHE"])
    indexed = encoder.fit(indexed).transform(indexed)

    def convertSparseVectortoDenseVectorInt(v):
        v = DenseVector(v)
        new_array = list([int(x) for x in v])
        return new_array

    toDenseVectorUdfInt = F.udf(convertSparseVectortoDenseVectorInt,
                                T.ArrayType(T.IntegerType()))

    from pyspark.ml.feature import Interaction, VectorAssembler
    assembler1 = VectorAssembler(inputCols=[featurename + "OHE"],
                                 outputCol="vec1")
    assembled1 = assembler1.transform(indexed)
    a = assembled1.toPandas()
    indexed = indexed.drop(featurename).drop(featurename + "HE").withColumn(
        featurename,
        toDenseVectorUdfInt(featurename + "OHE")).drop(featurename + "OHE")
    #indexer = VectorIndexer(inputCol=featurename+"OHE", outputCol=featurename+"tHE", maxCategories=10)
    #indexerModel = indexer.fit(indexed)
    #indexed = indexerModel.transform(indexed)

    return indexed
コード例 #26
0
    def get_coefficients(
        split_urls_and_word_frequency_orders: DataFrame,
        s: float,
        additional_weight_function: Callable[[int], float] = lambda e: 1
    ) -> DataFrame:
        """

        :param split_urls_and_word_frequency_orders: A DataFrame of split URLs and word frequency orders with columns:
                                                     id, url, split_url, word_frequency_orders.
        :param s: s parameter of Zipf distribution.
        :param additional_weight_function: additional weight function to be applied additional weight beside Zipf to
                                           word vector.
        :return: A DataFrame of split URLs and coefficient of each term with columns: id, url, split_url, coefficients
        """
        def calculate_coefficients(word_frequency_orders):
            coefficients = []
            for i in range(len(word_frequency_orders)):
                coefficients.append(
                    additional_weight_function(i) *
                    URLVectorCalculator.get_zipf_coefficient(
                        word_frequency_orders[i], s))
            return coefficients

        get_coefficients_udf = F.udf(calculate_coefficients,
                                     T.ArrayType(T.DoubleType()))
        split_urls_and_coefficients = split_urls_and_word_frequency_orders \
            .select("id",
                    "url",
                    "split_url",
                    get_coefficients_udf("word_frequency_orders").alias("coefficients"))
        return split_urls_and_coefficients
コード例 #27
0
def extract_embedding(spark, glove_model_path, output_folder):

    glove = Glove.load(glove_model_path)

    dictionary_schema = T.StructType([
        T.StructField('index', T.IntegerType(), True),
        T.StructField('standard_concept_id', T.IntegerType(), True)
    ])

    dictionary_df = spark.createDataFrame([
        Row(index=k, standard_concept_id=int(v))
        for k, v in glove.inverse_dictionary.items()
    ], dictionary_schema)

    vector_schema = T.StructType([
        T.StructField('index', T.IntegerType(), True),
        T.StructField('vector', T.ArrayType(T.DoubleType()), True)
    ])

    vector_df = spark.createDataFrame([
        Row(index=idx, vector=vector.tolist())
        for idx, vector in enumerate(glove.word_vectors)
    ], vector_schema)

    dictionary_df.join(vector_df, 'index').select(
        'standard_concept_id',
        'vector').write.mode('overwrite').parquet(output_folder)
コード例 #28
0
def calc_TX_PVLS(patient_agg_obs: DataFrame,
                 VL_code: str,
                 end_date_str: str = None) -> pandas.DataFrame:
    """Calculates TX_PVLS indicator with its corresponding disaggregations.

  Args:
    patient_agg_obs: A DataFrame generated by `join_patients_agg_obs()`.
    VL_code: The code for viral load values.
    end_date: The string representation of the last date as 'YYYY-MM-DD'.
  Returns:
  """
    end_date = datetime.today()
    if end_date_str:
        end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    agg_buckets_udf = F.UserDefinedFunction(
        lambda a, g: agg_buckets(a, g, end_date), T.ArrayType(T.StringType()))
    VL_df = patient_agg_obs.withColumn(
        'sup_VL', patient_agg_obs[VL_code + '_max_value'] < 150).withColumn(
            'agg_buckets',
            agg_buckets_udf(patient_agg_obs['birthDate'],
                            patient_agg_obs['gender']))
    num_patients = VL_df.count()
    VL_agg_P = VL_df.select(
        VL_df.sup_VL,
        F.explode(VL_df.agg_buckets).alias('agg_bucket')).groupBy(
            'sup_VL', 'agg_bucket').agg(
                F.count('*').alias('count')).toPandas().sort_values(
                    ['agg_bucket', 'sup_VL'])
    VL_agg_P['ratio'] = VL_agg_P['count'] / num_patients
    return VL_agg_P
コード例 #29
0
    def run(self, data, *cols):
        """
        Runs model on each row of the data.

        :param pyspark.sql.DataFrame data: spark dataframe with one row per model.
        :param cols: column name(s) to run model on.
        :return: a spark dataframe
        """
        import pyspark.sql.functions as F
        import pyspark.sql.types as T
        from datetime import date, datetime

        def json_serialize(obj):
            """
            JSON serializer for objects not serializable by default json code
            This function currently only handles datetime and date objects

            :param obj: Object to serialize
            :return: json serialized object
            """
            if isinstance(obj, (datetime, date)):
                return obj.isoformat()
            raise TypeError("Type %s not serializable" % type(obj))

        def _run(*inp):
            """
            Function to call the model _run function

            :param tuple inp: inputs passed to the function.
                TimeSeries Example:
                ([Row(index=datetime.datetime(2019, 1, 1, 0, 0), raw=1197387.0, interpolated=1197387.0),
                Row(index=datetime.datetime(2019, 1, 2, 0, 0), raw=1449210.0, interpolated=1449210.0), ... ],
                Row(_MetricName=u'injections', email_routing_domain=u'att.net'),
                datetime.datetime(2019, 3, 1, 16, 30))

            :return: model result - list of tuples e.g. [("{}", "{}", Timestamp), ("{}", "{}", Timestamp), ...]
            """
            import json
            output = self._run(*inp)

            if isinstance(output, list):
                output = [(json.dumps(model_attribute, default=json_serialize),
                           json.dumps(model_result, default=json_serialize),
                           data_date)
                          for model_attribute, model_result, data_date in output]
            elif isinstance(output, tuple):
                output = [(json.dumps(output[0], default=json_serialize),
                           json.dumps(output[1], default=json_serialize),
                           output[2])]
            return output

        run_udf = F.udf(_run, T.ArrayType(
            T.StructType([T.StructField('model_attributes', T.StringType()),
                          T.StructField('model_results', T.StringType()),
                          T.StructField('data_date', T.TimestampType())
                          ])))

        new_df = (data.withColumn('model_output', run_udf(*cols)))

        return new_df
コード例 #30
0
ファイル: convert.py プロジェクト: zhangxianbing/pb2df
def _proto3_field_to_spark_data_type(field_desc: FieldDescriptor) -> DataType:
    """Convert ProtoBuf field descriptor to Spark `DataType` or `StructField` object.

    Args:
        field_desc (FieldDescriptor): A ProtoBuf field descriptor.
    Returns:
        DataType: A Spark `DataType` or `StructField` object.
    """
    # map type field
    if _IsMapEntry(field_desc):
        key_field_desc = field_desc.message_type.fields_by_name["key"]
        value_field_desc = field_desc.message_type.fields_by_name["value"]
        key_struct_type = _proto3_field_to_spark_data_type(key_field_desc)
        value_struct_type = _proto3_field_to_spark_data_type(value_field_desc)
        return types.MapType(key_struct_type, value_struct_type)

    if field_desc.type == FieldDescriptor.TYPE_MESSAGE:
        # nested message
        field_data_type = _proto3_message_descriptor_to_spark_schema(
            field_desc.message_type)
    else:
        # scalar value types
        field_data_type = _SPARK_SQL_TYPE_MAP[field_desc.type]

    # list type field
    if field_desc.label == FieldDescriptor.LABEL_REPEATED:
        return types.ArrayType(field_data_type)

    return field_data_type