Ejemplo n.º 1
0
def text_features(p_df):
    """
    Extracts features derived from the quora question texts.
    :param p_df: A DataFrame.
    :return: A DataFrame.  
    """
    diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType())
    common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))),
                       IntegerType())
    unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))),
                       IntegerType())

    p_df = p_df.withColumn("len_q1", length("question1")).withColumn(
        "len_q2", length("question2"))
    p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2")))
    p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn(
        "words_q2", size("question2_words"))
    p_df = p_df.withColumn(
        "common_words",
        common_words(array("question1_words", "question2_words")))
    p_df = p_df.withColumn("unique_chars_q1",
                           unique_chars("question1")).withColumn(
                               "unique_chars_q2", unique_chars("question2"))

    assembler = VectorAssembler(inputCols=[
        "len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words",
        "unique_chars_q1", "unique_chars_q2"
    ],
                                outputCol="text_features")
    p_df = assembler.transform(p_df)
    return p_df
Ejemplo n.º 2
0
def text_features(p_df):
    """
    Extracts features derived from the quora question texts.
    :param p_df: A DataFrame.
    :return: A DataFrame.  
    """
    diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType())
    common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType())
    unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType())


    p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2"))
    p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2")))
    p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words"))
    p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words")))
    p_df = p_df.withColumn(
        "unique_chars_q1", unique_chars("question1")
    ).withColumn("unique_chars_q2", unique_chars("question2"))

    assembler = VectorAssembler(
        inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"],
        outputCol="text_features"
    )
    p_df = assembler.transform(p_df)    
    return p_df
Ejemplo n.º 3
0
def filter_str(df, col, filter_null=True, limit_length=True):
    if filter_null:
        df = df.filter(df[col].isNotNull() & (~df[col].endswith("\s*")))
    if limit_length:
        df = df.filter(F.length(df[col]) < MAX_LENGTH).filter(
            F.length(df[col]) > MIN_LENGTH)
    return df
Ejemplo n.º 4
0
    def cross_district_crimes(self, df=None, img_out=None, csv_out=None, cache=False):
        nypd_df = self.nypd_df

        if df:
            nypd_df = df

        if cache:
            nypd_df = nypd_df.persist()

        nypd_df = nypd_df.filter(
            (F.length(F.col(c.BOROUGH)) > 0)
            & (F.length(F.col(c.OFFENSE_DESCRIPTION)) > 0)
        )

        data2 = nypd_df.toPandas()
        df = pd.crosstab(data2.BORO_NM, data2.OFNS_DESC)

        if img_out:
            plt.figure()
            color = plt.cm.gist_rainbow(np.linspace(0, 1, 10))

            df.div(df.sum(1).astype(float), axis=0).plot.bar(stacked=True, color=color, figsize=(18, 12))
            plt.title('District vs Category of Crime', fontweight=30, fontsize=20)

            plt.xticks(rotation=90)
            plt.savefig(img_out)

        if csv_out:
            self._save_csv(df, csv_out)

        return nypd_df
Ejemplo n.º 5
0
def spark_ratio(left, right):
    # TODO: sparkify this function
    df = df(['left', 'right'])
    df = df.withColumn('len', F.min(F.length('left'), F.length('right')))
    df = df.withColumn('levenshtein', F.levenshtein('left', 'right'))
    df = df.withColumn('inv_edit_distance',
                       F.col('len') - F.col('levenshtein'))
    df = df.withColumn('ratio', F.col('inv_edit_distance') / F.col('len'))
    df = df.withColumnRenamed('ratio', 'fuzzy')
    df = df.select(['fuzzy'])
    return df
Ejemplo n.º 6
0
def prepare_features(df):
    df = df.withColumn(
        'exclam',
        length('review_body') -
        length(regexp_replace('review_body', '\!', '')))
    df = df.withColumn('age',
                       datediff(current_date(), to_date(df['review_date'])))
    df = df.withColumn('review_length', length(df['review_body']))
    df = df.withColumn('helfulness', df['helpful_votes'] / df['total_votes'])
    df = df.withColumn('label', expr("CAST(verified_purchase='Y' As INT)"))
    select_cols = df.select(
        ['star_rating', 'helfulness', 'age', 'review_length',
         'label']).na.fill(0)
    return select_cols
Ejemplo n.º 7
0
    def _preprocess(self):

        input_cols = [
            'armed', 'city', 'manner_of_death', 'flee', 'gender', 'state',
            'threat_level', 'body_camera', 'signs_of_mental_illness'
        ]

        # self.shootings_df = self.shootings_df.select([c for c in self.shootings_df.columns if c in input_cols])
        # self.shootings_df.show(n=10)

        self.shootings_df = self.shootings_df.filter(
            (F.length(F.col('armed')) > 0) & (F.length(F.col('city')) > 0) & \
            (F.length(F.col('manner_of_death')) > 0) & (F.length(F.col('race')) > 0) & \
            (F.length(F.col('flee')) > 0) & (F.length(F.col('gender')) > 0) & \
            (F.length(F.col('state')) > 0) & (F.length(F.col('threat_level')) > 0) & \
            (F.length(F.col('body_camera')) > 0) & (F.length(F.col('signs_of_mental_illness')) > 0)
            )

        indexers = [
            StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
            for c in input_cols
        ]

        # The encode of indexed vlaues multiple columns
        encoders = [
            OneHotEncoder(dropLast=False,
                          inputCol=indexer.getOutputCol(),
                          outputCol="{0}_enc".format(indexer.getOutputCol()))
            for indexer in indexers
        ]

        # Vectorizing encoded values
        assembler = VectorAssembler(
            inputCols=[encoder.getOutputCol() for encoder in encoders],
            outputCol="features")

        pipeline = Pipeline(stages=indexers + encoders + [assembler])
        model = pipeline.fit(self.shootings_df)
        self.shootings_df = model.transform(self.shootings_df)

        self.shootings_df = self.shootings_df.withColumn(
            'label',
            udf_parse_race('race').cast('int'))
        self.shootings_df = self.shootings_df.select('features', 'race',
                                                     'label')
        self.shootings_df.persist().count()

        return self.shootings_df
Ejemplo n.º 8
0
def visualize_tweet_data(twitter_data):
    from pyspark.sql.functions import avg, col, length
    from pyspark.sql.functions import lower, split
    import itertools
    import collections
    import seaborn as sns
    #import nltk
    from nltk.corpus import stopwords
    import pandas as pd
    from wordcloud import WordCloud
    sns.set(font_scale=1.5)
    sns.set_style("whitegrid")

    #Number of different tweets
    num_of_diftweet = len(twitter_data.groupBy("tweet").count().collect())
    print("Number of different tweets are: " + str(num_of_diftweet))

    #Average length of review text
    text_length = twitter_data.withColumn("length",
                                          length(twitter_data["tweet"]))
    avg_length = round(text_length.select(avg("length")).collect()[0][0], 2)
    print("Average length of text is: " + str(avg_length))

    #Make all twitter to lower case and split them
    words_in_tweet = twitter_data.select(split(lower(col("tweet")),
                                               " ")).collect()

    #This is a list contains many lists for all tweet
    """This list will use for applying stopword and collection word"""
    word_list = []
    for each_tweet in words_in_tweet:
        for word in each_tweet:
            word_list.append(word)

    #Eliminate stopwords to eliminate the common words
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    tweets_nsw = [[word for word in tweet_words if not word in stop_words]
                  for tweet_words in word_list]

    #Eliminate collection words
    collection_words = [
        'gucci', 'polo', 'chanel', 'burberry', 'prada', 'versace', 'fendi',
        'hermes', 'new', 'loving', 'never', 'check', 'share', 'someone',
        'fashion', 'got', 'played'
    ]

    tweets_nsw_nc = [[w for w in word if not w in collection_words]
                     for word in tweets_nsw]
    #Create a list of words after cleaning all common words
    all_words_nsw_nc = list(itertools.chain(*tweets_nsw_nc))
    counts_nsw_nc = collections.Counter(all_words_nsw_nc)
    input_data = counts_nsw_nc

    #Plot word count function
    word_count_visualization(input_data)

    #Word cloud
    create_wordcloud(input_data)
Ejemplo n.º 9
0
def usuarios_features(df, categoria=-1.0):
    logger.info("Calculando features para usuarios...")

    resultado = (df.select(
        df["user.id"].alias("user_id"),
        nullToInt("user.profile_use_background_image").alias(
            "con_imagen_fondo"),
        u_parse_time("user.created_at").cast('timestamp').alias(
            "cuenta_creada"), df["user.favourites_count"].alias("n_favoritos"),
        nullToInt("user.description").alias("con_descripcion"),
        F.length("user.description").alias("longitud_descripcion"),
        nullToInt("user.verified").alias("con_perfil_verificado"),
        nullToInt("user.default_profile_image").alias("con_imagen_default"),
        df["user.listed_count"].alias("n_listas"),
        nullToInt("user.geo_enabled").alias("con_geo_activo"),
        reputacion("user.followers_count",
                   "user.friends_count").alias("reputacion"),
        df["user.statuses_count"].alias("n_tweets"),
        followersRatio("user.followers_count",
                       "user.friends_count").alias("followers_ratio"),
        df["user.screen_name"].alias("nombre_usuario"),
        entropia("lista_intertweet").alias("entropia")).withColumn(
            "ano_registro", F.year("cuenta_creada")).withColumn(
                "categoria",
                F.lit(categoria)).withColumn("createdAt",
                                             F.current_timestamp()))

    return resultado
Ejemplo n.º 10
0
def preparar_df(df):
    df.repartition(df.user.id)

    df = df.where(F.length(df.text) > 0)
    df = df.select(
        "*",
        u_parse_time(
            df['created_at']).cast('timestamp').alias('created_at_ts'))

    df_intertweet = df.select(
        df.user.id.alias("user_id"),
        (df.created_at_ts.cast('bigint') -
         F.lag(df.created_at_ts.cast('bigint'), ).over(
             Window.partitionBy("user.id").orderBy("created_at_ts"))
         ).cast("bigint").alias("time_intertweet"))

    df_list_intertweet = df_intertweet.groupby(df_intertweet.user_id).agg(
        F.collect_list("time_intertweet").alias("lista_intertweet"))

    df_list_intertweet = df_list_intertweet.filter(
        F.size(df_list_intertweet.lista_intertweet) > 3)

    df = df.join(df_list_intertweet,
                 df["user.id"] == df_list_intertweet["user_id"])

    return df
Ejemplo n.º 11
0
    def crimes_top(self, df=None, n=20, img_out=None, csv_out=None, cache=False):
        nypd_df = self.nypd_df

        if df:
            nypd_df = df

        if cache:
            nypd_df = nypd_df.persist()

        # data cleaning:
        # filter rows without a OFFENSE_DESCRIPTION
        df = nypd_df.filter(F.length(F.col(c.OFFENSE_DESCRIPTION)) > 0)

        # crime types
        crime_type_groups = df.groupBy(c.OFFENSE_DESCRIPTION).count()
        crime_type_counts = crime_type_groups.orderBy('count', ascending=False)

        # select the top N most frequent crimes and plot the distribution
        counts_crime_pddf = crime_type_counts.toPandas()
        counts_crime_pddf_top_N = counts_crime_pddf[:n]

        print(counts_crime_pddf_top_N)

        if img_out:
            plt.figure(figsize=(12, 8))
            counts_crime_pddf_top_N.plot.barh(x=c.OFFENSE_DESCRIPTION, y='count')
            plt.savefig(img_out)

        if csv_out:
            self._save_csv(counts_crime_pddf_top_N, csv_out)

        return crime_type_counts
Ejemplo n.º 12
0
    def cross_age_race(self, df=None, img_out=None, csv_out=None, cache=False):
        nypd_df = self.nypd_df

        if df:
            nypd_df = df

        if cache:
            nypd_df = nypd_df.persist()

        age_groups = ['<18', '18-24', '25-44', '45-64', '65+']

        nypd_df = nypd_df.select(c.AGE, c.RACE)

        nypd_df = nypd_df.filter((F.length(F.col(c.AGE)) > 0) & (F.col(c.AGE) != 'false'))
        nypd_df = nypd_df.where(F.col(c.AGE).isin(age_groups))

        data3 = self_toPandas(nypd_df, 4)
        df = pd.crosstab(data3.SUSP_RACE, data3.SUSP_AGE_GROUP)

        if img_out:
            plt.figure()
            color = plt.cm.gist_rainbow(np.linspace(0, 1, 10))

            df.div(df.sum(1).astype(float), axis=0).plot.bar(stacked=True, color=color, figsize=(18, 12))
            plt.title('age vs race', fontweight=30, fontsize=20)

            plt.xticks(rotation=90)
            plt.savefig(img_out)

        if csv_out:
            self._save_csv(df, csv_out)

        return nypd_df
Ejemplo n.º 13
0
 def test_regex(self, input_df):
     filter_expression = """attributes.last_name rlike "^.{7}$" """
     transformer = Sieve(filter_expression=filter_expression)
     transformed_df = transformer.transform(input_df)
     assert transformed_df.count() < input_df.count()
     assert transformed_df.count() == input_df.where(
         F.length(input_df.attributes.last_name) == 7).count()
def test15(spark):
    """
    This demonstrates reading JSON events from Pravega. It uses chunked encoding to support
    events of 2 GiB.
    """
    # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol.
    # It should be selected at random by each process that writes records.
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    df = (spark.readStream.format("pravega").option(
        "controller", controller).option("scope", scope).option(
            "stream", "video").option("encoding", "chunked_v1").load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select(
        '*',
        from_json('event_string', schema=schema,
                  options=dict(mode='FAILFAST')).alias('event'))
    df = df.select('*', 'event.*')
    df = df.select('*', length('data'))
    df = df.drop('raw_event', 'event_string', 'event', 'data')
    df = df.withWatermark('timestamp', '60 second')

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate', 'false').start().awaitTermination())
Ejemplo n.º 15
0
def _get_base_cols(row: StructExpression) -> List[Column]:
    assert check_argument_types()

    contig_name_col = fx.col("`locus.contig`").alias("contigName")

    start_col = (fx.col("`locus.position`") - 1).cast("long").alias("start")

    end_col = start_col + fx.length(fx.element_at("alleles", 1))
    has_info = 'info' in row and isinstance(row.info.dtype, tstruct)
    if has_info and 'END' in row.info and row.info.END.dtype == tint:
        end_col = fx.coalesce(fx.col("`info.END`"), end_col)
    end_col = end_col.cast("long").alias("end")

    names_elems = []
    if 'varid' in row and row.varid.dtype == tstr:
        names_elems.append("varid")
    if 'rsid' in row and row.rsid.dtype == tstr:
        names_elems.append("rsid")
    names_col = fx.expr(
        f"nullif(filter(array({','.join(names_elems)}), n -> isnotnull(n)), array())").alias("names")

    reference_allele_col = fx.element_at("alleles", 1).alias("referenceAllele")

    alternate_alleles_col = fx.expr("slice(alleles, 2, size(alleles) - 1)").alias("alternateAlleles")

    base_cols = [
        contig_name_col, start_col, end_col, names_col, reference_allele_col, alternate_alleles_col
    ]
    assert check_return_type(base_cols)
    return base_cols
def create_values(cols):
    values = []
    for col in cols:
        if col.is_lookup == 1:
            values.append(
                f.when(
                    f.col(col.demographic_key).isNull(),
                    f.concat_ws('_', f.lit(col.demographic_key),
                                f.lit('9999'))).when(
                                    f.trim(f.col(col.demographic_key)) == '',
                                    f.concat_ws('_',
                                                f.lit(col.demographic_key),
                                                f.lit('9999'))).
                when(
                    f.length(
                        f.regexp_extract(
                            f.col(col.demographic_key).astype('string'),
                            '(\d+)', 1)) > 0,
                    f.concat_ws(
                        '_', f.lit(col.demographic_key),
                        f.col(col.demographic_key).astype('int').astype(
                            'string'))).otherwise(
                                f.concat_ws('_', f.lit(col.demographic_key),
                                            f.col(col.demographic_key))))
        else:
            values.append(f.col(col.demographic_key))
    return values
def lyrSizePandasMixed(count): 
  """
  using pandas udf 
  """
  df = spark.createDataFrame( sc.range(count,0,-1) ,schema=T.IntegerType())
  df = df.withColumn("lyr",F.pandas_udf(lineList,T.StringType())(F.col("value"))).select("lyr")
  return df.withColumn("lyrc", F.length(F.col("lyr")) ).select(F.sum(F.col("lyrc")).alias("c")).first()["c"]
def test2(spark):
    """
    """
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    # To allow for large images and avoid out-of-memory, the JVM will
    # send to the Python UDF this batch size.
    spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1')

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')

    df = (spark.readStream.format("pravega").option(
        "controller", controller).option("scope", scope).option(
            "stream", "video").option("encoding", "chunked_v1").load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.select('*', length('data'))
    df = df.withWatermark('timestamp', '60 second')

    def f(batch_df, batch_id):
        print('batch_id=%d' % batch_id)
        png0 = batch_df.select('data').limit(1).collect()[0][0]
        print('png0=%s' % png0[0:20])

    #     IPython.display.clear_output(wait=True)
    #     IPython.display.display(IPython.display.Image(data=png0))

    (df.writeStream.trigger(processingTime='3 seconds')  # limit trigger rate
     .foreachBatch(f).start().awaitTermination())
Ejemplo n.º 19
0
def get_suggested_dict(df):
    '''
    :param df: data frame
    :return: dictionary of suggested types in Postgres
    '''
    # ArrayType, BinaryType are not handled yet
    suggested = {}

    for f in df.schema.fields:
        if isinstance(f.dataType, DateType):
            suggested[f.name] = 'date'
        elif isinstance(f.dataType, StringType):
            df = df.withColumn('length', F.length(F.col(f.name)))
            x = df.agg(F.max(df.length)).collect()[0][0]
            # 20% extra length based on the longest string
            suggested[f.name] = 'varchar({})'.format(int(x * 1.2))
        elif isinstance(f.dataType, DoubleType) or isinstance(
                f.dataType, DecimalType):
            suggested[f.name] = 'numeric(18,2)'
        elif isinstance(f.dataType, LongType):
            suggested[f.name] = 'int8'
        elif isinstance(f.dataType, FloatType):
            suggested[f.name] = 'float8'
        elif isinstance(f.dataType, ShortType):
            suggested[f.name] = 'integer'
        elif isinstance(f.dataType, BooleanType):
            suggested[f.name] = 'Bool'
        elif isinstance(f.dataType, TimestampType):
            suggested[f.name] = 'timestamptz'
    return suggested
Ejemplo n.º 20
0
def main():
    spark = SparkSession.builder.appName('nlp').getOrCreate()
    data = spark.read.csv("./data/smsspamcollection/SMSSpamCollection",
                          inferSchema=True, sep='\t')
    data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1',
                                                                    'text')
    data.show()
    data = data.withColumn('length', length(data['text']))
    data.show()
    data.groupby('class').mean().show()
    tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
    stopremove = StopWordsRemover(inputCol='token_text',
                                  outputCol='stop_tokens')
    count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
    idf = IDF(inputCol="c_vec", outputCol="tf_idf")
    ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label')
    clean_up = VectorAssembler(inputCols=['tf_idf', 'length'],
                               outputCol='features')
    nb = NaiveBayes()
    data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove,
                                      count_vec, idf, clean_up])
    cleaner = data_prep_pipe.fit(data)
    clean_data = cleaner.transform(data)
    clean_data = clean_data.select(['label', 'features'])
    clean_data.show()
    (training, testing) = clean_data.randomSplit([0.7, 0.3])
    spam_predictor = nb.fit(training)
    data.printSchema()
    test_results = spam_predictor.transform(testing)
    test_results.show()
    acc_eval = MulticlassClassificationEvaluator()
    acc = acc_eval.evaluate(test_results)
    print("Accuracy of model at predicting spam was: {}".format(acc))
Ejemplo n.º 21
0
    def crimes_ages(self, df=None, img_out=None, csv_out=None, cache=False):
        nypd_df = self.nypd_df

        if df:
            nypd_df = df

        if cache:
            nypd_df = nypd_df.persist()

        nypd_df = nypd_df.filter(F.length(F.col(c.AGE)) > 0)

        crime_age_groups = nypd_df.groupBy(c.AGE).count()

        crime_age_counts = crime_age_groups.orderBy('count', ascending=False)

        pddf = crime_age_counts.toPandas()
        pddf.set_index(c.AGE, inplace=True)

        if img_out:
            plt.figure()
            pddf.plot.pie(y='count')
            plt.savefig(img_out)

        if csv_out:
            self._save_csv(pddf, csv_out)

        return crime_age_counts
Ejemplo n.º 22
0
def parse_worker_data(spark: SparkSession, input_path: str) -> DataFrame:
    """
    Parse the asylum seeker data to the appropriate schema.
    :param spark: the SparkSession object
    :param input_path: location of the data
    :return: A Spark dataframe
    """
    csv, to_filter = "h1b_kaggle.csv", [
        "CASE_STATUS", "EMPLOYER_NAME", "YEAR", "WORKSITE"
    ]
    df1 = spark.read.csv(input_path + "legal_immigrant_data/{}".format(csv), header=True) \
        .selectExpr(*_lower_case_headers(to_filter)) \
        .dropDuplicates() \
        .withColumn("visa_class", F.lit("H-1B"))
    df1 = df1.withColumn('split', F.split(df1['worksite'], ',')) \
        .withColumn("worksite_city", F.col('split')[0]) \
        .withColumn("worksite_state", F.col('split')[1]) \
        .drop("split", "worksite")
    df1 = df1.withColumn('worksite_state',
                         _abbreviate_state(df1.worksite_state))
    csv = "H-1B_Disclosure_Data_FY17.csv"
    to_filter = [
        'CASE_STATUS', 'VISA_CLASS', 'EMPLOYMENT_START_DATE',
        'EMPLOYMENT_END_DATE', 'EMPLOYER_NAME', 'EMPLOYER_CITY',
        'EMPLOYER_STATE', 'WORKSITE_CITY', 'WORKSITE_STATE'
    ]
    df2 = spark.read.csv(input_path + "legal_immigrant_data/{}".format(csv), header=True) \
        .selectExpr(*_lower_case_headers(to_filter)) \
        .dropDuplicates()
    states = {
        'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID',
        'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS',
        'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK',
        'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV',
        'WI', 'WY'
    }
    valid_row_allignment = lambda x: (F.length(x) == 2) & (x.isin(states))
    df2 = df2.filter(valid_row_allignment(
        df2.worksite_state))  # Mini data quality check to check row allignment
    for d in ['start_date', 'end_date']:
        prefix = 'arrival' if d == 'start_date' else 'expiry'
        column = 'employment_start_date' if d == 'start_date' else 'employment_end_date'
        df2 = df2.withColumn(d, F.to_date(column)) \
            .withColumn('{}_year'.format(prefix), F.year(d)) \
            .withColumn('{}_month'.format(prefix), F.month(d)) \
            .withColumn('{}_day'.format(prefix), F.dayofmonth(d)) \
            .withColumn('{}_weekday'.format(prefix), F.date_format(d, 'E')) \
            .drop(d, column)
    new_df = _fill_missing_columns(df1, df2)
    new_df = new_df.union(df2).dropDuplicates().withColumn(
        'id', F.monotonically_increasing_id())
    new_df = new_df.withColumnRenamed('visa_class', 'visa_type')
    for column in [
            'case_status', 'employer_name', 'worksite_city', 'arrival_weekday',
            'expiry_weekday'
    ]:
        new_df = _clean_string_column(new_df, column)
    schema = get_schema('worker')
    worker_df = make_empty_df(spark, schema).union(new_df.select(list(schema)))
    return worker_df
Ejemplo n.º 23
0
def pipeline(df):
    print(df.head())
    df = df.withColumn("length", length(df['Speech']))
    # Create the data processing pipeline functions here (note: StringIndexer will be used to encode
    # your target variable column. This column should be named 'label' so our model will recognize it later)
    review_data = Tokenizer(inputCol="Speech", outputCol="Words")
    reviewed = review_data.transform(df)
    #reviewed.show()
    remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
    newFrame = remover.transform(reviewed)
    #newFrame.show()
    hashing = HashingTF(inputCol="filtered",
                        outputCol="hashedValues",
                        numFeatures=pow(2, 10))
    # Transform in a DF
    hashed_df = hashing.transform(newFrame)
    hashed_df.show(truncate=False)
    idf = IDF(inputCol="hashedValues", outputCol="feature")
    idfModel = idf.fit(hashed_df)
    rescaledData = idfModel.transform(hashed_df)
    rescaledData.select("words", "feature").show(truncate=False)
    # indexer = StringIndexer(inputCol="Party_Affliation", outputCol="label")

    # indexed = indexer.fit(rescaledData).transform(rescaledData)

    assembler = VectorAssembler(inputCols=["feature", "length"],
                                outputCol="features")

    return assembler.transform(rescaledData)
Ejemplo n.º 24
0
    def deaths_states_topN(self, n=5, df=None, img_out=None, csv_out=None, cache=False):
        pdde_df = self.pdde_df

        if df:
            pdde_df = df

        if cache:
            pdde_df = pdde_df.persist()

        # data cleaning:
        # filter rows without a state
        df = pdde_df.filter(F.length(F.col(c.STATE)) > 0)

        # states
        deaths_states = df.groupBy(c.STATE).count()
        deaths_states_counts = deaths_states.orderBy('count', ascending=False)

        # select the top N most frequent deaths causes and plot the distribution
        counts_states_ppdf = deaths_states_counts.toPandas()
        counts_deaths_pddf_top_N = counts_states_ppdf[:n]

        print(counts_deaths_pddf_top_N)

        if csv_out:
            self._save_csv(counts_deaths_pddf_top_N, csv_out)

        if img_out:
            counts_deaths_pddf_top_N.plot.barh(x=c.STATE, y='count')
            plt.xlabel('States with more deaths in Police')
            plt.ylabel('Count')
            plt.savefig(img_out)

        return counts_deaths_pddf_top_N
Ejemplo n.º 25
0
def postJsonHandler():
    #print (request.is_json)
    #Check JSON data
    if not request.is_json:
        return ErrorMSG
    content = request.get_json()
    text = content['Text']
    if text is None:
        return ErrorMSG
    df = spark.createDataFrame([(0, text)], ["label", "Summary"])
    data = df.withColumn('length', length(df['Summary']))
    #print (df['Summary'])
    #print(text)
    try:
        model = PipelineModel.load('model')

        # Make predictions on test documents and print columns of interest.
        prediction = model.transform(data)
        selected = prediction.select("label", "Summary", "probability",
                                     "prediction")
        myJSON = {}
        for row in selected.collect():
            label, text, prob, prediction = row
            print("(%d, %s) --> prob=%s, prediction=%f" %
                  (label, text, str(prob), prediction))
            myJSON['text'] = text
            myJSON['prediction'] = prediction
        return jsonify(myJSON)
    except Exception as ex:
        print(ex)

    return 'JSON posted'
Ejemplo n.º 26
0
Archivo: job.py Proyecto: reganzm/ai
def statistic_job_jd(df):
    """
    职位jd分析
    :param df: 
    :return: 
    """
    df = df.filter(df.job_desc.isNotNull()).filter(
        F.length(df.job_desc) > 60).filter(F.length(df.job_desc) < 2000)
    # 去重
    df = df.dropDuplicates(subset=['position_name', "job_desc"])
    # df = df.withColumn("job_desc", F.udf(lambda x: emoji_pattern.sub(r'', x))(df.job_desc))
    # df = df.withColumn("job_desc", F.udf(lambda x: emoji_p.sub(r'', x))(df.job_desc))
    df = df.withColumn("job_desc", F.udf(filter_emoji)(df.job_desc))

    df = df.select("position_name", "job_desc")
    return df
Ejemplo n.º 27
0
    def crimes_severity(self, df=None, img_out=None, csv_out=None, cache=False):
        nypd_df = self.nypd_df

        if df:
            nypd_df = df

        if cache:
            nypd_df = nypd_df.persist()

        # analyze crimes severity over years
        nypd_df = nypd_df.filter(F.length(F.col(c.LEVEL_OFFENSE)) > 0)

        grouped_severity_df = nypd_df.groupby('yearpd', c.LEVEL_OFFENSE).count()

        severity_framing_pddf = grouped_severity_df.toPandas()
        grouped_severity_df_pddf = severity_framing_pddf.groupby(by=['yearpd', c.LEVEL_OFFENSE]).sum()

        print(grouped_severity_df_pddf)

        if img_out:
            plt.figure()
            grouped_severity_df_pddf['count'].unstack().plot.bar()
            plt.xticks(rotation=0)
            plt.ylabel('Counts')
            plt.xlabel('Crimes severity year-on-year')
            plt.savefig(img_out)

        if csv_out:
            self._save_csv(grouped_severity_df_pddf, csv_out)

        return grouped_severity_df
Ejemplo n.º 28
0
    def generateRxInfo(self, dataFrame):

        logger.info('in generateRxInfo')
        base_uri = 'http://{}:{}/REST/rxcui/'.format(
            Constants.POSTGRESQL_HOST_IP, Constants.RXNAV_PORT)

        def genRxInfo(RxNormId):
            d = {'BN': [], 'IN': [], 'tag': set()}
            if RxNormId != '':
                try:
                    resp = requests.get(url=base_uri + RxNormId +
                                        "/allrelatedextension")
                    root = ET.fromstring(resp.content)
                    for conceptGroup in root.findall(
                            './allRelatedGroup/conceptGroup'):
                        for tty in conceptGroup.findall('./tty'):
                            if tty.text == 'BN':
                                for brandname in conceptGroup.findall(
                                        './conceptProperties/name'):
                                    d['BN'].append(brandname.text)
                            if tty.text == 'IN':
                                for Ingredientname in conceptGroup.findall(
                                        './conceptProperties/name'):
                                    d['IN'].append(Ingredientname.text)
                                for humandrug in conceptGroup.findall(
                                        './conceptProperties/inferedhuman'):
                                    if humandrug.text == 'US':
                                        d['tag'].add('Human_Drug')
                                for animaldrug in conceptGroup.findall(
                                        './conceptProperties/inferedvet'):
                                    if animaldrug.text == 'US':
                                        d['tag'].add('Animal_Drug')
                except ET.ParseError as e:
                    print("Invalid XML received from uri {}".format(e))
            return str(d['BN']) + '|' + str(d['IN']) + '|' + ','.join(d['tag'])

        udf = Functions.UserDefinedFunction(genRxInfo, StringType())

        localDf = dataFrame.groupBy(
            Functions.col("RxNormId").alias("RxNormId2")).agg(
                Functions.lit('1'))

        localDf = localDf.withColumn("RxInfo",
                                     udf(Functions.col("RxNormId2"))).drop('1')

        localDf = dataFrame.join(
            localDf, (dataFrame['RxNormId'] == localDf['RxNormId2'])).drop(
                localDf['RxNormId2'])

        split_col = Functions.split(localDf['RxInfo'], '\|')
        localDf = localDf.withColumn("Brand_Names", split_col.getItem(0))\
                        .withColumn("Ingredients", split_col.getItem(1)) \
                        .withColumn("tag",Functions.when(localDf.tag== '', split_col.getItem(2))\
                            .otherwise(Functions.when(Functions.length( split_col.getItem(2))==0,localDf.tag)\
                                .otherwise(Functions.concat(localDf.tag,Functions.lit(','),split_col.getItem(2))))).drop('RxInfo')

        #self.container1.stop()
        #self.container2.stop()
        return localDf
Ejemplo n.º 29
0
def getEntitiesInTweets(date, data):
  temp = data.filter("timestamp = cast('"+date+"' as date)")
  temp = temp.select("entities")
  temp = temp.withColumn("entities",f.explode("entities"))
  temp = temp.filter(f.length(col("entities")) >0)
  temp = temp.groupBy("entities").agg(count("entities").alias("count")).orderBy("count", ascending = False)
  #display(temp)
  return temp
 def dataB_invoiced_currency(df: DataFrame) -> DataFrame:
     df = df.withColumn(
         'currency_code_len',
         F.length((F.regexp_replace(df.currency_code, "\\s+", ""))))
     data_frame = df.withColumn("invoiced_currency",
                     F.when(((df.currency_code_len == 0) | (df.currency_code == None)), F.lit('USD')) \
                       .otherwise(df.currency_code)).drop('currency_code_len')
     return data_frame
Ejemplo n.º 31
0
def getNameList(df, num, length):
    # Get the first num popular games
    # Sorted by Name string length
    df_pandas = df.where(f.length("Name") <= length).toPandas()[0:num]
    idx = df_pandas.Name.str.len().sort_values().index
    df_pandas = df_pandas.reindex(idx)
    print(df_pandas)
    return [row for row in df_pandas.Name]
    def create_length_feature(self, dataframe, base_field, length_field):
        """Produces a PySpark dataframe containing a field representing the length of a specified string field.

        :param dataframe: the PySpark dataframe
        :param base_field: the string field for which length is to be calculated
        :param length_field: the name to give to the field that will contain the length of the base_field
        :returns: the PySpark dataframe containing the length field and all fields in the supplied dataframe.
        """
        return(dataframe.withColumn(length_field, length(dataframe[base_field])))
Ejemplo n.º 33
0
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (1c) Length of each word **
# MAGIC 
# MAGIC Now use the SQL `length` function to find the number of characters in each word.  The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import length
#pluralLengthsDF = pluralDF.select(length(pluralDF.word).alias('word'))
pluralLengthsDF = pluralDF.select(length('word')) #.alias('word'))
pluralLengthsDF.show()

# COMMAND ----------

# TEST Length of each word (1e)
from collections import Iterable
asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v)

Test.assertEquals(set(asSelf(pluralLengthsDF.collect())), {4, 9, 4, 4, 4},
                  'incorrect values for pluralLengths')

# COMMAND ----------

# MAGIC %md
# MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **
Ejemplo n.º 34
0
# TEST Using DataFrame functions to add an 's' (1b)
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (1c) Length of each word **
# MAGIC 
# MAGIC Now use the SQL `length` function to find the number of characters in each word.  The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import length
pluralLengthsDF = pluralDF.select(length(pluralDF.word).alias('length'))
pluralLengthsDF.show()

# COMMAND ----------

# TEST Length of each word (1e)
from collections import Iterable
asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v)

Test.assertEquals(asSelf(pluralLengthsDF.collect()), [4, 9, 4, 4, 4],
                  'incorrect values for pluralLengths')

# COMMAND ----------

# MAGIC %md
# MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **
Ejemplo n.º 35
0
# TEST Using DataFrame functions to add an 's' (1b)
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (1c) Length of each word **
# MAGIC 
# MAGIC Now use the SQL `length` function to find the number of characters in each word.  The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import length
pluralLengthsDF = pluralDF.select(length(pluralDF.word))
pluralLengthsDF.show()

# COMMAND ----------

# TEST Length of each word (1e)
from collections import Iterable
asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v)

Test.assertEquals(asSelf(pluralLengthsDF.collect()), [4, 9, 4, 4, 4],
                  'incorrect values for pluralLengths')

# COMMAND ----------

# MAGIC %md
# MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **
Ejemplo n.º 36
0
schema = StructType([
    StructField("uniprot", StringType()),
    StructField("db", StringType()),
    StructField("extern", StringType())
])

df = ( sqlContext
    .read
    .format("com.databricks.spark.csv")
    .schema(schema)
    .option("header", "false")
    .option("delimiter", "\t")
    .option("mode", "DROPMALFORMED")
    .load("hdfs:///user/hbase/idmapping.new.dat")
#    .load("hdfs:///user/hbase/idmapping.10000")
    .dropDuplicates(['extern'])
    .filter( length(col("extern")) > 4) )


print df.count()

# df.coalesce(1).write.format('com.databricks.spark.csv').options(delimiter="\t").save('/user/hbase/testall')
# df.repartition(1).coalesce(1).write.csv("/user/toniher/idmappingall.csv", header='true', sep='\t')
df.write.format('com.databricks.spark.csv').options(delimiter="\t").save('/user/hbase/idmappingall')

#df.printSchema()

sc.stop()

Ejemplo n.º 37
0
# TEST Using DataFrame functions to add an 's' (1b)
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (1c) Length of each word **
# MAGIC
# MAGIC Now use the SQL `length` function to find the number of characters in each word.  The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import length
pluralLengthsDF = pluralDF.select(length('word'))
pluralLengthsDF.show()

# COMMAND ----------

# TEST Length of each word (1e)
from collections import Iterable
asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v)

Test.assertEquals(asSelf(pluralLengthsDF.collect()), [4, 9, 4, 4, 4],
                  'incorrect values for pluralLengths')

# COMMAND ----------

# MAGIC %md
# MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **
# TEST Using DataFrame functions to add an 's' (1b)
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (1c) Length of each word **
# MAGIC 
# MAGIC Now use the SQL `length` function to find the number of characters in each word.  The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import length
pluralLengthsDF = pluralDF.select(length(pluralDF.word))
pluralLengthsDF.show()

# COMMAND ----------

# TEST Length of each word (1e)
from collections import Iterable
asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v)

Test.assertEquals(asSelf(pluralLengthsDF.collect()), [4, 9, 4, 4, 4],
                  'incorrect values for pluralLengths')

# COMMAND ----------

# MAGIC %md
# MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **
Ejemplo n.º 39
0
                         .map(lambda x: separate_matches(x)) \
                         .flatMap(lambda x: x)

rdd_as_csv(match_keys_rdd, "matches_per_line.csv", "\t")


def matches(line):
    global pattern
    return "<M_SEP>".join(pattern.findall(line))


def time_matches(line):
    global time_pattern
    match = time_pattern.search(line)
    if match is None:
        return None
    return match.group()

udf_matches = udf(matches, StringType())
udf_timestamp = udf(time_matches, StringType())

data_frame = file_line_numbers.toDF(["line_number", "line_text"]) \
                              .withColumn("matches", udf_matches("line_text")) \
                              .filter(length("matches") > 0) \
                              .withColumn("timestamp", udf_timestamp("line_text"))

# df = data_frame.drop("line_text")
# print df.show()
dataframe_as_csv(data_frame, "matches.csv", "%d\t%s\t%s\t%s\n", ["line_number", "matches", "timestamp", "line_text"])
sys.exit(0)
# TEST Using DataFrame functions to add an 's' (1b)
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (1c) Length of each word **
# MAGIC 
# MAGIC Now use the SQL `length` function to find the number of characters in each word.  The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import length
pluralLengthsDF = pluralDF.select(length('word').alias('length'))
pluralLengthsDF.show()

# COMMAND ----------

# TEST Length of each word (1e)
from collections import Iterable
asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v)

Test.assertEquals(asSelf(pluralLengthsDF.collect()), [4, 9, 4, 4, 4],
                  'incorrect values for pluralLengths')

# COMMAND ----------

# MAGIC %md
# MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **