def text_features(p_df): """ Extracts features derived from the quora question texts. :param p_df: A DataFrame. :return: A DataFrame. """ diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType()) common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType()) unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType()) p_df = p_df.withColumn("len_q1", length("question1")).withColumn( "len_q2", length("question2")) p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2"))) p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn( "words_q2", size("question2_words")) p_df = p_df.withColumn( "common_words", common_words(array("question1_words", "question2_words"))) p_df = p_df.withColumn("unique_chars_q1", unique_chars("question1")).withColumn( "unique_chars_q2", unique_chars("question2")) assembler = VectorAssembler(inputCols=[ "len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2" ], outputCol="text_features") p_df = assembler.transform(p_df) return p_df
def text_features(p_df): """ Extracts features derived from the quora question texts. :param p_df: A DataFrame. :return: A DataFrame. """ diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType()) common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType()) unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType()) p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2")) p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2"))) p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words")) p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words"))) p_df = p_df.withColumn( "unique_chars_q1", unique_chars("question1") ).withColumn("unique_chars_q2", unique_chars("question2")) assembler = VectorAssembler( inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"], outputCol="text_features" ) p_df = assembler.transform(p_df) return p_df
def filter_str(df, col, filter_null=True, limit_length=True): if filter_null: df = df.filter(df[col].isNotNull() & (~df[col].endswith("\s*"))) if limit_length: df = df.filter(F.length(df[col]) < MAX_LENGTH).filter( F.length(df[col]) > MIN_LENGTH) return df
def cross_district_crimes(self, df=None, img_out=None, csv_out=None, cache=False): nypd_df = self.nypd_df if df: nypd_df = df if cache: nypd_df = nypd_df.persist() nypd_df = nypd_df.filter( (F.length(F.col(c.BOROUGH)) > 0) & (F.length(F.col(c.OFFENSE_DESCRIPTION)) > 0) ) data2 = nypd_df.toPandas() df = pd.crosstab(data2.BORO_NM, data2.OFNS_DESC) if img_out: plt.figure() color = plt.cm.gist_rainbow(np.linspace(0, 1, 10)) df.div(df.sum(1).astype(float), axis=0).plot.bar(stacked=True, color=color, figsize=(18, 12)) plt.title('District vs Category of Crime', fontweight=30, fontsize=20) plt.xticks(rotation=90) plt.savefig(img_out) if csv_out: self._save_csv(df, csv_out) return nypd_df
def spark_ratio(left, right): # TODO: sparkify this function df = df(['left', 'right']) df = df.withColumn('len', F.min(F.length('left'), F.length('right'))) df = df.withColumn('levenshtein', F.levenshtein('left', 'right')) df = df.withColumn('inv_edit_distance', F.col('len') - F.col('levenshtein')) df = df.withColumn('ratio', F.col('inv_edit_distance') / F.col('len')) df = df.withColumnRenamed('ratio', 'fuzzy') df = df.select(['fuzzy']) return df
def prepare_features(df): df = df.withColumn( 'exclam', length('review_body') - length(regexp_replace('review_body', '\!', ''))) df = df.withColumn('age', datediff(current_date(), to_date(df['review_date']))) df = df.withColumn('review_length', length(df['review_body'])) df = df.withColumn('helfulness', df['helpful_votes'] / df['total_votes']) df = df.withColumn('label', expr("CAST(verified_purchase='Y' As INT)")) select_cols = df.select( ['star_rating', 'helfulness', 'age', 'review_length', 'label']).na.fill(0) return select_cols
def _preprocess(self): input_cols = [ 'armed', 'city', 'manner_of_death', 'flee', 'gender', 'state', 'threat_level', 'body_camera', 'signs_of_mental_illness' ] # self.shootings_df = self.shootings_df.select([c for c in self.shootings_df.columns if c in input_cols]) # self.shootings_df.show(n=10) self.shootings_df = self.shootings_df.filter( (F.length(F.col('armed')) > 0) & (F.length(F.col('city')) > 0) & \ (F.length(F.col('manner_of_death')) > 0) & (F.length(F.col('race')) > 0) & \ (F.length(F.col('flee')) > 0) & (F.length(F.col('gender')) > 0) & \ (F.length(F.col('state')) > 0) & (F.length(F.col('threat_level')) > 0) & \ (F.length(F.col('body_camera')) > 0) & (F.length(F.col('signs_of_mental_illness')) > 0) ) indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in input_cols ] # The encode of indexed vlaues multiple columns encoders = [ OneHotEncoder(dropLast=False, inputCol=indexer.getOutputCol(), outputCol="{0}_enc".format(indexer.getOutputCol())) for indexer in indexers ] # Vectorizing encoded values assembler = VectorAssembler( inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol="features") pipeline = Pipeline(stages=indexers + encoders + [assembler]) model = pipeline.fit(self.shootings_df) self.shootings_df = model.transform(self.shootings_df) self.shootings_df = self.shootings_df.withColumn( 'label', udf_parse_race('race').cast('int')) self.shootings_df = self.shootings_df.select('features', 'race', 'label') self.shootings_df.persist().count() return self.shootings_df
def visualize_tweet_data(twitter_data): from pyspark.sql.functions import avg, col, length from pyspark.sql.functions import lower, split import itertools import collections import seaborn as sns #import nltk from nltk.corpus import stopwords import pandas as pd from wordcloud import WordCloud sns.set(font_scale=1.5) sns.set_style("whitegrid") #Number of different tweets num_of_diftweet = len(twitter_data.groupBy("tweet").count().collect()) print("Number of different tweets are: " + str(num_of_diftweet)) #Average length of review text text_length = twitter_data.withColumn("length", length(twitter_data["tweet"])) avg_length = round(text_length.select(avg("length")).collect()[0][0], 2) print("Average length of text is: " + str(avg_length)) #Make all twitter to lower case and split them words_in_tweet = twitter_data.select(split(lower(col("tweet")), " ")).collect() #This is a list contains many lists for all tweet """This list will use for applying stopword and collection word""" word_list = [] for each_tweet in words_in_tweet: for word in each_tweet: word_list.append(word) #Eliminate stopwords to eliminate the common words nltk.download('stopwords') stop_words = set(stopwords.words('english')) tweets_nsw = [[word for word in tweet_words if not word in stop_words] for tweet_words in word_list] #Eliminate collection words collection_words = [ 'gucci', 'polo', 'chanel', 'burberry', 'prada', 'versace', 'fendi', 'hermes', 'new', 'loving', 'never', 'check', 'share', 'someone', 'fashion', 'got', 'played' ] tweets_nsw_nc = [[w for w in word if not w in collection_words] for word in tweets_nsw] #Create a list of words after cleaning all common words all_words_nsw_nc = list(itertools.chain(*tweets_nsw_nc)) counts_nsw_nc = collections.Counter(all_words_nsw_nc) input_data = counts_nsw_nc #Plot word count function word_count_visualization(input_data) #Word cloud create_wordcloud(input_data)
def usuarios_features(df, categoria=-1.0): logger.info("Calculando features para usuarios...") resultado = (df.select( df["user.id"].alias("user_id"), nullToInt("user.profile_use_background_image").alias( "con_imagen_fondo"), u_parse_time("user.created_at").cast('timestamp').alias( "cuenta_creada"), df["user.favourites_count"].alias("n_favoritos"), nullToInt("user.description").alias("con_descripcion"), F.length("user.description").alias("longitud_descripcion"), nullToInt("user.verified").alias("con_perfil_verificado"), nullToInt("user.default_profile_image").alias("con_imagen_default"), df["user.listed_count"].alias("n_listas"), nullToInt("user.geo_enabled").alias("con_geo_activo"), reputacion("user.followers_count", "user.friends_count").alias("reputacion"), df["user.statuses_count"].alias("n_tweets"), followersRatio("user.followers_count", "user.friends_count").alias("followers_ratio"), df["user.screen_name"].alias("nombre_usuario"), entropia("lista_intertweet").alias("entropia")).withColumn( "ano_registro", F.year("cuenta_creada")).withColumn( "categoria", F.lit(categoria)).withColumn("createdAt", F.current_timestamp())) return resultado
def preparar_df(df): df.repartition(df.user.id) df = df.where(F.length(df.text) > 0) df = df.select( "*", u_parse_time( df['created_at']).cast('timestamp').alias('created_at_ts')) df_intertweet = df.select( df.user.id.alias("user_id"), (df.created_at_ts.cast('bigint') - F.lag(df.created_at_ts.cast('bigint'), ).over( Window.partitionBy("user.id").orderBy("created_at_ts")) ).cast("bigint").alias("time_intertweet")) df_list_intertweet = df_intertweet.groupby(df_intertweet.user_id).agg( F.collect_list("time_intertweet").alias("lista_intertweet")) df_list_intertweet = df_list_intertweet.filter( F.size(df_list_intertweet.lista_intertweet) > 3) df = df.join(df_list_intertweet, df["user.id"] == df_list_intertweet["user_id"]) return df
def crimes_top(self, df=None, n=20, img_out=None, csv_out=None, cache=False): nypd_df = self.nypd_df if df: nypd_df = df if cache: nypd_df = nypd_df.persist() # data cleaning: # filter rows without a OFFENSE_DESCRIPTION df = nypd_df.filter(F.length(F.col(c.OFFENSE_DESCRIPTION)) > 0) # crime types crime_type_groups = df.groupBy(c.OFFENSE_DESCRIPTION).count() crime_type_counts = crime_type_groups.orderBy('count', ascending=False) # select the top N most frequent crimes and plot the distribution counts_crime_pddf = crime_type_counts.toPandas() counts_crime_pddf_top_N = counts_crime_pddf[:n] print(counts_crime_pddf_top_N) if img_out: plt.figure(figsize=(12, 8)) counts_crime_pddf_top_N.plot.barh(x=c.OFFENSE_DESCRIPTION, y='count') plt.savefig(img_out) if csv_out: self._save_csv(counts_crime_pddf_top_N, csv_out) return crime_type_counts
def cross_age_race(self, df=None, img_out=None, csv_out=None, cache=False): nypd_df = self.nypd_df if df: nypd_df = df if cache: nypd_df = nypd_df.persist() age_groups = ['<18', '18-24', '25-44', '45-64', '65+'] nypd_df = nypd_df.select(c.AGE, c.RACE) nypd_df = nypd_df.filter((F.length(F.col(c.AGE)) > 0) & (F.col(c.AGE) != 'false')) nypd_df = nypd_df.where(F.col(c.AGE).isin(age_groups)) data3 = self_toPandas(nypd_df, 4) df = pd.crosstab(data3.SUSP_RACE, data3.SUSP_AGE_GROUP) if img_out: plt.figure() color = plt.cm.gist_rainbow(np.linspace(0, 1, 10)) df.div(df.sum(1).astype(float), axis=0).plot.bar(stacked=True, color=color, figsize=(18, 12)) plt.title('age vs race', fontweight=30, fontsize=20) plt.xticks(rotation=90) plt.savefig(img_out) if csv_out: self._save_csv(df, csv_out) return nypd_df
def test_regex(self, input_df): filter_expression = """attributes.last_name rlike "^.{7}$" """ transformer = Sieve(filter_expression=filter_expression) transformed_df = transformer.transform(input_df) assert transformed_df.count() < input_df.count() assert transformed_df.count() == input_df.where( F.length(input_df.attributes.last_name) == 7).count()
def test15(spark): """ This demonstrates reading JSON events from Pravega. It uses chunked encoding to support events of 2 GiB. """ # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol. # It should be selected at random by each process that writes records. schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1").load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select( '*', from_json('event_string', schema=schema, options=dict(mode='FAILFAST')).alias('event')) df = df.select('*', 'event.*') df = df.select('*', length('data')) df = df.drop('raw_event', 'event_string', 'event', 'data') df = df.withWatermark('timestamp', '60 second') df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').start().awaitTermination())
def _get_base_cols(row: StructExpression) -> List[Column]: assert check_argument_types() contig_name_col = fx.col("`locus.contig`").alias("contigName") start_col = (fx.col("`locus.position`") - 1).cast("long").alias("start") end_col = start_col + fx.length(fx.element_at("alleles", 1)) has_info = 'info' in row and isinstance(row.info.dtype, tstruct) if has_info and 'END' in row.info and row.info.END.dtype == tint: end_col = fx.coalesce(fx.col("`info.END`"), end_col) end_col = end_col.cast("long").alias("end") names_elems = [] if 'varid' in row and row.varid.dtype == tstr: names_elems.append("varid") if 'rsid' in row and row.rsid.dtype == tstr: names_elems.append("rsid") names_col = fx.expr( f"nullif(filter(array({','.join(names_elems)}), n -> isnotnull(n)), array())").alias("names") reference_allele_col = fx.element_at("alleles", 1).alias("referenceAllele") alternate_alleles_col = fx.expr("slice(alleles, 2, size(alleles) - 1)").alias("alternateAlleles") base_cols = [ contig_name_col, start_col, end_col, names_col, reference_allele_col, alternate_alleles_col ] assert check_return_type(base_cols) return base_cols
def create_values(cols): values = [] for col in cols: if col.is_lookup == 1: values.append( f.when( f.col(col.demographic_key).isNull(), f.concat_ws('_', f.lit(col.demographic_key), f.lit('9999'))).when( f.trim(f.col(col.demographic_key)) == '', f.concat_ws('_', f.lit(col.demographic_key), f.lit('9999'))). when( f.length( f.regexp_extract( f.col(col.demographic_key).astype('string'), '(\d+)', 1)) > 0, f.concat_ws( '_', f.lit(col.demographic_key), f.col(col.demographic_key).astype('int').astype( 'string'))).otherwise( f.concat_ws('_', f.lit(col.demographic_key), f.col(col.demographic_key)))) else: values.append(f.col(col.demographic_key)) return values
def lyrSizePandasMixed(count): """ using pandas udf """ df = spark.createDataFrame( sc.range(count,0,-1) ,schema=T.IntegerType()) df = df.withColumn("lyr",F.pandas_udf(lineList,T.StringType())(F.col("value"))).select("lyr") return df.withColumn("lyrc", F.length(F.col("lyr")) ).select(F.sum(F.col("lyrc")).alias("c")).first()["c"]
def test2(spark): """ """ schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' # To allow for large images and avoid out-of-memory, the JVM will # send to the Python UDF this batch size. spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1') controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1").load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.select('*', length('data')) df = df.withWatermark('timestamp', '60 second') def f(batch_df, batch_id): print('batch_id=%d' % batch_id) png0 = batch_df.select('data').limit(1).collect()[0][0] print('png0=%s' % png0[0:20]) # IPython.display.clear_output(wait=True) # IPython.display.display(IPython.display.Image(data=png0)) (df.writeStream.trigger(processingTime='3 seconds') # limit trigger rate .foreachBatch(f).start().awaitTermination())
def get_suggested_dict(df): ''' :param df: data frame :return: dictionary of suggested types in Postgres ''' # ArrayType, BinaryType are not handled yet suggested = {} for f in df.schema.fields: if isinstance(f.dataType, DateType): suggested[f.name] = 'date' elif isinstance(f.dataType, StringType): df = df.withColumn('length', F.length(F.col(f.name))) x = df.agg(F.max(df.length)).collect()[0][0] # 20% extra length based on the longest string suggested[f.name] = 'varchar({})'.format(int(x * 1.2)) elif isinstance(f.dataType, DoubleType) or isinstance( f.dataType, DecimalType): suggested[f.name] = 'numeric(18,2)' elif isinstance(f.dataType, LongType): suggested[f.name] = 'int8' elif isinstance(f.dataType, FloatType): suggested[f.name] = 'float8' elif isinstance(f.dataType, ShortType): suggested[f.name] = 'integer' elif isinstance(f.dataType, BooleanType): suggested[f.name] = 'Bool' elif isinstance(f.dataType, TimestampType): suggested[f.name] = 'timestamptz' return suggested
def main(): spark = SparkSession.builder.appName('nlp').getOrCreate() data = spark.read.csv("./data/smsspamcollection/SMSSpamCollection", inferSchema=True, sep='\t') data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text') data.show() data = data.withColumn('length', length(data['text'])) data.show() data.groupby('class').mean().show() tokenizer = Tokenizer(inputCol="text", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec') idf = IDF(inputCol="c_vec", outputCol="tf_idf") ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label') clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features') nb = NaiveBayes() data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove, count_vec, idf, clean_up]) cleaner = data_prep_pipe.fit(data) clean_data = cleaner.transform(data) clean_data = clean_data.select(['label', 'features']) clean_data.show() (training, testing) = clean_data.randomSplit([0.7, 0.3]) spam_predictor = nb.fit(training) data.printSchema() test_results = spam_predictor.transform(testing) test_results.show() acc_eval = MulticlassClassificationEvaluator() acc = acc_eval.evaluate(test_results) print("Accuracy of model at predicting spam was: {}".format(acc))
def crimes_ages(self, df=None, img_out=None, csv_out=None, cache=False): nypd_df = self.nypd_df if df: nypd_df = df if cache: nypd_df = nypd_df.persist() nypd_df = nypd_df.filter(F.length(F.col(c.AGE)) > 0) crime_age_groups = nypd_df.groupBy(c.AGE).count() crime_age_counts = crime_age_groups.orderBy('count', ascending=False) pddf = crime_age_counts.toPandas() pddf.set_index(c.AGE, inplace=True) if img_out: plt.figure() pddf.plot.pie(y='count') plt.savefig(img_out) if csv_out: self._save_csv(pddf, csv_out) return crime_age_counts
def parse_worker_data(spark: SparkSession, input_path: str) -> DataFrame: """ Parse the asylum seeker data to the appropriate schema. :param spark: the SparkSession object :param input_path: location of the data :return: A Spark dataframe """ csv, to_filter = "h1b_kaggle.csv", [ "CASE_STATUS", "EMPLOYER_NAME", "YEAR", "WORKSITE" ] df1 = spark.read.csv(input_path + "legal_immigrant_data/{}".format(csv), header=True) \ .selectExpr(*_lower_case_headers(to_filter)) \ .dropDuplicates() \ .withColumn("visa_class", F.lit("H-1B")) df1 = df1.withColumn('split', F.split(df1['worksite'], ',')) \ .withColumn("worksite_city", F.col('split')[0]) \ .withColumn("worksite_state", F.col('split')[1]) \ .drop("split", "worksite") df1 = df1.withColumn('worksite_state', _abbreviate_state(df1.worksite_state)) csv = "H-1B_Disclosure_Data_FY17.csv" to_filter = [ 'CASE_STATUS', 'VISA_CLASS', 'EMPLOYMENT_START_DATE', 'EMPLOYMENT_END_DATE', 'EMPLOYER_NAME', 'EMPLOYER_CITY', 'EMPLOYER_STATE', 'WORKSITE_CITY', 'WORKSITE_STATE' ] df2 = spark.read.csv(input_path + "legal_immigrant_data/{}".format(csv), header=True) \ .selectExpr(*_lower_case_headers(to_filter)) \ .dropDuplicates() states = { 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY' } valid_row_allignment = lambda x: (F.length(x) == 2) & (x.isin(states)) df2 = df2.filter(valid_row_allignment( df2.worksite_state)) # Mini data quality check to check row allignment for d in ['start_date', 'end_date']: prefix = 'arrival' if d == 'start_date' else 'expiry' column = 'employment_start_date' if d == 'start_date' else 'employment_end_date' df2 = df2.withColumn(d, F.to_date(column)) \ .withColumn('{}_year'.format(prefix), F.year(d)) \ .withColumn('{}_month'.format(prefix), F.month(d)) \ .withColumn('{}_day'.format(prefix), F.dayofmonth(d)) \ .withColumn('{}_weekday'.format(prefix), F.date_format(d, 'E')) \ .drop(d, column) new_df = _fill_missing_columns(df1, df2) new_df = new_df.union(df2).dropDuplicates().withColumn( 'id', F.monotonically_increasing_id()) new_df = new_df.withColumnRenamed('visa_class', 'visa_type') for column in [ 'case_status', 'employer_name', 'worksite_city', 'arrival_weekday', 'expiry_weekday' ]: new_df = _clean_string_column(new_df, column) schema = get_schema('worker') worker_df = make_empty_df(spark, schema).union(new_df.select(list(schema))) return worker_df
def pipeline(df): print(df.head()) df = df.withColumn("length", length(df['Speech'])) # Create the data processing pipeline functions here (note: StringIndexer will be used to encode # your target variable column. This column should be named 'label' so our model will recognize it later) review_data = Tokenizer(inputCol="Speech", outputCol="Words") reviewed = review_data.transform(df) #reviewed.show() remover = StopWordsRemover(inputCol="Words", outputCol="filtered") newFrame = remover.transform(reviewed) #newFrame.show() hashing = HashingTF(inputCol="filtered", outputCol="hashedValues", numFeatures=pow(2, 10)) # Transform in a DF hashed_df = hashing.transform(newFrame) hashed_df.show(truncate=False) idf = IDF(inputCol="hashedValues", outputCol="feature") idfModel = idf.fit(hashed_df) rescaledData = idfModel.transform(hashed_df) rescaledData.select("words", "feature").show(truncate=False) # indexer = StringIndexer(inputCol="Party_Affliation", outputCol="label") # indexed = indexer.fit(rescaledData).transform(rescaledData) assembler = VectorAssembler(inputCols=["feature", "length"], outputCol="features") return assembler.transform(rescaledData)
def deaths_states_topN(self, n=5, df=None, img_out=None, csv_out=None, cache=False): pdde_df = self.pdde_df if df: pdde_df = df if cache: pdde_df = pdde_df.persist() # data cleaning: # filter rows without a state df = pdde_df.filter(F.length(F.col(c.STATE)) > 0) # states deaths_states = df.groupBy(c.STATE).count() deaths_states_counts = deaths_states.orderBy('count', ascending=False) # select the top N most frequent deaths causes and plot the distribution counts_states_ppdf = deaths_states_counts.toPandas() counts_deaths_pddf_top_N = counts_states_ppdf[:n] print(counts_deaths_pddf_top_N) if csv_out: self._save_csv(counts_deaths_pddf_top_N, csv_out) if img_out: counts_deaths_pddf_top_N.plot.barh(x=c.STATE, y='count') plt.xlabel('States with more deaths in Police') plt.ylabel('Count') plt.savefig(img_out) return counts_deaths_pddf_top_N
def postJsonHandler(): #print (request.is_json) #Check JSON data if not request.is_json: return ErrorMSG content = request.get_json() text = content['Text'] if text is None: return ErrorMSG df = spark.createDataFrame([(0, text)], ["label", "Summary"]) data = df.withColumn('length', length(df['Summary'])) #print (df['Summary']) #print(text) try: model = PipelineModel.load('model') # Make predictions on test documents and print columns of interest. prediction = model.transform(data) selected = prediction.select("label", "Summary", "probability", "prediction") myJSON = {} for row in selected.collect(): label, text, prob, prediction = row print("(%d, %s) --> prob=%s, prediction=%f" % (label, text, str(prob), prediction)) myJSON['text'] = text myJSON['prediction'] = prediction return jsonify(myJSON) except Exception as ex: print(ex) return 'JSON posted'
def statistic_job_jd(df): """ 职位jd分析 :param df: :return: """ df = df.filter(df.job_desc.isNotNull()).filter( F.length(df.job_desc) > 60).filter(F.length(df.job_desc) < 2000) # 去重 df = df.dropDuplicates(subset=['position_name', "job_desc"]) # df = df.withColumn("job_desc", F.udf(lambda x: emoji_pattern.sub(r'', x))(df.job_desc)) # df = df.withColumn("job_desc", F.udf(lambda x: emoji_p.sub(r'', x))(df.job_desc)) df = df.withColumn("job_desc", F.udf(filter_emoji)(df.job_desc)) df = df.select("position_name", "job_desc") return df
def crimes_severity(self, df=None, img_out=None, csv_out=None, cache=False): nypd_df = self.nypd_df if df: nypd_df = df if cache: nypd_df = nypd_df.persist() # analyze crimes severity over years nypd_df = nypd_df.filter(F.length(F.col(c.LEVEL_OFFENSE)) > 0) grouped_severity_df = nypd_df.groupby('yearpd', c.LEVEL_OFFENSE).count() severity_framing_pddf = grouped_severity_df.toPandas() grouped_severity_df_pddf = severity_framing_pddf.groupby(by=['yearpd', c.LEVEL_OFFENSE]).sum() print(grouped_severity_df_pddf) if img_out: plt.figure() grouped_severity_df_pddf['count'].unstack().plot.bar() plt.xticks(rotation=0) plt.ylabel('Counts') plt.xlabel('Crimes severity year-on-year') plt.savefig(img_out) if csv_out: self._save_csv(grouped_severity_df_pddf, csv_out) return grouped_severity_df
def generateRxInfo(self, dataFrame): logger.info('in generateRxInfo') base_uri = 'http://{}:{}/REST/rxcui/'.format( Constants.POSTGRESQL_HOST_IP, Constants.RXNAV_PORT) def genRxInfo(RxNormId): d = {'BN': [], 'IN': [], 'tag': set()} if RxNormId != '': try: resp = requests.get(url=base_uri + RxNormId + "/allrelatedextension") root = ET.fromstring(resp.content) for conceptGroup in root.findall( './allRelatedGroup/conceptGroup'): for tty in conceptGroup.findall('./tty'): if tty.text == 'BN': for brandname in conceptGroup.findall( './conceptProperties/name'): d['BN'].append(brandname.text) if tty.text == 'IN': for Ingredientname in conceptGroup.findall( './conceptProperties/name'): d['IN'].append(Ingredientname.text) for humandrug in conceptGroup.findall( './conceptProperties/inferedhuman'): if humandrug.text == 'US': d['tag'].add('Human_Drug') for animaldrug in conceptGroup.findall( './conceptProperties/inferedvet'): if animaldrug.text == 'US': d['tag'].add('Animal_Drug') except ET.ParseError as e: print("Invalid XML received from uri {}".format(e)) return str(d['BN']) + '|' + str(d['IN']) + '|' + ','.join(d['tag']) udf = Functions.UserDefinedFunction(genRxInfo, StringType()) localDf = dataFrame.groupBy( Functions.col("RxNormId").alias("RxNormId2")).agg( Functions.lit('1')) localDf = localDf.withColumn("RxInfo", udf(Functions.col("RxNormId2"))).drop('1') localDf = dataFrame.join( localDf, (dataFrame['RxNormId'] == localDf['RxNormId2'])).drop( localDf['RxNormId2']) split_col = Functions.split(localDf['RxInfo'], '\|') localDf = localDf.withColumn("Brand_Names", split_col.getItem(0))\ .withColumn("Ingredients", split_col.getItem(1)) \ .withColumn("tag",Functions.when(localDf.tag== '', split_col.getItem(2))\ .otherwise(Functions.when(Functions.length( split_col.getItem(2))==0,localDf.tag)\ .otherwise(Functions.concat(localDf.tag,Functions.lit(','),split_col.getItem(2))))).drop('RxInfo') #self.container1.stop() #self.container2.stop() return localDf
def getEntitiesInTweets(date, data): temp = data.filter("timestamp = cast('"+date+"' as date)") temp = temp.select("entities") temp = temp.withColumn("entities",f.explode("entities")) temp = temp.filter(f.length(col("entities")) >0) temp = temp.groupBy("entities").agg(count("entities").alias("count")).orderBy("count", ascending = False) #display(temp) return temp
def dataB_invoiced_currency(df: DataFrame) -> DataFrame: df = df.withColumn( 'currency_code_len', F.length((F.regexp_replace(df.currency_code, "\\s+", "")))) data_frame = df.withColumn("invoiced_currency", F.when(((df.currency_code_len == 0) | (df.currency_code == None)), F.lit('USD')) \ .otherwise(df.currency_code)).drop('currency_code_len') return data_frame
def getNameList(df, num, length): # Get the first num popular games # Sorted by Name string length df_pandas = df.where(f.length("Name") <= length).toPandas()[0:num] idx = df_pandas.Name.str.len().sort_values().index df_pandas = df_pandas.reindex(idx) print(df_pandas) return [row for row in df_pandas.Name]
def create_length_feature(self, dataframe, base_field, length_field): """Produces a PySpark dataframe containing a field representing the length of a specified string field. :param dataframe: the PySpark dataframe :param base_field: the string field for which length is to be calculated :param length_field: the name to give to the field that will contain the length of the base_field :returns: the PySpark dataframe containing the length field and all fields in the supplied dataframe. """ return(dataframe.withColumn(length_field, length(dataframe[base_field])))
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s') Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (1c) Length of each word ** # MAGIC # MAGIC Now use the SQL `length` function to find the number of characters in each word. The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import length #pluralLengthsDF = pluralDF.select(length(pluralDF.word).alias('word')) pluralLengthsDF = pluralDF.select(length('word')) #.alias('word')) pluralLengthsDF.show() # COMMAND ---------- # TEST Length of each word (1e) from collections import Iterable asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v) Test.assertEquals(set(asSelf(pluralLengthsDF.collect())), {4, 9, 4, 4, 4}, 'incorrect values for pluralLengths') # COMMAND ---------- # MAGIC %md # MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **
# TEST Using DataFrame functions to add an 's' (1b) Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s') Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (1c) Length of each word ** # MAGIC # MAGIC Now use the SQL `length` function to find the number of characters in each word. The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import length pluralLengthsDF = pluralDF.select(length(pluralDF.word).alias('length')) pluralLengthsDF.show() # COMMAND ---------- # TEST Length of each word (1e) from collections import Iterable asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v) Test.assertEquals(asSelf(pluralLengthsDF.collect()), [4, 9, 4, 4, 4], 'incorrect values for pluralLengths') # COMMAND ---------- # MAGIC %md # MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **
# TEST Using DataFrame functions to add an 's' (1b) Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s') Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (1c) Length of each word ** # MAGIC # MAGIC Now use the SQL `length` function to find the number of characters in each word. The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import length pluralLengthsDF = pluralDF.select(length(pluralDF.word)) pluralLengthsDF.show() # COMMAND ---------- # TEST Length of each word (1e) from collections import Iterable asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v) Test.assertEquals(asSelf(pluralLengthsDF.collect()), [4, 9, 4, 4, 4], 'incorrect values for pluralLengths') # COMMAND ---------- # MAGIC %md # MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **
schema = StructType([ StructField("uniprot", StringType()), StructField("db", StringType()), StructField("extern", StringType()) ]) df = ( sqlContext .read .format("com.databricks.spark.csv") .schema(schema) .option("header", "false") .option("delimiter", "\t") .option("mode", "DROPMALFORMED") .load("hdfs:///user/hbase/idmapping.new.dat") # .load("hdfs:///user/hbase/idmapping.10000") .dropDuplicates(['extern']) .filter( length(col("extern")) > 4) ) print df.count() # df.coalesce(1).write.format('com.databricks.spark.csv').options(delimiter="\t").save('/user/hbase/testall') # df.repartition(1).coalesce(1).write.csv("/user/toniher/idmappingall.csv", header='true', sep='\t') df.write.format('com.databricks.spark.csv').options(delimiter="\t").save('/user/hbase/idmappingall') #df.printSchema() sc.stop()
# TEST Using DataFrame functions to add an 's' (1b) Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s') Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (1c) Length of each word ** # MAGIC # MAGIC Now use the SQL `length` function to find the number of characters in each word. The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import length pluralLengthsDF = pluralDF.select(length('word')) pluralLengthsDF.show() # COMMAND ---------- # TEST Length of each word (1e) from collections import Iterable asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v) Test.assertEquals(asSelf(pluralLengthsDF.collect()), [4, 9, 4, 4, 4], 'incorrect values for pluralLengths') # COMMAND ---------- # MAGIC %md # MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **
.map(lambda x: separate_matches(x)) \ .flatMap(lambda x: x) rdd_as_csv(match_keys_rdd, "matches_per_line.csv", "\t") def matches(line): global pattern return "<M_SEP>".join(pattern.findall(line)) def time_matches(line): global time_pattern match = time_pattern.search(line) if match is None: return None return match.group() udf_matches = udf(matches, StringType()) udf_timestamp = udf(time_matches, StringType()) data_frame = file_line_numbers.toDF(["line_number", "line_text"]) \ .withColumn("matches", udf_matches("line_text")) \ .filter(length("matches") > 0) \ .withColumn("timestamp", udf_timestamp("line_text")) # df = data_frame.drop("line_text") # print df.show() dataframe_as_csv(data_frame, "matches.csv", "%d\t%s\t%s\t%s\n", ["line_number", "matches", "timestamp", "line_text"]) sys.exit(0)
# TEST Using DataFrame functions to add an 's' (1b) Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s') Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (1c) Length of each word ** # MAGIC # MAGIC Now use the SQL `length` function to find the number of characters in each word. The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import length pluralLengthsDF = pluralDF.select(length('word').alias('length')) pluralLengthsDF.show() # COMMAND ---------- # TEST Length of each word (1e) from collections import Iterable asSelf = lambda v: map(lambda r: r[0] if isinstance(r, Iterable) and len(r) == 1 else r, v) Test.assertEquals(asSelf(pluralLengthsDF.collect()), [4, 9, 4, 4, 4], 'incorrect values for pluralLengths') # COMMAND ---------- # MAGIC %md # MAGIC #### ** Part 2: Counting with Spark SQL and DataFrames **