def nzpolice_link(env, inputs, settings): def set_progress(percentage): if env['ui'] is not None: widget: OWWidget = env['ui'] widget.progressBarSet(percentage) df = inputs['DataFrame'] print('Start group offender...') # offender -> crimes offender_count_list = df.groupby('offender').count().collect() offender_count_dict = { row['offender']: row['count'] for row in offender_count_list } set_progress(5) @udf(returnType=IntegerType()) def offender_count(offender): return offender_count_dict[offender] print('associate offence count...') df = df.withColumn('offender_offence_count', offender_count('offender')) set_progress(10) ##################################### print('Start group reports by narrative...') groups_by_narrative = df.groupby('narrative_hash').agg( F.collect_list(struct(*df.columns)).alias('reports')).collect() narrative_primaryoffender = dict() for row in groups_by_narrative: primary_offender = max( row['reports'], key=lambda row: row['offender_offence_count'])['offender'] narrative_primaryoffender[row['narrative_hash']] = primary_offender print('%d independent offence report' % len(narrative_primaryoffender.keys())) @udf(returnType=StringType()) def replace_offender_for_group_offence(narrative_hash): return narrative_primaryoffender[narrative_hash] df = df.withColumnRenamed('offender', 'origin_offender') df = df.withColumn('offender', replace_offender_for_group_offence('narrative_hash')) df = df.drop('origin_offender') reports = df.collect() # reports = [max(row['reports'], key=lambda row: row['offender_offence_count']) for row in groups_by_narrative] groups = {} # group_by_offender for report in reports: groups.setdefault(report['offender'], []).append(report) set_progress(15) print('Start statistics for selection...') NUM_GROUPS = len(groups) NUM_LINKED = 0 for group in groups: length = len(groups[group]) if length == 1: continue NUM_LINKED += scipy.misc.comb( length, 2, exact=True) # combination length*(length-1)-1 NUM_TO_SELECT = int(math.ceil( NUM_LINKED / NUM_GROUPS)) * settings['select_ratio'] print( '%d groups, %d linked, %d unlinked with %d select/r on average' % (NUM_GROUPS, NUM_LINKED, NUM_TO_SELECT * len(reports), NUM_TO_SELECT)) set_progress(20) balancing_ratio = NUM_TO_SELECT * len(reports) / ( NUM_LINKED + NUM_TO_SELECT * len(reports)) print('Start links combination...') links = [] for group in groups: group_weight = 1 / len(groups[group]) internal_group_links = [ t + (group_weight, 1) for t in combinations(groups[group], 2) ] external_group_links = [] for report in groups[group]: random_groups = random.sample([g for g in groups if g != group], NUM_TO_SELECT) external_group_links += [(report, random.choice(groups[g]), 1.0, 0) for g in random_groups] links.extend(internal_group_links) links.extend(external_group_links) print('Links combination finished: %d' % len(links)) set_progress(30) print('Start links with distance transformation...') linked_rows = [] progress = 0 for link in links: row1 = link[0] row2 = link[1] row = { feature: FEATURES_TO_USE[feature][2](row1[feature], row2[feature]) for feature in FEATURES_TO_USE if FEATURES_TO_USE[feature][2] is not None } row['weight'] = link[2] row['class'] = link[3] linked_rows.append(Row(**row)) progress += 1 set_progress(30 + progress * 60 / len(links)) fields = [ StructField(feature, FEATURES_TO_USE[feature][3], True) for feature in FEATURES_TO_USE if FEATURES_TO_USE[feature][2] is not None ] fields.append(StructField('weight', DoubleType(), False)) fields.append(StructField('class', IntegerType(), False)) df = env['sqlContext'].createDataFrame(linked_rows, schema=StructType(fields)) attributes = df.columns raw_df = _handle_missing(df) df = _vector_assembly(raw_df) df = _normalize(df) _write_arff(attributes, df) return {'DataFrame': df, 'RawDataFrame': raw_df}
from pyspark.sql.types import StringType from pyspark.sql.functions import udf # def match_func(predict, fraud_label): if ((predict == 0) & (fraud_label == 1)): return "no_match_predict_no_fraud" elif ((predict == 1) & (fraud_label == 0)): return "no_match_predict_fraud" else: return "match" # match_udf = udf(match_func, StringType()) #match_udf = udf(lambda (prediction,fraud_label): "no_match" if prediction!=fraud_label else "match", StringType()) # # Arguments # import argparse ## Parse date_of execution parser = argparse.ArgumentParser() parser.add_argument("--datev1", help="Execution Date") args = parser.parse_args() if args.datev1: processdate = args.datev1 # GENERAL PREPARATION SCRIPT # # Date in format YYYYMMDD
# train the model model = ALS.train( dfRates.rdd, 20, 20) # you could tune these numbers, but these are reasonable choices print("trained ...") # use this model to predict what the user would rate accommodations that she has not rated allPredictions = None for USER_ID in range(0, 100): dfUserRatings = dfRates.filter( dfRates.userId == USER_ID).rdd.map(lambda r: r.accoId).collect() rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings) pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0])) predictions = model.predictAll(pairsPotential).map( lambda p: (str(p[0]), str(p[1]), float(p[2]))) predictions = predictions.takeOrdered(5, key=lambda x: -x[2]) # top 5 print("predicted for user={0}".format(USER_ID)) if (allPredictions == None): allPredictions = predictions else: allPredictions.extend(predictions) # write them schema = StructType([ StructField("userId", StringType(), True), StructField("accoId", StringType(), True), StructField("prediction", FloatType(), True) ]) dfToSave = sqlContext.createDataFrame(allPredictions, schema) dfToSave.write.jdbc(url=jdbcUrl, table='Recommendation', mode='overwrite')
# parse the datasets into row tuples yellow_rows = yellow.mapPartitions(parse_yellow) citi_rows = citi.mapPartitions(parse_citi) # define dataframe schemas yellow_schema = StructType([ StructField('dropoff_time', TimestampType(), True), StructField('dropoff_lat', FloatType(), True), StructField('dropoff_lng', FloatType(), True) ]) citi_schema = StructType([ StructField('station_id', IntegerType(), True), StructField('ride_id', StringType(), True), StructField('start_time', TimestampType(), True) ]) # instantiate the dataframes yellow_df = sqlContext.createDataFrame(yellow_rows, yellow_schema) citi_df = sqlContext.createDataFrame(citi_rows, citi_schema) # filtering function to check if the taxi dropoff location is within 0.25 miles of citibike station def is_dropoff_close(lat, lng): # greenwich and 8th ave station station = (40.73901691, -74.00263761) # taxi dropoff location dropoff = (lat, lng)
def test_parse_schema(): struct = parse_schema("struct<foo:int, bar:string>") assert struct == StructType( [StructField("foo", IntegerType()), StructField("bar", StringType())])
def classify_spark(training, testing, target_domains, target_domains_dict): # Adjust target_domains_dict["_other"] = len(target_domains) target_domains.append(["_other"]) feature_list = [c for c in training.columns if c.startswith("_")] assembler = VectorAssembler(inputCols=feature_list, outputCol="features", handleInvalid="skip") str2idx = udf(lambda s: float(target_domains_dict[s]), FloatType()) idx2str = udf(lambda f: target_domains[int(f)], StringType()) training = assembler.transform(training) testing = assembler.transform(testing) training = training.withColumn("label_idx", str2idx("label")) testing = testing.withColumn("label_idx", str2idx("label")) bins = np.zeros(len(target_domains)) freqs = { row["label_idx"]: row["count"] for row in training.select("label_idx")\ .groupBy("label_idx").count().collect() } for i in freqs: bins[int(i)] = freqs[i] class_weights = np.sum(bins) / (len(bins) * bins) idx2cw = udf(lambda f: float(class_weights[int(f)]), FloatType()) training = training.withColumn("weigth", idx2cw("label_idx")) #model = pyspark.ml.classification.DecisionTreeClassifier(labelCol="label_idx", # featuresCol="features", predictionCol="prediction_idx") model = pyspark.ml.classification.LogisticRegression( labelCol="label_idx", weightCol="weigth", featuresCol="features", predictionCol="prediction_idx") model_fit = model.fit(training) training_predictions = model_fit.transform(training) testing_predictions = model_fit.transform(testing) training_predictions = training_predictions.withColumn( "prediction", idx2str("prediction_idx")) testing_predictions = testing_predictions.withColumn( "prediction", idx2str("prediction_idx")) labels_training = training_predictions.select("label").toPandas().values labels_test = testing_predictions.select("label").toPandas().values pred_training = training_predictions.select("prediction").toPandas().values pred_test = testing_predictions.select("prediction").toPandas().values training_report = classification_report(labels_training, pred_training, output_dict=True) testing_report = classification_report(labels_test, pred_test, output_dict=True) return model_fit, training_report, testing_report
""" 读取CSV文件信息 """ # 创建sparkSession spark = SparkSession.builder.appName("chronic").master("local[*]").enableHiveSupport().getOrCreate() # 使用spark读取csv文件 manbing = spark.read.csv("manbin.csv", header=True, mode="DROPMALFORMED") """ 对所有主治功能的列进行合并, 把所有String类型的字段合成 一个总String字段 """ # 使用udf来注册方法,并且指定输入和输出类型 toUnionUDF = udf(union_col, StringType()) # 指定需要合并的列并使用withColumn方法和udf自定义定义方法来进行转换,并生成一个新的列 manbing = manbing.withColumn('d_func', toUnionUDF( manbing.d_func_1, manbing.d_func_2, manbing.d_func_3, manbing.d_func_4, manbing.d_func_5, manbing.d_func_6, manbing.d_func_7, manbing.d_func_8, manbing.d_func_9, manbing.d_func_10, manbing.d_func_11, manbing.d_func_12, manbing.d_func_13, manbing.d_func_14, manbing.d_func_15, manbing.d_func_16)) """ 对所有主治功能总字段进行分词 使用jieba分词的方法进行 """ start=time.time() # 使用udf来注册方法,并且指定输入和输出类型
def main(context): # TASK 1 try: commentsDF = context.read.load('comments.parquet') submissionsDF = context.read.load('submissions.parquet') labeled_dataDF = context.read.load('label.parquet') except: commentsDF = sqlContext.read.json('comments-minimal.json.bz2') submissionsDF = sqlContext.read.json('submissions.json.bz2') labeled_dataDF = sqlContext.read.load('labeled_data.csv', format='csv', sep=',', header="true") commentsDF.write.parquet('comments.parquet') submissionsDF.write.parquet('submissions.parquet') labeled_dataDF.write.parquet('label.parquet') # TASK 2 joined_data = commentsDF.join(labeled_dataDF, commentsDF.id == labeled_dataDF.Input_id, 'inner').select(col('id'), col('body'), col('labeldjt')) # TASK 4,5 ngrams_udf = udf(get_ngrams, ArrayType(StringType())) joined_col = joined_data.withColumn('ngrams', ngrams_udf(joined_data['body'])) try: model = CountVectorizerModel.load('cv.model') except: # task 6A cv = CountVectorizer(inputCol='ngrams', outputCol="features", binary=True) model = cv.fit(joined_col) vectors = model.transform(joined_col) # task 6B positive_udf = udf(lambda x: 1 if x == '1' else 0, IntegerType()) negative_udf = udf(lambda x: 1 if x == '-1' else 0, IntegerType()) vectors = vectors.withColumn('positive', positive_udf(col('labeldjt'))) vectors = vectors.withColumn('negative', negative_udf(col('labeldjt'))) pos = vectors.select(col('positive').alias('label'), col('features')) neg = vectors.select(col('negative').alias('label'), col('features')) pos.write.parquet('positive_ROC.parquet') neg.write.parquet('negative_ROC.parquet') model.save('cv.model') try: posModel = CrossValidatorModel.load('pos.model') negModel = CrossValidatorModel.load('neg.model') except: # Task 7 # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("pos.model") print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. negModel.save("neg.model") # Task 8,9 try: finalDF = context.read.load('final.parquet') except: extract_id_udf = udf(lambda x: x[3:], StringType()) comments = commentsDF.select( col('id').alias('comment_id'), extract_id_udf(col('link_id')).alias('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('score').alias('comment_score')) submissions = submissionsDF.select( col('id').alias('submission_id'), col('title'), col('score').alias('submission_score')) finalDF = comments.join(submissions, comments.link_id == submissions.submission_id, 'inner') #sampling 20% finalDF = finalDF.sample(False, 0.02, None) pos_threshold_udf = udf(lambda x: 1 if x[1] > 0.2 else 0, IntegerType()) neg_threshold_udf = udf(lambda x: 1 if x[1] > 0.25 else 0, IntegerType()) finalDF = finalDF.filter( "body NOT LIKE '%/s%' and body NOT LIKE '>%'") finalDF = finalDF.withColumn('ngrams', ngrams_udf(finalDF['body'])) finalDF = model.transform(finalDF) posResult = posModel.transform(finalDF) temp = posResult.withColumn( 'pos', pos_threshold_udf(posResult['probability'])) temp = temp.select(col('comment_id'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('comment_score'), col('submission_id'), col('title'), col('submission_score'), col('ngrams'), col('pos')) temp = model.transform(temp) negResult = negModel.transform(temp) temp = negResult.withColumn( 'neg', neg_threshold_udf(negResult['probability'])) finalDF = temp.select(col('comment_id'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('comment_score'), col('submission_id'), col('title'), col('submission_score'), col('ngrams'), col('pos'), col('neg')) finalDF.write.parquet('final.parquet') # Task 10 # percentage of positive and negative comments try: task1 = context.read.load('percentage_value.csv/*.csv', format='csv', sep=',', header="true") except: total_rows = finalDF.count() total_pos_comments = finalDF.filter(col('pos') == '1').count() total_neg_comments = finalDF.filter(col('neg') == '1').count() pos_percentage = total_pos_comments / total_rows neg_percentage = total_neg_comments / total_rows values = [{ 'Total Rows': total_rows, 'Percentage of Positive Comments': pos_percentage, 'Percentage of Negative Comments': neg_percentage }] task1 = sqlContext.createDataFrame(values) task1.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("percentage_value.csv") #percent over date try: task2 = context.read.load('time_data.csv/*.csv', format='csv', sep=',', header="true") except: task2 = finalDF.withColumn( 'date', F.from_unixtime(col('created_utc')).cast(DateType())) task2 = task2.groupBy('date').agg( (F.sum('pos') / F.count('pos')).alias('Positive'), (F.sum('neg') / F.count('neg')).alias('Negative')) task2.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("time_data.csv") #percent over states try: task3 = context.read.load('state_data.csv/*.csv', format='csv', sep=',', header="true") except: state = sqlContext.createDataFrame(states, StringType()) task3 = finalDF.groupBy('author_flair_text').agg( (F.sum('pos') / F.count('pos')).alias('Positive'), (F.sum('neg') / F.count('neg')).alias('Negative')) task3 = task3.join(state, task3.author_flair_text == state.value, 'inner').na.drop(subset=['value']).select( col('author_flair_text').alias('state'), col('Positive'), col('Negative')) task3.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("state_data.csv") #percent over submission score try: task4 = context.read.load('submission_score.csv/*.csv', format='csv', sep=',', header="true") except: task4 = finalDF.groupBy('submission_score').agg( (F.sum('pos') / F.count('pos')).alias('Positive'), (F.sum('neg') / F.count('neg')).alias('Negative')) task4.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("submission_score.csv") #percent over commet score try: task5 = context.read.load('comment_score.csv/*.csv', format='csv', sep=',', header="true") except: task5 = finalDF.groupBy('comment_score').agg( (F.sum('pos') / F.count('pos')).alias('Positive'), (F.sum('neg') / F.count('neg')).alias('Negative')) task5.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("comment_score.csv") #list top 10 stories of each sentiment try: top_positive = context.read.load('top_positive.csv/*.csv', format='csv', sep=',', header="true") top_negative = context.read.load('top_negative.csv/*.csv', format='csv', sep=',', header="true") except: top_positive = finalDF.groupBy('title').agg( (F.sum('pos') / F.count('pos')).alias('Percentage')).orderBy( F.desc('Percentage')).limit(10) top_negative = finalDF.groupBy('title').agg( (F.sum('neg') / F.count('neg')).alias('Percentage')).orderBy( F.desc('Percentage')).limit(10) top_positive.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save("top_positive.csv") top_negative.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save("top_negative.csv")
def run(gz_paths_cols: List[Tuple[str, str]], ref_set_cols, ref_set_vals): spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext if not os.path.isdir('results_similarities'): os.makedirs('results_similarities') rand = get_rand_arg() if rand: random.shuffle(gz_paths_cols) # unzip gz_paths: List[str] = [gz_paths for gz_paths, _ in gz_paths_cols] to_sort = get_sort_arg() if to_sort: files_n_sizes = get_ds_file_sizes(gz_paths) sizes = [size for f, size in files_n_sizes] gz_paths_cols = list(zip( *sorted(list(zip(gz_paths_cols, sizes)), key=lambda x: x[1])))[0] # sort on sizes gz_paths: List[str] = [gz_paths for gz_paths, _ in gz_paths_cols] # unzip cols: List[str] = [cols for _, cols in gz_paths_cols] cache_col_name = {} # {col_name: semantic_type} | may help if cols are repeated across dfs cache_col_val = {} # there are many values repeated in a column so this helps # def _match_semantic_cols(col_name): # if not col_name in cache_col_name: # cache_col_name[col_name] = str(match_preprocess(col_name, ref_set_cols)[S_TYPE]) # return cache_col_name[col_name] def _match_semantic_vals(col_val, s_type_col): """ stage 1: run value matcher ('match_preprocess') on only the matched s_type_col if the cutoff not passed (avg distance from column is too high): stage 2: use heuristics (from manually examining frequent data for each col (ref_set)) to limit the amount of s_type_vals in ref_set_vals to compare to. I.e. null is automatically assigned the matched s_type_col I.e. check for subtrings, like if 'com' is in the val, then check 'website' s_type_vals for similarity. 'co' is implicitly in 'com' so check business_name as well, etc. this is to minimize misclassifications place them in 'check' to later build another s_type_vals using only those s_types stage 3: run 'match_preprocess' again on all s_types except the match s_type_col, or only on the heuristic matches in stage 2 (if they exist (if the heuristic check yielded results)) stage 4: check whether the stage 3 result is significantly better than the stage 1 result--by checking whether the avg_dist is some percentage ('IMPROVE_RATIO') better than what it was. If not, assign the val to the matched s_type_col as would happen if the value was null stage 5 (doesn't work in spark): if the min_dist is less than some similarity cutoff: 'MIN_DIST' (meaning it is sufficiently small) and is larger than some similarity cutoff: 'IDENTICAL_CUTOFF' (meaning it isn't nearly identical to something already in the ref_set) add it to the ref_set. if initial matches are correct, later matches should be more accurate. the ref_set tops out at some sufficient size as to prevent slow down and redundant matching all {col_val: s_type} combinations are cached so that identical column values aren't recomputed, and so that spark can assign each to the dataframe by using a udf after they are computed outside of Spark. the cache is cleared after each dataset """ col_val = str(col_val) s_type_col = str(s_type_col) # print(col_val, s_type_col, {s_type_col: [ref_set_vals[s_type_col]]}) if not col_val in cache_col_val: AVG_CUTOFF = 0.9 # similarity measure worse than this triggers second more general run MIN_CUTOFF = 0.65 IDENTICAL_CUTOFF = 0.10 IMPROVE_RATIO = 0.5 # second run improved by some percent str_col_val = str(col_val).lower() # print(str_col_val) if str_col_val == 'null' or str_col_val == '-' or str_col_val == '0' or str_col_val == 'none' or str_col_val == '' or col_val is None: res_final = (s_type_col, col_val, 0.0, 0.0) # default to s_type_col else: res0 = match_preprocess(col_val, {s_type_col: ref_set_vals[s_type_col]}, match_jacc_avg) # compare to values of matched (based on col_name) semantic type # print('res0:', res0) # res0[MIN_DIST] != 0.0 if AVG_CUTOFF < res0[AVG_DIST] or res0 is None: # was the cutoff passed, i.e. was the value present for this semantic type based on the col_name match? # check only these semantic types based on the content of the col_val (more explicit rules after examining data) check = [] if len(str_col_val) == 1 and str_col_val.isalpha(): possibles = ['person_name (middle_initial)', 'borough'] for pos_s_type in possibles: if s_type_col == pos_s_type: # which of these is the s_type of the col? check.extend([pos_s_type]) break if len(str_col_val) == 2 and str_col_val.isalpha(): check.extend(['color']) if len(str_col_val) >= 3: # can have numbers and other chars if 'llc' in str_col_val or 'inc' in str_col_val or 'co' in str_col_val: check.extend(['business_name']) if 'http' in str_col_val or 'www' in str_col_val or 'org' in str_col_val or 'com' in str_col_val: check.extend(['website']) if len(str_col_val) == 5 and str_col_val.isdigit(): check.extend(['zip_code']) if len(str_col_val) >= 6 and 'school' in str_col_val: check.extend(['city_agency', 'street_number', 'phone_number', 'building_classification']) if len(str_col_val) >= 3 and str_col_val.isdigit(): check.extend(['city_agency', 'street_number', 'phone_number', 'building_classification']) if len(str_col_val) >= 1 and str_col_val.isdigit(): check.extend(['street_number']) # if len(check) > 0: # print('check:', check) check = list(set(check)) if len(check) == 0: # compare to every semantic type but already checked ref_set_diff = copy.deepcopy(ref_set_vals) # clone for key, val in ref_set_cols.items(): # compare to column names as well (for ms_core) ref_set_diff[key].extend(copy.deepcopy(val)) else: # compare to only those in check ref_set_diff = {} for s_type in check: ref_set_diff[s_type] = copy.deepcopy(ref_set_vals[s_type]) ref_set_diff[s_type_col] = [] # prevent key error and delete all values for already matched res1 = match_preprocess(col_val, ref_set_diff, match_jacc_avg) # find similarity with other semantic value types if res0 is None and res1 is None: res_final = (s_type_col, col_val, 0.0, 0.0) elif res0 is None: res_final = res1 elif res1 is None: res_final = res0 else: # neither are None res_final = min([res0, res1], key=lambda x: x[AVG_DIST]) # if AVG_CUTOFF < res_final[AVG_DIST]: # still greater than cutoff and therefore unknown if not (res_final[AVG_DIST] < res0[AVG_DIST] * (1 - IMPROVE_RATIO)): # dist has not improved res_final = _default(s_type_col, col_val) # default to s_type_col # ^ should the distance be non-0 to add to ref_set? else: # print('FALSE') res_final = res0 # cutoff passed, return initial result # not an exact match and up to n different values stored if res_final[MIN_DIST] <= MIN_CUTOFF and res_final[MIN_DIST] >= IDENTICAL_CUTOFF and len(ref_set_vals[res_final[S_TYPE]]) < 100: ref_set_vals[res_final[S_TYPE]].append(col_val) # append to ref_set cache_col_val[col_val] = str(res_final[S_TYPE]) # print('res_final:', res_final) return cache_col_val[col_val] # match_semantic_cols = udf(_match_semantic_cols, StringType()) match_semantic_vals = udf(_match_semantic_vals, StringType()) master_dct = {} def _run(df, i): print("col_name:", cols[i]) col = None match_col = match_preprocess(cols[i], {'foo': df.columns}) # match the col from ta name to ds cols name if match_col is not None: col = match_col[COL] else: # shouldn't exec raise Exception(f'{cols[i]} not matched in {str(df.columns)}') df_cols = map_cols(df.select(col)) # filter single col # df_cols = df_cols.sample(0.5, seed=3).limit(500) # TEST if not col in cache_col_name: # currently uneccessary since cache_col_name is cleared after every ds cache_col_name[col] = match_preprocess(col, ref_set_cols)[S_TYPE] # match col to s_type s_type_col = cache_col_name[col] print('s_type_col:', s_type_col) print('ref_set_vals[s_type_col]:', ref_set_vals[s_type_col]) df_cols = df_cols.withColumn('s_type_col', lit(s_type_col)) # populate df with col s_type # if i < 35: # run on small datasets (before it gets slow) s_types_all = [] ### Python method: no spark to add to ref_set_vals for row in df_cols.select('value', 's_type_col').collect(): s_type_i = _match_semantic_vals(row['value'], row['s_type_col']) s_types_all.append(s_type_i) # get (s_type, count) s_types_distinct = sc.parallelize(s_types_all).countByValue().items() ### # the below udf call just pulls out the s_types from the cache df_cols = df_cols.withColumn('s_type_val', match_semantic_vals('value', 's_type_col')) # match uknown col value to semantic type df_test = df_cols.groupby('s_type_col', 'value', 's_type_val').count() df_test = df_test.sort('count', ascending=False) df_test.filter('s_type_val != s_type_col').show(25) df_test.show(25) # results = [str(list(row.asDict().values())) + '\n' for row in df_test.collect()] # print(results[:10]) # with open('results_similarities/test.txt', '+a') as f: # for s in results: # f.write(s) ds_dict = { 'column_name': col, 'semantic_types': [] } for s_type, count in s_types_distinct: if s_type in LABEL_LIST_TA: ds_dict['semantic_types'].append({ 'semantic_type': s_type, 'count': count }) else: ds_dict['semantic_types'].append({ 'semantic_type': 'other', 'label': s_type, 'count': count }) if gz_paths[i] not in master_dct: master_dct[gz_paths[i]] = {} master_dct[gz_paths[i]].update({col: ds_dict}) print('gz_paths[i]:', {gz_paths[i]: master_dct[gz_paths[i]]}) with open("results_similarities/master_dct.json", "w") as json_file: json.dump(master_dct, json_file, indent=4) cache_col_name.clear() cache_col_val.clear() timed(_run, gz_paths)
spark.sparkContext.setLogLevel("WARN") spark # In[3]: # path to files artistdata_path = 'AdvancedML_MusicRecommenderData2/artist_data.csv' userartist_path = 'AdvancedML_MusicRecommenderData2/user_artist_data_train.csv' test_path = 'AdvancedML_MusicRecommenderData2/LastFM_Test_Sample.csv' # In[4]: # Schemas for both files artistdata_struct = StructType( [StructField('artistId', LongType()), StructField('name', StringType())]) userartist_struct = StructType([ StructField('userId', LongType()), StructField('artistId', LongType()), StructField('song_count', LongType()) ]) # In[5]: # read artist_data file artistdata_df = spark.read.csv(artistdata_path, sep='\t', schema=artistdata_struct) artistdata_df.cache() # read user_artist_data file
class CCSparkJob: name = 'CCSparkJob' output_schema = StructType([ StructField("key", StringType(), True), StructField("val", LongType(), True) ]) warc_parse_http_header = True args = None records_processed = None warc_input_processed = None warc_input_failed = None log_level = 'INFO' logging.basicConfig(level=log_level, format=LOGGING_FORMAT) num_input_partitions = 400 num_output_partitions = 10 def parse_arguments(self): """ Returns the parsed arguments from the command line """ description = self.name if self.__doc__ is not None: description += " - " description += self.__doc__ arg_parser = argparse.ArgumentParser(description=description) arg_parser.add_argument("input", help="Path to file listing input paths") arg_parser.add_argument("output", help="Name of output table" " (saved in spark.sql.warehouse.dir)") arg_parser.add_argument("--num_input_partitions", type=int, default=self.num_input_partitions, help="Number of input splits/partitions") arg_parser.add_argument("--num_output_partitions", type=int, default=self.num_output_partitions, help="Number of output partitions") arg_parser.add_argument("--local_temp_dir", default=None, help="Local temporary directory, used to" "buffer content from S3") arg_parser.add_argument("--log_level", default=self.log_level, help="Logging level") self.add_arguments(arg_parser) args = arg_parser.parse_args() self.validate_arguments(args) self.init_logging(args.log_level) return args def add_arguments(self, parser): pass def validate_arguments(self, args): return True def init_logging(self, level=None): if level is None: level = self.log_level else: self.log_level = level logging.basicConfig(level=level, format=LOGGING_FORMAT) def get_logger(self, spark_context=None): """Get logger from SparkContext or (if None) from logging module""" if spark_context is None: return logging.getLogger(self.name) return spark_context._jvm.org.apache.log4j.LogManager \ .getLogger(self.name) def run(self): self.args = self.parse_arguments() conf = SparkConf().setAll(( ("spark.task.maxFailures", "10"), ("spark.locality.wait", "20s"), ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"), )) sc = SparkContext( appName=self.name, conf=conf) sqlc = SQLContext(sparkContext=sc) self.records_processed = sc.accumulator(0) self.warc_input_processed = sc.accumulator(0) self.warc_input_failed = sc.accumulator(0) self.run_job(sc, sqlc) sc.stop() def log_aggregator(self, sc, agg, descr): self.get_logger(sc).info(descr.format(agg.value)) def log_aggregators(self, sc): self.log_aggregator(sc, self.warc_input_processed, 'WARC input files processed = {}') self.log_aggregator(sc, self.warc_input_failed, 'records processed = {}') self.log_aggregator(sc, self.records_processed, 'records processed = {}') @staticmethod def reduce_by_key_func(a, b): return a + b def run_job(self, sc, sqlc): input_data = sc.textFile(self.args.input, minPartitions=self.args.num_input_partitions) output = input_data.mapPartitionsWithIndex(self.process_warcs) \ .reduceByKey(self.reduce_by_key_func) sqlc.createDataFrame(output, schema=self.output_schema) \ .coalesce(self.args.num_output_partitions) \ .write \ .format("parquet") \ .saveAsTable(self.args.output) self.get_logger(sc).info('records processed = {}'.format( self.records_processed.value)) def process_warcs(self, id_, iterator): s3pattern = re.compile('^s3://([^/]+)/(.+)') base_dir = os.path.abspath(os.path.dirname(__file__)) # S3 client (not thread-safe, initialize outside parallelized loop) no_sign_request = botocore.client.Config( signature_version=botocore.UNSIGNED) s3client = boto3.client('s3', config=no_sign_request) for uri in iterator: self.warc_input_processed.add(1) if uri.startswith('s3://'): self.get_logger().info('Reading from S3 {}'.format(uri)) s3match = s3pattern.match(uri) if s3match is None: self.get_logger().error("Invalid S3 URI: " + uri) continue bucketname = s3match.group(1) path = s3match.group(2) warctemp = TemporaryFile(mode='w+b', dir=self.args.local_temp_dir) try: s3client.download_fileobj(bucketname, path, warctemp) except botocore.client.ClientError as exception: self.get_logger().error( 'Failed to download {}: {}'.format(uri, exception)) self.warc_input_failed.add(1) continue warctemp.seek(0) stream = warctemp elif uri.startswith('hdfs://'): self.get_logger().error("HDFS input not implemented: " + uri) continue else: self.get_logger().info('Reading local stream {}'.format(uri)) if uri.startswith('file:'): uri = uri[5:] uri = os.path.join(base_dir, uri) try: stream = open(uri, 'rb') except IOError as exception: self.get_logger().error( 'Failed to open {}: {}'.format(uri, exception)) self.warc_input_failed.add(1) continue no_parse = (not self.warc_parse_http_header) try: for record in ArchiveIterator(stream, no_record_parse=no_parse): for res in self.process_record(record): yield res self.records_processed.add(1) except ArchiveLoadFailed as exception: self.warc_input_failed.add(1) self.get_logger().error( 'Invalid WARC: {} - {}'.format(uri, exception)) finally: if uri.startswith('file:'): stream.close() def process_record(self, record): raise NotImplementedError('Processing record needs to be customized') @staticmethod def is_wet_text_record(record): """Return true if WARC record is a WET text/plain record""" return (record.rec_type == 'conversion' and record.content_type == 'text/plain') @staticmethod def is_wat_json_record(record): """Return true if WARC record is a WAT record""" return (record.rec_type == 'metadata' and record.content_type == 'application/json')
persist() donnees06 = donnees05.map( lambda ligne : transformLignePoste(ligne)).\ persist() donnees07 = donnees06.join(donnees03).persist() donnees08 = donnees07.sortByKey().persist() donnees09 = donnees08.map(lambda ligne : tuple([ligne[0]] + [x for x in ligne[1][0]] + [x for x in ligne[1][1]]) ).persist() schema = StructType([ StructField('Id' , StringType() , True), StructField('ville' , StringType() , True), StructField('latitude' , StringType() , True), StructField('longitude' , StringType() , True), StructField('altitude' , StringType() , True), StructField('annee' , IntegerType(), True), StructField('mois' , IntegerType(), True), StructField('jour' , IntegerType(), True), StructField('temperature' , FloatType() , True), StructField('humidite' , FloatType() , True), StructField('visibilite' , FloatType() , True), StructField('pression' , FloatType() , True)]) donnees10 = spark.createDataFrame(donnees09, schema).cache() donnees11 = donnees10.filter(donnees10.Id < '08000') donnees12 = donnees11.groupBy('ville').\
class PhoneNumbers: filt = r"[\(\)\- ]*" mid_zero = "(?:{0}\( *?0 *?\))".format(filt) phone_regex = "(?:(?<=\D)00{0}3{0}1|\+{0}3{0}1){1}?(?:{0}[0-9]){{9}}".format( filt, mid_zero) replace_regex = "{0}|{1}".format(mid_zero, filt) zeroplus_regex = "^00" phone_nl_filter = re.compile(phone_regex) replace_filter = re.compile(replace_regex) zeroplus_filter = re.compile(zeroplus_regex) output_schema = StructType([ StructField("num", StringType(), True), StructField("urls", ArrayType(StringType()), True) ]) def __init__(self, input_file, output_dir, name, partitions=None): self.name = name self.input_file = input_file self.output_dir = output_dir self.partitions = partitions def run(self): sc = SparkContext(appName=self.name) sqlc = SQLContext(sparkContext=sc) self.failed_record_parse = sc.accumulator(0) self.failed_segment = sc.accumulator(0) if self.partitions is None: self.partitions = sc.defaultParallelism input_data = sc.textFile(self.input_file, minPartitions=self.partitions) phone_numbers = input_data.flatMap(self.process_warcs) phone_numb_agg_web = phone_numbers.groupByKey().mapValues(list) sqlc.createDataFrame(phone_numb_agg_web, schema=self.output_schema) \ .write \ .mode("overwrite") \ .format("parquet") \ .save(self.output_dir) self.log(sc, "Failed segments: {}".format(self.failed_segment.value)) self.log(sc, "Failed parses: {}".format(self.failed_record_parse.value)) def log(self, sc, message, level="warn"): log = sc._jvm.org.apache.log4j.LogManager.getLogger(self.name) if level == "info": log.info(message) elif level == "warn": log.warn(message) else: log.warn("Level unknown for logging: {}".format(level)) def process_warcs(self, input_uri): stream = None if input_uri.startswith('file:'): stream = self.process_file_warc(input_uri) elif input_uri.startswith('s3:/'): stream = self.process_s3_warc(input_uri) if stream is None: return [] return self.process_records(stream) def process_s3_warc(self, uri): try: no_sign_request = botocore.client.Config( signature_version=botocore.UNSIGNED) s3client = boto3.client('s3', config=no_sign_request) s3pattern = re.compile('^s3://([^/]+)/(.+)') s3match = s3pattern.match(uri) if s3match is None: print("Invalid URI: {}".format(uri)) self.failed_segment.add(1) return None bucketname = s3match.group(1) path = s3match.group(2) warctemp = TemporaryFile(mode='w+b') s3client.download_fileobj(bucketname, path, warctemp) warctemp.seek(0) return warctemp except BaseException as e: print("Failed fetching {}\nError: {}".format(uri, e)) self.failed_segment.add(1) return None def process_file_warc(self, input_file): try: return open(input_file[5:], 'rb') except BaseException as e: print("Error ocurred loading file: {}".format(input_file)) self.failed_segment.add(1) return None def process_records(self, stream): try: for rec in ArchiveIterator(stream): uri = rec.rec_headers.get_header("WARC-Target-URI") if uri is None: continue try: for num in self.find_phone_numbers(rec.content_stream()): yield (num, uri) except UnicodeDecodeError as e: print("Error: {}".format(e)) self.failed_record_parse.add(1) continue except BaseException as e: print("Failed parsing.\nError: {}".format(e)) self.failed_segment.add(1) def find_phone_numbers(self, content): content = content.read().decode('utf-8') numbers = self.phone_nl_filter.findall(content) nums_filt = { re.sub(self.zeroplus_filter, "+", re.sub(self.replace_filter, "", num)) for num in numbers } for num in nums_filt: yield num
nargs = int(sys.argv[argctr]) for x in range(0, nargs): argctr = argctr + 1 vecSizes.append(int(sys.argv[argctr])) #get samplesizes sampleSizes = [] argctr = argctr + 1 nargs = int(sys.argv[argctr]) for x in range(0, nargs): argctr = argctr + 1 sampleSizes.append(int(sys.argv[argctr])) argctr = argctr + 1 testHistory = sys.argv[argctr] argctr = argctr + 1 confidence = float(sys.argv[argctr]) switch = 10 catSizeLog = 10 seqs = spark.read.csv(seqFile, sep=',', schema=StructType([StructField('word', StringType(), False)])) seqs.show(10, False) main(f, seqs, outDir) sys.exit()
def register(self, name, f, returnType=StringType()): return self.sqlContext.registerFunction(name, f, returnType)
def text_clustering(dataFrame, k_value, w2v=False, w2v_value=None, seed=2137, normalize=True, plot=True): """ args: -dataFrame: spark Data Frame -k_value: number of clusters in k-means algorithm -w2v: if True word2Vec is used and w2v_value must be specified, otherwise tf-idf is used -w2v_value: number of parameters to be returned with Word2Vec -seed: seed -normalize: should normalization after Word2Vec be performed? -plot: if True, clusters are visualized with the use of PCA """ #Data preprocessing tokenizer = Tokenizer(inputCol="text", outputCol="words_raw") dataFrame = tokenizer.transform(dataFrame) remover = StopWordsRemover(inputCol="words_raw", outputCol="words") dataFrame = remover.transform(dataFrame) if w2v and w2v_value is None: raise ValueError('You have to give w2v_values parameter') if not w2v: #tf-idf hashingTF = HashingTF(inputCol="words_raw", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(dataFrame) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) memes_df = idfModel.transform(featurizedData) else: #word2vec word2Vec = Word2Vec(vectorSize=w2v_value, seed=seed, inputCol="words", outputCol="features_unnormalized") model_w2v = word2Vec.fit(dataFrame) memes_df = model_w2v.transform(dataFrame) model_w2v.write().overwrite().save("hdfs:///models/model_w2v") if normalize: scaler = StandardScaler(inputCol="features_unnormalized", outputCol="features", withStd=True, withMean=True) scalerModel = scaler.fit(memes_df) memes_df = scalerModel.transform(memes_df) #kmeans kmeans = KMeans(k=k_value, seed=seed) model_kmeans = kmeans.fit(memes_df) memes_df = model_kmeans.transform(memes_df) model_kmeans.write().overwrite().save("hdfs:///models/model_kmeans") #clustering evaluation evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(memes_df) centers = model_kmeans.clusterCenters() if plot: import matplotlib.pyplot as plt #virtual environment might have problems if imported "the classical" way #pca pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures") model_pca = pca.fit(memes_df) memes_df = model_pca.transform(memes_df) #memes_df.show() centers_pca = [None] * len(centers) for i in range(len(centers)): centers_pca[i] = np.multiply(model_pca.pc.toArray().T, centers[i]).sum(axis=1) centers_pca = np.array(centers_pca) #plot section split_col = functions.split(memes_df["pcaFeatures"].cast(StringType()), ',') memes_df = memes_df.withColumn( 'x', translate(split_col.getItem(0), "[", "").cast(DoubleType())) memes_df = memes_df.withColumn( 'y', translate(split_col.getItem(1), "]", "").cast(DoubleType())) #memes_df.show(truncate = False) df = memes_df.toPandas() groups = df.groupby('prediction') fig, ax = plt.subplots() ax.margins(0.05) for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=5, label=name) ax.text(centers_pca[name, 0], centers_pca[name, 1], s=name, fontsize=10) ax.legend() ax.title.set_text("k={0}, wn={1}, Silhouette={2}".format( k_value, w2v_value, silhouette)) plt.show() print("PCA, explained variance= {0}".format( model_pca.explainedVariance)) return memes_df
return '50-54' elif (age >= 55 and age < 60): return '55-59' elif (age >= 60 and age < 65): return '60-64' elif (age >= 65 and age < 70): return '65-69' elif (age >= 70 and age < 75): return '70-74' elif (age >= 75): return '75+' else: return '' age_banding_udf = udf(bandingFunction, StringType()) trainingSDF = trainingSDF.withColumn('age_banding', age_banding_udf(trainingSDF.age)) trainingSDF = trainingSDF.drop('age') temp_table_name = 'trainingSDF' trainingSDF.createOrReplaceTempView(temp_table_name) # COMMAND ---------- # MAGIC %md # MAGIC Let's now visualize the distribution. # COMMAND ----------
conf = SparkConf().setAppName("Max Temperature").setMaster("local[3]") sc = SparkContext(conf=conf) spark = SparkSession.builder.config(conf=conf).getOrCreate() def myFields(data_passed): field = data_passed.split(",") zone_id = field[0] date = field[1] temp_type = field[2] temperature = round(float(field[3]) * 0.1 * (9.0 / 5.0) + 32.0, 2) return zone_id, date, temp_type, temperature my_schema = StructType([ StructField("zone_id", StringType()), StructField("date", StringType()), StructField("temp_type", StringType()), StructField("temperature", FloatType()) ]) #By rdd+df # rdd1 = sc.textFile(r"D:\pythonProject\tammingBigDataSparkPython\1800.csv") # rdd2 = rdd1.map(lambda x: myFields(x)) # df2 = spark.createDataFrame(rdd2, my_schema) # df2.show(5) # rdd3 = df2.filter("temp_type = 'TMAX'").select("zone_id","temperature").rdd\ # .reduceByKey(lambda x,y: max(x,y)) # df3 = spark.createDataFrame(rdd3) # df3.show(5)
.appName('first_app') \ .getOrCreate() # 从集合中创建RDD rdd = spark.sparkContext.parallelize([ (1001, "张飞", 8341, "坦克"), (1002, "关羽", 7107, "战士"), (1003, "刘备", 6900, "战士") ]) # 指定模式, StructField(name,dataType,nullable) # name: 该字段的名字,dataType:该字段的数据类型,nullable: 指示该字段的值是否为空 from pyspark.sql.types import StructType, StructField, LongType, StringType # 导入类型 schema = StructType([ StructField("id", LongType(), True), StructField("name", StringType(), True), StructField("hp", LongType(), True), #生命值 StructField("role_main", StringType(), True) ]) # 对RDD应用该模式并且创建DataFrame heros = spark.createDataFrame(rdd, schema) heros.show() # 利用DataFrame创建一个临时视图 heros.registerTempTable("HeroGames") # 查看DataFrame的行数 print(heros.count()) # 使用自动类型推断的方式创建dataframe data = [(1001, "张飞", 8341, "坦克"),
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3" os.environ["PYSPARK_SUBMIT_ARGS"] = "pyspark-shell" # - sp = SparkSession.builder.master("local").appName("data").getOrCreate() sp.stop() # + data = [ ('PNT', 'point', 1, 1, "точка"), ('LNS', 'line segment', 2, 2, "отрезок"), ('TRI', 'triangle', 3, 2, "треугольник"), ('QDR', 'quadrangle', 4, 2, "четырехугольник"), ('QDT', 'quadrant', 4, 2, "квадрат"), ] schema = StructType([ StructField('id', StringType(), True), StructField('name', StringType(), True), StructField('points', IntegerType(), True), StructField('dimensions', IntegerType(), True), StructField('description', StringType(), True) ]) # Convert list to RDD rdd = sp.sparkContext.parallelize(data) # Create data frame df = sp.createDataFrame(rdd, schema) df.show()
from pyspark.sql import SparkSession, Window from pyspark.sql.functions import from_json, col, to_timestamp, window, expr, sum from pyspark.sql.types import StructType, StructField, StringType, IntegerType if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("Tumbling Window Steram") \ .master("local[3]") \ .config("spark.streaming.stopGracefullyOnShutdown", "true") \ .config("spark.sql.shuffle.partitions", 2) \ .getOrCreate() stock_schema = StructType([ StructField("CreatedTime", StringType()), StructField("Type", StringType()), StructField("Amount", IntegerType()), StructField("BrokerCode", StringType()) ]) kafka_df = spark.readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "trades") \ .option("startingOffsets", "earliest") \ .load() value_df = kafka_df.select( from_json(col("value").cast("string"), stock_schema).alias("value")) #value_df.printSchema()
hc = HiveContext(ss) #--------------------------------------------------------------------------------------------------------------------------------------------------- first_month = "'2017-01-%'" second_month = "'2017-02-%'" non_dropouts_table = "temp_user_store.rt_nondropouts_jan2017" dropouts_table = "temp_user_store.rt_dropouts_jan2017" def rename_device(device): if device in ('games console', 'tv', 'set top box', 'desktop', ''): return 'desktop' elif device in ('mobile phone', 'media player', 'tablet', 'ereader'): return 'mobile' else: return 'other' rename_device = F.udf(rename_device, StringType()) #--------------------------------------------------------------------------------------------------------------------------------------------------- # Extract individual events events = hc.sql("SELECT \ visitor_id, \ hit_time_gmt, \ file_date, \ month(file_date) as month, \ session_id, \ pageview_event_cnt, \ video_start_cnt, \ video_time_spent_secs, \ browser_typ_dsc, \ device_type_dsc as device \ FROM user_business_defined_dataset.cnn_adobe_bdd_web \
def register(self, name, f, returnType=None): """Register a Python function (including lambda function) or a user-defined function as a SQL function. .. versionadded:: 1.3.1 Parameters ---------- name : str, name of the user-defined function in SQL statements. f : function, :meth:`pyspark.sql.functions.udf` or :meth:`pyspark.sql.functions.pandas_udf` a Python function, or a user-defined function. The user-defined function can be either row-at-a-time or vectorized. See :meth:`pyspark.sql.functions.udf` and :meth:`pyspark.sql.functions.pandas_udf`. returnType : :class:`pyspark.sql.types.DataType` or str, optional the return type of the registered user-defined function. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. `returnType` can be optionally specified when `f` is a Python function but not when `f` is a user-defined function. Please see the examples below. Returns ------- function a user-defined function Notes ----- To register a nondeterministic Python function, users need to first build a nondeterministic user-defined function for the Python function and then register it as a SQL function. Examples -------- 1. When `f` is a Python function: `returnType` defaults to string type and can be optionally specified. The produced object must match the specified type. In this case, this API works as if `register(name, f, returnType=StringType())`. >>> strlen = spark.udf.register("stringLengthString", lambda x: len(x)) >>> spark.sql("SELECT stringLengthString('test')").collect() [Row(stringLengthString(test)='4')] >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect() [Row(stringLengthString(text)='3')] >>> from pyspark.sql.types import IntegerType >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType()) >>> spark.sql("SELECT stringLengthInt('test')").collect() [Row(stringLengthInt(test)=4)] >>> from pyspark.sql.types import IntegerType >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType()) >>> spark.sql("SELECT stringLengthInt('test')").collect() [Row(stringLengthInt(test)=4)] 2. When `f` is a user-defined function (from Spark 2.3.0): Spark uses the return type of the given user-defined function as the return type of the registered user-defined function. `returnType` should not be specified. In this case, this API works as if `register(name, f)`. >>> from pyspark.sql.types import IntegerType >>> from pyspark.sql.functions import udf >>> slen = udf(lambda s: len(s), IntegerType()) >>> _ = spark.udf.register("slen", slen) >>> spark.sql("SELECT slen('test')").collect() [Row(slen(test)=4)] >>> import random >>> from pyspark.sql.functions import udf >>> from pyspark.sql.types import IntegerType >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic() >>> new_random_udf = spark.udf.register("random_udf", random_udf) >>> spark.sql("SELECT random_udf()").collect() # doctest: +SKIP [Row(random_udf()=82)] >>> import pandas as pd # doctest: +SKIP >>> from pyspark.sql.functions import pandas_udf >>> @pandas_udf("integer") # doctest: +SKIP ... def add_one(s: pd.Series) -> pd.Series: ... return s + 1 ... >>> _ = spark.udf.register("add_one", add_one) # doctest: +SKIP >>> spark.sql("SELECT add_one(id) FROM range(3)").collect() # doctest: +SKIP [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)] >>> @pandas_udf("integer") # doctest: +SKIP ... def sum_udf(v: pd.Series) -> int: ... return v.sum() ... >>> _ = spark.udf.register("sum_udf", sum_udf) # doctest: +SKIP >>> q = "SELECT sum_udf(v1) FROM VALUES (3, 0), (2, 0), (1, 1) tbl(v1, v2) GROUP BY v2" >>> spark.sql(q).collect() # doctest: +SKIP [Row(sum_udf(v1)=1), Row(sum_udf(v1)=5)] """ # This is to check whether the input function is from a user-defined function or # Python function. if hasattr(f, 'asNondeterministic'): if returnType is not None: raise TypeError( "Invalid return type: data type can not be specified when f is" "a user-defined function, but got %s." % returnType) if f.evalType not in [PythonEvalType.SQL_BATCHED_UDF, PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF]: raise ValueError( "Invalid f: f must be SQL_BATCHED_UDF, SQL_SCALAR_PANDAS_UDF, " "SQL_SCALAR_PANDAS_ITER_UDF, SQL_GROUPED_AGG_PANDAS_UDF or " "SQL_MAP_PANDAS_ITER_UDF.") register_udf = UserDefinedFunction(f.func, returnType=f.returnType, name=name, evalType=f.evalType, deterministic=f.deterministic) return_udf = f else: if returnType is None: returnType = StringType() register_udf = UserDefinedFunction(f, returnType=returnType, name=name, evalType=PythonEvalType.SQL_BATCHED_UDF) return_udf = register_udf._wrapped() self.sparkSession._jsparkSession.udf().registerPython(name, register_udf._judf) return return_udf
header=True, inferSchema=True) display(dfFire) # COMMAND ---------- dfFire.printSchema() # COMMAND ---------- from pyspark.sql.types import StructField, StructType, IntegerType, StringType, BooleanType # COMMAND ---------- fire_schema = StructType([ StructField('CallNumber', StringType(), True), StructField('UnitID', StringType(), True), StructField('IncidentNumber', IntegerType(), True), StructField('CallType', StringType(), True), StructField('CallDate', StringType(), True), StructField('WatchDate', StringType(), True), StructField('ReceivedDtTm', StringType(), True), StructField('EntryDtTm', StringType(), True), StructField('DispatchDtTm', StringType(), True), StructField('ResponseDtTm', StringType(), True), StructField('OnSceneDtTm', StringType(), True), StructField('TransportDtTm', StringType(), True), StructField('HospitalDtTm', StringType(), True), StructField('CallFinalDisposition', StringType(), True), StructField('AvailableDtTm', StringType(), True), StructField('Address', StringType(), True),
import pyspark from pyspark.sql import SparkSession from pyspark.sql.types import StructField, StringType, IntegerType, StructType base_path = '/home/edoardo/Udemy/PySpark/Python-and-Spark-for-Big-Data-master/Spark_DataFrames/' file_name = 'people.json' spark = SparkSession.builder.appName('Basics').getOrCreate() # This is reading from column AGE assuming is value Integer type, True is to say that it is null data_schema = [StructField('age', IntegerType(), True), StructField('name', StringType(), True)] final_struct = StructType(fields = data_schema) df = spark.read.json(base_path + file_name, schema = final_struct) # select() method returns a DataFrame of a single column, using ['age'] we return a pyspark.Column object print(df.select('age').show()) # withColumn() ---> creates a new column in the dataframe print(df.withColumn('newage', df['age'] * 2).show()) # Just rename a column print(df.withColumnRenamed('age', 'my_new_age').show()) df.createOrReplaceTempView('people')
return result # Get movie name by given movie id def getMovieName(movieNames, movieId): result = movieNames.filter(func.col("movieID") == movieId) \ .select("movieTitle").collect()[0] return result[0] spark = SparkSession.builder.appName("MovieSimilarities").getOrCreate() spark.sparkContext.setLogLevel("ERROR") movieNamesSchema = StructType([ \ StructField("movieID", IntegerType(), True), \ StructField("movieTitle", StringType(), True) \ ]) moviesSchema = StructType([ \ StructField("userID", IntegerType(), True), \ StructField("movieID", IntegerType(), True), \ StructField("rating", IntegerType(), True), \ StructField("timestamp", LongType(), True)]) # Create a broadcast dataset of movieID and movieTitle. # Apply ISO-885901 charset movieNames = spark.read \ .option("sep", "|") \ .option("charset", "ISO-8859-1") \ .schema(movieNamesSchema) \
import pyspark from pyspark.sql import SparkSession spark = SparkSession.builder.appName('rdd-to-dataframe').getOrCreate() dept = [("Finance", 10), ("Marketing", 20), ("Sales", 30), ("IT", 40)] rdd = spark.sparkContext.parallelize(dept) df = rdd.toDF() df.printSchema() df.show(truncate=False) deptColumns = ["dept_name", "dept_id"] df2 = rdd.toDF(deptColumns) df2.printSchema() df2.show(truncate=False) deptDF = spark.createDataFrame(data=dept, schema=deptColumns) deptDF.printSchema() deptDF.show(truncate=False) from pyspark.sql.types import StructType, StructField, StringType deptSchema = StructType([ StructField('dept_name', StringType(), True), StructField('dept_id', StringType(), True) ]) deptDF1 = spark.createDataFrame(data=dept, schema=deptSchema) deptDF1.printSchema() deptDF1.show(truncate=False)
app_conf = yaml.load(conf, Loader=yaml.FullLoader) secret = open(app_secrets_path) app_secret = yaml.load(secret, Loader=yaml.FullLoader) # Setup spark to use s3 hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3a.access.key", app_secret["s3_conf"]["access_key"]) hadoop_conf.set("fs.s3a.secret.key", app_secret["s3_conf"]["secret_access_key"]) print( "\nCreating dataframe ingestion CSV file using 'SparkSession.read.format()'" ) crime_schema = StructType() \ .add("Id", StringType(), True) \ .add("City_Name", StringType(), True) \ .add("Crime_Name", StringType(), True) \ .add("Damages", StringType(), True) \ .add("No_Of_Case", IntegerType(), True) \ .add("Year", IntegerType(), True) \ .add("Total_Case", IntegerType(), True) crime_df = spark.read \ .option("header", "false") \ .option("delimiter", ",") \ .format("csv") \ .schema(crime_schema) \ .load("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/crime_dataset/xac.csv") crime_df.printSchema()
"float": FloatType, "double": DoubleType, "boolean": BooleanType, "struct": StructType, "array": ArrayType, "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType, "datetime": TimestampType, "binary": BinaryType, "null": NullType } SPARK_DTYPES_DICT_OBJECTS = \ {"string": StringType(), "int": IntegerType(), "float": FloatType(), "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()), "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(), "datetime": TimestampType(), "binary": BinaryType(), "null": NullType() } # Profiler PROFILER_TYPES = { "int", "float", "string", "bool", "date", "null", "array", "double" } PROFILER_LEGEND_TYPES = { "string": "ABC", "int": "#", "integer": "#", "float": "##.#", "double": "##.#",
import numpy # import scipy.misc # from scipy import spatial import scipy from Orange.widgets.widget import OWWidget from nltk.corpus import wordnet from nltk.wsd import lesk from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler from pyspark.sql import functions as F from pyspark.sql.functions import udf, struct from pyspark.sql.types import IntegerType, StringType, ArrayType, Row, DoubleType, StructField, StructType from scipy.optimize import linear_sum_assignment @udf(returnType=StringType()) def p_locationType(string): if "Farm" in string: return 1 if "Residenti" in string: return 2 if "Commerci" in string: return 3 if "Publi" in string: return 4 return "UNKNOWN" @udf(returnType=IntegerType()) def p_ordinalDate(string): start = datetime.datetime.strptime(string.strip(), '%d/%m/%Y')