columns_to_return = [ 'trip_id', 'stop_id', 'prev_stop', 'stop_sequence', 'route_id', 'filled_ts', 'day', 'hour', 'week', 'diff_schedule_real' ] df = df.filter(df.filled_ts.isNotNull()) \ .filter((df.filled_ts > 0)) \ .filter((df.filled_ts != 6666.6666)) \ .select(*columns_to_return) return df if __name__ == '__main__': scSpark = SparkSession.builder.appName('Stop to Stop').getOrCreate() scSpark.sparkContext.setLogLevel("ERROR") sqlCon = SQLContext(scSpark) stops_file = 'clean_stops.csv' trips_file = 'trips_agglom.csv' data_file = 'miniminiset.csv' if len(sys.argv) == 4: stops_file = sys.argv[1] data_file = sys.argv[2] trips_file = sys.argv[3] # Begin with a straightforward read from csv. trips_df = scSpark.read.csv(trips_file, header=True, sep=',') # Stop times require casting. Convert to DataFrame stops_df = stop_times_to_df(stops_file, scSpark, trips_df)
from pyspark.sql import SQLContext from pyspark import SparkContext if __name__ == "__main__": # create Spark context with necessary configuration spark = SparkContext("local", "Stock Returns") # read csv data from the stock_prices file df = SQLContext(spark).read.option("header", True).csv("stock_prices.csv") # calculate daily percentage returns df = df.withColumn("return", ((df["close"]-df["open"])/df["open"])*100) # average on date df = df.groupBy("date").avg().alias("avg_return") # save the average returns to output df.write.csv('./stockreturns/')
def get_sql_context_instance(spark_context): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(spark_context) return globals()['sqlContextSingletonInstance']
from pyspark.sql import SQLContext, Row, HiveContext from pyspark.sql.window import Window from pyspark.sql import functions as psf HadoopLink = sys.argv[1] HadoopLink2 = sys.argv[2] #conf = SparkConf() #conf.setMaster("local[*]") #conf.setAppName("BuildChurnChannelVariables") #conf.set("spark.executor.memory", "4g") #conf.set("spark.executor.cores", 2) #conf.set("spark.jars.packages", "com.databricks:spark-csv_2.11:1.4.0") sc = SparkContext() sq = SQLContext(sc) hq = HiveContext(sc) ###HadoopLink = "hdfs://10.82.187.10:8020/hadoop/hdfs/INPUTPARQUET/" CreditHistoryLog = hq.read.parquet(HadoopLink + "contr/CreditHistory_parquet") CreditHistoryLog.registerTempTable("CreditHistory") Client = hq.read.parquet(HadoopLink + "cli/Client_parquet") Client.registerTempTable("Client") ClientBlackList = hq.read.parquet(HadoopLink + "cli/ClientBlackList_parquet") ClientBlackList.registerTempTable("ClientBlackList") seip = hq.sql("\ SELECT distinct ClientID,ReportingDate FROM CreditHistory") seip.registerTempTable("seip")
from pyspark.sql import SQLContext import numpy as np import imutils import cv2 from imageai.Detection import ObjectDetection import tensorflow as tf import time conf = SparkConf().setAppName("object detection streaming").setMaster("yarn") conf.set("spark.scheduler.mode", "FAIR") conf.set("spark.scheduler.allocation.file", "/opt/spark-2.4.3-bin-hadoop2.7/conf/fairscheduler.xml") sc = SparkContext(conf=conf) sc.setLocalProperty("spark.scheduler.pool", "pool3") ssc = StreamingContext(sc, 0.5) sql_sc = SQLContext(sc) input_topic = 'input' output_topic = 'output3' brokers = "G01-01:2181,G01-02:2181,G01-03:2181,G01-04:2181,G01-05:2181,G01-06:2181,G01-07:2181,G01-08:2181," \ "G01-09:2181,G01-10:2181,G01-11:2181,G01-12:2181,G01-13:2181,G01-14:2181,G01-15:2181,G01-16:2181" def my_decoder(s): return s kafkaStream = KafkaUtils.createStream(ssc, brokers, 'test-consumer-group-3', {input_topic: 15}, valueDecoder=my_decoder)
#import path from pyspark.sql import * # create spark sql session myspark = SparkSession\ .builder\ .config("spark.executor.instances", 3 ) \ .config("spark.executor.memory", "5g") \ .config("spark.executor.cores", 2) \ .config("spark.dynamicAllocation.maxExecutors", 10) \ .config("spark.scheduler.listenerbus.eventqueue.size", 10000) \ .config("spark.sql.parquet.compression.codec", "snappy") \ .appName("Sample_07_kmeans") \ .getOrCreate() sc = myspark.sparkContext from pyspark.sql import SQLContext print sc #df = pd.read_csv("test.csv") print type(df) print df sqlCtx = SQLContext(sc) sqlCtx.createDataFrame(df).show()
""" This is a Spark script for curating the historical options """ sc.install_pypi_package('scipy') from pyspark.sql import SparkSession from pyspark.sql import SQLContext import numpy as np spark = SparkSession.builder.appName('MapRedBook').getOrCreate() ctx = SQLContext(spark) options = ctx.read.load('s3n://nbachmei.finsurf.data-us-west-2/data/csv/L2_options_*.csv', header="true", inferSchema="true", format="csv") import pyspark.sql.functions as F import datetime import numpy as np import scipy.stats as si @F.udf def fix_date(dt): if dt is None: return dt parts = dt.split('/') return "{}-{}-{}".format(parts[2],parts[0],parts[1]) @F.udf
from pyspark.sql.types import FloatType from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, RandomForestClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.feature import StringIndexer, VectorAssembler from pyspark.ml.regression import GeneralizedLinearRegression from pyspark.ml.stat import Correlation, Summarizer import sys import config DATA_CSV = config.BUILDDIR / 'bd_lab_small_sample.csv' DATA_PARQUET = DATA_CSV.with_suffix('.parquet') spark = SparkContext.getOrCreate() sql = SQLContext(spark) def convert(): """Convert CSV to Parquet.""" df = sql.read.csv(str(DATA_CSV), header=True, inferSchema='true') for field in [ 'cost', 'call_duration_minutes', 'data_volume_mb', 'LAT', 'LON' ]: df = df.withColumn(field, df[field].cast(FloatType())) df.write.parquet(str(DATA_PARQUET)) def explore(): df = sql.read.parquet(str(DATA_PARQUET))
def test(csv_report, all, dummy_test, separate_test, all_but_test, primary_pairs_test, custom_combos_test, non_zero_users_from_file): logging.info('Testing started') if csv_report: if cfg.reporting.use_uuid: uuid = uuid4() reporter = CSVReport(cfg.reporting.csv_dir, uuid) else: reporter = CSVReport(cfg.reporting.csv_dir, None) else: reporter = ExcelReport(cfg.reporting.file) logging.info('Spark context initialization') sc = SparkContext(cfg.spark.master, 'map_test: train') sqlContext = SQLContext(sc) logging.info('Train data reading') test_df = sqlContext.read.json(cfg.splitting.test_file).cache() test_data = test_df.filter("event = '%s'" % (cfg.testing.primary_event)).collect() #non_zero_users = set([r[0] for r in test_data][500:650]) # Because actually all our users have 0.0 scores -- too few data if all or dummy_test: logging.info('Train data reading') train_df = sqlContext.read.json(cfg.splitting.train_file).cache() counts = train_df.filter("event = '%s'" % (cfg.testing.primary_event)).groupBy("targetEntityId").count().collect() sorted_rating = sorted([(row.asDict()['count'], row.asDict()['targetEntityId']) for row in counts], reverse=True) elements = np.array([item for cnt, item in sorted_rating]) probs = np.array([cnt for cnt, item in sorted_rating]) probs = 1.0 * probs / probs.sum() logging.info('Process dummy test') # case 1. Random sampling from items (uniform) dummy_uniform_res = run_map_test_dummy(test_data, items=elements, probs=probs, uniform=True, top=False, K=cfg.testing.map_k) # case 2. Random sampling from items (according to their distribution in training data) dummy_res = run_map_test_dummy(test_data, items=elements, probs=probs, uniform=False, top=False, K=cfg.testing.map_k) # case 3. Top-N items from training data dummy_top_res = run_map_test_dummy(test_data, items=elements, probs=probs, uniform=True, top=True, K=cfg.testing.map_k) reporter.start_new_sheet('Dummy MAP benchmark') reporter.report( ['', 'Random uniform', 'Random sampled from train', 'Top - N'], [[('MAP @ %d' % i) for i in range(1, len(dummy_res)+1)]] + [dummy_uniform_res, dummy_res, dummy_top_res], cfg=cfg ) reporter.finish_sheet() logging.info('Process top 20 dummy test') scores = [] for i in range(20): scores.append(run_map_test_dummy(test_data, items=elements[i:], uniform=True, top=True, K=1, no_progress=True)[0]) reporter.start_new_sheet('Top-20 perfomance') reporter.report( ['Rank', 'MAP@1'], [list(range(1, 21)), scores], bold_first_column=False, cfg=cfg ) reporter.finish_sheet() if all or separate_test or all_but_test or primary_pairs_test or custom_combos_test: logging.info('Non zero users') if non_zero_users_from_file: with open(cfg.testing.non_zero_users_file) as input: non_zero_users = set(input.read().split(',')) else: _, r_data, _ = run_map_test(test_data, [cfg.testing.primary_event], test=False) non_zero_users = get_nonzero(r_data) with open(cfg.testing.non_zero_users_file, 'w') as output: output.write(','.join(non_zero_users)) if all or separate_test: logging.info('Process "map separate events" test') columns = [] for ev in cfg.testing.events: (r_scores, r_data, ipu) = run_map_test(test_data, [ev], users=non_zero_users, test=False) columns.append(r_scores + [len(non_zero_users)]) first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))] + ['non-zero users'] reporter.start_new_sheet('MAP separate events') reporter.report( ['event'] + cfg.testing.events, [first_column] + columns, selected_columns=[cfg.testing.events.index(cfg.testing.primary_event) + 1], cfg=cfg ) reporter.finish_sheet() if all or all_but_test: logging.info('Process "map all but..." test') events_scores = [] for ev in cfg.testing.events: evs = list(cfg.testing.events) evs.remove(ev) (r_scores, r_data, ipu) = run_map_test(test_data, evs, users=non_zero_users, test=False) events_scores.append(r_scores + [len(non_zero_users)]) evl = cfg.testing.events all_scores, r_data, ipu = run_map_test(test_data, evl, users=non_zero_users, test=False) all_scores.append(len(non_zero_users)) first_column = [('MAP @ %d' % i) for i in range(1, len(all_scores))] + ['non-zero users'] reporter.start_new_sheet('MAP all but...') reporter.report( ['event'] + cfg.testing.events + ['All'], [first_column] + events_scores + [all_scores], selected_columns=[cfg.testing.events.index(cfg.testing.primary_event) + 1], cfg=cfg ) reporter.finish_sheet() if all or primary_pairs_test: logging.info('Process "map pairs with primary" test') columns = [] events_without_primary = [event for event in cfg.testing.events if event != cfg.testing.primary_event] for event in events_without_primary: (r_scores, r_data, ipu) = run_map_test(test_data, [cfg.testing.primary_event, event], users=non_zero_users, test=False) columns.append(r_scores + [len(non_zero_users)]) first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))] + ['non-zero users'] reporter.start_new_sheet('MAP pairs with primary') reporter.report( ['event'] + events_without_primary, [first_column] + columns, cfg=cfg ) reporter.finish_sheet() if all or custom_combos_test: logging.info('Process "custom combos" test') columns = [] for event_group in cfg.testing.custom_combos.event_groups: if len(event_group) == 2 and cfg.testing.primary_event in event_group and primary_pairs_test: logging.warn("Report for group %s already generated in 'MAP pairs with primary'" % str(event_group)) continue if len(event_group) == 1 and separate_test: logging.warn("Report for group %s already generated in 'MAP separate events'" % str(event_group)) continue if len(event_group) >= len(cfg.testing.events) - 1 and all_but_test: logging.warn("Report for group %s already generated in 'All but...'" % str(event_group)) continue if not (set(cfg.testing.events) & set(event_group)): logging.warn("Event group is not corect!") continue (r_scores, r_data, ipu) = run_map_test(test_data, event_group, users = non_zero_users, test=False) columns.append(r_scores + [len(non_zero_users)]) if columns: first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))] + ['non-zero users'] reporter.start_new_sheet('Custom combos') reporter.report( ['event'] + [str([s.encode('utf-8') for s in group]) for group in cfg.testing.custom_combos.event_groups], [first_column] + columns, cfg=cfg ) reporter.finish_sheet() reporter.finish_document() logging.info('Testing finished successfully')
def run(): # Creating the Spark Context sc = SparkContext(master="local[2]", appName="WindowWordCount") sc.setLogLevel("ERROR") # creating the streaming context ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") # creating the SQL context sqlContext = SQLContext(sc) host = "localhost" port = 5599 lines = ssc.socketTextStream(host, port) hashtags = lines.filter(lambda text: len(text) > 0) \ .flatMap(lambda text: text.split(" ")) \ .filter(lambda text: text.lower().startswith('#')) Word = namedtuple('Word', ("word", "count")) Hashtag = namedtuple('Hashtag', ("tag", "count")) Tweet = namedtuple('Tweet', ('text', 'sentiment')) stop_words = set(stopwords.words('english')) list_punct = list(string.punctuation) lemmatizer = WordNetLemmatizer() # processing to obtain data about tweets text and sentiment lines.window(40) \ .map(lambda p: clean_tweet(p)) \ .filter(lambda text: len(text) > 0) \ .map(lambda p: Tweet(p, analyze_sentiment_polarity(p))) \ .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("tweets")) # processing to obtain data about single words in text and their count. NLP tools applied. lines.window(40) \ .map(lambda p: clean_tweet(p)) \ .filter(lambda text: len(text) > 0) \ .flatMap(lambda text: text.split(" ")) \ .map(lambda word: word.lower()) \ .filter(lambda word: word not in stop_words) \ .map(lambda word: ''.join(char for char in word if char not in list_punct)) \ .map(lambda word: lemmatizer.lemmatize(word)) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) \ .map(lambda p: Word(p[0], p[1])) \ .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("words")) # processing to obtain data about hashtags in text and their count. hashtags.window(40) \ .map(lambda word: ''.join(char for char in word if char not in list_punct)) \ .map(lambda word: (word.lower(), 1)) \ .reduceByKey(lambda a, b: a + b) \ .map(lambda p: Hashtag(p[0], p[1])) \ .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("hashtags")) time_to_wait = 80 ssc.start() print("Session Started.....") print("Collecting tweets...waiting for " + str(time_to_wait) + " seconds..") time.sleep( time_to_wait) # waiting in to ensure that some data are yet collected. print("Tweets Collected....") all_hashtags_df = None all_tweets_df = None all_words_df = None count = 1 count_max = 4 while count <= count_max: print('Count: ' + str(count) + "/" + str(count_max)) print("Waiting for 30 Seconds.....") time.sleep(40) words = sqlContext.sql('Select word, count from words') words_df = words.toPandas() print(words_df) if all_words_df is None: all_words_df = words_df else: all_words_df = pd.concat([all_words_df, words_df], join='inner', ignore_index=True) tags = sqlContext.sql('Select tag, count from hashtags') tags_df = tags.toPandas() print(tags_df) if all_hashtags_df is None: all_hashtags_df = tags_df else: all_hashtags_df = pd.concat([all_hashtags_df, tags_df], join='inner', ignore_index=True) tweets = sqlContext.sql('Select text, sentiment from tweets') tweets_df = tweets.toPandas() if all_tweets_df is None: all_tweets_df = tweets_df else: all_tweets_df = pd.concat([all_tweets_df, tweets_df], join='inner', ignore_index=True) count += 1 ssc.stop() # Saving all dataframes as csv. if all_hashtags_df is not None: all_hashtags_df.to_csv('hashtags.csv') if all_words_df is not None: all_words_df.to_csv('words.csv') if all_tweets_df is not None: all_tweets_df.to_csv('tweets.csv')
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Generalized Linear Regression Scripts", "weight": 2 }, "predictionStart": { "summary": "Generalized Linear Regression Model Prediction Started", "weight": 2 }, "predictionFinished": { "summary": "Generalized Linear Regression Model Prediction Finished", "weight": 6 } } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionStart", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path( ) trained_model_path += "/model" pipeline_path = "/".join( trained_model_path.split("/")[:-1]) + "/pipeline" print("trained_model_path", trained_model_path) print("pipeline_path", pipeline_path) print("score_data_path", score_data_path) pipelineModel = MLUtils.load_pipeline(pipeline_path) trained_model = MLUtils.load_generalized_linear_regresssion_pyspark_model( trained_model_path) df = self._data_frame indexed = pipelineModel.transform(df) transformed = trained_model.transform(indexed) if result_column in transformed.columns: transformed = transformed.withColumnRenamed( result_column, "originalLabel") transformed = transformed.withColumnRenamed("prediction", result_column) pandas_scored_df = transformed.select( list(set(self._data_frame.columns + [result_column]))).toPandas() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] pandas_scored_df.to_csv(score_data_path, header=True, index=False) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionFinished", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("STARTING Measure ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) spark_scored_df = transformed.select( list(set(columns_to_keep + [result_column]))) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, scriptWeight=self._scriptWeightDict, analysisName="Descriptive analysis") descr_stats_obj.Run() print("DescriptiveStats Analysis Done in ", time.time() - fs, " seconds.") except: print("Frequency Analysis Failed ") try: fs = time.time() df_helper.fill_na_dimension_nulls() df = df_helper.get_data_frame() dt_reg = DecisionTreeRegressionScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName="Predictive modeling") dt_reg.Run() print("DecisionTrees Analysis Done in ", time.time() - fs, " seconds.") except: print("DTREE FAILED") try: fs = time.time() two_way_obj = TwoWayAnovaScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName="Measure vs. Dimension") two_way_obj.Run() print("OneWayAnova Analysis Done in ", time.time() - fs, " seconds.") except: print("Anova Analysis Failed")
import re import nltk from pyspark import SparkConf, SparkContext from pyspark import sql from pyspark.sql import SQLContext from nltk.corpus import stopwords from nltk.sentiment.vader import SentimentIntensityAnalyzer from nltk import tokenize from nltk.stem import WordNetLemmatizer import time start_time = time.time() nltk.download('vader_lexicon') # Set of all stopwords conf = SparkConf().setAppName("Yelp") sc = SparkContext(conf=conf) spark = SQLContext(sc) """ Schema |-- business_id: string (nullable = true) |-- cool: long (nullable = true) |-- date: string (nullable = true) |-- funny: long (nullable = true) |-- review_id: string (nullable = true) |-- stars: double (nullable = true) |-- text: string (nullable = true) |-- useful: long (nullable = true) |-- user_id: string (nullable = true) """ loc = 'c' # 'c' # Full if loc == 'f':
def main(): """Executes Batch pipeline to store dataset into Cloud Spanner table.""" parser = argparse.ArgumentParser( description='Perform Batch processing to send session data to Spanner') parser.add_argument('--input', help='''Path to data set in cloud storage Example: --input gs://project/path/to/GCS/file''', required=True) parser.add_argument('--instance_id', help='''Cloud Spanner instance ID Example: --instance_id spanner_instance_id''', required=True) parser.add_argument('--database_id', help='''Cloud Spanner database ID Example: --database-id spanner_database_id''', required=True) args = parser.parse_args() logging.info('Reading Dataset') user_sessions_chunks_df = pd.read_csv(args.input, encoding='utf-8', chunksize=int(10**2)) conf = SparkConf().setAppName("Batch Processing with Spark").setMaster( "local") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) instance = get_instance(args.instance_id) logging.info('Creating user_sessions Spanner Database') create_database(instance, args.database_id) product_attributes = [ 'category', 'sub_category', 'product', 'product_details' ] schema = StructType([ StructField("event_time", StringType(), True), StructField("event_type", StringType(), True), StructField("product_id", StringType(), True), StructField("category_id", StringType(), True), StructField("category_code", StringType(), True), StructField("brand", StringType(), True), StructField("price", StringType(), True), StructField("user_id", StringType(), True), StructField("user_session", StringType(), True) ]) for user_sessions_chunk_df in user_sessions_chunks_df: logging.info('Transforming data from the Batch') # print(user_sessions_chunk_df.count()) user_sessions_df = transform_data(sqlContext, user_sessions_chunk_df, product_attributes, schema) logging.info( 'Loading DF Data from the Batch into events_batch Spanner Table') user_sessions_rows = user_sessions_df.to_records(index=True, index_dtypes=int) user_sessions_values = list(user_sessions_rows) write_to_spanner(instance, args.database_id, user_sessions_values) spanner_success_message = ('Finished Loading DF Data from all' + ' Batches into events_batch Spanner Table') logging.info(spanner_success_message)
def main(sc, src_s3_bucket, target_gremlin_server): gremlin_method_insert_pkg_version = """ def insert_package_version(g, ecosystem, name, version) { def pred_pkg = g.V().has('vertex_label', 'Package').has('name', name).has('ecosystem', ecosystem); def pkg_vertex = (pred_pkg.hasNext()) ? pred_pkg.next() : g.addV('vertex_label', 'Package', 'name', name, 'ecosystem', ecosystem).next() def pred_version = g.V().has('vertex_label', 'Version').has('pecosystem', ecosystem).has('pname', name).has('version', version); if (!pred_version.hasNext()) { def version_vertex = g.addV('vertex_label', 'Version', 'pecosystem', ecosystem, 'pname', name, 'version', version).next(); pkg_vertex.addEdge('has_version', version_vertex); } } """ gremlin_method_insert_ref_stack = """ def insert_ref_stack(g, sid, sname, secosystem, usage, source, is_ref_stack, dependencies) { def pred_stack = g.V().has('vertex_label', 'Stack').has('sname', sname).has('secosystem', secosystem) if (!pred_stack.hasNext()) { def stack_vertex = g.addV('vertex_label','Stack','sname', sname, 'secosystem', secosystem, 'usage', usage, 'source', source, 'is_ref_stack', is_ref_stack, 'sid', sid).next(); for (k in dependencies.keySet()) { def version_vertex = g.V().has('vertex_label', 'Version').has('pecosystem', secosystem).has('pname', k).has('version', dependencies.get(k)).next(); stack_vertex.addEdge('has_dependency', version_vertex); } } } """ sqlContext = SQLContext(sc) input_data = sc.wholeTextFiles("s3n://" + src_s3_bucket + "/") not_null_data = input_data.filter(lambda x: x[1].strip() not in ['null', '']) json_formatted = not_null_data.map(lambda x: (x[0], json.loads(x[1]))) only_npm = json_formatted.filter(lambda x: 'NPM' in extract_ecosystem(x[1])) package_versions = only_npm.map(lambda x: (x[0], map_package_versions(x[1]))) non_fail_package_versions = package_versions.map( lambda x: (x[0], filter(lambda pv: pv[0] != 'fail' and pv[1] != 'fail', x[1]))) non_empty_package_versions = non_fail_package_versions.filter(lambda x: len(x[1]) > 0) transactions = non_empty_package_versions.map(lambda x: map(lambda pv: "%s@@%s" % (pv[0], pv[1]), x[1])) unique_transactions = transactions.map(lambda x: list(set(x))) truncated_transactions = unique_transactions.map(lambda x: x[:MAX_WIDTH]).cache() count_transactions = truncated_transactions.count() model = FPGrowth.train(truncated_transactions, minSupport=0.5, numPartitions=truncated_transactions.getNumPartitions()) rddJsons = model.freqItemsets().map( lambda x: freqItemsetToRefStack(x.items, float(x.freq) / float(count_transactions))) # rddJsons = rddRefStacks.filter(lambda x: len(x.get('dependencies').items()) > 4 and len(x.get('dependencies').items()) <= 10 ) # Save packages and versions rddVersions = rddJsons.flatMap(lambda x: x.get('dependencies').items()) dfVersions = rddVersions.toDF().distinct() rddGremlinVersions = dfVersions.rdd.map(lambda x: gremlin_str_pkg_version('trial', x[0], x[1])) str_gremlin = gremlin_method_insert_pkg_version + ' '.join(rddGremlinVersions.collect()) fire_gremlin(target_gremlin_server, str_gremlin) # Save stacks rdd_gremlin_stacks = rddJsons.map(lambda x: gremlin_str_ref_stack(x)) str_gremlin = gremlin_method_insert_ref_stack + ' '.join(rdd_gremlin_stacks.collect()) fire_gremlin(target_gremlin_server, str_gremlin)
def process_log_data(spark, input_data, output_data): """ In this function we are loading the song_data file and create tables for songplays,users and time tables. Input: Sparksession, Input_data filepath for songs data Output_data filepath for songs data Output: We produce parquet files for songplays,users and time tables. """ # get filepath to log data file log_data = input_data # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(col("page") == "NextSong") # extract columns for users table users_table = df['userId', 'firstName', 'lastName', 'gender', 'level'] #drop duplicates users_table = users_table.drop_duplicates(subset=['userId']) # write users table to parquet files users_table = users_table.write.partitionBy('userId').parquet( os.path.join(output_data, 'users.parquet'), 'overwrite') print("users_table partitioned!") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: tstodatetime(x)) df = df.withColumn('daytime', get_timestamp(col("ts"))) # extract columns to create time table time_table = df.select( col("ts").alias('start_time'), year('daytime').alias('year'), month('daytime').alias('month'), dayofmonth('daytime').alias('day'), hour('daytime').alias('hour'), weekofyear('daytime').alias('weekofyear')) #We are going to partition later in the code! # read in song data to use for songplays table sqlContext = SQLContext(spark) songs_table = sqlContext.read.parquet( 'data/outputs/song_data/songs.parquet') # extract columns from joined song and log datasets to create songplays table songplays_table = df['ts', 'userId', 'level', 'sessionId', 'location', 'userAgent', 'song'] #add artists id and song id by joining with songs_table songplays_table = songplays_table.alias('s').join(songs_table.alias('e'),col('e.title') == col('s.song'))\ .select(col('s.ts').alias('start_time'), col('s.userId'), col('s.level'), col('s.sessionId'), col('s.location'), col('s.userAgent'), col('s.song'), col('e.artist_id').alias('artist_id'), col('e.song_id').alias('song_id')) #add month and year for partitioning later based on those time_table_short = time_table['start_time', 'month', 'year'] songplays_table = songplays_table.alias('s').join(time_table_short.alias('t'),col('t.start_time') == col('s.start_time'))\ .select(col('s.start_time'), col('s.userId'), col('s.level'), col('s.sessionId'), col('s.location'), col('s.userAgent'), col('s.song'), col('s.artist_id'), col('s.song_id'), col('t.year'), col('t.month'), ) # write time table to parquet files partitioned by year and month time_table = time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'times.parquet'), 'overwrite') print("time_table partitioned!") # write songplays table to parquet files partitioned by year and month songplays_table = songplays_table.write.partitionBy( 'year', 'month').parquet(os.path.join(output_data, 'songplays.parquet'), 'overwrite') print("songplays_table partitioned!")
def split(intersections, csv_report): logging.info('Splitting started') if csv_report: if cfg.reporting.use_uuid: uuid = uuid4() reporter = CSVReport(cfg.reporting.csv_dir, uuid) else: reporter = CSVReport(cfg.reporting.csv_dir, None) else: reporter = ExcelReport(cfg.reporting.file) logging.info('Spark initialization') sc = SparkContext(cfg.spark.master, 'map_test: split') sqlContext = SQLContext(sc) logging.info('Source file reading') df = sqlContext.read.json(cfg.splitting.source_file) df = df.withColumn("Date", F.from_utc_timestamp("eventTime", "UTC")) df = df[(df.event != '$set') & (df.event != '$unset')] users_with_event_count = df.groupBy(F.col("entityId").alias("user")).count() logging.info('Filter users with small number of events') min_events = 10 users_with_few_events = (users_with_event_count .filter("count < %d" % (min_events)) .select(F.col("user").alias("user_with_few_events"))) ndf = df.join(users_with_few_events, F.col("entityId")==F.col("user_with_few_events"), how="left_outer") df1 = ndf.filter("user_with_few_events is NULL").drop("user_with_few_events") logging.info('Split data into train and test') train_df, test_df = split_data(df) train_df.coalesce(1).write.format('json').save(cfg.splitting.train_file) test_df.coalesce(1).write.format('json').save(cfg.splitting.test_file) train_df = train_df.select("entityId", "event", "targetEntityId").cache() test_df = test_df.select("entityId", "event", "targetEntityId").cache() logging.info('Calculation of different stat metrics of datasets') events_by_type = (df .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("count_total")) .toPandas()) events_by_type_test = (test_df .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("count_test")) .toPandas() .set_index("event")) events_by_type_train = (train_df .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("count_train")) .toPandas() .set_index("event")) unique_users_by_event = (df .select(F.col("entityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_users_total")) .toPandas() .set_index("event")) unique_users_by_event_train = (train_df .select(F.col("entityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_users_train")) .toPandas() .set_index("event")) unique_users_by_event_test = (test_df .select(F.col("entityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_users_test")) .toPandas() .set_index("event")) unique_items_by_event = (df .select(F.col("targetEntityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_items_total")) .toPandas() .set_index("event")) unique_items_by_event_train = (train_df .select(F.col("targetEntityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_items_train")) .toPandas() .set_index("event")) unique_items_by_event_test = (test_df .select(F.col("targetEntityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_items_test")) .toPandas() .set_index("event")) logging.info('Calculate total counts') events = df.count() events_train = train_df.count() events_test = test_df.count() unique_users = df.select("entityId").distinct().count() unique_users_train = train_df.select("entityId").distinct().count() unique_users_test = test_df.select("entityId").distinct().count() unique_items = df.select(F.col("targetEntityId")).distinct().count() unique_items_train = train_df.select(F.col("targetEntityId")).distinct().count() unique_items_test = test_df.select(F.col("targetEntityId")).distinct().count() info_df = events_by_type dfs = [unique_users_by_event, unique_items_by_event, events_by_type_train, events_by_type_test, unique_users_by_event_train, unique_users_by_event_test, unique_items_by_event_train, unique_items_by_event_test] for data_frame in dfs: info_df = info_df.join(data_frame, on="event") n_rows, n_cols = info_df.shape # totals info_df.loc[n_rows] = ['ANY EVENT', events, unique_users, unique_items, events_train, events_test, unique_users_train, unique_users_test, unique_items_train, unique_items_test] info_df.insert(4, 'events per user', info_df.ix[:, 1] / info_df.ix[:, 2]) info_df.insert(5, 'events per item', info_df.ix[:, 1] / info_df.ix[:, 3]) info_df = info_df.fillna(0) logging.info('Create event stat worksheet') reporter.start_new_sheet('Events stat') reporter.report( ['event', 'event count', 'unique users', 'unique items', 'events per user', 'events per item', 'event count train', 'event count test', 'unique users train', 'unique users test', 'unique items train', 'unique items test'], [column.tolist() for _, column in info_df.iteritems()], selected_rows=[next(info_df.iteritems())[1].tolist().index(cfg.testing.primary_event)], cfg=cfg) reporter.finish_sheet() if intersections: logging.info('Start intersections calculation') reporter.start_new_sheet('Intersections') columns_for_matrix = cfg.testing.events logging.info('Process train / train user intersection') train_train_users = ( train_df .select(F.col("entityId").alias("user"), F.col("event").alias("event_left")) .distinct() .join(train_df.select(F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(), on="user", how="inner") .groupBy(["event_left", "event_right"]) .count() .collect()) trtru = mk_intersection_matrix(train_train_users, columns_for_matrix) reporter.report( [''] + list(trtru.columns.values), [trtru.index.tolist()] + [column for _, column in trtru.iteritems()], title='Train / train user intersection') logging.info('Process train / test user intersection') train_test_users = ( train_df .select(F.col("entityId").alias("user"), F.col("event").alias("event_left")) .distinct() .join(test_df.select(F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(), on="user", how="inner") .groupBy(["event_left", "event_right"]) .count() .collect()) trtsu = mk_intersection_matrix(train_test_users, columns_for_matrix, horizontal_suffix=" train", vertical_suffix=" test") reporter.report( [''] + list(trtsu.columns.values), [trtsu.index.tolist()] + [column for _, column in trtsu.iteritems()], title='Train / test user intersection') logging.info('Process train / train item intersection') train_train_items = ( train_df .select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_left")) .distinct() .join(train_df.select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(), on="item", how="inner") .groupBy(["event_left", "event_right"]) .count() .collect()) trtri = mk_intersection_matrix(train_train_items, columns_for_matrix) reporter.report( [''] + list(trtri.columns.values), [trtri.index.tolist()] + [column for _, column in trtri.iteritems()], title='Train / train item intersection' ) logging.info('Process train / test item intersection') train_test_items = ( train_df .select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_left")) .distinct() .join(test_df.select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(), on="item", how="inner") .groupBy(["event_left", "event_right"]) .count() .collect()) trtsi = mk_intersection_matrix(train_test_items, columns_for_matrix, horizontal_suffix=" train", vertical_suffix=" test") reporter.report( [''] + list(trtsi.columns.values), [trtsi.index.tolist()] + [column for _, column in trtsi.iteritems()], title='Train / test item intersection' ) reporter.report_config(cfg) reporter.finish_document() logging.info('Splitting finished successfully')
from pyspark.sql.session import SparkSession as spark import pandas as pd from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql.types import * from matplotlib import pyplot import os sc = SparkContext(appName="Task_usage") sql_context = SQLContext(sc) # folder_path ='/mnt/volume/ggcluster/clusterdata-2011-2/task_usage/' folder_path = '/mnt/volume/ggcluster/spark-2.1.1-bin-hadoop2.7/thangbk2209/Jobid-NumberOfTask/' dataSchema = StructType([ StructField('JobId', LongType(), True), StructField('taskIndex', LongType(), True) ]) file_name = 'JobID-numberOfTask.csv' df = (sql_context.read.format('com.databricks.spark.csv').schema( dataSchema).load("%s%s" % (folder_path, file_name))) df.createOrReplaceTempView("dataFrame") sumCPUUsage = sql_context.sql( "SELECT JobId, sum(taskIndex) as numberOfTask from dataFrame group by Jobid order by numberOfTask DESC" ) # sumCPUUsage.show(5000) schema_df = ["Jobid", "numberOfTaskIndex"] sumCPUUsage.toPandas().to_csv('thangbk2209/Predictive_Scaling/results/%s' % (file_name), index=False, header=None) # sumCPUUsage.write.save("results/test.csv", format="csv", columns=schema_df)
from pyspark.ml.classification import RandomForestClassificationModel # SparkSession singleton generator needed to operate on Dataframes within stream def getSparkSessionInstance(sparkConf): if ('sparkSessionSingletonInstance' not in globals()): globals()['sparkSessionSingletonInstance'] = SparkSession\ .builder\ .config(conf=sparkConf)\ .getOrCreate() return globals()['sparkSessionSingletonInstance'] spark = SparkSession.builder.appName("Realtime Sensor Analytics").getOrCreate() sc = spark.sparkContext sqc = SQLContext(sc) # Read in Kudu information config = ConfigParser.ConfigParser() config.read('config.ini') kuduMaster = config.get('hadoop', 'kudu_masters') kuduPort = config.get('hadoop', 'kudu_port') kafkaTopic = config.get('hadoop', 'kafka_topic') kafkaBroker = config.get('hadoop', 'kafka_brokers') + ':' + '9092' # Read in Tag ID/Entity mappings from Kudu to join with sensor data tag_mappings = sqc.read.format('org.apache.kudu.spark.kudu')\ .option('kudu.master',kuduMaster)\ .option('kudu.table','tag_mappings')\ .load()
from pyspark.sql.types import DataType, IntegerType # mllib for clustering from pyspark.mllib.linalg import Vectors, DenseMatrix from pyspark.mllib.clustering import GaussianMixture # JSON import json import collections # numpy from numpy.testing import assert_equal import numpy as np from shutil import rmtree from numpy import array from datetime import timedelta, date if __name__ == "__main__": sqlsc = SQLContext(sc) MYSQL_USERNAME = "" MYSQL_PWD = "" MYSQL_CONNECTION_URL = "jdbc:mysql://localhost:3306/telegramdb?autoReconnect=true&useSSL=false" + \ "&user="******"&password="******"jdbc").options( url=MYSQL_CONNECTION_URL, dbtable="information", driver="com.mysql.jdbc.Driver").load() tags = sqlsc.read.format("jdbc").options( url=MYSQL_CONNECTION_URL, dbtable="tags", driver="com.mysql.jdbc.Driver").load() # columns tags = tag_df.filter( tag_df.high == 'IT').map(lambda line: line.low).collect()
from pyspark import SparkContext from pyspark.sql import SQLContext sc=SparkContext() path = "eventLogging/local-1588781816130.inprogress" sqlConf = SQLContext(sc) df=sqlConf.read.json(path) df.createOrReplaceTempView("Events") # The Number of events so far print(f'The number of events So far = {df.count()}') dist_events=df.groupBy('Event').count() print(f'The number of distinct events So far = {dist_events.count()}') event_list=[] event_timestamp=dict() for event in dist_events.collect(): event_dict=event.asDict() event_list.append(event_dict.['Event']) event_timestamp[event_dict.['Event']]=event_dict.['Event'] open_event=[] for event in event_list: if event[-5:]=='Start':
def getSqlContextInstance(sparkContext): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext) return globals()['sqlContextSingletonInstance']
with open(output, 'w') as outfile: #write out into output file for lst in final: for x in lst: if len(x) == 1: outfile.write("'") outfile.write(x[0]) outfile.write("'") outfile.write('\n') else: for i in x: outfile.write("'") outfile.write(i) outfile.write("'") if i != x[-1]: outfile.write(', ') outfile.write('\n') outfile.close() end_time = datetime.datetime.now() difference1 = end_time - start_time difference_duration1 = round(difference1.total_seconds(), 2) print('Duration: ' + str(difference_duration1) + ' seconds') if __name__ == "__main__": start_time = datetime.datetime.now() input_file = sys.argv[1] output = sys.argv[2] gf = SparkContext.getOrCreate() sqlContext = SQLContext(gf) Graph()
warehouse_location = abspath('spark-warehouse') os.environ[ "PYSPARK_SUBMIT_ARGS"] = '--jars /data/jupyter/kudu-spark2_2.11-1.8.0.jar pyspark-shell' sc = SparkSession.builder \ .appName("Special Day Weekly") \ .config("spark.sql.warehouse.dir", warehouse_location) \ .config("spark.num.executors", '10') \ .config("spark.executor.memory", '15G') \ .config("spark.executor.cores", '20') \ .enableHiveSupport() \ .getOrCreate() sqlc = SQLContext(sc) sql_query = f""" select trxn.item_id, trxn.sub_id, trxn.store_code, trxn.date_key, trxn.daily_sales_sum, cal.week_key from {database_name}.forecast_sprint4_add_dm_to_daily trxn join ods.dim_calendar cal on trxn.date_key >= '{start_date}' and trxn.date_key <'{end_date}' and trxn.date_key = cal.date_key """.replace("\n", " ")
def getSqlContextInstance(sparkContext): """Lazily instantiated global instance of SQLContext Below from https://spark.apache.org/docs/1.5.2/streaming-programming-guide.html#dataframe-and-sql-operations.""" if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext) return globals()['sqlContextSingletonInstance']
from pyspark.sql import SQLContext from pyspark.mllib.linalg import Vectors import numpy as np import random from pyspark import SparkContext from pyspark import SparkConf conf = SparkConf() conf.set("spark.executor.cores", '3') conf.set("spark.executor.instances", '1') conf.set("spark.executor.memory", "1g") conf.set("spark.locality.wait", "0") conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf=conf) sqlcontext = SQLContext(sc) def shuffle_csv(csv_file): lines = open(csv_file).readlines() random.shuffle(lines) open(csv_file, 'w').writelines(lines) def load_data_frame(csv_file, shuffle=True, train=True): if shuffle: shuffle_csv(csv_file) data = sc.textFile( '/home/minglu/dist_spark/data/' + csv_file ) # This is an RDD, which will later be transformed to a data frame data = data.filter(lambda x: x.split(',')[0] != 'label').map( lambda line: line.split(','))
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType from pyspark.sql import SQLContext from pyspark.sql.functions import col, max as max_ import load_data rdd_join = load_data.rdd_join sqlContext = SQLContext(load_data.sc) schema = StructType([ StructField("trade_dt", StringType(), True), StructField("rec_type", StringType(), True), StructField("symbol", StringType(), True), StructField("exchange", StringType(), True), StructField("event_tm", StringType(), True), StructField("event_seq_nb", StringType(), True), StructField("arrival_tm", StringType(), True), StructField("trade_pr", StringType(), True), StructField("bid_pr", StringType(), True), StructField("bid_size", StringType(), True), StructField("ask_pr", StringType(), True), StructField("ask_size", StringType(), True), StructField("execution_id", StringType(), True), StructField("trade_size", StringType(), True) ]) # Creating a dataframe for all data common_df = sqlContext.createDataFrame(rdd_join, schema) # Working with TRADES data: trade = common_df.filter(common_df.rec_type == "T") # Selecting most important columns to save space
enumerator = args[7] else: k = 10 w = 0.5 alpha = 6 b_update = True debug = True loss_type = 0 enumerator = "union" conf = SparkConf().setAppName("salary_test").setMaster('local[2]') num_partitions = 2 model_type = "regression" label = 'salary' sparkContext = SparkContext(conf=conf) sqlContext = SQLContext(sparkContext) fileRDD = sparkContext.textFile('salaries.csv', num_partitions) header = fileRDD.first() head_split = header.split(",") head_split[0] = '_c0' fileRDD = fileRDD.filter(lambda line: line != header) data = fileRDD.map(lambda row: row.split(",")) dataset_df = sqlContext.createDataFrame(data, head_split) cat_features = ["rank", "discipline", "sincephd_bin", "service_bin", "sex"] # initializing stages of main transformation pipeline stages = [] dataset_df = dataset_df.drop('_c0') dataset_df = dataset_df.withColumn("id", sf.monotonically_increasing_id()) # bining numeric features by local binner udf function (specified for current dataset if needed) dataset_df = dataset_df.withColumn('sincephd_bin',
print('Trying to get spark connection...') warehouse_location = os.path.abspath('spark-warehouse') spark = (SparkSession \ .builder \ .appName("Qiang (Charles)") \ .config("spark.sql.warehouse.dir", warehouse_location) \ .config("spark.num.executors", '15') \ .config("spark.executor.memory", '20G') \ .config("spark.executor.cores", '25') \ .enableHiveSupport() \ .getOrCreate() ) print('Spark connection created!') sqlc = SQLContext(spark) # /* ===================== getopt added ========================== # example: python3.6 ./sqls/9.7grouped_to_be_shipment_groupped_0729.py -d {config['database']} import argparse parser = argparse.ArgumentParser() parser.add_argument("-d", "--database_name", help="database name") args = parser.parse_args() print(args.database_name) config = {} config['database'] = args.database_name # config = {}
import pandas as pd import numpy as np import re from pyspark.sql.functions import UserDefinedFunction from pyspark.sql.types import * from pyspark.sql import Row from pyspark.sql.types import IntegerType from pyspark.sql.types import DoubleType from pyspark.sql.functions import when import ds_config import logging if __name__ == "__main__": logging.getLogger("py4j").setLevel(logging.ERROR) conf = SparkConf().setAppName("preprocess_04") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").option("delimiter", '|').load(ds_config.preprocess_02_output_01) gsdf = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").option("delimiter", ds_config.gs_customer_prof_after_delim).load(ds_config.gs_customer_prof_after) gsdf.registerTempTable("GS_SUMMARY") selected = sqlContext.sql("SELECT analytic_id, foreigner_flag, mobile_region, service_month, urbanflag, billing_region from GS_SUMMARY") present_df = df.join(selected, ["analytic_id"], "left_outer") present_df.registerTempTable("present_df") service_month = sqlContext.sql("SELECT analytic_id, service_month from present_df") means = service_month.agg( *[func.mean(c).alias(c) for c in service_month.columns if c != 'analytic_id']).toPandas().to_dict('records')[0] means['foreigner_flag'] = 'N' means['mobile_region'] = 'NA' means['urbanflag'] = 'Y' means['billing_region'] = 'NA' means['service_month'] = 12 present_df_eliminateGSna = present_df.fillna(means) present_df_eliminateGSna.registerTempTable("present_df_eliminateGSna")
df = spark.createDataFrame(mapped, schema=schema).distinct() # Extract child and parent domains so we can easily use asin filtering after loading the parquet file df = df.select( '*', url_to_domain('childTLD').alias('childDomain'), url_to_domain('parentTLD').alias('parentDomain')) df.write.format("parquet").saveAsTable(output_file, path="../data/ETLout/{}".format(output_file)) if __name__ == '__main__': conf = SparkConf().setAll(( ("spark.task.maxFailures", "10"), ("spark.locality.wait", "20s"), ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"), )) rec = re.compile(r"(https?://)?(www\.)?") # Regex to clean parent/child links sc = SparkContext(appName='etl-common-crawl', conf=conf) spark = SQLContext(sparkContext=sc) parser = argparse.ArgumentParser(description='Perform ETL on CommonCrawl') parser.add_argument('input', type=str, help='Input path') parser.add_argument('output', type=str, help='Output path') parser.add_argument('file_type', type=str, help='file or s3') parser.add_argument('crawl_path', type=str, help='file path or bucket name in case of s3') args = parser.parse_args() main(args.input, args.output, args.file_type, args.crawl_path)