def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today = today.replace(hour=0, minute=0, second=0, microsecond=0) first_day_of_month = today.replace() print('today: ', today) yesterday = today - timedelta(1) print('yesterday: ', yesterday) today_id = long(today.strftime("%Y%m%d")) yesterday_id = long(yesterday.strftime("%Y%m%d")) today_id_0h00 = long(today.strftime("%s")) print('today_id: ', today_id) print('yesterday_id: ', yesterday_id) print('today_id_0h00: ', today_id_0h00) date_end = 1573232400L General = 'General' Vocabulary = 'Vocabulary' Grammar = 'Grammar' Speaking = 'Speaking' Listening = 'Listening' Phrasal_Verb = 'Phrasal' Pronunciation = 'Pronunciation' # Phrasal # Verb # Speaking # 2 # General # 3 # Phrasal Verb # 4 # Grammar # 5 # Vocabulary # 6 # Pronunciation # 7 # Listening is_dev = True is_just_monthly_exam = False is_limit_test = False start_load_date = 0L BEHAVIOR_ID_TEST_TUAN = 22L BEHAVIOR_ID_TEST_THANG = 23L PERIOD_DAYLY = 1L PERIOD_WEEKLY = 2L PERIOD_MONTHLY = 3L def doCheckClassID(code): if code is None: return None code = str(code) if code == General: return 61L if code == Vocabulary: return 62L if code == Grammar: return 63L if code == Speaking: return 64L if code == Listening: return 65L if code == Pronunciation: return 66L if Phrasal_Verb in code: return 67L return None check_class_id = udf(doCheckClassID, LongType()) # ------------------------------------------------------------------------------------------------------------------# my_partition_predicate = "(behavior_id=='22' or behavior_id=='23')" dyf_student_behavior = glueContext.create_dynamic_frame.from_catalog( database="od_student_behavior", table_name="student_behavior", push_down_predicate=my_partition_predicate) dyf_student_behaviors = dyf_student_behavior.resolveChoice( specs=[('behavior_id', 'cast:long'), ('transformed_at', 'cast:long')]) # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3://dts-odin/flag/flag_student_testing_history.parquet") # max_key = df_flag.collect()[0]['flag'] # print('read from index: ', max_key) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_student_behaviors = Filter.apply(frame=dyf_student_behaviors, f=lambda x: x['transformed_at'] > max_key) # except: # print('read flag error ') if dyf_student_behaviors.count() > 0: dyf_student_behaviors = Filter.apply( frame=dyf_student_behaviors, f=lambda x: x["student_behavior_id"] is not None and x[ "student_id"] is not None # and x["behavior_id"] in [BEHAVIOR_ID_TEST_TUAN, # BEHAVIOR_ID_TEST_THANG # ] and start_load_date <= x["student_behavior_date"] < today_id_0h00) number_dyf_student_behavior = dyf_student_behaviors.count() print('number_dyf_student_behavior after filtering: ', number_dyf_student_behavior) if number_dyf_student_behavior == 0: return dyf_student_behavior = dyf_student_behaviors \ .select_fields(['student_behavior_id', 'student_behavior_date', 'student_id', 'behavior_id']) df_student_behavior = dyf_student_behavior.toDF() df_student_behavior = df_student_behavior.drop_duplicates( ['student_behavior_id']) if is_limit_test: df_student_behavior = df_student_behavior.limit(1000) df_student_behavior = df_student_behavior.repartition('behavior_id') df_student_behavior.cache() student_behavior_number = df_student_behavior.count() if is_dev: print('dy_student_behavior') print('student_behavior_number: ', student_behavior_number) df_student_behavior.printSchema() df_student_behavior.show(3) if student_behavior_number == 0: return # ------------------------------------------------------------------------------------------------------------------# dyf_student_test_mark = glueContext.create_dynamic_frame.from_catalog( database="od_student_behavior", table_name="student_test_mark", push_down_predicate=my_partition_predicate) dyf_student_test_mark = dyf_student_test_mark.select_fields( ['student_behavior_id', 'question_category', 'grade']) # dyf_student_test_mark = Filter.apply(frame=dyf_student_test_mark, # f=lambda x: x["behavior_id"] in [BEHAVIOR_ID_TEST_TUAN, # BEHAVIOR_ID_TEST_THANG # ] # ) df_student_test_mark = dyf_student_test_mark.toDF() number_student_test_mark = df_student_test_mark.count() if is_dev: print('df_student_test_mark') print('df_student_test_mark: ', number_student_test_mark) df_student_test_mark.printSchema() df_student_test_mark.show(3) if number_student_test_mark == 0: return df_student_behavior_mark = df_student_behavior\ .join(df_student_test_mark, on='student_behavior_id', how='left') if is_dev: print('df_student_behavior_mark') print('df_student_behavior_mark: ', df_student_behavior_mark) df_student_behavior_mark.printSchema() df_student_behavior_mark.show(3) df_student_behavior_mark = df_student_behavior_mark.dropDuplicates([ 'student_behavior_id', 'student_id', 'behavior_id', 'question_category' ]) df_student_behavior_mark_week = df_student_behavior_mark\ .filter(df_student_behavior_mark.behavior_id == BEHAVIOR_ID_TEST_TUAN) df_student_behavior_mark_month = df_student_behavior_mark.filter( df_student_behavior_mark.behavior_id == BEHAVIOR_ID_TEST_THANG) df_student_behavior_mark_week = df_student_behavior_mark_week\ .withColumn('agg_week_id', from_unixtime(df_student_behavior_mark_week.student_behavior_date, "yyyyww")) df_student_behavior_mark_month = df_student_behavior_mark_month \ .withColumn('agg_month_id', from_unixtime(df_student_behavior_mark_month.student_behavior_date, "yyyyMM")) if is_dev: print('df_student_behavior_mark_week') df_student_behavior_mark_week.printSchema() df_student_behavior_mark_week.show(3) print('df_student_behavior_mark_month') df_student_behavior_mark_month.printSchema() df_student_behavior_mark_month.show(3) df_student_behavior_mark_week = df_student_behavior_mark_week \ .withColumn("class_id", check_class_id(df_student_behavior_mark_week.question_category)) df_student_behavior_mark_week_agg = df_student_behavior_mark_week.groupby( 'student_id', 'agg_week_id', 'class_id').agg( f.round(f.max(df_student_behavior_mark_week.grade)).cast( 'long').alias('grade_total'), f.lit(PERIOD_WEEKLY).alias('period_type_id'), f.lit(None).cast('string').alias('agg_date_id'), f.lit(None).cast('string').alias('agg_month_id')) df_student_behavior_mark_month = df_student_behavior_mark_month.na.fill( {'grade': 0}) df_student_behavior_mark_month = df_student_behavior_mark_month.groupby( 'student_behavior_id').agg( f.first('student_id').alias('student_id'), f.first('agg_month_id').alias('agg_month_id'), f.round( f.sum('grade')).cast('long').alias('grade_total_attempt'), ) df_student_behavior_mark_month_agg = df_student_behavior_mark_month.groupby( 'student_id', 'agg_month_id').agg( f.max( df_student_behavior_mark_month.grade_total_attempt).alias( 'grade_total'), f.lit(PERIOD_MONTHLY).alias('period_type_id'), f.lit(None).cast('string').alias('agg_date_id'), f.lit(None).cast('string').alias('agg_week_id'), f.lit(68L).cast('long').alias('class_id')) df_student_behavior_mark_month_agg = df_student_behavior_mark_month_agg.select( 'student_id', 'agg_week_id', 'class_id', 'grade_total', 'period_type_id', 'agg_date_id', 'agg_month_id') if is_dev: print('df_student_behavior_mark_week_agg') df_student_behavior_mark_week_agg.printSchema() df_student_behavior_mark_week_agg.show(3) print('df_student_behavior_mark_month_agg') df_student_behavior_mark_month_agg.printSchema() df_student_behavior_mark_month_agg.show(3) df_student_behavior_mark_agg = df_student_behavior_mark_week_agg.union( df_student_behavior_mark_month_agg) if is_dev: print('df_student_behavior_mark_agg') df_student_behavior_mark_agg.printSchema() df_student_behavior_mark_agg.show(3) dyf_student_behavior_mark_agg = DynamicFrame.fromDF( df_student_behavior_mark_agg, glueContext, 'dyf_student_behavior_mark_agg') dyf_student_behavior_mark_agg = Filter.apply( frame=dyf_student_behavior_mark_agg, f=lambda x: x["class_id"] is not None) dyf_student_behavior_mark_agg.show(3) apply_output_month = ApplyMapping.apply( frame=dyf_student_behavior_mark_agg, mappings=[("student_id", "long", "student_id", "long"), ("class_id", "long", "class_id", "long"), ("period_type_id", "long", "period_type_id", "long"), ("agg_date_id", "string", "created_date_id", "long"), ("agg_week_id", "string", "created_week_id", "long"), ("agg_month_id", "string", "created_month_id", "long"), ("grade_total", "long", "measure1", "long")]) dfy_output_month = ResolveChoice.apply( frame=apply_output_month, choice="make_cols", transformation_ctx="resolvechoice2") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dfy_output_month, catalog_connection="nvn_knowledge", connection_options={ "dbtable": "student_learning_history", "database": "nvn_knowledge_v2" }, redshift_tmp_dir= "s3n://dtsodin/temp/nvn_knowledge_v2/student_learning_history", transformation_ctx="datasink4") df_temp = dyf_student_behaviors.toDF() flag = df_temp.agg({"transformed_at": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dts-odin/flag/flag_student_testing_history.parquet", mode="overwrite")
) if ctx.job_name == 'chief': print("Exporting saved_model to {}".format(args.export_dir)) classifier.export_saved_model(args.export_dir, serving_input_receiver_fn) if __name__ == "__main__": from pyspark.context import SparkContext from pyspark.conf import SparkConf from tensorflowonspark import TFCluster import argparse sc = SparkContext(conf=SparkConf().setAppName("mnist_estimator")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 parser = argparse.ArgumentParser() parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int,
cnt += 1 # output_file.write("lbl: {} pred: {}\n".format(truelbl,np.argmax(p) )) if truelbl != np.argmax(p): output_file.write("lbl: {} pred: {}\n".format( truelbl, np.argmax(p))) except tf.errors.OutOfRangeError: break output_file.close() if __name__ == '__main__': from pyspark.context import SparkContext from pyspark.conf import SparkConf sc = SparkContext(conf=SparkConf().setAppName("mnist_inference")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 parser = argparse.ArgumentParser() parser.add_argument( "--cluster_size", help= "number of nodes in the cluster (for S with labelspark Standalone)", type=int, default=num_executors) parser.add_argument('--images_labels', type=str, help='Directory for input images with labels') parser.add_argument("--export", help="HDFS path to export model",
""" Gets the value of stepSize or its default value. """ return self.getOrDefault(self.stepSize) class GBTRegressionModel(TreeEnsembleModels): """ Model fitted by GBTRegressor. .. versionadded:: 1.4.0 """ if __name__ == "__main__": import doctest from pyspark.context import SparkContext from pyspark.sql import SQLContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[2]", "ml.regression tests") sqlContext = SQLContext(sc) globs['sc'] = sc globs['sqlContext'] = sqlContext (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) sc.stop() if failure_count: exit(-1)
required=True, help='hdfs path to output data') existing_model_group = parser.add_mutually_exclusive_group(required=True) existing_model_group.add_argument( '--model_pkl', dest='model_pkl', type=str, default=None, help='a pickled LOPQModel to evaluate on the data') existing_model_group.add_argument( '--model_proto', dest='model_proto', type=str, default=None, help='a protobuf LOPQModel to evaluate on the data') args = parser.parse_args() sc = SparkContext(appName='LOPQ code computation') # Load UDF module if provided if args.data_udf: udf_module = __import__(args.data_udf, fromlist=['udf']) load_udf = udf_module.udf main(sc, args, data_load_fn=load_udf) else: main(sc, args) sc.stop()
>>> algo.getInitSteps() 10 """ self._paramMap[self.initSteps] = value return self @since("1.5.0") def getInitSteps(self): """ Gets the value of `initSteps` """ return self.getOrDefault(self.initSteps) if __name__ == "__main__": import doctest from pyspark.context import SparkContext from pyspark.sql import SQLContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[2]", "ml.clustering tests") sqlContext = SQLContext(sc) globs['sc'] = sc globs['sqlContext'] = sqlContext (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) sc.stop() if failure_count: exit(-1)
y_: batch_ys }))) if sv.is_chief: summary_writer.add_summary(summary, step) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop() if __name__ == '__main__': sc = SparkContext(conf=SparkConf().setAppName("read hdfs save to hdfs ")) hive_context = HiveContext(sc) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="input hdfs path") parser.add_argument( "-m", "--model", help="HDFS path to save/load model during train/inference", default="mnist_model") parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process",
steps_per_epoch=steps_per_epoch, callbacks=callbacks) from tensorflow_estimator.python.estimator.export import export_lib export_dir = export_lib.get_timestamped_export_dir(args.export_dir) compat.export_saved_model(multi_worker_model, export_dir, ctx.job_name == 'chief') if __name__ == '__main__': import argparse from pyspark.context import SparkContext from pyspark.conf import SparkConf from tensorflowonspark import TFCluster sc = SparkContext(conf=SparkConf().setAppName("mnist_keras")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 parser = argparse.ArgumentParser() parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int,
def copy(self, extra=None): """ Creates a copy of this instance with a randomly generated uid and some extra params. This copies the underlying bestModel, creates a deep copy of the embedded paramMap, and copies the embedded and extra parameters over. :param extra: Extra parameters to copy to the new instance :return: Copy of this instance """ if extra is None: extra = dict() return CrossValidatorModel(self.bestModel.copy(extra)) if __name__ == "__main__": import doctest from pyspark.context import SparkContext from pyspark.sql import SQLContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[2]", "ml.tuning tests") sqlContext = SQLContext(sc) globs['sc'] = sc globs['sqlContext'] = sqlContext (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) sc.stop() if failure_count: exit(-1)
import redis_logger_handler def parseFile(images_path, labels_path, fmt): if fmt == "csv": images = sc.textFile(images_path).map( lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile(labels_path).map( lambda ln: [int(x) for x in ln.split(',')]) else: images = sc.pickleFile(images_path) labels = sc.pickleFile(labels_path) return images, labels sc = SparkContext(conf=SparkConf().setAppName("lstm_ctc_ocr_spark")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 parser = argparse.ArgumentParser() parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=64) parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=1)
from __future__ import absolute_import from __future__ import division from __future__ import print_function from pyspark.context import SparkContext from pyspark.conf import SparkConf import argparse from datetime import datetime from tensorflowonspark import TFCluster import criteo_dist if __name__ == "__main__": sc = SparkContext(conf=SparkConf().setAppName("criteo_spark")) executors = sc._conf.get("spark.executor.instances") if executors is None: raise Exception( "Could not retrieve the number of executors from the SparkContext") num_executors = int(executors) num_ps = 1 parser = argparse.ArgumentParser() parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=100) parser.add_argument("-e", "--epochs",
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source #------------------------------------------------------------------------------------------------------------------# dyf_native_talk = glueContext.create_dynamic_frame.from_catalog(database='native_talk', table_name='native_talk_history_log_api') dyf_native_talk = dyf_native_talk.resolveChoice(specs=[('id', 'cast:long')]) try: df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet") read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_native_talk = Filter.apply(frame=dyf_native_talk, f=lambda x: x["id"] > read_from_index) except: print('read flag file error ') dyf_native_talk = dyf_native_talk.select_fields( ['id', 'learning_date', 'speaking_dialog_score', 'username', 'updated_time']) dy_cache = dyf_native_talk.toDF() dy_cache = dy_cache.cache() dyf_native_talk = DynamicFrame.fromDF(dy_cache, glueContext, 'dyf_native_talk') print('dy_cache------------') dy_cache.printSchema() print('dy_cache: ', dy_cache.count()) dy_cache.show(2) #------------------------------------------------------------------------------------------------------------------# if (dyf_native_talk.count() > 0): #---------------------------------------------------------datasource0-----------------------------------------------------# dyf_native_talk = Filter.apply(frame=dyf_native_talk, f=lambda x: x["username"] is not None and x["username"] != '' and x["speaking_dialog_score"] is not None and x["learning_date"] is not None and x["learning_date"] != '') # ----------------------------------datasource1---------------------------------------------------------------------------# if (dyf_native_talk.count() > 0): dyf_nt_account_mapping = glueContext.create_dynamic_frame.from_catalog(database='native_talk', table_name='native_talk_account_mapping') dyf_nt_account_mapping = dyf_nt_account_mapping.select_fields(['contact_id', 'username']).rename_field('username', 'nativetalk_user') dy_cache_2 = dyf_nt_account_mapping.toDF() dy_cache_2 = dy_cache_2.cache() dyf_nt_account_mapping = DynamicFrame.fromDF(dy_cache_2, glueContext, 'dyf_nt_account_mapping') dyf_nt_account_mapping = Filter.apply(frame=dyf_nt_account_mapping, f=lambda x: x["nativetalk_user"] is not None and x["nativetalk_user"] != '') # ----------------------------------datasource1---------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------# join = Join.apply(dyf_native_talk, dyf_nt_account_mapping, 'username', 'nativetalk_user') if(join.count() > 0): df_nativetalk = join.toDF() df_nativetalk = df_nativetalk.withColumn('sogio', f.lit(0.083333)) #5 phut df_nativetalk = df_nativetalk.withColumn('id_time', from_unixtime( unix_timestamp(df_nativetalk.learning_date, "yyyy-MM-dd"), "yyyyMMdd")) df_nativetalk = df_nativetalk.where("contact_id IS NOT NULL") data_nativetalk = DynamicFrame.fromDF(df_nativetalk, glueContext, 'data_nativetalk') data_nativetalk = data_nativetalk.resolveChoice(specs=[('sogio', 'cast:float')]) # -------------------------------------------------------------------------------------------------------------# print('data_nativetalk----------') data_nativetalk.printSchema() # tinh bang "fact_hieusuathoctap" df_hieusuathoctap = data_nativetalk.toDF() # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time df_hieusuathoctap = df_hieusuathoctap.groupby('contact_id', 'id_time').agg(f.sum('sogio'), f.count('contact_id')) df_hieusuathoctap = df_hieusuathoctap.withColumn('tu_hoc_type_id', f.lit(400)) data_hieusuathoctap = DynamicFrame.fromDF(df_hieusuathoctap, glueContext, 'data_hieusuathoctap') data_hieusuathoctap = data_hieusuathoctap.resolveChoice(specs=[('sum(sogio)', 'cast:double')]) print('data_hieusuathoctap::data_hieusuathoctap::data_hieusuathoctap------------------------------------------') data_hieusuathoctap.printSchema(); applymapping2 = ApplyMapping.apply(frame=data_hieusuathoctap, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", 'string', 'id_time', 'bigint'), ("count(contact_id)", 'long', 'soca', 'int'), ("sum(sogio)", 'double', 'sogio', 'double'), ("tu_hoc_type_id", 'int', "tu_hoc_type_id", "int")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields2 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") print('dropnullfields2 number: ', dropnullfields2.count()) datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2, catalog_connection="glue_redshift", connection_options={"dbtable": "temp_staging_lich_su_tu_hoc_native_talk", "database": "dts_odin", "postactions": """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2) SELECT um.user_id, hwb.id_time, 53, hwb.soca, round(hwb.sogio, 4) FROM temp_staging_lich_su_tu_hoc_native_talk hwb LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = hwb.contact_id; DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_native_talk """ }, redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/", transformation_ctx="datasink2") df_datasource = dyf_native_talk.toDF() flag = df_datasource.agg({"id": "max"}).collect()[0][0] print('flag: ', flag) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet", mode="overwrite") dy_cache.unpersist() dy_cache_2.unpersist()
SOURCE_SYSTEM = ARGS['source_system'] # must JSON_FILE_NAME = TARGET_TABLE + \ "_" + SOURCE_SYSTEM # ar_invc_hdr_f_must STAGE_TABLE = ARGS['rs_stage_table'] # ar_invc_hdr_f_stage_must CTLG_CONNECTION = ARGS['glue_conn'] # TestRedshift3 REDSHIFTDB = ARGS['rs_db'] # usinnovationredshift S3_BUCKET = ARGS['bkt_name'] # "odp-us-innovation-raw" MD5_COLUMN_SCD1 = TARGET_TABLE + "_md5_scd1" # ar_invc_hdr_f_md5_scd1 TARGET_TABLE_COLUMNS = ARGS['target_cols'] # As per DDL(col1,col2,col3) STAGE_TABLE_COLUMNS = ARGS['stage_cols'] # As per DDL(col1,col2,col3) DBTABLE_STG = STAGE_DATABASE_NAME + "." + STAGE_TABLE URL = ARGS["jdbc_url"] IAM_ROLE = ARGS["iam_role"] SC = SparkContext() GLUECONTEXT = GlueContext(SC) SPARK = GLUECONTEXT.spark_session JOB = Job(GLUECONTEXT) JOB.init(ARGS['JOB_NAME'], ARGS) RUN_ID = ARGS['JOB_RUN_ID'] JOB_NAME = ARGS['JOB_NAME'] TEMPDIR = ARGS['TempDir'] SRC_NOTEMPTY = True try: # @type: DataSource # @args: [database = "db_mrr_must", ## table_name = "billing" # transformation_ctx = "billing_df"] # @return: DynamicFrame
tags=[tag_constants.SERVING], signature_def_map={'predict': signature}, clear_devices=True) builder.save() if args.input_mode == 'spark': tf_feed.terminate() if __name__ == '__main__': import argparse from pyspark.context import SparkContext from pyspark.conf import SparkConf from tensorflowonspark import TFCluster sc = SparkContext(conf=SparkConf().setAppName("mnist_mlp")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 parser = argparse.ArgumentParser() parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=20) parser.add_argument("--export_dir", help="directory to export saved_mode") parser.add_argument(
def dohash(value): return abs(hash(value)) if __name__ == "__main__": # ensure number of inputs is 4: py file, input files, output files if len(sys.argv) != 4: print( "This script requires 3 input arguments to run: 1 inputFile and 1 outputFile" ) # break it sys.exit(1) # create an interface between pyspark and spark server sc = SparkContext('local[*]') # to simplify output # sc.setLogLevel("ERROR") """ DEFAULT APPROACH """ # start timer startTimer1 = time.time() # get input file and import into the SparkContext object task2Input1 = sc.textFile(sys.argv[1]) answer1, builder1 = task2Processor(task2Input1)
from __future__ import absolute_import from __future__ import division from __future__ import print_function from pyspark.context import SparkContext from pyspark.conf import SparkConf import argparse import numpy import tensorflow as tf from datetime import datetime from tensorflowonspark import TFCluster import wiki_dist sc = SparkContext(conf=SparkConf().setAppName("wiki_spark")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 parser = argparse.ArgumentParser() parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100) parser.add_argument("--epochs", help="number of epochs", type=int, default=1) parser.add_argument("--format", help="example format: (csv|pickle|tfr)", choices=["csv", "pickle", "tfr"], default="csv") parser.add_argument("--images",
''' Created on 2015/12/08 @author: charles ''' from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating from pyspark.context import SparkContext sc = SparkContext("local") # Load and parse the data data = sc.textFile("data/mllib/als/test.data") ratings = data.map(lambda l: l.split(',')).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) # Build the recommendation model using Alternating Least Squares rank = 10 numIterations = 10 model = ALS.train(ratings, rank, numIterations) # Evaluate the model on training data testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc, "myModelPath") sameModel = MatrixFactorizationModel.load(sc, "myModelPath")
import calendar import datetime from pyspark.sql.types import StringType from pyspark.sql.functions import lit, col def findDay(date): ref = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ] born = datetime.datetime.strptime(date, '%d %m %Y').weekday() return ref[int(born)] sc = SparkContext('local') # spark = SparkSession(sc) sql = SQLContext(sc) sample_udf = udf(lambda x: findDay(x), StringType()) # df = sql.read.csv("KCcrime2010To2018.csv", inferSchema = True, header = True) df = sql.read.csv("joined_date - Copy.csv", inferSchema=True, header=True) df.createTempView(name='kc_crime') pdb.set_trace() #getting the number of crimes in 2010 count by month df2 = sql.sql( "select Reported_month, count(1) from kc_crime where Reported_year = '2010' group by Reported_month order by Reported_month"
from __future__ import print_function from pyspark.conf import SparkConf from pyspark.context import SparkContext config = SparkConf() config.setAppName("SPARK_WORD_COUNT_JOB") config.setMaster("local[*]") sc = SparkContext(conf=config) sc.setLogLevel("info") text_file_rdd = sc.textFile("/home/dharshekthvel/history_1.txt") flat_mapped_rdd=text_file_rdd.flatMap(lambda each: each.split(' ')) mapped_rdd = flat_mapped_rdd.map(lambda each: (each,1)) mapped_rdd.reduceByKey(lambda x,y: x+y)\ .foreach(print)
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") dyf_care_call = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='care_call') dyf_care_call = dyf_care_call.resolveChoice(specs=[('_key', 'cast:long')]) # print schema and select fields print('original schema') dyf_care_call.printSchema() dyf_care_call.show(10) # try: # df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet") # read_from_index = df_flag.collect()[0]['flag'] # print('read from index: ', read_from_index) # dyf_care_call = Filter.apply(frame=dyf_care_call, # f=lambda x: x["_key"] > read_from_index) # except: # print('read flag file error ') # print('the number of new contacts: ', dyf_care_call.count()) dyf_care_call = dyf_care_call.select_fields( ['_key', 'id', 'phone', 'duration', 'call_status', 'time_created']).rename_field('time_created', 'call_date') dy_source_care_call_cache = dyf_care_call.toDF() dy_source_care_call_cache = dy_source_care_call_cache.dropDuplicates( ['id']) dy_source_care_call_cache = dy_source_care_call_cache.cache() dyf_care_call = DynamicFrame.fromDF(dy_source_care_call_cache, glueContext, 'dyf_care_call') # if (dyf_care_call.count() > 0): dyf_care_call = Filter.apply( frame=dyf_care_call, f=lambda x: x["phone"] is not None and x["phone"] != '' and (x["call_status"] == 'success' or x["call_status"] == 'call_success') and x["call_date"] is not None and x["call_date"] != '' and x["duration"] is not None and x["duration"] > 30) # print('dyf_care_call::corrcect') print('dyf_care_call number', dyf_care_call.count()) if (dyf_care_call.count() > 0): dyf_ad_contact_phone = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='student_contact_phone') dyf_ad_contact_phone = dyf_ad_contact_phone.select_fields( ['phone', 'contact_id']) dyf_ad_contact_phone = Filter.apply( frame=dyf_ad_contact_phone, f=lambda x: x["phone"] is not None and x["phone"] != '' and x[ "contact_id"] is not None and x["contact_id"] != '') print('dyf_ad_contact_phone::schema') dyf_ad_contact_phone.printSchema() # dyf_advisor_ip_phone = glueContext.create_dynamic_frame.from_catalog(database='callcenter', # table_name='advisor_ip_phone') # # dyf_advisor_ip_phone = Filter.apply(frame=dyf_advisor_ip_phone, # f=lambda x: x["ip_phone"] is not None and x["ip_phone"] != '') # # # # # # #-----------------------------------------------------------------------------------------------------------# join_call_contact = Join.apply(dyf_care_call, dyf_ad_contact_phone, 'phone', 'phone') # join_call_contact = join_call_contact.select_fields(['id_time', 'answertime', 'calldate', 'phonenumber_correct', 'calldate', 'ipphone', 'contact_id']) # print('join_call_contact::schema------------') join_call_contact.printSchema() join_call_contact.show(2) print('join: ', join_call_contact.count()) # # # #-----------------------------------------------------------------------------------------------------------# # dyf_source_ls_dong_tien = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='nvn_poss_lich_su_dong_tien') dyf_source_ls_dong_tien = Filter.apply( frame=dyf_source_ls_dong_tien, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '' and x["ngay_thanhtoan"] is not None and x["ngay_thanhtoan" ] != '') dyf_source_ls_dong_tien = dyf_source_ls_dong_tien.select_fields([ '_key', 'id', 'contact_id', 'ngay_thanhtoan', 'ngay_tao', 'makh' ]).rename_field('ngay_tao', 'ngay_a0') dy_source_ls_dt_cache = dyf_source_ls_dong_tien.toDF() dy_source_ls_dt_cache = dy_source_ls_dt_cache.dropDuplicates( ['id']) dy_source_ls_dt_cache = dy_source_ls_dt_cache.cache() dyf_source_ls_dong_tien = DynamicFrame.fromDF( dy_source_ls_dt_cache, glueContext, 'dyf_source_ls_dong_tien') # join_call_contact_ao = Join.apply(join_call_contact, dyf_source_ls_dong_tien, 'contact_id', 'contact_id') # print('join_call_contact_ao::schema------------') join_call_contact_ao.printSchema() join_call_contact_ao.show(2) print('join: ', join_call_contact_ao.count()) # # # join_call_contact_ao = join_call_contact_ao.resolveChoice(specs=[('calldate', 'cast:timestamp'), # # ('ngay_a0', 'cast:timestamp')]) # # join_call_contact_ao = Filter.apply( frame=join_call_contact_ao, f=lambda x: x["call_date"] is not None and x[ "ngay_a0"] is not None and x["call_date"] > x["ngay_a0"]) # print( 'join_call_contact_ao::after filter calldate > ngay_a0------------' ) # join_call_contact_ao.printSchema() join_call_contact_ao.show(2) print('join_call_contact_ao: ', join_call_contact_ao.count()) # # #get lich su chao mung thanh cong df_join_call_contact_ao = join_call_contact_ao.toDF() df_join_call_contact_ao = df_join_call_contact_ao.groupby( 'contact_id', 'makh').agg(f.min('call_date').alias("ngay_a1")) df_join_call_contact_ao = df_join_call_contact_ao.withColumn( 'id_time', from_unixtime( unix_timestamp(df_join_call_contact_ao.ngay_a1, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) dyf_result = DynamicFrame.fromDF(df_join_call_contact_ao, glueContext, 'dyf_result') # # print('dyf_result------------') # join_call_contact_ao.printSchema() dyf_result.show(2) print('dyf_result: ', dyf_result.count()) # # # # # # # chon field applymapping1 = ApplyMapping.apply( frame=dyf_result, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", "string", "id_time", "bigint"), ("makh", "int", "makh", "int"), ("ngay_a1", "string", "ngay_a1", "timestamp")]) # resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") # print('dropnullfields3::printSchema') # dropnullfields3.printSchema() # dropnullfields3.show(2) # ghi data vao redshift datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_ls_dong_tien_a1_v3", "database": "dts_odin", "postactions": """ INSERT into mapping_changed_status_student(description, user_id, change_status_date_id, to_status_id, timestamp1) SELECT 'contact_id: ' + temp_a1.contact_id +' - makh: ' + temp_a1.makh, um.user_id ,temp_a1.id_time, 2, temp_a1.ngay_a1 FROM temp_ls_dong_tien_a1_v3 temp_a1 LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = temp_a1.contact_id ; DROP TABLE IF EXISTS public.temp_ls_dong_tien_a1_v3; CALL update_a1_exception_from_eg() """ }, redshift_tmp_dir="s3n://dts-odin/temp/temp_ls_dong_tien/v2", transformation_ctx="datasink4") df_datasource = dyf_care_call.toDF() flag = df_datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet( "s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet", mode="overwrite") dy_source_care_call_cache.unpersist()
return sorted(connectedGraph, key=lambda x: -len(x)) def calculateModularity(communities, modularityDict): Q = 0 for c in communities: for i in c: for j in c: Q += modularityDict[(i, j)] return Q / (2 * M) if __name__ == "__main__": startTime = time.time() sc = SparkContext('local[*]', '578task') sc.setLogLevel("WARN") input_file = "edgeList.csv" output_file = "output_cluster.csv" rdd = sc.textFile(input_file) edges = rdd.map(lambda x: x.split(","))\ .map(lambda x: (x[0], x[1])) M = edges.count() print("edgesNumber:", M) # record every users' neighbor, (user1, [user2, user3...]) userNeighbor = edges.flatMap(lambda x: [(x[0], [x[1]]), (x[1], [x[0]])]
# coding: utf-8 # In[1]: from pyspark.context import SparkContext from pyspark.sql import SparkSession from pyspark import SparkConf conf = SparkConf() sc = SparkContext('local', conf) spark = SparkSession(sc) # In[2]: import pyspark.sql.functions as F from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.clustering import KMeans # In[3]: spark.read.json('C:/Users/maxen/Downloads/Test_DS/Test_DS/Brisbane_CityBike.json').show(150,False) # In[42]:
def spark_context(request): sc = SparkContext('local', 'tests_practicas_spark') request.addfinalizer(lambda: sc.stop()) logger = logging.getLogger('py4j') logger.setLevel(logging.WARN) return sc
def stablish_spark_connection(): sConf = SparkConf("spark://localhost:7077") sc = SparkContext(conf=sConf) spark = SparkSession(sc) return sConf, sc, spark
from pyspark.conf import SparkConf import argparse import os import numpy import sys import tensorflow as tf import threading import time from datetime import datetime # from com.yahoo.ml.tf import TFCluster from tensorflowonspark import TFCluster import mnist_dist sc = SparkContext( conf=SparkConf().setAppName("mnist_spark")) # mnist_spark 可以自行修改 executors = sc._conf.get("spark.executor.instances") # spark worker实例个数 num_executors = int(executors) if executors is not None else 1 num_ps = 1 # ps(主)节点个数 parser = argparse.ArgumentParser() parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=128) # 每步训练的样本数 parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=1) # 样本迭代总数
if sparkVersion.isImportAllPackageUnderSparkSql(): java_import(gateway.jvm, "org.apache.spark.sql.*") java_import(gateway.jvm, "org.apache.spark.sql.hive.*") else: java_import(gateway.jvm, "org.apache.spark.sql.SQLContext") java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext") java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext") java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext") java_import(gateway.jvm, "scala.Tuple2") _zcUserQueryNameSpace = {} jconf = intp.getSparkConf() conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf) sc = _zsc_ = SparkContext(jsc=jsc, gateway=gateway, conf=conf) _zcUserQueryNameSpace["_zsc_"] = _zsc_ _zcUserQueryNameSpace["sc"] = sc if sparkVersion.isSpark2(): spark = __zSpark__ = SparkSession(sc, intp.getSparkSession()) sqlc = __zSqlc__ = __zSpark__._wrapped _zcUserQueryNameSpace["sqlc"] = sqlc _zcUserQueryNameSpace["__zSqlc__"] = __zSqlc__ _zcUserQueryNameSpace["spark"] = spark _zcUserQueryNameSpace["__zSpark__"] = __zSpark__ else: sqlc = __zSqlc__ = SQLContext(sparkContext=sc, sqlContext=intp.getSQLContext()) _zcUserQueryNameSpace["sqlc"] = sqlc _zcUserQueryNameSpace["__zSqlc__"] = sqlc
@keyword_only def setParams(self, predictionCol="prediction", labelCol="label", metricName="f1"): """ setParams(self, predictionCol="prediction", labelCol="label", \ metricName="f1") Sets params for multiclass classification evaluator. """ kwargs = self.setParams._input_kwargs return self._set(**kwargs) if __name__ == "__main__": import doctest from pyspark.context import SparkContext from pyspark.sql import SQLContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[2]", "ml.evaluation tests") sqlContext = SQLContext(sc) globs['sc'] = sc globs['sqlContext'] = sqlContext (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) sc.stop() if failure_count: exit(-1)
`features` """ return self._call_java("userFactors") @property @since("1.4.0") def itemFactors(self): """ a DataFrame that stores item factors in two columns: `id` and `features` """ return self._call_java("itemFactors") if __name__ == "__main__": import doctest import pyspark.ml.recommendation from pyspark.context import SparkContext from pyspark.sql import SQLContext globs = pyspark.ml.recommendation.__dict__.copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[2]", "ml.recommendation tests") sqlContext = SQLContext(sc) globs['sc'] = sc globs['sqlContext'] = sqlContext (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) sc.stop() if failure_count: exit(-1)
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import boto3 target_format = "parquet" ## @params: [JOB_NAME] args = getResolvedOptions( sys.argv, ['JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) dataLakeBucket = args["DL_BUCKET"] dataLakePrefix = args["DL_PREFIX"] aws_region = args["DL_REGION"] glue_database = args["GLUE_SRC_DATABASE"] job.init(args['JOB_NAME'], args) client = boto3.client(service_name='glue', region_name=aws_region) responseGetTables = client.get_tables(DatabaseName=glue_database)
"gameweek": i['gameweek'], "goals": i['goals'], "own_goals": i['own_goals'], "yellow_cards": i['yellow_cards'], "red_cards": i['red_cards'] } json_object = json.dumps(dictionary, indent=4) # Writing to sample.json with open("output_req_2.json", "w") as outfile: print("Writing....to JSON") outfile.write(json_object) break if __name__ == "__main__": sp_context = SparkContext('local[2]', "UI") sp_sess = SparkSession.builder.appName('user_input').getOrCreate() sp_context.addFile("model.py") input_file = sys.argv[1] with open(input_file, 'r') as file: content = file.read() input_data = eval(content) if input_data["req_type"] == 1: # calling predict function: """ output = predict(input_) """ predict_helper(input_data) elif input_data["req_type"] == 2: # calling profile function