def extract_imits_tsv_allele_2(spark_session: SparkSession, file_path: str) -> DataFrame: imits_df = utils.extract_tsv(spark_session, file_path) imits_df = imits_df.withColumn( "allele_mgi_accession_id", when( (col("allele_mgi_accession_id").isNull()) & (col("type") == "Allele"), concat(lit("NOT-RELEASED-"), substring(md5(col("allele_symbol")), 0, 10)), ).otherwise(col("allele_mgi_accession_id")), ) imits_df = imits_df.withColumn( "marker_mgi_accession_id", when( (col("marker_mgi_accession_id").isNull()) & (col("type") == "Gene"), concat(lit("NOT-RELEASED-"), substring(md5(col("marker_symbol")), 0, 10)), ).otherwise(col("marker_mgi_accession_id")), ) imits_df = imits_df.withColumn( "allele2_id", monotonically_increasing_id().astype(StringType())) for col_name in ALLELE2_MULTIVALUED: imits_df = imits_df.withColumn( col_name, when( col(col_name).contains("|"), split(col_name, r"\|"), ).otherwise(array(col_name)), ) return imits_df
def get_body_weight_curve_observations( unidimensional_observations_df: DataFrame): body_weight_curve_df = None for ( parameter_stable_id, parameter_data, ) in Constants.BODY_WEIGHT_CURVE_PARAMETERS.items(): bwt_observations = unidimensional_observations_df.where( col("parameter_stable_id").isin(parameter_data["parameters"])) bwt_observations = bwt_observations.withColumn( "procedure_stable_id", lit(parameter_data["procedure_stable_id"])) bwt_observations = bwt_observations.withColumn( "parameter_stable_id", lit(parameter_stable_id)) bwt_observations = bwt_observations.withColumn( "procedure_group", lit(parameter_data["procedure_group"])) bwt_observations = bwt_observations.withColumn( "procedure_name", lit(parameter_data["procedure_name"])) bwt_observations = bwt_observations.withColumn( "parameter_name", lit(parameter_data["parameter_name"])) bwt_observations = bwt_observations.withColumn( "experiment_id", md5(concat(lit(parameter_stable_id + "_"), col("experiment_id"))), ) bwt_observations = bwt_observations.withColumn( "observation_id", md5(concat(lit(parameter_stable_id + "_"), col("observation_id"))), ) bwt_observations = bwt_observations.withColumn( "experiment_source_id", concat(lit(parameter_stable_id + "_"), col("experiment_source_id")), ) bwt_observations = bwt_observations.withColumn("metadata_group", md5(lit(""))) bwt_observations = bwt_observations.withColumn( "metadata", array(concat(lit("Source experiment id: "), col("experiment_id"))), ) bwt_observations = bwt_observations.withColumn("observation_type", lit("time_series")) bwt_observations = bwt_observations.withColumn("discrete_point", col("age_in_weeks")) bwt_observations = bwt_observations.withColumn( "time_point", col("date_of_experiment")) if body_weight_curve_df is None: body_weight_curve_df = bwt_observations else: body_weight_curve_df = body_weight_curve_df.union(bwt_observations) return body_weight_curve_df.drop_duplicates()
def create_md5_hash_keys(df, grouping_cols): return ( df .withColumn('hash_key', F.md5(F.concat_ws('__', *grouping_cols))) .select(*grouping_cols, 'hash_key', 'amount') .sort(grouping_cols) )
def generate_unique_id(dcc_experiment_df: DataFrame): """ Generates an unique_id column using as an input every column except from those that have non unique values and the ones that correspond to parameter values. Given that _sequenceID could be null, the function transforms it the to the string NA when its null to avoid the nullifying the concat. It concatenates the unique set of values and then applies an MD5 hash function to the resulting string. :param dcc_experiment_df: :return: DataFrame """ non_unique_columns = [ "_type", "_sourceFile", "_VALUE", "procedureMetadata", "statusCode", "_sequenceID", "_project", ] if "_sequenceID" in dcc_experiment_df.columns: dcc_experiment_df = dcc_experiment_df.withColumn( "_sequenceIDStr", when(col("_sequenceID").isNull(), lit("NA")).otherwise(col("_sequenceID")), ) unique_columns = [ col_name for col_name in dcc_experiment_df.columns if col_name not in non_unique_columns and not col_name.endswith("Parameter") ] dcc_experiment_df = dcc_experiment_df.withColumn( "unique_id", md5(concat(*unique_columns))) return dcc_experiment_df
def map_line_columns(line_df: DataFrame): for field, value in Constants.LINE_TO_OBSERVATION_MAP.items(): if value is not None: line_df = line_df.withColumn(field, col(value)) else: line_df = line_df.withColumn(field, lit(None)) line_df = line_df.withColumn("biological_sample_group", lit("experimental")) line_df = line_df.withColumn("zygosity", lit("homozygote")) line_df = line_df.withColumn( "datasource_name", when(col("_dataSource") == "impc", lit("IMPC")).otherwise( when(col("_dataSource") == "europhenome", lit("EuroPhenome")).otherwise(col("_dataSource"))), ) line_df = line_df.withColumn( "allele_accession_id", when(col("biological_sample_group") == "control", lit(None)).otherwise( when( col("allele.mgiAlleleID").isNull(), concat( lit("NOT-RELEASED-"), substring(md5(line_df["allele_symbol"]), 0, 10), ), ).otherwise(col("allele.mgiAlleleID"))), ) return line_df
def save_to_stage(rdd): """ This method handles the kafka messages - we simply want to save them to the staging index for processing. """ # If we get an empty message, do nothing (should not happen!) if rdd.isEmpty(): return esconf = {} esconf["es.mapping.id"] = 'message_id' esconf["es.index.auto.create"] = "true" esconf["es.nodes"] = ip esconf["es.port"] = port esconf["es.nodes.wan.only"] = "true" esconf["es.write.operation"] = "index" sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate()) df = sqlContext.createDataFrame(rdd, samplingRatio=1).toDF( "topic", "key", "value") # Add Identifier, Recieved Timestamp, and Boolean Flag to indicate not processed df = df.drop(df.key) df = df.withColumn('message_id', f.md5(df.value)) df = df.withColumn("message_recieved_ts", f.lit(f.current_timestamp())) df = df.withColumn("message_processed", f.lit('false')) df.write.format("org.elasticsearch.spark.sql").options( **esconf).mode("append").save(resource)
def convertAndSaveS3(df): df = df.withColumn("year_month_id", f.from_unixtime('student_behavior_date', format="yyyyMM")) df = df.withColumn('student_behavior_id', f.md5(concaText( df.student_behavior_date, df.behavior_id, df.student_id, df.contact_id, df.package_code, df.package_status_code, df.student_level_code, df.transformed_at))) dyf = DynamicFrame.fromDF(df, glueContext, "dyf") if is_dev: print('dyf____________________________') dyf.printSchema() dyf.show(10) behavior_mapping = mappingForAll(dyf, MAPPING) if behavior_mapping.count() > 0: parquetToS3(dyf=behavior_mapping, path="s3://toxd-olap/transaction_log/student_behavior/sb_student_behavior") rate_mapping = mappingForAll(dyf, RATE_MAPPING) if rate_mapping.count() > 0: parquetToS3(dyf=rate_mapping, path="s3://toxd-olap/transaction_log/student_behavior/sb_student_rating")
def extract_imits_tsv_by_entity_type(spark_session: SparkSession, file_path: str, entity_type: str) -> DataFrame: """ Uses a Spark Session to generate a DataFrame from a TSV file and a specific entity type. Can extract Genes or Alleles from a Alleles report file produced by IMITS. :param spark_session: spark SQL session to be used in the extraction :param file_path: path to the TSV file :param entity_type: 'Allele' or 'Gene' :return: Spark DataFrame with the extracted data """ imits_df = utils.extract_tsv(spark_session, file_path) imtis_entity_df = imits_df.where(imits_df.type == entity_type) if entity_type == "Allele": imtis_entity_df = imtis_entity_df.withColumn( "acc", when( col("allele_mgi_accession_id").isNull(), concat(lit("NOT-RELEASED-"), substring(md5(col("allele_symbol")), 0, 10)), ).otherwise(col("allele_mgi_accession_id")), ) imtis_entity_df = imtis_entity_df.withColumn( "allele2_id", monotonically_increasing_id().astype(StringType())) for col_name in ALLELE2_MULTIVALUED: imits_df = imits_df.withColumn( col_name, when( col(col_name).contains("|"), split(col_name, r"\|"), ).otherwise(array(col_name)), ) return imtis_entity_df
def convertAndSaveS3(df): df = df.dropDuplicates() prinDev(df, "befor convert") df = df.withColumn( "year_month_id", f.from_unixtime('student_behavior_date', format="yyyyMM")) prinDev(df, "befor mapping") df = df.withColumn( 'student_behavior_id', f.md5( concaText(df.student_behavior_date, df.behavior_id, df.student_id, df.contact_id, df.package_code, df.package_status_code, df.student_level_code, df.transformed_at))) dyf = DynamicFrame.fromDF(df, glueContext, "dyf") behavior_mapping = mappingForAll(dyf, MAPPING) prinDev(behavior_mapping, "after mapping") if behavior_mapping.count() > 0: parquetToS3( dyf=behavior_mapping, path= "s3://toxd-olap/transaction_log/student_behavior/sb_student_behavior" )
def hash_pseudo_anonymization(inputDF: DataFrame, columns: list) -> DataFrame: from pyspark.sql import functions outputDataFrame: DataFrame = inputDF for column in columns: outputDataFrame = outputDataFrame.withColumn( column + '_anon', functions.md5(functions.col(column).cast('String'))) return outputDataFrame
def generate_unique_id(dcc_specimen_df: DataFrame) -> DataFrame: unique_columns = ["_productionCentre", "_specimenID"] unique_columns = [ col_name for col_name in dcc_specimen_df.columns if col_name in unique_columns ] dcc_specimen_df = dcc_specimen_df.withColumn("unique_id", md5(concat(*unique_columns))) return dcc_specimen_df
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_filename", help="Input CSV filename", type=str) parser.add_argument("masked_csv_folder", help="Masked CSV folder", type=str) args = parser.parse_args() print(args) csv_filename = args.csv_filename spark = SparkSession.builder.master('local[*]').appName( "processCSV").getOrCreate() df_masked = spark.read.csv(csv_filename, header=True)\ .withColumn('first_name', F.md5('first_name'))\ .withColumn('last_name', F.md5('last_name'))\ .withColumn('address', F.md5('address')) df_masked.write.mode('overwrite').csv(args.masked_csv_folder, header=False) spark.stop()
def process_event(self): event_cols = ["event_id", 'collector_tstamp', 'domain_sessionid', 'dvce_tstamp', 'event_sub_type', 'se_label', 'se_property_type', 'user_token', 'device_id', 'user_ipaddress', 'hour', 'se_property', 'isSuspect', 'suspect_reason'] # with user event_to_user = self.df_init.filter(F.col("se_property_type") == "user") \ .select(*event_cols).withColumn("dst_node", F.md5("se_property")) \ .drop_duplicates(["event_id"]) # with note event_to_note = self.df_init.filter(F.col("se_property_type") == "note") \ .select(*event_cols).withColumn("dst_node", F.md5("se_property")) \ .drop_duplicates(["event_id"]).select(event_to_user.columns) event_df = event_to_user.unionAll(event_to_note) # 分小时存csv event_df.repartition(1).write.partitionBy("hour").csv(os.path.join(self.save_dir, "event_by_hour.csv")) print("node device has been processed successfully.\n") return event_df
def run(): any_is_null = None for col in cols: col_is_null = f.isnull(f.col(col)) any_is_null = col_is_null if any_is_null is None else ( any_is_null | col_is_null) col_sk = (f.when(any_is_null, '-1').otherwise( f.md5(f.concat_ws('-', *cols))).cast('string')) return col_sk
def calculate_md5(df): """Calculate MD5 on all columns of input data frame Args: df : spark data frame returns: dataframe """ col_list = [] for i in df.columns: col_list.append(i) resultdf = df.withColumn('md5', md5(concat_ws('_', *col_list))) return resultdf
def process_parameter_values(exp_df, pipeline_df, parameter_column, exp_type="experiment"): parameter_cols = [ "simpleParameter", "mediaParameter", "ontologyParameter", "seriesMediaParameter", "seriesParameter", ] if parameter_column not in exp_df.columns: return None parameter_observation_df = exp_df for column in parameter_cols: if column is not parameter_column: parameter_observation_df = parameter_observation_df.drop(column) parameter_observation_df = (parameter_observation_df.selectExpr( "*", "posexplode(" + parameter_column + ") as (experimentPos, " + parameter_column + "Exploded )", ).withColumn(parameter_column, col(parameter_column + "Exploded")).drop(parameter_column + "Exploded")) parameter_observation_df = parameter_observation_df.withColumn( "observation_id", md5( concat( col("experiment_id"), lit("_" + parameter_column + "_"), col("experimentPos"), )), ) if exp_type == "experiment": parameter_observation_df = map_experiment_columns( parameter_observation_df) else: parameter_observation_df = map_line_columns(parameter_observation_df) parameter_observation_df = add_impress_info(parameter_observation_df, pipeline_df, parameter_column, exp_type=exp_type) if has_column(parameter_observation_df, parameter_column + ".parameterStatus"): parameter_observation_df = parameter_observation_df.withColumn( "parameter_status", col(parameter_column + ".parameterStatus")) else: parameter_observation_df = parameter_observation_df.withColumn( "parameter_status", lit(None)) return parameter_observation_df
def benchmark2(): print("===Benchmark 2===") print( "Comparing JDBC writes to InnoDB and API writes to ColumnStore with larger datasets" ) print("") emptyDatabase() print("creating dataframe 1: two random generated doubles") randDF = sqlContext.range(0, 7000000).withColumn( 'uniform', rand(seed=23)).withColumn('normal', randn(seed=42)).cache() randDFRows = randDF.count() randDFItems = randDFRows * len(randDF.columns) randDF.printSchema() print("bemchmarking dataframe 1") rand_benchmark = benchmark2execution( "rand", randDF, "id BIGINT, uniform DOUBLE, normal DOUBLE") randDF.unpersist() print( "creating dataframe 2: sha1, sha256, sha512 and md5 hashes of integers" ) tmpDF = sqlContext.createDataFrame( sc.parallelize(range( 0, 3000000)).map(lambda i: Row(number=i, string=str(i)))) hashDF = tmpDF.select(tmpDF.number, sha1(tmpDF.string).alias("sha1"), sha2(tmpDF.string, 256).alias("sha256"), sha2(tmpDF.string, 512).alias("sha512"), md5(tmpDF.string).alias("md5")).cache() hashDFRows = hashDF.count() hashDFItems = hashDFRows * len(hashDF.columns) hashDF.printSchema() print("bemchmarking dataframe 2") hash_benchmark = benchmark2execution( "hash", hashDF, "number BIGINT, sha1 VARCHAR(40), sha256 VARCHAR(64), sha512 VARCHAR(128), md5 VARCHAR(32)" ) hashDF.unpersist() print("jdbc_innodb\tapi_columnstore\t\trows\t\titems") print("%.3fs\t\t%.3fs\t\t%i\t\t%i" % (rand_benchmark[0], rand_benchmark[1], randDFRows, randDFItems)) print("%.3fs\t\t%.3fs\t\t%i\t\t%i" % (hash_benchmark[0], hash_benchmark[1], hashDFRows, hashDFItems))
def process_user(self, behave_dir, active_dir): """ process node user @return: """ source_user = self.df_init.select("user_token", "user_app_first_time", "user_create_time") \ .filter(~F.isnull("user_token")) \ .drop_duplicates(["user_token"]) \ .withColumn("has_behavior", F.lit(1)) dest_user = self.df_init.filter(F.col("se_property_type") == "user") \ .select("se_property").drop_duplicates() \ .select(F.md5("se_property").alias("user_token")) \ .withColumn("isFollow", F.lit(1)) user_df = source_user.join(dest_user, on="user_token", how="outer").withColumn("noe_type", F.lit("user")) save2csv(user_df.repartition(1), self.save_dir, "node_user.csv") print("node user has been processed successfully.\n") return user_df
def generate_unique_id(self, dcc_specimen_df: DataFrame) -> DataFrame: """ Generates an unique identifier for the Specimen using productin centre, phenotyping centre and specimen ID. """ dcc_specimen_df = dcc_specimen_df.withColumn( "unique_id", md5( concat(*[ when( col("_productionCentre").isNotNull(), col("_productionCentre"), ).when( col("_phenotypingCentre").isNotNull(), col("_phenotypingCentre"), ).otherwise(lit("")), col("_specimenID"), ])), ) return dcc_specimen_df
def resolve_image_record_value(image_record_observation_df: DataFrame): image_record_observation_df = image_record_observation_df.selectExpr( "*", "posexplode(seriesMediaParameter.value) as (seriesMediaParameterPos, seriesMediaParameterValue)", ) image_record_observation_df = image_record_observation_df.withColumn( "observation_id", md5( concat( col("observation_id"), lit("_seriesMediaParameterValue_"), col("seriesMediaParameterPos"), )), ) image_record_observation_df = image_record_observation_df.withColumn( "download_file_path", col("seriesMediaParameterValue._URI")) image_record_observation_df = image_record_observation_df.withColumn( "file_type", col("seriesMediaParameterValue._fileType")) image_record_observation_df = image_record_observation_df.withColumn( "observation_type", lit("image_record")) return image_record_observation_df
def handle_patient_registration(source_df): """ This methods applies transformations to tha_patient_registration to make it compatible with abstract schema Args: source_df: tha_patient_registration Dataframe Returns: pat_reg_df: transformed tha_patient_registration Dataframe """ pat_reg_df = source_df.withColumn("claim_id", F.md5(F.concat(F.lit("tha"), F.coalesce(source_df.fac, F.lit("")), F.coalesce(source_df.dkey, F.lit("")), F.coalesce(source_df.source_code, F.lit("")), F.coalesce(source_df.source_year, F.lit("")), F.coalesce(source_df.source_qtr, F.lit("")) ))) pat_reg_final_df = pat_reg_df.withColumn("claim_type_cd", claim_type_code(pat_reg_df.source_code)) \ .withColumn("claim_type_txt", claim_type_txt(pat_reg_df.source_code)) \ .withColumn("claim_start_dt", pat_reg_df.adat) \ .withColumn("claim_end_dt", pat_reg_df.ddat) \ .withColumnRenamed("pdx", "primary_diagnosis_cd") \ .withColumn("primary_diagnosis_code_type_cd", F.lit("icd10")) \ .withColumnRenamed("pphysdocid", "attending_provider_npi") \ .withColumn("organization_state_cd", organization_state_cd(pat_reg_df.hospstateabbr)) \ .withColumn("utilization_day_cnt", pat_reg_df.los) \ .withColumn("admit_dt", pat_reg_df.adat) \ .withColumn("admit_type_cd", admit_type_cd(pat_reg_df.atype)) \ .withColumn("length_of_stay_val", pat_reg_df.los) \ .withColumn("discharge_dt", pat_reg_df.ddat) \ .withColumn("admission_referral_source_cd", admission_referral_source_cd(pat_reg_df.asource)) \ .withColumnRenamed("drg", "drg_cd") \ .select("fac", "dkey", "claim_id", "claim_type_cd", "claim_type_txt", "claim_start_dt", "claim_end_dt", "primary_diagnosis_cd", "primary_diagnosis_code_type_cd", "attending_provider_npi", "organization_state_cd", "utilization_day_cnt", "admit_dt", "admit_type_cd", "length_of_stay_val", "discharge_dt", "admission_referral_source_cd", "drg_cd", "source_code", "source_year", "source_qtr" ) return pat_reg_final_df
def add_hashed_id(df, columns=[], hashed_col='Hashed_ID', hash_type='md5'): """ This method will create a dummy transaction for each account record in the dataframe. Returns -------- Dataframe with hashed Id as a column ------ Parameters -------- df : spark dataframe dataframe to create hashed id on columns : list of strings columns to use hash, default is None which takes in all columns of df hashed_col : string column name for hashed id -------- """ if len(columns) == 0: columns = df.columns else: illegal_columns = [] for column in columns: if column not in df.columns: illegal_columns.append(column) if len(illegal_columns) > 0: raise IllegalArgumentException( 'Column {} does not exist in dataframe'.format(', '.join(illegal_columns))) if hashed_col is None or hashed_col == '': hashed_col = 'Hashed_ID' if hash_type == 'md5': df = df.withColumn(hashed_col, F.md5(F.concat(*columns))) else: df = df.withColumn(hashed_col, F.sha2(F.concat(*columns))) return df
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) # f.lit(today_second).cast('long').alias('transformed_at') rangeid = [14, 15, 16, 17, 18] student_id_unavailable = '0' package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' package_endtime = 'package_endtime' package_starttime = 'package_starttime' student_level_code = 'student_level_code' student_status_code = 'student_status_code' def doCheckModified(val1, val2): if val1 is not None: return val1 return val2 check_modified_null = udf(doCheckModified, StringType()) def doCheckStudentID(code): code = str(code) if code is None: return student_id_unavailable return code check_student_id = udf(doCheckStudentID, StringType()) def doCheckData(code, key): key = str(key) if code is None: if key == package_endtime: return package_endtime_unavailable else: return package_starttime_unavailable return code check_data = udf(doCheckData, IntegerType()) def doCheckDataNull(code, key): code = str(code) key = str(key) if (code is None) & (key == student_level_code): return student_level_code_unavailable if (code is None) & (key == student_status_code): return student_status_code_unavailable return code check_data_null = udf(doCheckDataNull, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime,package_starttime, student_level_code, student_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_status_code is not None: text_concat += str(student_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = f.udf(concaText, StringType()) dyf_tblreply_rating = glueContext.create_dynamic_frame.from_catalog( database="native_smile", table_name="tblreply_rating" ) dyf_tblreply_rating = dyf_tblreply_rating.select_fields( ['_key', 'userid', 'ratingid', 'time_rating'] ) dyf_tblreply_rating = dyf_tblreply_rating.resolveChoice(specs=[('_key', 'cast:long')]) # try: # df_flag_1 = spark.read.parquet("s3://dtsodin/flag/flag_hoc_vien_rating_native_smile_h2472.parquet") # max_key = df_flag_1.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_tblreply_rating = Filter.apply(frame=dyf_tblreply_rating, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') if dyf_tblreply_rating.count()> 0: dyf_mdl_user = glueContext.create_dynamic_frame.from_catalog( database="topicalms", table_name="mdl_user" ) dyf_mdl_user = dyf_mdl_user.select_fields( ['id', 'email'] ) dyf_tma_dm_tu_dien = glueContext.create_dynamic_frame.from_catalog( database="native_smile", table_name="tma_dm_tu_dien" ) dyf_tma_dm_tu_dien = dyf_tma_dm_tu_dien.select_fields( ['id', 'ma_tu_dien', 'id_dm_loai_tu_dien'] ) dyf_tma_dm_tu_dien = Filter.apply(frame=dyf_tma_dm_tu_dien, f=lambda x: x['id_dm_loai_tu_dien'] == 7 and x['id'] in rangeid) ################ join_rating_user = Join.apply(dyf_tblreply_rating, dyf_mdl_user, 'userid', 'id') join_rating_user01 = Join.apply(join_rating_user, dyf_tma_dm_tu_dien, 'ratingid', 'id') ################ dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact" ) dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id', 'level_study', 'time_lms_created'])\ dyf_log_student_status = glueContext.create_dynamic_frame.from_catalog( database="do_tig_advisor", table_name="log_student_status" ) dyf_log_student_status = dyf_log_student_status.select_fields( ['contact_id', 'status_code', 'last_status_code', 'start_date', 'end_date']) \ .rename_field('contact_id', 'contact_id_status') dyf_log_student_package = glueContext.create_dynamic_frame.from_catalog( database="do_tig_advisor", table_name="log_student_package" ) dyf_log_student_package = dyf_log_student_package.select_fields( ['student_id', 'package_code', 'start_time', 'end_time']) \ .rename_field('student_id', 'student_id_package') \ .rename_field('start_time', 'start_time_package') \ .rename_field('end_time', 'end_time_package') dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study" ) dyf_log_student_level_study = dyf_log_student_level_study.select_fields( ['contact_id', 'level_current', 'level_modified', 'package_code', 'time_created']) \ .rename_field('contact_id', 'contact_id_level') join_rating_user01.printSchema() print join_rating_user01.count() print join_rating_user01.count() try: df_rating_class = join_rating_user01.toDF() df_student_contact = dyf_student_contact.toDF() df_log_student_level_study = dyf_log_student_level_study.toDF() df_temp = dyf_log_student_level_study.toDF() df_log_student_status = dyf_log_student_status.toDF() df_log_student_package = dyf_log_student_package.toDF() df_temp = df_temp.groupby('contact_id_level', 'level_current', 'package_code').agg( f.max("time_created").alias("time_created_max")) df_temp = df_temp.withColumnRenamed('contact_id_level', 'contact_id_join') \ .withColumnRenamed('package_code', 'package_code_join') df_join0 = df_temp.join(df_log_student_level_study, (df_temp['contact_id_join'] == df_log_student_level_study['contact_id_level']) & (df_temp['package_code_join'] == df_log_student_level_study['package_code']) & (df_temp['time_created_max'] == df_log_student_level_study['time_created']), "left") print "=========== . ===========" df_join0.printSchema() dyf_join = DynamicFrame.fromDF(df_join0, glueContext, "dyf_join") dyf_join = dyf_join.select_fields( ['contact_id_level', 'level_current', 'level_modified', 'package_code', 'time_created']) df_join = dyf_join.toDF() df_join.printSchema() df_join.show(10) print "########## . ###########" df_join01 = df_rating_class.join(df_student_contact, (df_rating_class['userid'] == df_student_contact['student_id'])) df_join01.printSchema() df_join02 = df_join01.join(df_join, (df_join['contact_id_level'] == df_join01['contact_id']) & (df_join['time_created'] <= df_join01['time_lms_created']), "left") df_join02 = df_join02\ .withColumn("level_modified_new", check_modified_null(df_join02.level_modified, df_join02.level_study)) df_join02.printSchema() df_join02.show(10) dyf_join = DynamicFrame.fromDF(df_join02, glueContext, "dyf_join") dyf_join = dyf_join.select_fields(['time_rating', 'contact_id', 'student_id', 'level_study', 'time_lms_created', 'ratingid', 'level_current', 'level_modified', 'package_code', 'time_created', 'level_modified_new']) # dyf_join_temp = Filter.apply(frame=dyf_join, # f=lambda x: x["level_modified_new"] is None) # print "count: ", dyf_join_temp.count() ############ df_join02 = dyf_join.toDF() df_join03 = df_join02.join(df_log_student_status, (df_log_student_status['contact_id_status'] == df_join02['contact_id']) & (df_log_student_status['start_date'] <= df_join02['time_rating']) & (df_log_student_status['end_date'] >= df_join02['time_rating']), "left") df_join04 = df_join03.join(df_log_student_package, (df_log_student_package['student_id_package'] == df_join03['student_id']) & (df_log_student_package['start_time_package'] <= df_join03['time_rating']) & (df_log_student_package['end_time_package'] >= df_join03['time_rating']), "left") dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join") dyf_join = Filter.apply(frame=dyf_join, f=lambda x: x["start_time_package"] is not None and x["end_time_package"] is not None) print "dyf_join: ", dyf_join.count() dyf_join.printSchema() dyf_join.show(10) dyf_join = dyf_join.select_fields( ['time_rating', 'student_id', 'contact_id', 'package_code', 'ratingid', 'start_time_package', 'end_time_package', 'level_modified_new', 'status_code'] ) print "TTTTTTTTTTTTTT" # dyf_join01 = Filter.apply(frame=dyf_join, # f=lambda x: x["level_current"] is not None) # # print "Check null ", dyf_join01.count() df_join04 = dyf_join.toDF() df_join04 = df_join04.withColumn("transformed_at", unix_timestamp(f.current_timestamp())) \ .withColumn("student_id", check_student_id(df_join04.student_id)) \ .withColumn("package_endtime", check_data(df_join04.end_time_package, f.lit(package_endtime))) \ .withColumn("package_starttime", check_data(df_join04.start_time_package, f.lit(package_starttime))) \ .withColumn("student_level_code", check_data_null(df_join04.level_modified_new, f.lit(student_level_code))) \ .withColumn("student_status_code", check_data_null(df_join04.status_code, f.lit(student_status_code))) \ .withColumn("behavior_id", f.lit(27)) \ .withColumn("rating_type", f.lit("rating_native_smile_h2472")) \ .withColumn("comment", f.lit("")) \ .withColumn("rating_about", f.lit(None)) \ .withColumn("number_rating", f.lit(1)) \ .withColumn("value_rating", (df_join04.ratingid - f.lit(13))) df_join04.printSchema() print df_join04.count() df_join04.show(10) dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join") # dyf_join.printSchema() # print dyf_join.count() # dyf_join.show(10) dyf_rating_cara = ApplyMapping.apply(frame=dyf_join, mappings=[("time_rating", "int", "student_behavior_date", "long"), ("behavior_id", "int", "behavior_id", "long"), ("student_id", "string", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("package_endtime", "int", "package_endtime", "long"), ("package_starttime", "int", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long"), ("rating_type", "string", "rating_type", "string"), ("comment", "string", "comment", "string"), ("rating_about", "int", "rating_about", "long"), ("number_rating", "int", "number_rating", "long"), ("value_rating", "int", "value_rating", "long")]) df_rating_cara = dyf_rating_cara.toDF() df_rating_cara2 = df_rating_cara.withColumn('student_behavior_id', f.md5(concaText( df_rating_cara.student_behavior_date, df_rating_cara.behavior_id, df_rating_cara.student_id, df_rating_cara.contact_id, df_rating_cara.package_code, df_rating_cara.package_endtime, df_rating_cara.package_starttime, df_rating_cara.student_level_code, df_rating_cara.student_status_code, df_rating_cara.transformed_at))) dyf_rating_cara = DynamicFrame.fromDF(df_rating_cara2, glueContext, 'dyf_rating_cara') dyf_rating_cara = Filter.apply(frame=dyf_rating_cara, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') applymapping0 = ApplyMapping.apply(frame=dyf_rating_cara, mappings=[ ("student_behavior_id", "string", "student_behavior_id", "string"), ("rating_type", "string", "rating_type", "string"), ("comment", "string", "comment", "string"), ("rating_about", "long", "rating_about", "long"), ("number_rating", "long", "number_rating", "long"), ("value_rating", "long", "value_rating", "long"), ("behavior_id", "long", "behavior_id", "long")]) applymapping0.printSchema() print applymapping0.count() applymapping0.show(5) resolvechoice0 = ResolveChoice.apply(frame=applymapping0, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields0 = DropNullFields.apply(frame=resolvechoice0, transformation_ctx="dropnullfields0") print resolvechoice0.count() # resolvechoice0.printSchema() # resolvechoice0.show(10) print('START WRITE TO S3-------------------------') datasink0 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields0, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_rating/", "partitionKeys": ["behavior_id"]}, format="parquet", transformation_ctx="datasink0") print('END WRITE TO S3-------------------------') # datasink0 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields0, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "student_rating_temp", # "database": "dts_odin" # }, # redshift_tmp_dir="s3a://dtsodin/temp/student_rating_temp/", # transformation_ctx="datasink0") applymapping1 = ApplyMapping.apply(frame=dyf_rating_cara, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "long", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("package_endtime", "long", "package_endtime", "long"), ("package_starttime", "long", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long")]) applymapping1.printSchema() print applymapping1.count() applymapping1.show(10) resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1") print resolvechoice1.count() # resolvechoice1.printSchema() # resolvechoice1.show(10) print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior/", "partitionKeys": ["behavior_id"]}, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "student_behavior", # "database": "dts_odin" # }, # redshift_tmp_dir="s3a://dtsodin/temp/student_behavior", # transformation_ctx="datasink1") df_temp = dyf_tblreply_rating.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dtsodin/flag/flag_hoc_vien_rating_native_smile_h2472.parquet", mode="overwrite") except Exception as e: print e
def generate_metadata_group( experiment_specimen_df: DataFrame, impress_df: DataFrame, exp_type="experiment", ) -> DataFrame: """ Takes in an Experiment-Specimen DataFrame and the IMPReSS dataframe, and generates a hash value with the parameters marked as 'isImportant' on IMPReSS. This hash is used to identify experiments that are comparable (i.e. share the same experimental conditions). """ # Explode the experiments by procedureMetadata so each row contains data for each procedureMetadata value experiment_metadata = experiment_specimen_df.withColumn( "procedureMetadata", explode("procedureMetadata")) # Filter the IMPReSS to leave only those that generate a metadata split: isImportant = True impress_df_required = impress_df.where( (col("parameter.isImportant") == True) & (col("parameter.type") == "procedureMetadata")) # Join the experiment DF with he IMPReSS DF experiment_metadata = experiment_metadata.join( impress_df_required, ((experiment_metadata["_pipeline"] == impress_df_required["pipelineKey"]) & (experiment_metadata["_procedureID"] == impress_df_required["procedure.procedureKey"]) & (experiment_metadata["procedureMetadata._parameterID"] == impress_df_required["parameter.parameterKey"])), ) # Create a new column by concatenating the parameter name and the parameter value experiment_metadata = experiment_metadata.withColumn( "metadataItem", when( col("procedureMetadata.value").isNotNull(), concat(col("parameter.name"), lit(" = "), col("procedureMetadata.value")), ).otherwise(concat(col("parameter.name"), lit(" = "), lit("null"))), ) # Select the right column name for production and phenotyping centre depending on experiment type if exp_type == "experiment": production_centre_col = "_productionCentre" phenotyping_centre_col = "_phenotypingCentre" else: production_centre_col = "production_centre" phenotyping_centre_col = "phenotyping_centre" # Create a window for the DataFrame over experiment id, production and phenotyping centre window = Window.partitionBy( "unique_id", production_centre_col, phenotyping_centre_col).orderBy("parameter.name") # Use the window to create for every experiment an array containing the set of "parameter = value" pairs. experiment_metadata_input = experiment_metadata.withColumn( "metadataItems", collect_set(col("metadataItem")).over(window)) # Add the production centre to the metadata group when this is different form the phenotyping centre. # This is because in that given case we would like to generate a metadata split among specimens # That have been produced and phenotyped on the same centre experiment_metadata_input = experiment_metadata_input.withColumn( "metadataItems", when( (col(production_centre_col).isNotNull()) & (col(production_centre_col) != col(phenotyping_centre_col)), array_union( col("metadataItems"), array( concat(lit("ProductionCenter = "), col(production_centre_col))), ), ).otherwise(col("metadataItems")), ) # Create a string with the concatenation of the metadata items "parameter = value" separated by '::'. experiment_metadata = experiment_metadata_input.groupBy( "unique_id", production_centre_col, phenotyping_centre_col).agg( concat_ws("::", sort_array(max( col("metadataItems")))).alias("metadataGroupList")) # Hash the list to generate a medata group identifier. experiment_metadata = experiment_metadata.withColumn( "metadataGroup", md5(col("metadataGroupList"))) # Select the experiment IDs and the metadata group IDs experiment_metadata = experiment_metadata.select("unique_id", "metadataGroup") # Join the original experiment DataFrame with the result of the metadata group generation experiment_specimen_df = experiment_specimen_df.join( experiment_metadata, "unique_id", "left_outer") # Add the hashed version of an empty string to those rows without a metadata group. experiment_specimen_df = experiment_specimen_df.withColumn( "metadataGroup", when(experiment_specimen_df["metadataGroup"].isNull(), md5(lit(""))).otherwise(experiment_specimen_df["metadataGroup"]), ) return experiment_specimen_df
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") student_id_unavailable = '0' package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' package_endtime = 'package_endtime' package_starttime = 'package_starttime' student_level_code = 'student_level_code' student_status_code = 'student_status_code' def doCheckModified(val1, val2): if val1 is not None: return val1 return val2 check_modified_null = udf(doCheckModified, StringType()) def doCheckStudentID(code): code = str(code) if code is None: return student_id_unavailable return code check_student_id = udf(doCheckStudentID, StringType()) def doCheckData(code, key): key = str(key) if code is None: if key == package_endtime: return package_endtime_unavailable else: return package_starttime_unavailable return code check_data = udf(doCheckData, IntegerType()) def doCheckDataNull(code, key): code = str(code) key = str(key) if (code is None) & (key == student_level_code): return student_level_code_unavailable if (code is None) & (key == student_status_code): return student_status_code_unavailable return code check_data_null = udf(doCheckDataNull, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime, package_starttime, student_level_code, student_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_status_code is not None: text_concat += str(student_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = udf(concaText, StringType()) # get dynamic frame source dyf_student_assignments = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_assignments") # chon cac field dyf_student_assignments = dyf_student_assignments.select_fields([ '_key', 'contact_id', 'advisor_id', 'time_created', 'time_modified', 'status' ]) df_student_assignments = dyf_student_assignments.toDF() df_student_assignments = df_student_assignments\ .withColumn("time_created", unix_timestamp(df_student_assignments.time_created).cast("long"))\ .withColumn("time_modified", unix_timestamp(df_student_assignments.time_modified).cast("long"))\ .withColumn("_key", unix_timestamp(df_student_assignments._key).cast("long")) dyf_student_assignments = DynamicFrame.fromDF(df_student_assignments, glueContext, "dyf_student_assignments") dyf_student_assignments.printSchema() dyf_student_assignments.show(2) # check bucket is not null try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3a://dtsodin/flag/toa_L3150/toa_student_assignments.parquet") start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag dyf_student_assignments = Filter.apply( frame=dyf_student_assignments, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') print('the number of new contacts: ', dyf_student_assignments.count()) if dyf_student_assignments.count() > 0: dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id', 'level_study', 'time_lms_created'])\ .rename_field('contact_id', 'contactid') dyf_log_student_status = glueContext.create_dynamic_frame.from_catalog( database="do_tig_advisor", table_name="log_student_status") dyf_log_student_status = dyf_log_student_status.select_fields( ['contact_id', 'status_code', 'last_status_code', 'start_date', 'end_date']) \ .rename_field('contact_id', 'contact_id_status') dyf_log_student_package = glueContext.create_dynamic_frame.from_catalog( database="do_tig_advisor", table_name="log_student_package") dyf_log_student_package = dyf_log_student_package.select_fields( ['student_id', 'package_code', 'start_time', 'end_time']) \ .rename_field('student_id', 'student_id_package') \ .rename_field('start_time', 'start_time_package') \ .rename_field('end_time', 'end_time_package') dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study") dyf_log_student_level_study = dyf_log_student_level_study.select_fields( ['contact_id', 'level_current', 'level_modified', 'package_code', 'time_created']) \ .rename_field('contact_id', 'contact_id_level') dyf_student_assignment = Filter.apply( frame=dyf_student_assignments, f=lambda x: x['contact_id'] is not None and x['contact_id'] != '') try: dyf_assignments_contact = Join.apply(dyf_student_assignment, dyf_student_contact, "contact_id", "contactid") dyf_assignments_contact.printSchema() dyf_assignments_contact.show(2) df_assignments_contact = dyf_assignments_contact.toDF( ).dropDuplicates() df_log_student_level_study = dyf_log_student_level_study.toDF() df_temp = dyf_log_student_level_study.toDF() df_log_student_status = dyf_log_student_status.toDF() df_log_student_package = dyf_log_student_package.toDF() df_temp = df_temp.groupby( 'contact_id_level', 'level_current', 'package_code').agg( f.max("time_created").alias("time_created_max")) df_temp = df_temp.withColumnRenamed('contact_id_level', 'contact_id_join') \ .withColumnRenamed('package_code', 'package_code_join') df_student_level_study = df_temp.join( df_log_student_level_study, (df_temp['contact_id_join'] == df_log_student_level_study['contact_id_level']) & (df_temp['package_code_join'] == df_log_student_level_study['package_code']) & (df_temp['time_created_max'] == df_log_student_level_study['time_created']), "left") print "=========== . ===========" df_student_level_study.printSchema() dyf_student_level_study = DynamicFrame.fromDF( df_student_level_study, glueContext, "dyf_student_level_study") dyf_student_level_study = dyf_student_level_study.select_fields([ 'contact_id_level', 'level_current', 'level_modified', 'package_code', 'time_created' ]) df_student_level_study = dyf_student_level_study.toDF() df_student_level_study.printSchema() df_assignments_contact_level = df_student_level_study.join( df_assignments_contact, (df_student_level_study['contact_id_level'] == df_assignments_contact['contact_id']) & (df_student_level_study['time_created'] <= df_assignments_contact['time_lms_created']), "right") df_assignments_contact_level = df_assignments_contact_level.withColumn( "level_modified_new", check_modified_null( df_assignments_contact_level.level_modified, df_assignments_contact_level.level_study)) df_assignments_contact_level.printSchema() df_assignments_contact_level.count() df_assignments_contact_level.show(10) dyf_assignments_contact_level = DynamicFrame.fromDF( df_assignments_contact_level, glueContext, "dyf_assignments_contact_level") dyf_assignments_contact_level = dyf_assignments_contact_level.select_fields( [ 'time_created', 'contact_id', 'student_id', 'advisor_id', 'level_study', 'time_lms_created', 'level_current', 'level_modified', 'package_code', 'time_modified', 'level_modified_new' ]) # dyf_join_temp = Filter.apply(frame=dyf_join, # f=lambda x: x["level_modified_new"] is None) # print "count: ", dyf_join_temp.count() ############ df_assignments_contact_level = dyf_assignments_contact_level.toDF() df_join_data = df_assignments_contact_level.join( df_log_student_status, (df_log_student_status['contact_id_status'] == df_assignments_contact_level['contact_id']) & (df_log_student_status['start_date'] <= df_assignments_contact_level['time_created']) & (df_log_student_status['end_date'] >= df_assignments_contact_level['time_created']), "left") df_join_full_data = df_join_data.join( df_log_student_package, (df_log_student_package['student_id_package'] == df_join_data['student_id']) & (df_log_student_package['start_time_package'] <= df_join_data['time_created']) & (df_log_student_package['end_time_package'] >= df_join_data['time_created']), "left") df_join_full_data = df_join_full_data.dropDuplicates() dyf_join_full_data = DynamicFrame.fromDF(df_join_full_data, glueContext, "dyf_join_full_data") dyf_join_full_data = Filter.apply( frame=dyf_join_full_data, f=lambda x: x["start_time_package"] is not None and x[ "end_time_package"] is not None) print "dyf_join_full_data: ", dyf_join_full_data.count() dyf_join_full_data.show(10) dyf_join_full_data = dyf_join_full_data.select_fields([ 'time_created', 'student_id', 'contact_id', 'package_code', 'time_modified', 'advisor_id', 'start_time_package', 'end_time_package', 'level_modified_new', 'status_code' ]) df_join_full_data = dyf_join_full_data.toDF() df_join_full_data = df_join_full_data.withColumn("transformed_at", unix_timestamp(f.current_timestamp())) \ .withColumn("student_id", check_student_id(df_join_full_data.student_id)) \ .withColumn("package_endtime", check_data(df_join_full_data.end_time_package, f.lit(package_endtime))) \ .withColumn("package_starttime", check_data(df_join_full_data.start_time_package, f.lit(package_starttime))) \ .withColumn("student_level_code", check_data_null(df_join_full_data.level_modified_new, f.lit(student_level_code))) \ .withColumn("student_status_code", check_data_null(df_join_full_data.status_code, f.lit(student_status_code))) \ .withColumn("behavior_id", f.lit(234).cast("long")) df_join_full_data.printSchema() print df_join_full_data.count() df_join_full_data.show(10) dyf_join_full_data = DynamicFrame.fromDF(df_join_full_data, glueContext, "dyf_join_full_data") # dyf_join.printSchema() # print dyf_join.count() # dyf_join.show(10) dyf_dong_tien_student = ApplyMapping.apply( frame=dyf_join_full_data, mappings=[ ("time_created", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "string", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("package_endtime", "int", "package_endtime", "long"), ("package_starttime", "int", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long") ]) df_dong_tien_student = dyf_dong_tien_student.toDF() df_dong_tien_student2 = df_dong_tien_student.withColumn( 'student_behavior_id', f.md5( concaText(df_dong_tien_student.student_behavior_date, df_dong_tien_student.behavior_id, df_dong_tien_student.student_id, df_dong_tien_student.contact_id, df_dong_tien_student.package_code, df_dong_tien_student.package_endtime, df_dong_tien_student.package_starttime, df_dong_tien_student.student_level_code, df_dong_tien_student.student_status_code, df_dong_tien_student.transformed_at))) # df_dong_tien_student2 = df_dong_tien_student2.dropDuplicates() print "==============", df_dong_tien_student2.count() df_dong_tien_student2 = df_dong_tien_student2\ .groupby('student_behavior_id', 'student_behavior_date', 'behavior_id', 'transformed_at', 'contact_id', 'student_id', 'package_code', 'package_endtime', 'package_starttime', 'student_level_code')\ .agg(f.first('student_status_code')) print "==============", df_dong_tien_student2.count() dyf_dong_tien_student = DynamicFrame.fromDF( df_dong_tien_student2, glueContext, 'dyf_dong_tien_student') dyf_dong_tien_student = Filter.apply( frame=dyf_dong_tien_student, f=lambda x: x["contact_id"] is not None and x["contact_id" ] != '') applymapping1 = ApplyMapping.apply( frame=dyf_dong_tien_student, mappings=[ ("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "long", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("package_endtime", "long", "package_endtime", "long"), ("package_starttime", "long", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long") ]) resolvechoice1 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply( frame=resolvechoice1, transformation_ctx="dropnullfields1") print resolvechoice1.count() resolvechoice1.printSchema() resolvechoice1.show(10) print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields1, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior/", "partitionKeys": ["behavior_id"] }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "student_behavior", # "database": "dts_odin" # }, # redshift_tmp_dir="s3a://dtsodin/temp/student_behavior/", # transformation_ctx="datasink1") # ghi flag # lay max key trong data source datasourceTmp = dyf_student_assignments.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/toa_student_assignments.parquet", mode="overwrite") except Exception as e: print e
def _to_pandas(rows): pd_df = pd.DataFrame(list(rows)) return [pd_df] def to_pandas(df, num_partitions=None): """ :param df: spark DataFrame :param num_partitions: 设置spark DataFrame的partition数量 默认:None :return: """ if num_partitions is not None: df = df.repartition(num_partitions) pd_dfs = df.rdd.mapPartitions(_to_pandas).collect() pd_df = pd.concat(pd_dfs) pd_df.columns = df.columns return pd_df spark = SparkSession.builder \ .appName("smartCity")\ .master("local[5]")\ .getOrCreate() df = spark.range(50).withColumn("value", f.md5(f.lit("abcd"))) df2 = df.cache()
def main(): def checknull(level_modified, level_study): if level_modified is not None: return level_modified else: return level_study checknull_ = udf(checknull, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime, package_starttime, student_level_code, student_package_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_package_status_code is not None: text_concat += str(student_package_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = udf(concaText, StringType()) glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['student_id', 'contact_id', 'level_study']) dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study") dyf_log_student_level_study = dyf_log_student_level_study.select_fields([ 'contact_id', 'level_current', 'level_modified', 'package_code', 'time_created' ]) dyf_log_student_level_study = dyf_log_student_level_study.resolveChoice( specs=[('_key', 'cast:int')]) dyf_tpe_invoice_product = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product") dyf_tpe_invoice_product = dyf_tpe_invoice_product.select_fields([ '_key', 'timecreated', 'user_id', 'buyer_id', 'invoice_packages_price', 'invoice_price', 'invoice_code' ]) dyf_tpe_invoice_product = dyf_tpe_invoice_product.resolveChoice( specs=[('_key', 'cast:long')]) dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields( ['cat_code', 'package_time', 'invoice_code']) dyf_student_package = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_package") # chon cac field dyf_student_package = dyf_student_package.select_fields( ['student_id', 'start_time', 'end_time', 'package_code']).rename_field('student_id', 'student_id1') dyf_student_package.printSchema() dyf_student_package.show(2) # # doc flag tu s3 try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet" ) start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag dyf_tpe_invoice_product = Filter.apply( frame=dyf_tpe_invoice_product, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') print('the number of new contacts: ', dyf_tpe_invoice_product.count()) if (dyf_tpe_invoice_product.count() > 0): df_log_student_level_study = dyf_log_student_level_study.toDF() df_log_student_level_study = df_log_student_level_study.groupby( 'contact_id', 'level_current', 'level_modified', 'package_code').agg(f.max('time_created').alias('time_created')) dyf_join0 = Join.apply(dyf_tpe_invoice_product, dyf_tpe_invoice_product_details, 'invoice_code', 'invoice_code') print("@@@@@@@@@@@@") dyf_join0.printSchema() dyf_join0.show(2) dyf_log_student_level_study = DynamicFrame.fromDF( df_log_student_level_study, glueContext, "dyf_log_student_level_study") dyf_join1 = Join.apply(dyf_student_contact, dyf_join0, "contact_id", "user_id") dyf_join = Join.apply(dyf_join1, dyf_log_student_level_study, "user_id", "contact_id") print("@@@@@@@@@@@@") dyf_join.printSchema() dyf_join.show(2) dyf_join = Filter.apply( frame=dyf_join, f=lambda x: x['time_created'] <= x['timecreated']) dyf_data_join3 = Join.apply(dyf_join, dyf_student_package, "student_id", "student_id1") dyf_data_join3 = Filter.apply( frame=dyf_data_join3, f=lambda x: x['package_code'] == x['cat_code']) df_data_join3 = dyf_data_join3.toDF() df_data_join3 = df_data_join3.withColumn("student_level_code", checknull_(df_data_join3.level_modified, df_data_join3.level_study))\ .withColumn("behavior_id", f.lit(3))\ .withColumn("student_package_status_code", f.lit("DEACTIVED"))\ .withColumn("student_behavior_date", from_unixtime(df_data_join3.timecreated))\ .withColumn("package_starttime", df_data_join3['start_time'])\ .withColumn("package_endtime", df_data_join3['end_time']) \ .withColumn("transformed_at", f.lit(None)) df_data_join3 = df_data_join3.withColumn( 'student_behavior_id', f.md5( concaText(df_data_join3.student_behavior_date, df_data_join3.behavior_id, df_data_join3.student_id, df_data_join3.contact_id, df_data_join3.package_code, df_data_join3.package_endtime, df_data_join3.package_starttime, df_data_join3.student_level_code, df_data_join3.student_package_status_code, df_data_join3.transformed_at))) df_data_join3 = df_data_join3.dropDuplicates() dyf_data_join3 = DynamicFrame.fromDF(df_data_join3, glueContext, "dyf_data_join3") dyf_data_join3 = dyf_data_join3.resolveChoice( specs=[('behavior_id', 'cast:int'), ('student_behavior_date', 'cast:timestamp')]) dyf_data_join3.printSchema() dyf_data_join3.show(2) applymapping = ApplyMapping.apply( frame=dyf_data_join3, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("contact_id", "string", "contact_id", "string"), ("student_behavior_date", "timestamp", "student_behavior_date", "long"), ("student_id", "string", "student_id", "long"), ("cat_code", "string", "package_code", "string"), ("package_starttime", "int", "package_starttime", "long"), ("package_endtime", "int", "package_endtime", "long"), ("student_package_status_code", "string", "student_status_code", "string"), ("behavior_id", "int", "behavior_id", "long"), ("student_level_code", "string", "student_level_code", "string")]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") print(dropnullfields.count()) dropnullfields.toDF().show() glueContext.write_dynamic_frame.from_options( frame=dropnullfields, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior", "partitionKeys": ["behavior_id"] }, format="parquet") applymapping1 = ApplyMapping.apply( frame=dyf_data_join3, mappings=[("invoice_packages_price", "int", "measure1", "long"), ("behavior_id", "int", "behavior_id", "long"), ("invoice_price", "int", "measure2 ", "long")]) resolvechoice1 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields1") print(dropnullfields1.count()) dropnullfields1.toDF().show() glueContext.write_dynamic_frame.from_options( frame=dropnullfields, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_general_behavior", "partitionKeys": ["behavior_id"] }, format="parquet") dyf_tpe_invoice_product = dyf_tpe_invoice_product.toDF() flag = dyf_tpe_invoice_product.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet", mode="overwrite")
def main(): # ========== init # =========== create_dynamic_frame df_log_in_out = retrieve_dynamic_frame(glue_context, 'native_livestream', 'log_in_out', casts=[('time_in', 'cast:long'), ('time_out', 'cast:long'), ('thoigianhoc', 'cast:long') ]) if IS_DEV: print('df_log_in_out') df_log_in_out.printSchema() df_log_in_out.show(3) # display(data_frame=df_log_in_out, message="dyf_log_in_out") # ========== clear data # df_log_in_out = filter_latest(spark=spark, data_frame=df_log_in_out, config_file=flag_file) df_log_in_out_origin = df_log_in_out if df_log_in_out.count() < 0: return df_log_in_out = df_log_in_out.dropDuplicates( ['student_id', 'room_id', 'time_in']) df_log_in_out = df_log_in_out.groupBy('student_id', 'room_id').agg( min('time_in').alias('time_in'), max('time_out').alias('time_out'), sum('thoigianhoc').alias('thoigianhoc'), count('time_in').cast('long').alias('number_in_out')) if IS_DEV: print('df_log_in_out_after_group_by_room') df_log_in_out.printSchema() df_log_in_out.show(3) df_log_in_out = df_log_in_out \ .withColumn('student_behavior_date', df_log_in_out['time_in']) \ .withColumn('end_learning_time', df_log_in_out['time_out']) \ .withColumn('duration', df_log_in_out['thoigianhoc']) df_log_in_out = df_log_in_out.withColumn( 'student_behavior_date', expr("student_behavior_date div 1000")) df_log_in_out = df_log_in_out.withColumn( 'start_learning_time', df_log_in_out['student_behavior_date']) df_log_in_out = df_log_in_out.withColumn( 'end_learning_time', expr("end_learning_time div 1000")) df_log_in_out = df_log_in_out.withColumn( 'duration', round_duration_udf(expr("duration div 1000"))) df_log_in_out = data_frame_filter_not_null(df_log_in_out, [ 'student_behavior_date', 'student_id', 'start_learning_time', 'end_learning_time', 'duration' ]) display(data_frame=df_log_in_out, message="dyf_log_in_out clear data") # =========== create_dynamic_frame df_streaming_calendar_teach = retrieve_dynamic_frame( glue_context, 'topicalms', 'streaming_calendar_teach', [ 'id', 'type_class', 'teacher_available_id', 'assistant_id', 'hour_id' ], [('room_id', 'cast:int')]) # dyf_streaming_material = retrieve_dynamic_frame(glue_context, 'topicalms', 'streaming_material', # ['calendar_teach_id', 'subject']) df_student_contact = retrieve_dynamic_frame(glue_context, 'tig_advisor', 'student_contact', ['contact_id', 'student_id']) df_student_level = retrieve_data_frame_from_redshift( glue_context, 'transaction_log', 'ad_student_level', ['contact_id', 'level_code', 'start_date', 'end_date']) df_student_level = df_student_level.withColumnRenamed( 'contact_id', 'contact_id_level') display(df_student_level, "df_student_level") df_student_package = retrieve_data_frame_from_redshift( glue_context, 'transaction_log', 'ad_student_package', [ 'contact_id', 'package_code', 'package_status_code', 'package_start_time', 'package_end_time' ]) df_student_package = df_student_package.withColumnRenamed( 'contact_id', 'contact_id_package') display(df_student_package, "df_student_package") df_student_advisor = retrieve_data_frame_from_redshift( glue_context, 'transaction_log', 'ad_student_advisor', ['contact_id', 'advisor_id', 'start_date', 'end_date']) df_student_advisor = df_student_advisor\ .withColumnRenamed('contact_id', 'contact_id_advisor')\ .withColumnRenamed('start_date', 'start_date_advisor') \ .withColumnRenamed('end_date', 'end_date_advisor') display(df_student_advisor, "df_student_advisor") # ============ join student_contact table df_result = df_log_in_out.join(df_student_contact, on=['student_id'], how='left_outer') df_result = data_frame_filter_not_null(df_result, ['contact_id']) display(df_result, "join student_contact table") # ============ join streaming_calendar_teach table df_result = df_result.join( df_streaming_calendar_teach, df_result.room_id == df_streaming_calendar_teach.id, how='left_outer') df_result = df_result.withColumnRenamed('teacher_available_id', 'teacher_id') df_result = df_result.withColumnRenamed('type_class', 'class_type') df_result = df_result \ .where(col('class_type').eqNullSafe('NORMAL')) \ .withColumn('class_type', lit('LIVESTREAM').cast('string')) df_result = data_frame_filter_not_null( df_result, ['room_id', 'class_type', 'hour_id']) display(df_result, "join streaming_calendar_teach table") df_result = df_result \ .join(df_student_level, (df_result.contact_id == df_student_level.contact_id_level) & (df_result.student_behavior_date >= df_student_level.start_date) & (df_result.student_behavior_date < df_student_level.end_date), 'left' ) \ .join(df_student_package, (df_result.contact_id == df_student_package.contact_id_package) & (df_result.student_behavior_date >= df_student_package.package_start_time) & (df_result.student_behavior_date < df_student_package.package_end_time), 'left' ) \ .join(df_student_advisor, (df_result.contact_id == df_student_advisor.contact_id_advisor) & (df_result.student_behavior_date >= df_student_advisor.start_date_advisor) & (df_result.student_behavior_date < df_student_advisor.end_date_advisor), 'left' ) display( df_result, "join df_student_level, df_student_package, df_student_advisor table") # ============ add column df_result = df_result \ .withColumn('behavior_id', lit(13).cast('long')) \ .withColumnRenamed('level_code', 'student_level_code') \ .withColumn('transformed_at', lit(datetime.now()).cast("long")) \ .withColumn('platform', lit('MOBILE').cast('string')) \ .withColumn('teacher_type', lit(None).cast('string')) \ .withColumn('year_month_id', lit(from_unixtime(df_result['student_behavior_date'], format="yyyyMM")).cast('long')) \ .withColumn('role_in_class', lit('UNAVAILABLE')) \ .withColumn('vcr_type', lit('UNAVAILABLE')) contact_id_unavailable = 0 student_id_unavailable = 0 package_endtime_unavailable = 99999999999 package_starttime_unavailable = 0 student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' package_code_unavailable = 'UNAVAILABLE' class_type_unavailable = 'UNAVAILABLE' teacher_type_unavailable = 'UNAVAILABLE' advisor_unavailable = 0 measure1_unavailable = float(0.0) measure2_unavailable = float(0.0) measure3_unavailable = float(0.0) measure4_unavailable = float(0.0) role_in_class_unavailable = 'UNAVAILABLE' number_in_out_unavailable = 1 df_result = df_result.na.fill({ 'package_code': package_code_unavailable, 'student_level_code': student_level_code_unavailable, 'package_status_code': student_status_code_unavailable, 'class_type': class_type_unavailable, 'teacher_type': teacher_type_unavailable, 'advisor_id': advisor_unavailable, 'role_in_class': role_in_class_unavailable, 'number_in_out': number_in_out_unavailable }) df_result = df_result.dropDuplicates() df_result = df_result.withColumn( 'student_behavior_id', md5( concat_text_udf(df_result.student_behavior_date, df_result.behavior_id, df_result.student_id, df_result.contact_id, df_result.package_code, df_result.student_level_code, df_result.package_status_code, df_result.transformed_at))) df_result.persist(StorageLevel.DISK_ONLY_2) display(df_result, "df_result add column") # ============ select student_behavior dyf_result = from_data_frame(data_frame=df_result, glue_context=glue_context, name='dyf_result') dyf_student_behavior = dyf_result.select_fields([ 'student_behavior_id', 'student_behavior_date', 'behavior_id', 'student_id', 'contact_id', 'package_code', 'student_level_code', 'package_status_code', 'advisor_id', 'transformed_at', 'year_month_id' ]) dyf_student_behavior = dyf_student_behavior.resolveChoice([ ('student_behavior_id', 'cast:string'), ('student_behavior_date', 'cast:long'), ('behavior_id', 'cast:int'), ('student_id', 'cast:long'), ('contact_id', 'cast:string'), ('package_code', 'cast:string'), ('student_level_code', 'cast:string'), ('package_status_code', 'cast:string'), ('advisor_id', 'cast:long'), ('transformed_at', 'cast:long'), ('year_month_id', 'cast:long') ]) # -----------------------------------------------------------------------------------------------------------------# dyf_student_learning = dyf_result.select_fields([ 'student_behavior_id', 'class_type', 'platform', 'teacher_id', 'teacher_type', 'assistant_id', 'hour_id', 'start_learning_time', 'end_learning_time', 'duration', 'behavior_id', 'year_month_id', 'role_in_class', 'number_in_out', 'vcr_type' ]) dyf_student_learning = dyf_student_learning.resolveChoice([ ('student_behavior_id', 'cast:string'), ('class_type', 'cast:string'), ('platform', 'cast:string'), ('teacher_id', 'cast:long'), ('teacher_type', 'cast:string'), ('assistant_id', 'cast:long'), ('hour_id', 'cast:int'), ('start_learning_time', 'cast:long'), ('end_learning_time', 'cast:long'), ('duration', 'cast:long'), ('behavior_id', 'cast:int'), ('year_month_id', 'cast:long'), ('role_in_class', 'cast:string'), ('number_in_out', 'cast:long'), ('vcr_type', 'cast:string') ]) # -----------------------------------------------------------------------------------------------------------------# df_flag = get_flag(spark=spark, data_frame=df_log_in_out_origin) # ============ save display(dyf_student_behavior, 'dyf_student_behavior') display(dyf_student_learning, 'dyf_student_learning') display(df_flag, "df_flag") save_data_to_s3(glue_context, dyf_student_behavior, student_behavior_s3_path, student_behavior_s3_partition) save_data_to_s3(glue_context, dyf_student_learning, student_learning_s3_path, student_learning_s3_partition) if df_flag.collect()[0]['flag'] is not None: print('save_flag done') save_flag(df_flag, flag_file) df_result.unpersist()
logging.shutdown() dbutils.notebook.exit('Failed') # COMMAND ---------- # MAGIC %md # MAGIC ##### Adding adition columns into `Dataframe` # MAGIC * below command we are going to create two aditional two columns in `dataframe` # MAGIC * `Key Column` which is generated using `md5` based on `Id Column` from source table # MAGIC * Adding `load_date` column and inserting `current_timestamp()` # COMMAND ---------- from pyspark.sql.functions import md5, col, concat, lit, current_timestamp df_channels = df_channels.withColumn("channels_key",md5(concat(df_channels.CHANNEL_ID)))\ .withColumn('load_date',lit(current_timestamp())) # COMMAND ---------- # MAGIC %md # MAGIC ##### Creating `DataFrame` from target table data based on `single ID Column` # MAGIC * we are using below command to get existing data from target table to compare incremental data from source. # COMMAND ---------- # Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named _corrupt_record by default). # For example: spark.read.schema(schema).csv(file).filter($"_corrupt_record".isNotNull).count() and spark.read.schema(schema).csv(file).select("_corrupt_record").show(). # Instead, you can cache or save the parsed results and then send the same query. # Remove corrupt records df_channels.cache()
def add_md5(target_df): row_rdd=target_df.rdd.map(func) concat_df=row_rdd.toDF() hash_df=concat_df.withColumn("hash_id",md5(F.col("concat_val")))#.drop(F.col("concat_val")) return hash_df