Esempio n. 1
0
def extract_imits_tsv_allele_2(spark_session: SparkSession,
                               file_path: str) -> DataFrame:
    imits_df = utils.extract_tsv(spark_session, file_path)
    imits_df = imits_df.withColumn(
        "allele_mgi_accession_id",
        when(
            (col("allele_mgi_accession_id").isNull()) &
            (col("type") == "Allele"),
            concat(lit("NOT-RELEASED-"),
                   substring(md5(col("allele_symbol")), 0, 10)),
        ).otherwise(col("allele_mgi_accession_id")),
    )
    imits_df = imits_df.withColumn(
        "marker_mgi_accession_id",
        when(
            (col("marker_mgi_accession_id").isNull()) &
            (col("type") == "Gene"),
            concat(lit("NOT-RELEASED-"),
                   substring(md5(col("marker_symbol")), 0, 10)),
        ).otherwise(col("marker_mgi_accession_id")),
    )
    imits_df = imits_df.withColumn(
        "allele2_id",
        monotonically_increasing_id().astype(StringType()))
    for col_name in ALLELE2_MULTIVALUED:
        imits_df = imits_df.withColumn(
            col_name,
            when(
                col(col_name).contains("|"),
                split(col_name, r"\|"),
            ).otherwise(array(col_name)),
        )
    return imits_df
Esempio n. 2
0
def get_body_weight_curve_observations(
        unidimensional_observations_df: DataFrame):
    body_weight_curve_df = None
    for (
            parameter_stable_id,
            parameter_data,
    ) in Constants.BODY_WEIGHT_CURVE_PARAMETERS.items():
        bwt_observations = unidimensional_observations_df.where(
            col("parameter_stable_id").isin(parameter_data["parameters"]))
        bwt_observations = bwt_observations.withColumn(
            "procedure_stable_id", lit(parameter_data["procedure_stable_id"]))
        bwt_observations = bwt_observations.withColumn(
            "parameter_stable_id", lit(parameter_stable_id))
        bwt_observations = bwt_observations.withColumn(
            "procedure_group", lit(parameter_data["procedure_group"]))
        bwt_observations = bwt_observations.withColumn(
            "procedure_name", lit(parameter_data["procedure_name"]))
        bwt_observations = bwt_observations.withColumn(
            "parameter_name", lit(parameter_data["parameter_name"]))
        bwt_observations = bwt_observations.withColumn(
            "experiment_id",
            md5(concat(lit(parameter_stable_id + "_"), col("experiment_id"))),
        )
        bwt_observations = bwt_observations.withColumn(
            "observation_id",
            md5(concat(lit(parameter_stable_id + "_"), col("observation_id"))),
        )
        bwt_observations = bwt_observations.withColumn(
            "experiment_source_id",
            concat(lit(parameter_stable_id + "_"),
                   col("experiment_source_id")),
        )
        bwt_observations = bwt_observations.withColumn("metadata_group",
                                                       md5(lit("")))
        bwt_observations = bwt_observations.withColumn(
            "metadata",
            array(concat(lit("Source experiment id: "), col("experiment_id"))),
        )

        bwt_observations = bwt_observations.withColumn("observation_type",
                                                       lit("time_series"))
        bwt_observations = bwt_observations.withColumn("discrete_point",
                                                       col("age_in_weeks"))
        bwt_observations = bwt_observations.withColumn(
            "time_point", col("date_of_experiment"))
        if body_weight_curve_df is None:
            body_weight_curve_df = bwt_observations
        else:
            body_weight_curve_df = body_weight_curve_df.union(bwt_observations)
    return body_weight_curve_df.drop_duplicates()
def create_md5_hash_keys(df, grouping_cols):
    return (
        df
        .withColumn('hash_key', F.md5(F.concat_ws('__', *grouping_cols)))
        .select(*grouping_cols, 'hash_key', 'amount')
        .sort(grouping_cols)
    )
Esempio n. 4
0
def generate_unique_id(dcc_experiment_df: DataFrame):
    """
    Generates an unique_id column using as an input every column
    except from those that have non unique values and
    the ones that correspond to parameter values.
    Given that _sequenceID could be null, the function transforms it the to the string NA
    when its null to avoid the nullifying the concat.
    It concatenates the unique set of values and then applies
    an MD5 hash function to the resulting string.
    :param dcc_experiment_df:
    :return: DataFrame
    """
    non_unique_columns = [
        "_type",
        "_sourceFile",
        "_VALUE",
        "procedureMetadata",
        "statusCode",
        "_sequenceID",
        "_project",
    ]
    if "_sequenceID" in dcc_experiment_df.columns:
        dcc_experiment_df = dcc_experiment_df.withColumn(
            "_sequenceIDStr",
            when(col("_sequenceID").isNull(),
                 lit("NA")).otherwise(col("_sequenceID")),
        )
    unique_columns = [
        col_name for col_name in dcc_experiment_df.columns
        if col_name not in non_unique_columns
        and not col_name.endswith("Parameter")
    ]
    dcc_experiment_df = dcc_experiment_df.withColumn(
        "unique_id", md5(concat(*unique_columns)))
    return dcc_experiment_df
Esempio n. 5
0
def map_line_columns(line_df: DataFrame):
    for field, value in Constants.LINE_TO_OBSERVATION_MAP.items():
        if value is not None:
            line_df = line_df.withColumn(field, col(value))
        else:
            line_df = line_df.withColumn(field, lit(None))
    line_df = line_df.withColumn("biological_sample_group",
                                 lit("experimental"))
    line_df = line_df.withColumn("zygosity", lit("homozygote"))
    line_df = line_df.withColumn(
        "datasource_name",
        when(col("_dataSource") == "impc", lit("IMPC")).otherwise(
            when(col("_dataSource") == "europhenome",
                 lit("EuroPhenome")).otherwise(col("_dataSource"))),
    )
    line_df = line_df.withColumn(
        "allele_accession_id",
        when(col("biological_sample_group") == "control", lit(None)).otherwise(
            when(
                col("allele.mgiAlleleID").isNull(),
                concat(
                    lit("NOT-RELEASED-"),
                    substring(md5(line_df["allele_symbol"]), 0, 10),
                ),
            ).otherwise(col("allele.mgiAlleleID"))),
    )
    return line_df
def save_to_stage(rdd):
    """
    This method handles the kafka messages - we simply want to save them to the staging index for processing.
    """
    # If we get an empty message, do nothing (should not happen!)
    if rdd.isEmpty():
        return

    esconf = {}
    esconf["es.mapping.id"] = 'message_id'
    esconf["es.index.auto.create"] = "true"
    esconf["es.nodes"] = ip
    esconf["es.port"] = port
    esconf["es.nodes.wan.only"] = "true"
    esconf["es.write.operation"] = "index"
    sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate())
    df = sqlContext.createDataFrame(rdd, samplingRatio=1).toDF(
        "topic", "key", "value")
    # Add Identifier, Recieved Timestamp, and Boolean Flag to indicate not processed
    df = df.drop(df.key)
    df = df.withColumn('message_id', f.md5(df.value))
    df = df.withColumn("message_recieved_ts", f.lit(f.current_timestamp()))
    df = df.withColumn("message_processed", f.lit('false'))
    df.write.format("org.elasticsearch.spark.sql").options(
        **esconf).mode("append").save(resource)
Esempio n. 7
0
def convertAndSaveS3(df):
    df = df.withColumn("year_month_id", f.from_unixtime('student_behavior_date', format="yyyyMM"))
    df = df.withColumn('student_behavior_id',
                       f.md5(concaText(
                           df.student_behavior_date,
                           df.behavior_id,
                           df.student_id,
                           df.contact_id,
                           df.package_code,
                           df.package_status_code,
                           df.student_level_code,
                           df.transformed_at)))

    dyf = DynamicFrame.fromDF(df, glueContext, "dyf")

    if is_dev:
        print('dyf____________________________')
        dyf.printSchema()
        dyf.show(10)

    behavior_mapping = mappingForAll(dyf, MAPPING)

    if behavior_mapping.count() > 0:
        parquetToS3(dyf=behavior_mapping, path="s3://toxd-olap/transaction_log/student_behavior/sb_student_behavior")
    rate_mapping = mappingForAll(dyf, RATE_MAPPING)
    if rate_mapping.count() > 0:
        parquetToS3(dyf=rate_mapping, path="s3://toxd-olap/transaction_log/student_behavior/sb_student_rating")
Esempio n. 8
0
def extract_imits_tsv_by_entity_type(spark_session: SparkSession,
                                     file_path: str,
                                     entity_type: str) -> DataFrame:
    """
    Uses a Spark Session to generate a DataFrame from a TSV file and a specific entity type.
    Can extract Genes or Alleles from a Alleles report file produced by IMITS.
    :param spark_session: spark SQL session to be used in the extraction
    :param file_path: path to the TSV file
    :param entity_type: 'Allele' or 'Gene'
    :return: Spark DataFrame with the extracted data
    """
    imits_df = utils.extract_tsv(spark_session, file_path)
    imtis_entity_df = imits_df.where(imits_df.type == entity_type)
    if entity_type == "Allele":
        imtis_entity_df = imtis_entity_df.withColumn(
            "acc",
            when(
                col("allele_mgi_accession_id").isNull(),
                concat(lit("NOT-RELEASED-"),
                       substring(md5(col("allele_symbol")), 0, 10)),
            ).otherwise(col("allele_mgi_accession_id")),
        )
        imtis_entity_df = imtis_entity_df.withColumn(
            "allele2_id",
            monotonically_increasing_id().astype(StringType()))
    for col_name in ALLELE2_MULTIVALUED:
        imits_df = imits_df.withColumn(
            col_name,
            when(
                col(col_name).contains("|"),
                split(col_name, r"\|"),
            ).otherwise(array(col_name)),
        )
    return imtis_entity_df
Esempio n. 9
0
def convertAndSaveS3(df):
    df = df.dropDuplicates()
    prinDev(df, "befor convert")
    df = df.withColumn(
        "year_month_id",
        f.from_unixtime('student_behavior_date', format="yyyyMM"))
    prinDev(df, "befor mapping")
    df = df.withColumn(
        'student_behavior_id',
        f.md5(
            concaText(df.student_behavior_date, df.behavior_id, df.student_id,
                      df.contact_id, df.package_code, df.package_status_code,
                      df.student_level_code, df.transformed_at)))

    dyf = DynamicFrame.fromDF(df, glueContext, "dyf")

    behavior_mapping = mappingForAll(dyf, MAPPING)

    prinDev(behavior_mapping, "after mapping")
    if behavior_mapping.count() > 0:
        parquetToS3(
            dyf=behavior_mapping,
            path=
            "s3://toxd-olap/transaction_log/student_behavior/sb_student_behavior"
        )
Esempio n. 10
0
def hash_pseudo_anonymization(inputDF: DataFrame, columns: list) -> DataFrame:
    from pyspark.sql import functions
    outputDataFrame: DataFrame = inputDF
    for column in columns:
        outputDataFrame = outputDataFrame.withColumn(
            column + '_anon',
            functions.md5(functions.col(column).cast('String')))
    return outputDataFrame
Esempio n. 11
0
def generate_unique_id(dcc_specimen_df: DataFrame) -> DataFrame:
    unique_columns = ["_productionCentre", "_specimenID"]
    unique_columns = [
        col_name for col_name in dcc_specimen_df.columns
        if col_name in unique_columns
    ]
    dcc_specimen_df = dcc_specimen_df.withColumn("unique_id",
                                                 md5(concat(*unique_columns)))
    return dcc_specimen_df
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_filename", help="Input CSV filename", type=str)
    parser.add_argument("masked_csv_folder",
                        help="Masked CSV folder",
                        type=str)
    args = parser.parse_args()

    print(args)
    csv_filename = args.csv_filename
    spark = SparkSession.builder.master('local[*]').appName(
        "processCSV").getOrCreate()

    df_masked = spark.read.csv(csv_filename, header=True)\
        .withColumn('first_name', F.md5('first_name'))\
        .withColumn('last_name', F.md5('last_name'))\
        .withColumn('address', F.md5('address'))

    df_masked.write.mode('overwrite').csv(args.masked_csv_folder, header=False)
    spark.stop()
Esempio n. 13
0
    def process_event(self):
        event_cols = ["event_id", 'collector_tstamp', 'domain_sessionid', 'dvce_tstamp',
                      'event_sub_type', 'se_label', 'se_property_type', 'user_token',
                      'device_id', 'user_ipaddress', 'hour', 'se_property',
                      'isSuspect', 'suspect_reason']
        #  with  user
        event_to_user = self.df_init.filter(F.col("se_property_type") == "user") \
            .select(*event_cols).withColumn("dst_node", F.md5("se_property")) \
            .drop_duplicates(["event_id"])

        # with note
        event_to_note = self.df_init.filter(F.col("se_property_type") == "note") \
            .select(*event_cols).withColumn("dst_node", F.md5("se_property")) \
            .drop_duplicates(["event_id"]).select(event_to_user.columns)

        event_df = event_to_user.unionAll(event_to_note)
        # 分小时存csv
        event_df.repartition(1).write.partitionBy("hour").csv(os.path.join(self.save_dir, "event_by_hour.csv"))
        print("node device has been processed successfully.\n")
        return event_df
Esempio n. 14
0
    def run():
        any_is_null = None
        for col in cols:
            col_is_null = f.isnull(f.col(col))
            any_is_null = col_is_null if any_is_null is None else (
                any_is_null | col_is_null)

        col_sk = (f.when(any_is_null, '-1').otherwise(
            f.md5(f.concat_ws('-', *cols))).cast('string'))

        return col_sk
Esempio n. 15
0
def calculate_md5(df):
    """Calculate MD5 on all columns of input data frame
    Args:
    df : spark data frame

    returns: dataframe
    """
    col_list = []
    for i in df.columns:
        col_list.append(i)
        resultdf = df.withColumn('md5', md5(concat_ws('_', *col_list)))
    return resultdf
Esempio n. 16
0
def process_parameter_values(exp_df,
                             pipeline_df,
                             parameter_column,
                             exp_type="experiment"):
    parameter_cols = [
        "simpleParameter",
        "mediaParameter",
        "ontologyParameter",
        "seriesMediaParameter",
        "seriesParameter",
    ]
    if parameter_column not in exp_df.columns:
        return None
    parameter_observation_df = exp_df
    for column in parameter_cols:
        if column is not parameter_column:
            parameter_observation_df = parameter_observation_df.drop(column)
    parameter_observation_df = (parameter_observation_df.selectExpr(
        "*",
        "posexplode(" + parameter_column + ") as (experimentPos, " +
        parameter_column + "Exploded )",
    ).withColumn(parameter_column,
                 col(parameter_column + "Exploded")).drop(parameter_column +
                                                          "Exploded"))
    parameter_observation_df = parameter_observation_df.withColumn(
        "observation_id",
        md5(
            concat(
                col("experiment_id"),
                lit("_" + parameter_column + "_"),
                col("experimentPos"),
            )),
    )
    if exp_type == "experiment":
        parameter_observation_df = map_experiment_columns(
            parameter_observation_df)
    else:
        parameter_observation_df = map_line_columns(parameter_observation_df)

    parameter_observation_df = add_impress_info(parameter_observation_df,
                                                pipeline_df,
                                                parameter_column,
                                                exp_type=exp_type)
    if has_column(parameter_observation_df,
                  parameter_column + ".parameterStatus"):
        parameter_observation_df = parameter_observation_df.withColumn(
            "parameter_status", col(parameter_column + ".parameterStatus"))
    else:
        parameter_observation_df = parameter_observation_df.withColumn(
            "parameter_status", lit(None))
    return parameter_observation_df
def benchmark2():
    print("===Benchmark 2===")
    print(
        "Comparing JDBC writes to InnoDB and API writes to ColumnStore with larger datasets"
    )
    print("")

    emptyDatabase()

    print("creating dataframe 1: two random generated doubles")
    randDF = sqlContext.range(0, 7000000).withColumn(
        'uniform', rand(seed=23)).withColumn('normal', randn(seed=42)).cache()
    randDFRows = randDF.count()
    randDFItems = randDFRows * len(randDF.columns)
    randDF.printSchema()
    print("bemchmarking dataframe 1")
    rand_benchmark = benchmark2execution(
        "rand", randDF, "id BIGINT, uniform DOUBLE, normal DOUBLE")
    randDF.unpersist()

    print(
        "creating dataframe 2: sha1, sha256, sha512 and md5 hashes of integers"
    )
    tmpDF = sqlContext.createDataFrame(
        sc.parallelize(range(
            0, 3000000)).map(lambda i: Row(number=i, string=str(i))))
    hashDF = tmpDF.select(tmpDF.number,
                          sha1(tmpDF.string).alias("sha1"),
                          sha2(tmpDF.string, 256).alias("sha256"),
                          sha2(tmpDF.string, 512).alias("sha512"),
                          md5(tmpDF.string).alias("md5")).cache()
    hashDFRows = hashDF.count()
    hashDFItems = hashDFRows * len(hashDF.columns)
    hashDF.printSchema()
    print("bemchmarking dataframe 2")
    hash_benchmark = benchmark2execution(
        "hash", hashDF,
        "number BIGINT, sha1 VARCHAR(40), sha256 VARCHAR(64), sha512 VARCHAR(128), md5 VARCHAR(32)"
    )
    hashDF.unpersist()

    print("jdbc_innodb\tapi_columnstore\t\trows\t\titems")
    print("%.3fs\t\t%.3fs\t\t%i\t\t%i" %
          (rand_benchmark[0], rand_benchmark[1], randDFRows, randDFItems))
    print("%.3fs\t\t%.3fs\t\t%i\t\t%i" %
          (hash_benchmark[0], hash_benchmark[1], hashDFRows, hashDFItems))
Esempio n. 18
0
    def process_user(self, behave_dir, active_dir):
        """
        process node user
        @return:
        """
        source_user = self.df_init.select("user_token", "user_app_first_time", "user_create_time") \
            .filter(~F.isnull("user_token")) \
            .drop_duplicates(["user_token"]) \
            .withColumn("has_behavior", F.lit(1))

        dest_user = self.df_init.filter(F.col("se_property_type") == "user") \
            .select("se_property").drop_duplicates() \
            .select(F.md5("se_property").alias("user_token")) \
            .withColumn("isFollow", F.lit(1))
        user_df = source_user.join(dest_user, on="user_token", how="outer").withColumn("noe_type", F.lit("user"))
        save2csv(user_df.repartition(1), self.save_dir, "node_user.csv")
        print("node user has been processed successfully.\n")
        return user_df
Esempio n. 19
0
 def generate_unique_id(self, dcc_specimen_df: DataFrame) -> DataFrame:
     """
     Generates an unique identifier for the  Specimen using productin centre, phenotyping centre  and specimen ID.
     """
     dcc_specimen_df = dcc_specimen_df.withColumn(
         "unique_id",
         md5(
             concat(*[
                 when(
                     col("_productionCentre").isNotNull(),
                     col("_productionCentre"),
                 ).when(
                     col("_phenotypingCentre").isNotNull(),
                     col("_phenotypingCentre"),
                 ).otherwise(lit("")),
                 col("_specimenID"),
             ])),
     )
     return dcc_specimen_df
Esempio n. 20
0
def resolve_image_record_value(image_record_observation_df: DataFrame):
    image_record_observation_df = image_record_observation_df.selectExpr(
        "*",
        "posexplode(seriesMediaParameter.value) as (seriesMediaParameterPos, seriesMediaParameterValue)",
    )
    image_record_observation_df = image_record_observation_df.withColumn(
        "observation_id",
        md5(
            concat(
                col("observation_id"),
                lit("_seriesMediaParameterValue_"),
                col("seriesMediaParameterPos"),
            )),
    )
    image_record_observation_df = image_record_observation_df.withColumn(
        "download_file_path", col("seriesMediaParameterValue._URI"))
    image_record_observation_df = image_record_observation_df.withColumn(
        "file_type", col("seriesMediaParameterValue._fileType"))
    image_record_observation_df = image_record_observation_df.withColumn(
        "observation_type", lit("image_record"))
    return image_record_observation_df
Esempio n. 21
0
def handle_patient_registration(source_df):
    """
     This methods applies transformations to tha_patient_registration to make it compatible with abstract schema
    Args:
        source_df: tha_patient_registration Dataframe

    Returns:
        pat_reg_df: transformed tha_patient_registration Dataframe
    """
    pat_reg_df = source_df.withColumn("claim_id", F.md5(F.concat(F.lit("tha"),
                                                                 F.coalesce(source_df.fac, F.lit("")),
                                                                 F.coalesce(source_df.dkey, F.lit("")),
                                                                 F.coalesce(source_df.source_code, F.lit("")),
                                                                 F.coalesce(source_df.source_year, F.lit("")),
                                                                 F.coalesce(source_df.source_qtr, F.lit(""))
                                                                 )))

    pat_reg_final_df = pat_reg_df.withColumn("claim_type_cd", claim_type_code(pat_reg_df.source_code)) \
        .withColumn("claim_type_txt", claim_type_txt(pat_reg_df.source_code)) \
        .withColumn("claim_start_dt", pat_reg_df.adat) \
        .withColumn("claim_end_dt", pat_reg_df.ddat) \
        .withColumnRenamed("pdx", "primary_diagnosis_cd") \
        .withColumn("primary_diagnosis_code_type_cd", F.lit("icd10")) \
        .withColumnRenamed("pphysdocid", "attending_provider_npi") \
        .withColumn("organization_state_cd", organization_state_cd(pat_reg_df.hospstateabbr)) \
        .withColumn("utilization_day_cnt", pat_reg_df.los) \
        .withColumn("admit_dt", pat_reg_df.adat) \
        .withColumn("admit_type_cd", admit_type_cd(pat_reg_df.atype)) \
        .withColumn("length_of_stay_val", pat_reg_df.los) \
        .withColumn("discharge_dt", pat_reg_df.ddat) \
        .withColumn("admission_referral_source_cd", admission_referral_source_cd(pat_reg_df.asource)) \
        .withColumnRenamed("drg", "drg_cd") \
        .select("fac", "dkey", "claim_id", "claim_type_cd", "claim_type_txt", "claim_start_dt", "claim_end_dt",
                "primary_diagnosis_cd", "primary_diagnosis_code_type_cd", "attending_provider_npi",
                "organization_state_cd", "utilization_day_cnt", "admit_dt", "admit_type_cd", "length_of_stay_val",
                "discharge_dt", "admission_referral_source_cd", "drg_cd", "source_code", "source_year", "source_qtr"
                )

    return pat_reg_final_df
Esempio n. 22
0
    def add_hashed_id(df, columns=[], hashed_col='Hashed_ID', hash_type='md5'):
        """
            This method will create a dummy transaction for each account record in the dataframe.

            Returns
                --------
                Dataframe with hashed Id as a column
                ------
            Parameters
                --------
                df : spark dataframe
                    dataframe to create hashed id on
                columns : list of strings
                    columns to use hash, default is None which takes in all columns of df
                hashed_col : string
                    column name for hashed id
                --------
        """
        if len(columns) == 0:
            columns = df.columns
        else:
            illegal_columns = []
            for column in columns:
                if column not in df.columns:
                    illegal_columns.append(column)
            if len(illegal_columns) > 0:
                raise IllegalArgumentException(
                    'Column {} does not exist in dataframe'.format(', '.join(illegal_columns)))
        
        if hashed_col is None or hashed_col == '':
            hashed_col = 'Hashed_ID'
        
        if hash_type == 'md5':
            df = df.withColumn(hashed_col, F.md5(F.concat(*columns)))
        else:
            df = df.withColumn(hashed_col, F.sha2(F.concat(*columns)))
        return df
Esempio n. 23
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second = long(today.strftime("%s"))
    print('today_id: ', today_second)
    # f.lit(today_second).cast('long').alias('transformed_at')

    rangeid = [14, 15, 16, 17, 18]

    student_id_unavailable = '0'
    package_endtime_unavailable = 99999999999L
    package_starttime_unavailable = 0L
    student_level_code_unavailable = 'UNAVAILABLE'
    student_status_code_unavailable = 'UNAVAILABLE'

    package_endtime = 'package_endtime'
    package_starttime = 'package_starttime'
    student_level_code = 'student_level_code'
    student_status_code = 'student_status_code'

    def doCheckModified(val1, val2):
        if val1 is not None:
            return val1
        return val2

    check_modified_null = udf(doCheckModified, StringType())

    def doCheckStudentID(code):
        code = str(code)
        if code is None:
            return student_id_unavailable
        return code

    check_student_id = udf(doCheckStudentID, StringType())

    def doCheckData(code, key):
        key = str(key)
        if code is None:
            if key == package_endtime:
                return package_endtime_unavailable
            else:
                return package_starttime_unavailable
        return code

    check_data = udf(doCheckData, IntegerType())

    def doCheckDataNull(code, key):
        code = str(code)
        key = str(key)
        if (code is None) & (key == student_level_code):
            return student_level_code_unavailable

        if (code is None) & (key == student_status_code):
            return student_status_code_unavailable

        return code

    check_data_null = udf(doCheckDataNull, StringType())

    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                package_code, package_endtime,package_starttime,
                student_level_code, student_status_code, transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is  not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_status_code is not None:
            text_concat += str(student_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = f.udf(concaText, StringType())


    dyf_tblreply_rating = glueContext.create_dynamic_frame.from_catalog(
        database="native_smile",
        table_name="tblreply_rating"
    )
    dyf_tblreply_rating = dyf_tblreply_rating.select_fields(
            ['_key', 'userid', 'ratingid', 'time_rating']
    )
    dyf_tblreply_rating = dyf_tblreply_rating.resolveChoice(specs=[('_key', 'cast:long')])
    # try:
    #     df_flag_1 = spark.read.parquet("s3://dtsodin/flag/flag_hoc_vien_rating_native_smile_h2472.parquet")
    #     max_key = df_flag_1.collect()[0]['flag']
    #     print("max_key:  ", max_key)
    #     # Chi lay nhung ban ghi lon hon max_key da luu, ko load full
    #     dyf_tblreply_rating = Filter.apply(frame=dyf_tblreply_rating, f=lambda x: x["_key"] > max_key)
    # except:
    #     print('read flag file error ')

    if dyf_tblreply_rating.count()> 0:

        dyf_mdl_user = glueContext.create_dynamic_frame.from_catalog(
            database="topicalms",
            table_name="mdl_user"
        )
        dyf_mdl_user = dyf_mdl_user.select_fields(
            ['id', 'email']
        )

        dyf_tma_dm_tu_dien = glueContext.create_dynamic_frame.from_catalog(
            database="native_smile",
            table_name="tma_dm_tu_dien"
        )
        dyf_tma_dm_tu_dien = dyf_tma_dm_tu_dien.select_fields(
            ['id', 'ma_tu_dien', 'id_dm_loai_tu_dien']
        )

        dyf_tma_dm_tu_dien = Filter.apply(frame=dyf_tma_dm_tu_dien,
                                        f=lambda x: x['id_dm_loai_tu_dien'] == 7
                                                    and x['id'] in rangeid)

        ################
        join_rating_user = Join.apply(dyf_tblreply_rating, dyf_mdl_user, 'userid', 'id')
        join_rating_user01 = Join.apply(join_rating_user, dyf_tma_dm_tu_dien, 'ratingid', 'id')

        ################
        dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
            database="tig_advisor",
            table_name="student_contact"
        )
        dyf_student_contact = dyf_student_contact.select_fields(
            ['contact_id', 'student_id', 'level_study', 'time_lms_created'])\

        dyf_log_student_status = glueContext.create_dynamic_frame.from_catalog(
            database="do_tig_advisor",
            table_name="log_student_status"
        )
        dyf_log_student_status = dyf_log_student_status.select_fields(
            ['contact_id', 'status_code', 'last_status_code', 'start_date', 'end_date']) \
            .rename_field('contact_id', 'contact_id_status')

        dyf_log_student_package = glueContext.create_dynamic_frame.from_catalog(
            database="do_tig_advisor",
            table_name="log_student_package"
        )
        dyf_log_student_package = dyf_log_student_package.select_fields(
            ['student_id', 'package_code', 'start_time', 'end_time']) \
            .rename_field('student_id', 'student_id_package') \
            .rename_field('start_time', 'start_time_package') \
            .rename_field('end_time', 'end_time_package')

        dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog(
            database="tig_advisor",
            table_name="log_student_level_study"
        )
        dyf_log_student_level_study = dyf_log_student_level_study.select_fields(
            ['contact_id', 'level_current', 'level_modified', 'package_code', 'time_created']) \
            .rename_field('contact_id', 'contact_id_level')

        join_rating_user01.printSchema()
        print join_rating_user01.count()
        print join_rating_user01.count()
        try:
            df_rating_class = join_rating_user01.toDF()
            df_student_contact = dyf_student_contact.toDF()
            df_log_student_level_study = dyf_log_student_level_study.toDF()
            df_temp = dyf_log_student_level_study.toDF()
            df_log_student_status = dyf_log_student_status.toDF()
            df_log_student_package = dyf_log_student_package.toDF()

            df_temp = df_temp.groupby('contact_id_level', 'level_current', 'package_code').agg(
                f.max("time_created").alias("time_created_max"))
            df_temp = df_temp.withColumnRenamed('contact_id_level', 'contact_id_join') \
                .withColumnRenamed('package_code', 'package_code_join')

            df_join0 = df_temp.join(df_log_student_level_study,
                                    (df_temp['contact_id_join'] == df_log_student_level_study['contact_id_level'])
                                    & (df_temp['package_code_join'] == df_log_student_level_study['package_code'])
                                    & (df_temp['time_created_max'] == df_log_student_level_study['time_created']), "left")
            print "=========== . ==========="
            df_join0.printSchema()
            dyf_join = DynamicFrame.fromDF(df_join0, glueContext, "dyf_join")
            dyf_join = dyf_join.select_fields(
                ['contact_id_level', 'level_current', 'level_modified', 'package_code', 'time_created'])
            df_join = dyf_join.toDF()
            df_join.printSchema()
            df_join.show(10)
            print "########## . ###########"

            df_join01 = df_rating_class.join(df_student_contact,
                                         (df_rating_class['userid'] == df_student_contact['student_id']))
            df_join01.printSchema()
            df_join02 = df_join01.join(df_join,
                                       (df_join['contact_id_level'] == df_join01['contact_id'])
                                       & (df_join['time_created'] <= df_join01['time_lms_created']), "left")

            df_join02 = df_join02\
                .withColumn("level_modified_new", check_modified_null(df_join02.level_modified, df_join02.level_study))

            df_join02.printSchema()
            df_join02.show(10)
            dyf_join = DynamicFrame.fromDF(df_join02, glueContext, "dyf_join")
            dyf_join = dyf_join.select_fields(['time_rating', 'contact_id', 'student_id', 'level_study', 'time_lms_created', 'ratingid',
                                               'level_current', 'level_modified', 'package_code', 'time_created', 'level_modified_new'])
            # dyf_join_temp = Filter.apply(frame=dyf_join,
            #                              f=lambda x: x["level_modified_new"] is None)
            # print "count: ", dyf_join_temp.count()

            ############
            df_join02 = dyf_join.toDF()
            df_join03 = df_join02.join(df_log_student_status,
                                       (df_log_student_status['contact_id_status'] == df_join02['contact_id'])
                                       & (df_log_student_status['start_date'] <= df_join02['time_rating'])
                                       & (df_log_student_status['end_date'] >= df_join02['time_rating']), "left")

            df_join04 = df_join03.join(df_log_student_package,
                                       (df_log_student_package['student_id_package'] == df_join03['student_id'])
                                       & (df_log_student_package['start_time_package'] <= df_join03['time_rating'])
                                       & (df_log_student_package['end_time_package'] >= df_join03['time_rating']), "left")

            dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join")
            dyf_join = Filter.apply(frame=dyf_join,
                                    f=lambda x: x["start_time_package"] is not None
                                                and x["end_time_package"] is not None)
            print "dyf_join: ", dyf_join.count()
            dyf_join.printSchema()
            dyf_join.show(10)
            dyf_join = dyf_join.select_fields(
                ['time_rating', 'student_id', 'contact_id', 'package_code', 'ratingid',
                 'start_time_package', 'end_time_package', 'level_modified_new', 'status_code']
            )
            print "TTTTTTTTTTTTTT"
            # dyf_join01 = Filter.apply(frame=dyf_join,
            #                           f=lambda x: x["level_current"] is not None)
            #
            # print "Check null ", dyf_join01.count()

            df_join04 = dyf_join.toDF()
            df_join04 = df_join04.withColumn("transformed_at", unix_timestamp(f.current_timestamp())) \
                .withColumn("student_id", check_student_id(df_join04.student_id)) \
                .withColumn("package_endtime", check_data(df_join04.end_time_package, f.lit(package_endtime))) \
                .withColumn("package_starttime", check_data(df_join04.start_time_package, f.lit(package_starttime))) \
                .withColumn("student_level_code", check_data_null(df_join04.level_modified_new, f.lit(student_level_code))) \
                .withColumn("student_status_code", check_data_null(df_join04.status_code, f.lit(student_status_code))) \
                .withColumn("behavior_id", f.lit(27)) \
                .withColumn("rating_type", f.lit("rating_native_smile_h2472")) \
                .withColumn("comment", f.lit("")) \
                .withColumn("rating_about", f.lit(None)) \
                .withColumn("number_rating", f.lit(1)) \
                .withColumn("value_rating", (df_join04.ratingid - f.lit(13)))

            df_join04.printSchema()
            print df_join04.count()
            df_join04.show(10)

            dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join")
            # dyf_join.printSchema()
            # print dyf_join.count()
            # dyf_join.show(10)

            dyf_rating_cara = ApplyMapping.apply(frame=dyf_join,
                                                       mappings=[("time_rating", "int", "student_behavior_date", "long"),
                                                                 ("behavior_id", "int", "behavior_id", "long"),
                                                                 ("student_id", "string", "student_id", "long"),
                                                                 ("contact_id", "string", "contact_id", "string"),
                                                                 ("package_code", "string", "package_code", "string"),
                                                                 ("package_endtime", "int", "package_endtime", "long"),
                                                                 ("package_starttime", "int", "package_starttime", "long"),
                                                                 ("student_level_code", "string", "student_level_code", "string"),
                                                                 ("student_status_code", "string", "student_status_code", "string"),
                                                                 ("transformed_at", "long", "transformed_at", "long"),
                                                                 ("rating_type", "string", "rating_type", "string"),
                                                                 ("comment", "string", "comment", "string"),
                                                                 ("rating_about", "int", "rating_about", "long"),
                                                                 ("number_rating", "int", "number_rating", "long"),
                                                                 ("value_rating", "int", "value_rating", "long")])

            df_rating_cara = dyf_rating_cara.toDF()
            df_rating_cara2 = df_rating_cara.withColumn('student_behavior_id',
                                                                    f.md5(concaText(
                                                                        df_rating_cara.student_behavior_date,
                                                                        df_rating_cara.behavior_id,
                                                                        df_rating_cara.student_id,
                                                                        df_rating_cara.contact_id,
                                                                        df_rating_cara.package_code,
                                                                        df_rating_cara.package_endtime,
                                                                        df_rating_cara.package_starttime,
                                                                        df_rating_cara.student_level_code,
                                                                        df_rating_cara.student_status_code,
                                                                        df_rating_cara.transformed_at)))

            dyf_rating_cara = DynamicFrame.fromDF(df_rating_cara2, glueContext, 'dyf_rating_cara')

            dyf_rating_cara = Filter.apply(frame=dyf_rating_cara,
                                           f=lambda x: x["contact_id"] is not None and x["contact_id"] != '')

            applymapping0 = ApplyMapping.apply(frame=dyf_rating_cara,
                                               mappings=[
                                                   ("student_behavior_id", "string", "student_behavior_id", "string"),
                                                   ("rating_type", "string", "rating_type", "string"),
                                                   ("comment", "string", "comment", "string"),
                                                   ("rating_about", "long", "rating_about", "long"),
                                                   ("number_rating", "long", "number_rating", "long"),
                                                   ("value_rating", "long", "value_rating", "long"),
                                                   ("behavior_id", "long", "behavior_id", "long")])

            applymapping0.printSchema()
            print applymapping0.count()
            applymapping0.show(5)
            resolvechoice0 = ResolveChoice.apply(frame=applymapping0, choice="make_cols",
                                                 transformation_ctx="resolvechoice1")
            dropnullfields0 = DropNullFields.apply(frame=resolvechoice0, transformation_ctx="dropnullfields0")
            print resolvechoice0.count()
            # resolvechoice0.printSchema()
            # resolvechoice0.show(10)

            print('START WRITE TO S3-------------------------')
            datasink0 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields0, connection_type="s3",
                                                                     connection_options={
                                                                         "path": "s3://dtsodin/student_behavior/student_rating/",
                                                                         "partitionKeys": ["behavior_id"]},
                                                                     format="parquet",
                                                                     transformation_ctx="datasink0")
            print('END WRITE TO S3-------------------------')

            # datasink0 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields0,
            #                                                            catalog_connection="glue_redshift",
            #                                                            connection_options={
            #                                                                "dbtable": "student_rating_temp",
            #                                                                "database": "dts_odin"
            #                                                            },
            #                                                            redshift_tmp_dir="s3a://dtsodin/temp/student_rating_temp/",
            #                                                            transformation_ctx="datasink0")

            applymapping1 = ApplyMapping.apply(frame=dyf_rating_cara,
                                               mappings=[("student_behavior_id", "string", "student_behavior_id", "string"),
                                                         ("student_behavior_date", "long", "student_behavior_date", "long"),
                                                         ("behavior_id", "long", "behavior_id", "long"),
                                                         ("student_id", "long", "student_id", "long"),
                                                         ("contact_id", "string", "contact_id", "string"),
                                                         ("package_code", "string", "package_code", "string"),
                                                         ("package_endtime", "long", "package_endtime", "long"),
                                                         ("package_starttime", "long", "package_starttime", "long"),
                                                         ("student_level_code", "string", "student_level_code", "string"),
                                                         ("student_status_code", "string", "student_status_code", "string"),
                                                         ("transformed_at", "long", "transformed_at", "long")])

            applymapping1.printSchema()
            print applymapping1.count()
            applymapping1.show(10)

            resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols",
                                                 transformation_ctx="resolvechoice1")
            dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1")
            print resolvechoice1.count()
            # resolvechoice1.printSchema()
            # resolvechoice1.show(10)

            print('START WRITE TO S3-------------------------')
            datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3",
                                                                     connection_options={
                                                                         "path": "s3://dtsodin/student_behavior/student_behavior/",
                                                                         "partitionKeys": ["behavior_id"]},
                                                                     format="parquet",
                                                                     transformation_ctx="datasink6")
            print('END WRITE TO S3-------------------------')

            # datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1,
            #                                                            catalog_connection="glue_redshift",
            #                                                            connection_options={
            #                                                                "dbtable": "student_behavior",
            #                                                                "database": "dts_odin"
            #                                                            },
            #                                                            redshift_tmp_dir="s3a://dtsodin/temp/student_behavior",
            #                                                            transformation_ctx="datasink1")

            df_temp = dyf_tblreply_rating.toDF()
            flag = df_temp.agg({"_key": "max"}).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')
            # ghi de _key vao s3
            df.write.parquet("s3a://dtsodin/flag/flag_hoc_vien_rating_native_smile_h2472.parquet", mode="overwrite")
        except Exception as e:
            print e
Esempio n. 24
0
def generate_metadata_group(
    experiment_specimen_df: DataFrame,
    impress_df: DataFrame,
    exp_type="experiment",
) -> DataFrame:
    """
    Takes in an Experiment-Specimen DataFrame and the IMPReSS dataframe,
    and generates a hash value with the parameters marked as 'isImportant' on IMPReSS.
    This hash is used to identify experiments that are comparable (i.e. share the same experimental conditions).
    """

    # Explode the experiments by procedureMetadata so each row contains data for each procedureMetadata value
    experiment_metadata = experiment_specimen_df.withColumn(
        "procedureMetadata", explode("procedureMetadata"))

    # Filter the IMPReSS to leave only those that generate a metadata split: isImportant = True
    impress_df_required = impress_df.where(
        (col("parameter.isImportant") == True)
        & (col("parameter.type") == "procedureMetadata"))

    # Join the experiment DF with he IMPReSS DF
    experiment_metadata = experiment_metadata.join(
        impress_df_required,
        ((experiment_metadata["_pipeline"]
          == impress_df_required["pipelineKey"])
         & (experiment_metadata["_procedureID"]
            == impress_df_required["procedure.procedureKey"])
         & (experiment_metadata["procedureMetadata._parameterID"]
            == impress_df_required["parameter.parameterKey"])),
    )

    # Create a new column by concatenating the parameter name and the parameter value
    experiment_metadata = experiment_metadata.withColumn(
        "metadataItem",
        when(
            col("procedureMetadata.value").isNotNull(),
            concat(col("parameter.name"), lit(" = "),
                   col("procedureMetadata.value")),
        ).otherwise(concat(col("parameter.name"), lit(" = "), lit("null"))),
    )

    # Select the right column name for production and phenotyping centre depending on experiment type
    if exp_type == "experiment":
        production_centre_col = "_productionCentre"
        phenotyping_centre_col = "_phenotypingCentre"
    else:
        production_centre_col = "production_centre"
        phenotyping_centre_col = "phenotyping_centre"

    # Create a window for the DataFrame over experiment id, production and phenotyping centre
    window = Window.partitionBy(
        "unique_id", production_centre_col,
        phenotyping_centre_col).orderBy("parameter.name")

    # Use the window to create for every experiment an array containing the set of "parameter =  value" pairs.
    experiment_metadata_input = experiment_metadata.withColumn(
        "metadataItems",
        collect_set(col("metadataItem")).over(window))

    # Add the production centre to the metadata group when this is different form the phenotyping centre.
    # This is because in that given case we would like to generate a metadata split among specimens
    # That have been produced and phenotyped on the same centre
    experiment_metadata_input = experiment_metadata_input.withColumn(
        "metadataItems",
        when(
            (col(production_centre_col).isNotNull())
            & (col(production_centre_col) != col(phenotyping_centre_col)),
            array_union(
                col("metadataItems"),
                array(
                    concat(lit("ProductionCenter = "),
                           col(production_centre_col))),
            ),
        ).otherwise(col("metadataItems")),
    )

    # Create a string with the concatenation of the metadata items "parameter = value" separated by '::'.
    experiment_metadata = experiment_metadata_input.groupBy(
        "unique_id", production_centre_col, phenotyping_centre_col).agg(
            concat_ws("::", sort_array(max(
                col("metadataItems")))).alias("metadataGroupList"))

    # Hash the list to generate a medata group identifier.
    experiment_metadata = experiment_metadata.withColumn(
        "metadataGroup", md5(col("metadataGroupList")))

    # Select the experiment IDs and the metadata group IDs
    experiment_metadata = experiment_metadata.select("unique_id",
                                                     "metadataGroup")

    # Join the original experiment DataFrame with the result of the metadata group generation
    experiment_specimen_df = experiment_specimen_df.join(
        experiment_metadata, "unique_id", "left_outer")

    # Add the hashed version of an empty string to those rows without a metadata group.
    experiment_specimen_df = experiment_specimen_df.withColumn(
        "metadataGroup",
        when(experiment_specimen_df["metadataGroup"].isNull(),
             md5(lit(""))).otherwise(experiment_specimen_df["metadataGroup"]),
    )
    return experiment_specimen_df
Esempio n. 25
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    student_id_unavailable = '0'
    package_endtime_unavailable = 99999999999L
    package_starttime_unavailable = 0L
    student_level_code_unavailable = 'UNAVAILABLE'
    student_status_code_unavailable = 'UNAVAILABLE'

    package_endtime = 'package_endtime'
    package_starttime = 'package_starttime'
    student_level_code = 'student_level_code'
    student_status_code = 'student_status_code'

    def doCheckModified(val1, val2):
        if val1 is not None:
            return val1
        return val2

    check_modified_null = udf(doCheckModified, StringType())

    def doCheckStudentID(code):
        code = str(code)
        if code is None:
            return student_id_unavailable
        return code

    check_student_id = udf(doCheckStudentID, StringType())

    def doCheckData(code, key):
        key = str(key)
        if code is None:
            if key == package_endtime:
                return package_endtime_unavailable
            else:
                return package_starttime_unavailable
        return code

    check_data = udf(doCheckData, IntegerType())

    def doCheckDataNull(code, key):
        code = str(code)
        key = str(key)
        if (code is None) & (key == student_level_code):
            return student_level_code_unavailable

        if (code is None) & (key == student_status_code):
            return student_status_code_unavailable

        return code

    check_data_null = udf(doCheckDataNull, StringType())

    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                  package_code, package_endtime, package_starttime,
                  student_level_code, student_status_code, transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_status_code is not None:
            text_concat += str(student_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = udf(concaText, StringType())

    # get dynamic frame source
    dyf_student_assignments = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="student_assignments")

    # chon cac field
    dyf_student_assignments = dyf_student_assignments.select_fields([
        '_key', 'contact_id', 'advisor_id', 'time_created', 'time_modified',
        'status'
    ])

    df_student_assignments = dyf_student_assignments.toDF()
    df_student_assignments = df_student_assignments\
        .withColumn("time_created", unix_timestamp(df_student_assignments.time_created).cast("long"))\
        .withColumn("time_modified", unix_timestamp(df_student_assignments.time_modified).cast("long"))\
        .withColumn("_key", unix_timestamp(df_student_assignments._key).cast("long"))

    dyf_student_assignments = DynamicFrame.fromDF(df_student_assignments,
                                                  glueContext,
                                                  "dyf_student_assignments")
    dyf_student_assignments.printSchema()
    dyf_student_assignments.show(2)

    #  check bucket is not null
    try:
        # # doc moc flag tu s3
        df_flag = spark.read.parquet(
            "s3a://dtsodin/flag/toa_L3150/toa_student_assignments.parquet")
        start_read = df_flag.collect()[0]['flag']
        print('read from index: ', start_read)

        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        dyf_student_assignments = Filter.apply(
            frame=dyf_student_assignments, f=lambda x: x['_key'] > start_read)
    except:
        print('read flag file error ')

    print('the number of new contacts: ', dyf_student_assignments.count())

    if dyf_student_assignments.count() > 0:
        dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
            database="tig_advisor", table_name="student_contact")
        dyf_student_contact = dyf_student_contact.select_fields(
            ['contact_id', 'student_id', 'level_study', 'time_lms_created'])\
            .rename_field('contact_id', 'contactid')

        dyf_log_student_status = glueContext.create_dynamic_frame.from_catalog(
            database="do_tig_advisor", table_name="log_student_status")
        dyf_log_student_status = dyf_log_student_status.select_fields(
            ['contact_id', 'status_code', 'last_status_code', 'start_date', 'end_date']) \
            .rename_field('contact_id', 'contact_id_status')

        dyf_log_student_package = glueContext.create_dynamic_frame.from_catalog(
            database="do_tig_advisor", table_name="log_student_package")
        dyf_log_student_package = dyf_log_student_package.select_fields(
            ['student_id', 'package_code', 'start_time', 'end_time']) \
            .rename_field('student_id', 'student_id_package') \
            .rename_field('start_time', 'start_time_package') \
            .rename_field('end_time', 'end_time_package')

        dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog(
            database="tig_advisor", table_name="log_student_level_study")
        dyf_log_student_level_study = dyf_log_student_level_study.select_fields(
            ['contact_id', 'level_current', 'level_modified', 'package_code', 'time_created']) \
            .rename_field('contact_id', 'contact_id_level')

        dyf_student_assignment = Filter.apply(
            frame=dyf_student_assignments,
            f=lambda x: x['contact_id'] is not None and x['contact_id'] != '')

        try:
            dyf_assignments_contact = Join.apply(dyf_student_assignment,
                                                 dyf_student_contact,
                                                 "contact_id", "contactid")
            dyf_assignments_contact.printSchema()
            dyf_assignments_contact.show(2)
            df_assignments_contact = dyf_assignments_contact.toDF(
            ).dropDuplicates()
            df_log_student_level_study = dyf_log_student_level_study.toDF()
            df_temp = dyf_log_student_level_study.toDF()
            df_log_student_status = dyf_log_student_status.toDF()
            df_log_student_package = dyf_log_student_package.toDF()

            df_temp = df_temp.groupby(
                'contact_id_level', 'level_current', 'package_code').agg(
                    f.max("time_created").alias("time_created_max"))
            df_temp = df_temp.withColumnRenamed('contact_id_level', 'contact_id_join') \
                .withColumnRenamed('package_code', 'package_code_join')

            df_student_level_study = df_temp.join(
                df_log_student_level_study,
                (df_temp['contact_id_join']
                 == df_log_student_level_study['contact_id_level'])
                & (df_temp['package_code_join']
                   == df_log_student_level_study['package_code'])
                & (df_temp['time_created_max']
                   == df_log_student_level_study['time_created']), "left")

            print "=========== . ==========="
            df_student_level_study.printSchema()

            dyf_student_level_study = DynamicFrame.fromDF(
                df_student_level_study, glueContext, "dyf_student_level_study")

            dyf_student_level_study = dyf_student_level_study.select_fields([
                'contact_id_level', 'level_current', 'level_modified',
                'package_code', 'time_created'
            ])

            df_student_level_study = dyf_student_level_study.toDF()
            df_student_level_study.printSchema()

            df_assignments_contact_level = df_student_level_study.join(
                df_assignments_contact,
                (df_student_level_study['contact_id_level']
                 == df_assignments_contact['contact_id'])
                & (df_student_level_study['time_created'] <=
                   df_assignments_contact['time_lms_created']), "right")

            df_assignments_contact_level = df_assignments_contact_level.withColumn(
                "level_modified_new",
                check_modified_null(
                    df_assignments_contact_level.level_modified,
                    df_assignments_contact_level.level_study))
            df_assignments_contact_level.printSchema()
            df_assignments_contact_level.count()
            df_assignments_contact_level.show(10)
            dyf_assignments_contact_level = DynamicFrame.fromDF(
                df_assignments_contact_level, glueContext,
                "dyf_assignments_contact_level")
            dyf_assignments_contact_level = dyf_assignments_contact_level.select_fields(
                [
                    'time_created', 'contact_id', 'student_id', 'advisor_id',
                    'level_study', 'time_lms_created', 'level_current',
                    'level_modified', 'package_code', 'time_modified',
                    'level_modified_new'
                ])
            # dyf_join_temp = Filter.apply(frame=dyf_join,
            #                              f=lambda x: x["level_modified_new"] is None)
            # print "count: ", dyf_join_temp.count()

            ############
            df_assignments_contact_level = dyf_assignments_contact_level.toDF()
            df_join_data = df_assignments_contact_level.join(
                df_log_student_status,
                (df_log_student_status['contact_id_status']
                 == df_assignments_contact_level['contact_id'])
                & (df_log_student_status['start_date'] <=
                   df_assignments_contact_level['time_created'])
                & (df_log_student_status['end_date'] >=
                   df_assignments_contact_level['time_created']), "left")

            df_join_full_data = df_join_data.join(
                df_log_student_package,
                (df_log_student_package['student_id_package']
                 == df_join_data['student_id'])
                & (df_log_student_package['start_time_package'] <=
                   df_join_data['time_created'])
                & (df_log_student_package['end_time_package'] >=
                   df_join_data['time_created']), "left")

            df_join_full_data = df_join_full_data.dropDuplicates()
            dyf_join_full_data = DynamicFrame.fromDF(df_join_full_data,
                                                     glueContext,
                                                     "dyf_join_full_data")
            dyf_join_full_data = Filter.apply(
                frame=dyf_join_full_data,
                f=lambda x: x["start_time_package"] is not None and x[
                    "end_time_package"] is not None)
            print "dyf_join_full_data: ", dyf_join_full_data.count()
            dyf_join_full_data.show(10)
            dyf_join_full_data = dyf_join_full_data.select_fields([
                'time_created', 'student_id', 'contact_id', 'package_code',
                'time_modified', 'advisor_id', 'start_time_package',
                'end_time_package', 'level_modified_new', 'status_code'
            ])

            df_join_full_data = dyf_join_full_data.toDF()
            df_join_full_data = df_join_full_data.withColumn("transformed_at", unix_timestamp(f.current_timestamp())) \
                .withColumn("student_id", check_student_id(df_join_full_data.student_id)) \
                .withColumn("package_endtime", check_data(df_join_full_data.end_time_package, f.lit(package_endtime))) \
                .withColumn("package_starttime", check_data(df_join_full_data.start_time_package, f.lit(package_starttime))) \
                .withColumn("student_level_code",
                            check_data_null(df_join_full_data.level_modified_new, f.lit(student_level_code))) \
                .withColumn("student_status_code", check_data_null(df_join_full_data.status_code, f.lit(student_status_code))) \
                .withColumn("behavior_id", f.lit(234).cast("long"))

            df_join_full_data.printSchema()
            print df_join_full_data.count()
            df_join_full_data.show(10)

            dyf_join_full_data = DynamicFrame.fromDF(df_join_full_data,
                                                     glueContext,
                                                     "dyf_join_full_data")
            # dyf_join.printSchema()
            # print dyf_join.count()
            # dyf_join.show(10)

            dyf_dong_tien_student = ApplyMapping.apply(
                frame=dyf_join_full_data,
                mappings=[
                    ("time_created", "long", "student_behavior_date", "long"),
                    ("behavior_id", "long", "behavior_id", "long"),
                    ("student_id", "string", "student_id", "long"),
                    ("contact_id", "string", "contact_id", "string"),
                    ("package_code", "string", "package_code", "string"),
                    ("package_endtime", "int", "package_endtime", "long"),
                    ("package_starttime", "int", "package_starttime", "long"),
                    ("student_level_code", "string", "student_level_code",
                     "string"),
                    ("student_status_code", "string", "student_status_code",
                     "string"),
                    ("transformed_at", "long", "transformed_at", "long")
                ])

            df_dong_tien_student = dyf_dong_tien_student.toDF()
            df_dong_tien_student2 = df_dong_tien_student.withColumn(
                'student_behavior_id',
                f.md5(
                    concaText(df_dong_tien_student.student_behavior_date,
                              df_dong_tien_student.behavior_id,
                              df_dong_tien_student.student_id,
                              df_dong_tien_student.contact_id,
                              df_dong_tien_student.package_code,
                              df_dong_tien_student.package_endtime,
                              df_dong_tien_student.package_starttime,
                              df_dong_tien_student.student_level_code,
                              df_dong_tien_student.student_status_code,
                              df_dong_tien_student.transformed_at)))

            # df_dong_tien_student2 = df_dong_tien_student2.dropDuplicates()
            print "==============", df_dong_tien_student2.count()
            df_dong_tien_student2 = df_dong_tien_student2\
                .groupby('student_behavior_id', 'student_behavior_date', 'behavior_id',
                         'transformed_at', 'contact_id', 'student_id', 'package_code',
                         'package_endtime', 'package_starttime', 'student_level_code')\
                .agg(f.first('student_status_code'))
            print "==============", df_dong_tien_student2.count()

            dyf_dong_tien_student = DynamicFrame.fromDF(
                df_dong_tien_student2, glueContext, 'dyf_dong_tien_student')

            dyf_dong_tien_student = Filter.apply(
                frame=dyf_dong_tien_student,
                f=lambda x: x["contact_id"] is not None and x["contact_id"
                                                              ] != '')

            applymapping1 = ApplyMapping.apply(
                frame=dyf_dong_tien_student,
                mappings=[
                    ("student_behavior_id", "string", "student_behavior_id",
                     "string"),
                    ("student_behavior_date", "long", "student_behavior_date",
                     "long"), ("behavior_id", "long", "behavior_id", "long"),
                    ("student_id", "long", "student_id", "long"),
                    ("contact_id", "string", "contact_id", "string"),
                    ("package_code", "string", "package_code", "string"),
                    ("package_endtime", "long", "package_endtime", "long"),
                    ("package_starttime", "long", "package_starttime", "long"),
                    ("student_level_code", "string", "student_level_code",
                     "string"),
                    ("student_status_code", "string", "student_status_code",
                     "string"),
                    ("transformed_at", "long", "transformed_at", "long")
                ])

            resolvechoice1 = ResolveChoice.apply(
                frame=applymapping1,
                choice="make_cols",
                transformation_ctx="resolvechoice1")
            dropnullfields1 = DropNullFields.apply(
                frame=resolvechoice1, transformation_ctx="dropnullfields1")
            print resolvechoice1.count()
            resolvechoice1.printSchema()
            resolvechoice1.show(10)

            print('START WRITE TO S3-------------------------')
            datasink6 = glueContext.write_dynamic_frame.from_options(
                frame=dropnullfields1,
                connection_type="s3",
                connection_options={
                    "path": "s3://dtsodin/student_behavior/student_behavior/",
                    "partitionKeys": ["behavior_id"]
                },
                format="parquet",
                transformation_ctx="datasink6")
            print('END WRITE TO S3-------------------------')

            # datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1,
            #                                                            catalog_connection="glue_redshift",
            #                                                            connection_options={
            #                                                                "dbtable": "student_behavior",
            #                                                                "database": "dts_odin"
            #                                                            },
            #                                                            redshift_tmp_dir="s3a://dtsodin/temp/student_behavior/",
            #                                                            transformation_ctx="datasink1")

            # ghi flag
            # lay max key trong data source
            datasourceTmp = dyf_student_assignments.toDF()
            flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')

            # ghi de _key vao s3
            df.write.parquet(
                "s3a://dtsodin/flag/toa_student_assignments.parquet",
                mode="overwrite")
        except Exception as e:
            print e
Esempio n. 26
0

def _to_pandas(rows):
    pd_df = pd.DataFrame(list(rows))
    return [pd_df]


def to_pandas(df, num_partitions=None):
    """

    :param df: spark DataFrame
    :param num_partitions: 设置spark DataFrame的partition数量 默认:None
    :return:
    """
    if num_partitions is not None:
        df = df.repartition(num_partitions)
    pd_dfs = df.rdd.mapPartitions(_to_pandas).collect()
    pd_df = pd.concat(pd_dfs)
    pd_df.columns = df.columns

    return pd_df

spark = SparkSession.builder \
            .appName("smartCity")\
            .master("local[5]")\
            .getOrCreate()

df = spark.range(50).withColumn("value", f.md5(f.lit("abcd")))

df2 = df.cache()
Esempio n. 27
0
def main():
    def checknull(level_modified, level_study):
        if level_modified is not None:
            return level_modified
        else:
            return level_study

    checknull_ = udf(checknull, StringType())

    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                  package_code, package_endtime, package_starttime,
                  student_level_code, student_package_status_code,
                  transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_package_status_code is not None:
            text_concat += str(student_package_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = udf(concaText, StringType())
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="student_contact")

    dyf_student_contact = dyf_student_contact.select_fields(
        ['student_id', 'contact_id', 'level_study'])

    dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="log_student_level_study")

    dyf_log_student_level_study = dyf_log_student_level_study.select_fields([
        'contact_id', 'level_current', 'level_modified', 'package_code',
        'time_created'
    ])
    dyf_log_student_level_study = dyf_log_student_level_study.resolveChoice(
        specs=[('_key', 'cast:int')])

    dyf_tpe_invoice_product = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_invoice_product")
    dyf_tpe_invoice_product = dyf_tpe_invoice_product.select_fields([
        '_key', 'timecreated', 'user_id', 'buyer_id', 'invoice_packages_price',
        'invoice_price', 'invoice_code'
    ])
    dyf_tpe_invoice_product = dyf_tpe_invoice_product.resolveChoice(
        specs=[('_key', 'cast:long')])
    dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_invoice_product_details")

    dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields(
        ['cat_code', 'package_time', 'invoice_code'])

    dyf_student_package = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="log_student_package")

    # chon cac field
    dyf_student_package = dyf_student_package.select_fields(
        ['student_id', 'start_time', 'end_time',
         'package_code']).rename_field('student_id', 'student_id1')
    dyf_student_package.printSchema()
    dyf_student_package.show(2)
    # # doc flag tu s3
    try:
        # # doc moc flag tu s3
        df_flag = spark.read.parquet(
            "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet"
        )
        start_read = df_flag.collect()[0]['flag']
        print('read from index: ', start_read)

        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        dyf_tpe_invoice_product = Filter.apply(
            frame=dyf_tpe_invoice_product, f=lambda x: x['_key'] > start_read)
    except:
        print('read flag file error ')

    print('the number of new contacts: ', dyf_tpe_invoice_product.count())

    if (dyf_tpe_invoice_product.count() > 0):
        df_log_student_level_study = dyf_log_student_level_study.toDF()
        df_log_student_level_study = df_log_student_level_study.groupby(
            'contact_id', 'level_current', 'level_modified',
            'package_code').agg(f.max('time_created').alias('time_created'))

        dyf_join0 = Join.apply(dyf_tpe_invoice_product,
                               dyf_tpe_invoice_product_details, 'invoice_code',
                               'invoice_code')
        print("@@@@@@@@@@@@")
        dyf_join0.printSchema()
        dyf_join0.show(2)
        dyf_log_student_level_study = DynamicFrame.fromDF(
            df_log_student_level_study, glueContext,
            "dyf_log_student_level_study")

        dyf_join1 = Join.apply(dyf_student_contact, dyf_join0, "contact_id",
                               "user_id")
        dyf_join = Join.apply(dyf_join1, dyf_log_student_level_study,
                              "user_id", "contact_id")
        print("@@@@@@@@@@@@")
        dyf_join.printSchema()
        dyf_join.show(2)
        dyf_join = Filter.apply(
            frame=dyf_join, f=lambda x: x['time_created'] <= x['timecreated'])

        dyf_data_join3 = Join.apply(dyf_join, dyf_student_package,
                                    "student_id", "student_id1")
        dyf_data_join3 = Filter.apply(
            frame=dyf_data_join3,
            f=lambda x: x['package_code'] == x['cat_code'])
        df_data_join3 = dyf_data_join3.toDF()
        df_data_join3 = df_data_join3.withColumn("student_level_code", checknull_(df_data_join3.level_modified, df_data_join3.level_study))\
        .withColumn("behavior_id", f.lit(3))\
        .withColumn("student_package_status_code", f.lit("DEACTIVED"))\
        .withColumn("student_behavior_date", from_unixtime(df_data_join3.timecreated))\
        .withColumn("package_starttime", df_data_join3['start_time'])\
        .withColumn("package_endtime", df_data_join3['end_time']) \
            .withColumn("transformed_at", f.lit(None))
        df_data_join3 = df_data_join3.withColumn(
            'student_behavior_id',
            f.md5(
                concaText(df_data_join3.student_behavior_date,
                          df_data_join3.behavior_id, df_data_join3.student_id,
                          df_data_join3.contact_id, df_data_join3.package_code,
                          df_data_join3.package_endtime,
                          df_data_join3.package_starttime,
                          df_data_join3.student_level_code,
                          df_data_join3.student_package_status_code,
                          df_data_join3.transformed_at)))
        df_data_join3 = df_data_join3.dropDuplicates()
        dyf_data_join3 = DynamicFrame.fromDF(df_data_join3, glueContext,
                                             "dyf_data_join3")
        dyf_data_join3 = dyf_data_join3.resolveChoice(
            specs=[('behavior_id',
                    'cast:int'), ('student_behavior_date', 'cast:timestamp')])
        dyf_data_join3.printSchema()
        dyf_data_join3.show(2)
        applymapping = ApplyMapping.apply(
            frame=dyf_data_join3,
            mappings=[("student_behavior_id", "string", "student_behavior_id",
                       "string"),
                      ("contact_id", "string", "contact_id", "string"),
                      ("student_behavior_date", "timestamp",
                       "student_behavior_date", "long"),
                      ("student_id", "string", "student_id", "long"),
                      ("cat_code", "string", "package_code", "string"),
                      ("package_starttime", "int", "package_starttime",
                       "long"),
                      ("package_endtime", "int", "package_endtime", "long"),
                      ("student_package_status_code", "string",
                       "student_status_code", "string"),
                      ("behavior_id", "int", "behavior_id", "long"),
                      ("student_level_code", "string", "student_level_code",
                       "string")])

        resolvechoice = ResolveChoice.apply(frame=applymapping,
                                            choice="make_cols",
                                            transformation_ctx="resolvechoice")

        dropnullfields = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields")

        print(dropnullfields.count())
        dropnullfields.toDF().show()

        glueContext.write_dynamic_frame.from_options(
            frame=dropnullfields,
            connection_type="s3",
            connection_options={
                "path": "s3://dtsodin/student_behavior/student_behavior",
                "partitionKeys": ["behavior_id"]
            },
            format="parquet")

        applymapping1 = ApplyMapping.apply(
            frame=dyf_data_join3,
            mappings=[("invoice_packages_price", "int", "measure1", "long"),
                      ("behavior_id", "int", "behavior_id", "long"),
                      ("invoice_price", "int", "measure2 ", "long")])

        resolvechoice1 = ResolveChoice.apply(
            frame=applymapping1,
            choice="make_cols",
            transformation_ctx="resolvechoice1")

        dropnullfields1 = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields1")

        print(dropnullfields1.count())
        dropnullfields1.toDF().show()
        glueContext.write_dynamic_frame.from_options(
            frame=dropnullfields,
            connection_type="s3",
            connection_options={
                "path":
                "s3://dtsodin/student_behavior/student_general_behavior",
                "partitionKeys": ["behavior_id"]
            },
            format="parquet")

        dyf_tpe_invoice_product = dyf_tpe_invoice_product.toDF()
        flag = dyf_tpe_invoice_product.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet(
            "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet",
            mode="overwrite")
def main():
    # ========== init

    # =========== create_dynamic_frame
    df_log_in_out = retrieve_dynamic_frame(glue_context,
                                           'native_livestream',
                                           'log_in_out',
                                           casts=[('time_in', 'cast:long'),
                                                  ('time_out', 'cast:long'),
                                                  ('thoigianhoc', 'cast:long')
                                                  ])

    if IS_DEV:
        print('df_log_in_out')
        df_log_in_out.printSchema()
        df_log_in_out.show(3)

    # display(data_frame=df_log_in_out, message="dyf_log_in_out")

    # ========== clear data
    # df_log_in_out = filter_latest(spark=spark, data_frame=df_log_in_out, config_file=flag_file)

    df_log_in_out_origin = df_log_in_out

    if df_log_in_out.count() < 0:
        return

    df_log_in_out = df_log_in_out.dropDuplicates(
        ['student_id', 'room_id', 'time_in'])
    df_log_in_out = df_log_in_out.groupBy('student_id', 'room_id').agg(
        min('time_in').alias('time_in'),
        max('time_out').alias('time_out'),
        sum('thoigianhoc').alias('thoigianhoc'),
        count('time_in').cast('long').alias('number_in_out'))

    if IS_DEV:
        print('df_log_in_out_after_group_by_room')
        df_log_in_out.printSchema()
        df_log_in_out.show(3)

    df_log_in_out = df_log_in_out \
        .withColumn('student_behavior_date', df_log_in_out['time_in']) \
        .withColumn('end_learning_time', df_log_in_out['time_out']) \
        .withColumn('duration', df_log_in_out['thoigianhoc'])

    df_log_in_out = df_log_in_out.withColumn(
        'student_behavior_date', expr("student_behavior_date div 1000"))
    df_log_in_out = df_log_in_out.withColumn(
        'start_learning_time', df_log_in_out['student_behavior_date'])
    df_log_in_out = df_log_in_out.withColumn(
        'end_learning_time', expr("end_learning_time div 1000"))
    df_log_in_out = df_log_in_out.withColumn(
        'duration', round_duration_udf(expr("duration div 1000")))

    df_log_in_out = data_frame_filter_not_null(df_log_in_out, [
        'student_behavior_date', 'student_id', 'start_learning_time',
        'end_learning_time', 'duration'
    ])

    display(data_frame=df_log_in_out, message="dyf_log_in_out clear data")

    # =========== create_dynamic_frame
    df_streaming_calendar_teach = retrieve_dynamic_frame(
        glue_context, 'topicalms', 'streaming_calendar_teach', [
            'id', 'type_class', 'teacher_available_id', 'assistant_id',
            'hour_id'
        ], [('room_id', 'cast:int')])
    # dyf_streaming_material = retrieve_dynamic_frame(glue_context, 'topicalms', 'streaming_material',
    #                                                 ['calendar_teach_id', 'subject'])
    df_student_contact = retrieve_dynamic_frame(glue_context, 'tig_advisor',
                                                'student_contact',
                                                ['contact_id', 'student_id'])

    df_student_level = retrieve_data_frame_from_redshift(
        glue_context, 'transaction_log', 'ad_student_level',
        ['contact_id', 'level_code', 'start_date', 'end_date'])
    df_student_level = df_student_level.withColumnRenamed(
        'contact_id', 'contact_id_level')
    display(df_student_level, "df_student_level")

    df_student_package = retrieve_data_frame_from_redshift(
        glue_context, 'transaction_log', 'ad_student_package', [
            'contact_id', 'package_code', 'package_status_code',
            'package_start_time', 'package_end_time'
        ])
    df_student_package = df_student_package.withColumnRenamed(
        'contact_id', 'contact_id_package')
    display(df_student_package, "df_student_package")

    df_student_advisor = retrieve_data_frame_from_redshift(
        glue_context, 'transaction_log', 'ad_student_advisor',
        ['contact_id', 'advisor_id', 'start_date', 'end_date'])
    df_student_advisor = df_student_advisor\
        .withColumnRenamed('contact_id', 'contact_id_advisor')\
        .withColumnRenamed('start_date', 'start_date_advisor') \
        .withColumnRenamed('end_date', 'end_date_advisor')

    display(df_student_advisor, "df_student_advisor")

    # ============ join student_contact table
    df_result = df_log_in_out.join(df_student_contact,
                                   on=['student_id'],
                                   how='left_outer')

    df_result = data_frame_filter_not_null(df_result, ['contact_id'])

    display(df_result, "join student_contact table")

    # ============ join streaming_calendar_teach table
    df_result = df_result.join(
        df_streaming_calendar_teach,
        df_result.room_id == df_streaming_calendar_teach.id,
        how='left_outer')
    df_result = df_result.withColumnRenamed('teacher_available_id',
                                            'teacher_id')
    df_result = df_result.withColumnRenamed('type_class', 'class_type')
    df_result = df_result \
        .where(col('class_type').eqNullSafe('NORMAL')) \
        .withColumn('class_type', lit('LIVESTREAM').cast('string'))

    df_result = data_frame_filter_not_null(
        df_result, ['room_id', 'class_type', 'hour_id'])

    display(df_result, "join streaming_calendar_teach table")

    df_result = df_result \
        .join(df_student_level,
              (df_result.contact_id == df_student_level.contact_id_level)
              & (df_result.student_behavior_date >= df_student_level.start_date)
              & (df_result.student_behavior_date < df_student_level.end_date),
              'left'
              ) \
        .join(df_student_package,
              (df_result.contact_id == df_student_package.contact_id_package)
              & (df_result.student_behavior_date >= df_student_package.package_start_time)
              & (df_result.student_behavior_date < df_student_package.package_end_time),
              'left'
              ) \
        .join(df_student_advisor,
              (df_result.contact_id == df_student_advisor.contact_id_advisor)
              & (df_result.student_behavior_date >= df_student_advisor.start_date_advisor)
              & (df_result.student_behavior_date < df_student_advisor.end_date_advisor),
              'left'
              )

    display(
        df_result,
        "join df_student_level, df_student_package, df_student_advisor table")

    # ============ add column
    df_result = df_result \
        .withColumn('behavior_id', lit(13).cast('long')) \
        .withColumnRenamed('level_code', 'student_level_code') \
        .withColumn('transformed_at', lit(datetime.now()).cast("long")) \
        .withColumn('platform', lit('MOBILE').cast('string')) \
        .withColumn('teacher_type', lit(None).cast('string')) \
        .withColumn('year_month_id',
                    lit(from_unixtime(df_result['student_behavior_date'], format="yyyyMM")).cast('long')) \
        .withColumn('role_in_class', lit('UNAVAILABLE')) \
        .withColumn('vcr_type', lit('UNAVAILABLE'))

    contact_id_unavailable = 0
    student_id_unavailable = 0
    package_endtime_unavailable = 99999999999
    package_starttime_unavailable = 0
    student_level_code_unavailable = 'UNAVAILABLE'
    student_status_code_unavailable = 'UNAVAILABLE'
    package_code_unavailable = 'UNAVAILABLE'
    class_type_unavailable = 'UNAVAILABLE'
    teacher_type_unavailable = 'UNAVAILABLE'
    advisor_unavailable = 0
    measure1_unavailable = float(0.0)
    measure2_unavailable = float(0.0)
    measure3_unavailable = float(0.0)
    measure4_unavailable = float(0.0)
    role_in_class_unavailable = 'UNAVAILABLE'
    number_in_out_unavailable = 1

    df_result = df_result.na.fill({
        'package_code': package_code_unavailable,
        'student_level_code': student_level_code_unavailable,
        'package_status_code': student_status_code_unavailable,
        'class_type': class_type_unavailable,
        'teacher_type': teacher_type_unavailable,
        'advisor_id': advisor_unavailable,
        'role_in_class': role_in_class_unavailable,
        'number_in_out': number_in_out_unavailable
    })

    df_result = df_result.dropDuplicates()

    df_result = df_result.withColumn(
        'student_behavior_id',
        md5(
            concat_text_udf(df_result.student_behavior_date,
                            df_result.behavior_id, df_result.student_id,
                            df_result.contact_id, df_result.package_code,
                            df_result.student_level_code,
                            df_result.package_status_code,
                            df_result.transformed_at)))

    df_result.persist(StorageLevel.DISK_ONLY_2)

    display(df_result, "df_result add column")

    # ============ select student_behavior
    dyf_result = from_data_frame(data_frame=df_result,
                                 glue_context=glue_context,
                                 name='dyf_result')
    dyf_student_behavior = dyf_result.select_fields([
        'student_behavior_id', 'student_behavior_date', 'behavior_id',
        'student_id', 'contact_id', 'package_code', 'student_level_code',
        'package_status_code', 'advisor_id', 'transformed_at', 'year_month_id'
    ])

    dyf_student_behavior = dyf_student_behavior.resolveChoice([
        ('student_behavior_id', 'cast:string'),
        ('student_behavior_date', 'cast:long'), ('behavior_id', 'cast:int'),
        ('student_id', 'cast:long'), ('contact_id', 'cast:string'),
        ('package_code', 'cast:string'), ('student_level_code', 'cast:string'),
        ('package_status_code', 'cast:string'), ('advisor_id', 'cast:long'),
        ('transformed_at', 'cast:long'), ('year_month_id', 'cast:long')
    ])

    # -----------------------------------------------------------------------------------------------------------------#
    dyf_student_learning = dyf_result.select_fields([
        'student_behavior_id', 'class_type', 'platform', 'teacher_id',
        'teacher_type', 'assistant_id', 'hour_id', 'start_learning_time',
        'end_learning_time', 'duration', 'behavior_id', 'year_month_id',
        'role_in_class', 'number_in_out', 'vcr_type'
    ])

    dyf_student_learning = dyf_student_learning.resolveChoice([
        ('student_behavior_id', 'cast:string'), ('class_type', 'cast:string'),
        ('platform', 'cast:string'), ('teacher_id', 'cast:long'),
        ('teacher_type', 'cast:string'), ('assistant_id', 'cast:long'),
        ('hour_id', 'cast:int'), ('start_learning_time', 'cast:long'),
        ('end_learning_time', 'cast:long'), ('duration', 'cast:long'),
        ('behavior_id', 'cast:int'), ('year_month_id', 'cast:long'),
        ('role_in_class', 'cast:string'), ('number_in_out', 'cast:long'),
        ('vcr_type', 'cast:string')
    ])

    # -----------------------------------------------------------------------------------------------------------------#
    df_flag = get_flag(spark=spark, data_frame=df_log_in_out_origin)

    # ============ save
    display(dyf_student_behavior, 'dyf_student_behavior')
    display(dyf_student_learning, 'dyf_student_learning')
    display(df_flag, "df_flag")

    save_data_to_s3(glue_context, dyf_student_behavior,
                    student_behavior_s3_path, student_behavior_s3_partition)
    save_data_to_s3(glue_context, dyf_student_learning,
                    student_learning_s3_path, student_learning_s3_partition)

    if df_flag.collect()[0]['flag'] is not None:
        print('save_flag done')
        save_flag(df_flag, flag_file)

    df_result.unpersist()
Esempio n. 29
0
    logging.shutdown()
    dbutils.notebook.exit('Failed')

# COMMAND ----------

# MAGIC %md
# MAGIC ##### Adding adition columns into `Dataframe`
# MAGIC * below command we are going to create two aditional two columns in `dataframe`
# MAGIC * `Key Column` which is generated using `md5` based on `Id Column` from source table
# MAGIC * Adding `load_date` column and inserting `current_timestamp()`

# COMMAND ----------

from pyspark.sql.functions import md5, col, concat, lit, current_timestamp

df_channels = df_channels.withColumn("channels_key",md5(concat(df_channels.CHANNEL_ID)))\
              .withColumn('load_date',lit(current_timestamp()))

# COMMAND ----------

# MAGIC %md
# MAGIC ##### Creating `DataFrame` from target table data based on `single ID Column`
# MAGIC * we are using below command to get existing data from target table to compare incremental data from source.

# COMMAND ----------

# Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named _corrupt_record by default).
# For example: spark.read.schema(schema).csv(file).filter($"_corrupt_record".isNotNull).count() and spark.read.schema(schema).csv(file).select("_corrupt_record").show().
# Instead, you can cache or save the parsed results and then send the same query.
# Remove corrupt records
df_channels.cache()
Esempio n. 30
0
def add_md5(target_df):
    row_rdd=target_df.rdd.map(func)
    concat_df=row_rdd.toDF()
    hash_df=concat_df.withColumn("hash_id",md5(F.col("concat_val")))#.drop(F.col("concat_val"))
    return hash_df