Example #1
2
def process_data(spark, data_folder):
    """Faz a leitura dos arquivos gzip de uma pasta e processa o Spark Dataframe
    
    Arguments:
        spark -- Spark Session
        data_folder -- pasta contendo os arquivos gzip

    Returns:
        Spark Dataframe   
    """
    
    # Reading files from folder
    df_nasa = spark.read.text(data_folder)

    # Regex for group matching
    parse_regex = '(.*) - - \[([\w:/]+\s[+\-]\d{4})\] \"(.*)\" (\d{3}) ([0-9]*|-)'
    
    # Create columns based on Regex group match
    df = df_nasa.withColumn('host',F.regexp_extract(F.col('value'),parse_regex,1)) \
                .withColumn('timestamp',F.regexp_extract(F.col('value'),parse_regex,2)) \
                .withColumn('request',F.regexp_extract(F.col('value'),parse_regex,3)) \
                .withColumn('status_code',F.regexp_extract(F.col('value'),parse_regex,4).cast(IntegerType())) \
                .withColumn('bytes',F.regexp_extract(F.col('value'),parse_regex,5).cast(IntegerType())) \
                .drop('value')

    return df
Example #2
0
        def calculateDifficulty(self, dfIngridients):
            dfIngridients.persist(StorageLevel.MEMORY_AND_DISK)
            dfIngridients = dfIngridients.filter(
                dfIngridients.ingridents.contains("beef"))
            dfIngridients = dfIngridients.withColumn("cookTimeInt", regexp_extract(dfIngridients.cookTime, "(d+)", 1)) \
                .withColumn("prepTimeInt", regexp_extract(dfIngridients.cookTime, "(d+)", 1))

            dfIngridients = dfIngridients.withColumn(
                "totalTime",
                dfIngridients.cookTimeInt + dfIngridients.prepTimeInt)

            dfIngridients_filtered = dfIngridients.withColumn("difficulty",
                                                              when(dfIngridients.totalTime > 60, lit("Hard")).when(
                                                                  dfIngridients.totalTime > 30 & dfIngridients.totalTime < 60,
                                                                  lit("Medium")) \
                                                              .when(dfIngridients.totalTime < 30, lit("Easy")) \
                                                              .otherwise(lit("Unknown"))).withColumn("currentDate",
                                                                                                     unix_timestamp() * 1000)

            dfIngridients_filtered.repartition("difficulty").persist(
                StorageLevel.MEMORY_AND_DISK)

            self.log.info("dfIngridients_filtered: {}".format(
                dfIngridients_filtered.rdd.count()))

            self.writeToImpala(
                dataToWrite=dfIngridients_filtered,
                table=self.config.get('impala').get('tablename'),
                properties=self.ImpalaProperties.get('impala'))
Example #3
0
def analysis_7(units_df, damages_df, log):
    """Logs the result for Query 7.
    :param units_df: DataFrame Units_use.
    :param damages_df: DataFrame Damages_use.
    :param log: Logger.
    :return None
    """
    filtered_units = (
        units_df
        .filter(col("FIN_RESP_TYPE_ID").contains("INSURANCE"))
        .withColumn("VEH_DMAG_SCL_1_ID", when(units_df.VEH_DMAG_SCL_1_ID.contains("DAMAGED"),
                                              regexp_extract(col("VEH_DMAG_SCL_1_ID"), "(\\d{1})", 1)).otherwise(0))
        .withColumn("VEH_DMAG_SCL_2_ID", when(units_df.VEH_DMAG_SCL_2_ID.contains("DAMAGED"),
                                              regexp_extract(col("VEH_DMAG_SCL_1_ID"), "(\\d{1})", 1)).otherwise(0))
        .filter((col("VEH_DMAG_SCL_1_ID") > 4) | (col("VEH_DMAG_SCL_2_ID") > 4))
    )
    units_damages_left_join_filtered = (
        filtered_units.alias("U")
        .join(damages_df.alias("D"), col("U.CRASH_ID") == col("D.CRASH_ID"), "left")
        .filter(col("DAMAGED_PROPERTY").contains("NONE") | col("DAMAGED_PROPERTY").contains("NO DAMAGE") | col(
        "DAMAGED_PROPERTY").isNull())
        .select("U.CRASH_ID")
        .distinct()
    )
    crash_count = units_damages_left_join_filtered.count()
    log.warn("Result for Query 7")
    log.warn(
        "Distinct Crash IDs where No Damaged Property was observed and Damage Level (VEH_DMAG_SCL~) is above 4 and "
        "car avails Insurance = {}".format(crash_count))

    return None
    def custom_to_dataframe(self, filename):
        # custom_schema = StructType([self.schema[i] for i in [0,9,10]])
        custom_data = spark.read.text(self.path + filename)

        r = "user_id=(.+)feature_9=(.+)feature_10=(.+)"
        custom_data = custom_data.select(regexp_extract('value',r,1).alias('user_id'), \
                 regexp_extract('value',r,2).alias('feature_9').cast("double"), \
                 regexp_extract('value',r,3).alias('feature_10').cast("double"))
        return custom_data
Example #5
0
def main(log_file):
    sc = utils.setup_spark_context()

    sqlContext = SQLContext(sc)
    try:
        sql_log_data = sqlContext.read.text(log_file)
    except:
        print("######################")
        print("Bad file name!")
        return

    splited_data_frame = sql_log_data.select(
        regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
        regexp_extract('value',
                       r'^.*\[(\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]',
                       1).alias('timestamp'),
        regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"',
                       1).alias('request'),
        regexp_extract('value', r'^.*"\s+([^\s]+)',
                       1).cast('integer').alias('http_status'),
        regexp_extract('value', r'^.*\s+(\d+)$',
                       1).cast('integer').alias('content_size_in_bytes'))

    splited_data_frame.cache()

    data_frames = {}

    data_frames['unique_hosts'] = splited_data_frame.groupBy(
        'host').count().filter('count = 1').select('host')

    data_frames['top_20_request'] = splited_data_frame.groupBy(
        'request').count().sort(desc("count")).limit(20)

    data_frames['total_http_404'] = splited_data_frame.groupBy(
        'http_status').count().filter('http_status = "404"')

    data_frames['frequency_status'] = splited_data_frame.groupBy(
        'http_status').count()

    data_frames['top_5_hosts_http_404'] = splited_data_frame.filter(
        'http_status = "404"').groupBy('request').count().sort(
            col("count").desc()).limit(5)

    data_frames['qty_http_404_per_day'] = splited_data_frame.filter(
        'http_status = "404"').groupBy(
            splited_data_frame.timestamp.substr(1,
                                                11).alias('day')).count().sort(
                                                    desc('day'))
    data_frames['sum_bytes'] = splited_data_frame.select(
        'content_size_in_bytes').groupBy().sum()

    data_frames['bytes_per_day'] = splited_data_frame.select(
        'content_size_in_bytes', 'timestamp').groupBy(
            splited_data_frame.timestamp.substr(
                1, 11).alias('day')).sum().sort('day')

    utils.export_all_queries_to_csv(data_frames)
Example #6
0
def main():
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    # Guardar todos os log da pasta files
    files = listdir('./files')

    schema_blank = StructType([StructField("value", StringType(), True)])

    # DataFrame vazio para unir todos os arquivos em Files
    main_df = sqlContext.createDataFrame([], schema_blank)

    for file in files:
        path_file = './files/' + file
        temp_df = sqlContext.read.text(path_file)
        main_df = main_df.union(temp_df)

    main_df_format = main_df.select(
        regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
        regexp_extract('value',
                       r'^.*\[(\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]',
                       1).alias('timestamp'),
        regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"',
                       1).alias('URL'),
        regexp_extract('value', r'^.*"\s+([^\s]+)',
                       1).cast('integer').alias('codeHTTP'),
        regexp_extract('value', r'^.*\s+(\d+)$',
                       1).cast('integer').alias('byte'))

    # 1. Numero de hosts unicos:
    host_uniques = main_df_format.groupBy('host').count().filter(
        'count = 1').count()
    with open('./resultados/host_uniques.csv', 'w') as file:
        writer = csv.writer(file)
        writer.writerow([',host_uniques'])
        writer.writerow(['0,{}'.format(host_uniques)])

    # 2. O total de erros 404:
    main_df_format.groupBy('codeHTTP').count().filter(
        'codeHTTP = "404"').toPandas().to_csv('./resultados/total_404.csv')

    # Os 5 URLs que mais causaram erro 404
    main_df_format.filter('codeHTTP = "404"').groupBy('URL').count().sort(
        col("count").desc()).limit(5).toPandas().to_csv(
            './resultados/top_five_404.csv')

    # 4. Quantidade de erros 404 por dia
    main_df_format.filter('codeHTTP = "404"').groupBy(
        main_df_format.timestamp.substr(1, 11).alias(
            'day')).count().toPandas().to_csv('./resultados/per_day_404.csv')

    # 5. O total de bytes retornados:
    main_df_format.select('byte').groupBy().sum().toPandas().to_csv(
        './resultados/total_bytes.csv')

    print('Arquivos Exportados para a pasta resultados')
def clean(spark, rows):
    # Load Data
    df = spark.createDataFrame(Row(**row) for row in rows)

    # Clean column country
    re_country = "[a-zA-Z][a-zA-Z\s\-]*"

    df = df.withColumn(
        "country",
        (F.lower(F.trim(F.regexp_extract("country", re_country, 0)))),
    )

    # Clean column campus
    re_campus = "([a-zA-Z]+[_\ \-]?)+"

    df = df.withColumn(
        "campus", (F.lower(F.trim(F.regexp_extract("campus", re_campus, 0)))))

    # Clean column mobility
    re_mobility = "([a-zA-Z0-9]+[\ \-]?)+"

    df = df.withColumn(
        "mobility",
        (F.lower(F.trim(F.regexp_extract("mobility", re_mobility, 0)))))

    # Clean column contracts
    df = df.withColumn(
        "contracts", null_negative_int(df["contracts"].cast(T.IntegerType())))

    # Clean column alternative_choice
    re_alternative_choice = "([a-zA-Z]+[_\ \-]?)+"

    df = df.withColumn(
        "alternative_choice",
        (F.lower(
            F.trim(
                F.regexp_extract("alternative_choice", re_alternative_choice,
                                 0)))),
    )

    # Clean column distance
    re_distance = "[0-9]+"

    df = df.withColumn(
        "distance",
        (F.lower(F.trim(F.regexp_extract("distance", re_distance, 0))).cast(
            T.IntegerType())),
    )

    # Clean column pro_contract
    df = df.withColumn("pro_contract",
                       df["pro_contract"].cast(T.BooleanType()))

    return df
Example #8
0
def house_number_extract(df):
    #make address_line_1 all uppercase
    df = df.withColumn('address_line_1', f.upper('address_line_1'))
    
    #extract house number or box number into column housenumber
    df = df.withColumn('housenumber',
                      f.when(
                          f.col('address_line_1').rlike('^[A-Z]{2}'),
                          f.regexp_extract(f.col('address_line_1'),'(BOX\\s)([0-9]+[0-9A-Z.*-]*)', 2))
                       .otherwise(f.regexp_extract(f.col('address_line_1'),'^([A-Z]*[0-9]+[0-9A-Z.*-]*)', 1)))
    return df
Example #9
0
    def prepare_google_trend():
        # Extract week start date and state.
        google_trend_all = google_trend_csv \
            .withColumn('Date', F.regexp_extract(google_trend_csv.week, '(.*?) -', 1)) \
            .withColumn('State', F.regexp_extract(google_trend_csv.file, 'Rossmann_DE_(.*)', 1))

        # Map state NI -> HB,NI to align with other data sources.
        google_trend_all = google_trend_all \
            .withColumn('State', F.when(google_trend_all.State == 'NI', 'HB,NI').otherwise(google_trend_all.State))

        # Expand dates.
        return expand_date(google_trend_all)
Example #10
0
def nasa_ingestao(df):
    return (df.select(
        regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
        regexp_extract('value',
                       r'^.*\[(\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]',
                       1).alias('timestamp'),
        regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"',
                       1).alias('url'),
        regexp_extract('value', r'^.*"\s+([^\s]+)',
                       1).cast('integer').alias('status'),
        regexp_extract('value', r'^.*\s+(\d+)$',
                       1).cast('integer').alias('bytes')))
Example #11
0
def server_finder(column):
    column = F.lower(column)
    regex_exp = r'(apache|nginx|microsoft)([^\/]+|)(\/|)((\d+\.|\d+\b|)+)'
    
    server  = F.regexp_extract(column, regex_exp, 1)
    version = F.regexp_extract(column, regex_exp, 4)
    
    server = F.when(server == '', 'other')                            \
              .otherwise(server)
    
    column = F.when(version == '', server)                            \
              .otherwise(F.concat_ws('/', server, version))
    
    return column
Example #12
0
def process_equipment_failure_sensors(spark, input_data, output_data):
    '''
    Write parquet files
    on the gold path

    Parameters:
    spark : Spark Session
    input_data (str): Path to input data
    output_data (str): Path to output data
    
    '''
    try:
        df_data = spark.read.format('csv').option("sep", '\t').load(input_data)
        print('1->Read {} - OK'.format(input_data))
    except IOError:
        print('read error')

    df_data = df_data.withColumn(
        'date',
        regexp_extract('_c0', r'(\d+-\d+-\d+\s\d+:\d+:\d+)',
                       1).alias('Date').cast('timestamp')).drop('_c0')

    df_data = df_data.withColumn('error',
                                 when(df_data._c1 == 'ERROR',
                                      1).otherwise(0)).drop('_c1')

    df_data = df_data.withColumn(
        'sensor_id',
        regexp_replace('_c2', '(\D)', '').cast('integer')).drop('_c2')

    df_data = df_data.drop('_c3')

    df_data = df_data.withColumn(
        'temperature',
        regexp_extract('_c4', r'(\d+.\d+)', 1).cast('float')).drop('_c4')

    df_data = df_data.withColumn(
        'vibration',
        regexp_extract('_c5', r'([\-\+]?\d+.\d+)',
                       1).cast('float')).drop('_c5')

    print('2--->Format and clean data {} - OK'.format(input_data))

    try:
        df_data.write.format('parquet').mode('overwrite').save(output_data)
        print('3----->Write OK')
    except IOError:
        print('write error')
    def prepare_dataset(self):
        """
        Compute common intermediate DataFrames and cache to reduce the execution time.
        """

        # A DataFrame of movies where the genre cells are split into several rows
        self.__movies_df_split_genres = self.__movies_df \
            .withColumn('genres', explode(split(self.__movies_df.genres, "\\|"))) \
            .filter(self.__movies_df.genres != "(no genres listed)") \
            .filter(self.__movies_df.genres != "(s listed)") \
            .dropna()

        # A DataFrame of the movies where the title and year of the movie are in separated columns
        self.__movies_df_with_year_col = self.__movies_df \
            .withColumn('year',
                        regexp_extract(self.__movies_df['title'], '[1-2][0-9][0-9][0-9]', 0).cast(IntegerType())) \
            .withColumn('title', split(self.__movies_df['title'], '\([1-2][0-9][0-9][0-9]\)').getItem(0))

        # A DataFrame that contains only the movies that have been rated or tagged
        self.__reduced_ratings = self.__ratings_df.select(
            col("userId"), col("movieId")).distinct()
        self.__reduced_tags = self.__tags_df.select(col("userId"),
                                                    col("movieId")).distinct()
        self.__movies_user_df = self.__reduced_ratings.union(
            self.__reduced_tags).distinct().cache()

        # A DataFrame combining average rating per movie where genres are split in rows
        self.__favor_genre_df = self.__movies_df_split_genres \
            .join(self.__ratings_df, self.__movies_df_split_genres.movieId == self.__ratings_df.movieId) \
            .drop(self.__ratings_df.movieId) \
            .drop(self.__ratings_df.timestamp)
Example #14
0
def dccon_parse(df, col):
    return df.withColumn(
        col,
        F.when(
            F.col(col).startswith('<video'),
            F.concat(F.lit('<dccon> '),
                     F.regexp_extract(col, r'data-src="[^?]*\?no=([^"]+)"', 1),
                     F.lit(' '),
                     F.regexp_extract(col, r'title="([^"]*)"', 1))).when(
                         F.col(col).startswith('<img'),
                         F.concat(
                             F.lit('<dccon> '),
                             F.regexp_extract(col, r'src="[^?]*\?no=([^"]+)"',
                                              1), F.lit(' '),
                             F.regexp_extract(col, r'title="([^"]*)"',
                                              1))).otherwise(F.col(col)))
def main(inputFile, outputFile, configFile, contentMapping):

    #config
    uc = popularityCalculator(configFile)

    df = spark.read.parquet(inputFile + '/*').dropDuplicates().na.drop()
    contentMapping = spark.read.csv(contentMapping, header='true')

    #get rid of ".mp3" in item_name
    df = df.withColumnRenamed("item_name", "to_del")
    df = df.withColumn("item_name", F.split(df['to_del'], '\.')[0])
    df = df.drop('to_del')

    #turn Content Mapping String Length to TimeDelta Object
    strp_time = udf(lambda x: datetime.strptime(x, "%M:%S"))
    time_delta = udf(lambda y: timedelta(minutes=y.minute, seconds=y.second))

    contentMapping = contentMapping.withColumn("strptime",
                                               strp_time(F.col("Length")))
    contentMapping = contentMapping.withColumn("Content Length",
                                               time_delta(F.col("strptime")))
    contentMapping = contentMapping.drop('strpTime')
    contentMapping = contentMapping.withColumnRenamed("Title", "item_name")

    #Merge df and contentMapping
    df = df.join(contentMapping, ["item_name"], "outer")

    #get time played for
    df = df.withColumn(
        "Played For",
        F.unix_timestamp(df["end"]) - F.unix_timestamp(df["start"]))

    #get total seconds of song as String, convert to bigInt
    df = df.withColumn(
        "Song Duration Str",
        F.regexp_extract(df["Content Length"], "(?<=total: )(.*)(?= seconds)",
                         0))
    df = df.withColumn("Song Duration Int",
                       df["Song Duration Str"].cast(IntegerType()))

    #Let's get Percentage Played
    df = df.withColumn("PercentPlayed",
                       df["Played For"] / df["Song Duration Int"])

    #Let's keep only the columns we need at this point
    df = df.select(["device_id", "item_name", "PercentPlayed"])

    #assign weights based on Percent Played
    df = df.withColumn(
    'weight',
    F.when((F.col("PercentPlayed") >= 0.0) & (F.col("PercentPlayed") < 0.25), uc.first)\
    .when((F.col("PercentPlayed") >= 0.25) & (F.col("PercentPlayed") < 0.50), uc.second)\
    .when((F.col("PercentPlayed") >= 0.50) & (F.col("PercentPlayed") < 0.75), uc.third)\
    .when((F.col("PercentPlayed") >= 0.75) & (F.col("PercentPlayed") <= 1.00), uc.fourth)\
    .otherwise(-999.999)
    )
    #drop the rows with invalid percent played
    df = df.filter((df.weight != -999.999))

    df.write.parquet(outputFile)  # Write onto output Parquet
def create_values(cols):
    values = []
    for col in cols:
        if col.is_lookup == 1:
            values.append(
                f.when(
                    f.col(col.demographic_key).isNull(),
                    f.concat_ws('_', f.lit(col.demographic_key),
                                f.lit('9999'))).when(
                                    f.trim(f.col(col.demographic_key)) == '',
                                    f.concat_ws('_',
                                                f.lit(col.demographic_key),
                                                f.lit('9999'))).
                when(
                    f.length(
                        f.regexp_extract(
                            f.col(col.demographic_key).astype('string'),
                            '(\d+)', 1)) > 0,
                    f.concat_ws(
                        '_', f.lit(col.demographic_key),
                        f.col(col.demographic_key).astype('int').astype(
                            'string'))).otherwise(
                                f.concat_ws('_', f.lit(col.demographic_key),
                                            f.col(col.demographic_key))))
        else:
            values.append(f.col(col.demographic_key))
    return values
    def compute(
        self,
        biomarkers_table: str,
        source_table: str,
        disease_table: str,
        drug_index: str,
        output_file: str
    ) -> None:
        """Loads and processes inputs to generate the Cancer Biomarkers evidence strings"""

        # Import data
        biomarkers_df = self.spark.read.csv(biomarkers_table, sep='\t', header=True)
        source_df = self.spark.read.json(source_table).select(
            col('label').alias('niceName'),
            'source', 'url')
        disease_df = self.spark.read.json(disease_table).select(
            regexp_replace(col('name'), '_', '').alias('tumor_type'),
            regexp_extract(col('url'), r'[^/]+$', 0).alias('diseaseFromSourceMappedId'))
        drugs_df = self.spark.read.parquet(drug_index).select(
            col('id').alias('drugId'), col('name').alias('drug'))

        # Process inputs to generate evidence strings
        evidence = self.process_biomarkers(
            biomarkers_df, source_df, disease_df, drugs_df
        )

        # Write evidence strings
        write_evidence_strings(self.evidence, output_file)
        logging.info(f'{evidence.count()} evidence strings have been saved to {output_file}.')
Example #18
0
def transform(retail_df):
    """
    transformations:
        extract color name from description attribute
        select 'Country', 'Quantity', 'UnitPrice' and update product_color by replacing empty value with 'nocolor
        groupBy country and product color
        sum Quantity, UnitePrice as total_price and total_quantity respectively
        Add column avg_spent = total_price/ total_quantity
    :param retail_df:
    :return:
    """
    from pyspark.sql.functions import regexp_extract, col, count, sum, expr, regexp_replace

    extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"

    transformed_retail = (retail_df.withColumn(
        'product_color',
        regexp_extract(col("Description"), extract_str, 1)).select(
            'Country', 'Quantity', 'UnitPrice',
            regexp_replace(
                col("product_color"), '^$',
                "NOCOLOR").alias('product_color')).groupBy(
                    'Country', 'product_color').agg(
                        sum('Quantity').alias('total_quantity'),
                        sum('UnitPrice').alias('total_price')).withColumn(
                            'avg_spent (dollars)',
                            expr('total_price/total_quantity')))

    return transformed_retail
Example #19
0
def create_cmpgn_event_pivot_col(spark_df_model_email):
    '''
    The following function generates a new column named CMPGN_NM_EVENT. This column is 
    the concatenation of the Campaign Name and Email for each row. 
    
    The input spark dataframe must have the following columns:
    
      - CMPGN_NM,
      - VENDOR_EVENT_TYPE_TXT
      
    ACCEPTS:
     - spark dataframe
     
    RETURNS:
    - spark dataframe
    '''

    # Start of String Up To Third Occurance of "_"
    regex_str = '^(?:[^_]*\_){2}([^_]*)'

    # IDX = 0 to grab entire string
    idx = 0

    spark_df_model_email = spark_df_model_email.withColumn('CMPGN_NM_REG', F.regexp_extract(F.col('CMPGN_NM'), regex_str, idx)) \
                                               .withColumn('CMPGN_NM_EVENT', F.concat_ws("_", F.col('CMPGN_NM_REG'), F.col('VENDOR_EVENT_TYPE_TXT'))) \
                                               .drop('CMPGN_NM_REG')

    return spark_df_model_email
Example #20
0
    def invalid_dbs_present_phedex(self):
        '''

        Returns a dataframe with datasets which have "INVALID" status in DBS and are "PRESENT" in phedex

        :func: run_consistency.invalid_dbs_present_phedex()

        for reference dbs d_dataset_access_type_id:
             1 :  valid
             2 :  invalid
             42 : Deprecated
             41 : Production
             81 : Deleted
        '''
        invalid_dbs_present_phedex = (self.dbs_datasets
                .filter(col('d_dataset_access_type_id')=='2')
                .join(self.dbs_blocks,col('d_dataset_id')==col('b_dataset_id'))
                .join(self.phedex_block_replicas,col('d_dataset')==col('dataset_name'))
                .filter(col('dataset_name').isNotNull())
                .withColumn('input_campaign', fn.regexp_extract(col('d_dataset'), "^/[^/]*/((?:HI|PA|PN|XeXe|)Run201\d\w-[^-]+|CMSSW_\d+|[^-]+)[^/]*/", 1))
                .select('input_campaign','d_dataset','d_last_modified_by')    # you can select more columns for detail info
                .distinct())

        invalid_dbs_present_phedex.groupby("input_campaign").agg((fn.count(fn.col("d_dataset")))).show()
        return invalid_dbs_present_phedex.select("d_dataset")
Example #21
0
 def _add_special_dates(self, dcc_experiment_df: DataFrame):
     """
     Takes in a DataFrame with experimental data, parses out the metadata values for special dates,
     and adds those values as new columns.
     """
     for col_name, date_prefixes in {
             "_dateOfBloodCollection": [
                 "date and time of blood collection = ",
                 "date/time of blood collection = ",
             ],
             "_dateOfSacrifice": [
                 "date and time of sacrifice = ",
                 "date of sacrifice = ",
             ],
     }.items():
         escaped_prefixes = [
             prefix.replace("/", ".") for prefix in date_prefixes
         ]
         prefix_regex = f"(?i)(.*)({'|'.join(escaped_prefixes)})(.*)"
         dcc_experiment_df = dcc_experiment_df.withColumn(
             col_name + "Array",
             expr(
                 f'filter(metadata, metadataValue ->  metadataValue rlike "{prefix_regex}" )'
             ),
         )
         dcc_experiment_df = dcc_experiment_df.withColumn(
             col_name,
             regexp_extract(
                 col(col_name + "Array").getItem(0), prefix_regex,
                 3).astype(DateType()),
         )
     return dcc_experiment_df
Example #22
0
def compile_regex_extract(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    pattern = op.pattern.op().value
    idx = op.index.op().value
    return F.regexp_extract(src_column, pattern, idx)
Example #23
0
def get_memedroid_data(memes_df):
    memedroid_df = memes_df.filter(memes_df.source == 'memedroid')
    
    memedroid_schema = StructType().add(
        'title', StringType(), True).add(
        'tags', StringType(), True).add(
        'date', StringType(), True).add(
        'popularity', StringType(), True)
    
    memedroid_data =  memedroid_df.select(
        functions.col('id'),
        functions.from_json(
            functions.col('additional_data'),
            schema=memedroid_schema
        ).alias("data")
    ).select('id','data.*')
    
    upvote_percentage = pyspark.sql.functions.split(memedroid_data['popularity'], '%').getItem(0)
    number_of_votes = pyspark.sql.functions.split(memedroid_data['popularity'], '%').getItem(1)
    
    memedroid_data = memedroid_data.withColumn(
        'upvote_percentage', upvote_percentage.cast("Integer")).withColumn(
        'number_of_votes', regexp_extract(number_of_votes, '[0-9]+',0).cast("Integer"))

    upvotes = (memedroid_data.upvote_percentage * memedroid_data.number_of_votes * 0.01)
    memedroid_data = memedroid_data.withColumn('upvotes', upvotes.cast("Integer"))
    
    memedroid_data = memedroid_data.filter(memedroid_data.upvotes > 100)
    
    return memedroid_data
Example #24
0
def extract_state(df):
    data = df.withColumn('State', 
                    regexp_replace(
                        regexp_extract('Location', r'(, )(\w\w)', 2),
                        r'^$', 'none'
                    ))
    return data
    def _process_pipeline(self, read_stream):
        # filter useless data
        filtered_stream = read_stream.where(
            (col("duration_ms").cast("long") != 0) &
            ~ (col("requested_url").startswith("GET /info") | col("requested_url").startswith("GET /prometheus"))
        )

        mapped_stream = filtered_stream \
            .withColumn("country",
                        when(col("stack").isNotNull(),
                             regexp_extract("stack", r".*-(\w+)$", 1))
                        .otherwise("undefined"))

        average_duration = mapped_stream.aggregate(
            Avg(group_fields=["country", "host", "app", "app_version", "api_method"],
                aggregation_field="duration_ms",
                aggregation_name=self._component_name))

        count_by_status = mapped_stream.aggregate(
            Count(group_fields=["country", "host", "app", "app_version", "api_method", "status"],
                  aggregation_name=self._component_name))

        request_stream = read_stream \
            .where(col("header_x-dev").isNotNull()) \
            .withColumn("country",
                        when(col("stack").isNotNull(),
                             regexp_extract("stack", r".*-(\w+)$", 1))
                        .otherwise("undefined"))

        count_by_app = request_stream.aggregate(
            Count(group_fields=["country", "app"],
                  aggregation_name=self._component_name + ".requests"))

        count_by_app_with_status = request_stream \
            .where(col("status").isNotNull()) \
            .withColumn("status", custom_translate_regex(
                source_field=col("status"),
                mapping={r"^2\d\d": "successful"},
                default_value="failure")) \
            .aggregate(Count(group_fields=["country", "app", "status"],
                             aggregation_name=self._component_name + ".requests"))

        count_stb_requests = request_stream \
            .aggregate(Count(group_fields=["country", "header_x-dev"],
                             aggregation_name=self._component_name + ".requests"))

        return [average_duration, count_by_status, count_stb_requests, count_by_app, count_by_app_with_status]
Example #26
0
def get_crab_popularity_ds(start_date,
                           end_date,
                           verbose=False,
                           base=_BASE_PATH):
    """
    Query the hdfs data and returns a pandas dataframe with:
    Datatier, Dataset, CMSPrimaryPrimaryDataset, job_count, workflow_count, ChirpCMSSWReadBytes
    args:
        - start_date datetime Start of the query period (RecordTime)
        - end_date datetime End of the query period
    """
    start = int(start_date.timestamp() * 1000)
    end = int(end_date.timestamp() * 1000)
    spark = get_spark_session(yarn=True, verbose=verbose)

    dfs_crabdb = (spark.read.option("basePath", base).json(
        _get_candidate_files(start_date, end_date, spark, base=base),
        schema=_get_crab_condor_schema(),
    ).select("metadata.timestamp",
             "data.*").filter("""Status in ('Completed', 'Removed') AND
                              CRAB_DataBlock is not NULL  AND
                              timestamp >= {} AND
                              timestamp <= {}""".format(
                 start, end)).repartition("CRAB_DataBlock").drop_duplicates([
                     "GlobalJobId"
                 ]).withColumnRenamed(
                     "CMSPrimaryPrimaryDataset", "PrimaryDataset").withColumn(
                         "Dataset",
                         regexp_extract("CRAB_DataBlock", "^(.*)/([^/]*)#.*$",
                                        1)).withColumn(
                                            "Datatier",
                                            regexp_extract(
                                                "CRAB_DataBlock",
                                                "^(.*)/([^/]*)#.*$", 2)))
    dfs_crabdb = (dfs_crabdb.groupBy(
        "Datatier", "PrimaryDataset", "Dataset").agg(
            _max(col("RecordTime")),
            _min(col("RecordTime")),
            count(lit(1)),
            countDistinct("CRAB_Workflow"),
            _sum(col("ChirpCMSSWReadBytes")),
        ).withColumnRenamed("count(1)", "job_count").withColumnRenamed(
            "count(DISTINCT CRAB_Workflow)",
            "workflow_count").withColumnRenamed(
                "sum(ChirpCMSSWReadBytes)", "ChirpCMSSWReadBytes").na.fill(
                    "Unknown", ["Datatier", "PrimaryDataset", "Dataset"]))
    return dfs_crabdb.toPandas()
 def __ring_status_node_warnings(self, events):
     return events \
         .where("message like '%Unable to determine external address "
                "of node with internal address %'") \
         .withColumn("host", regexp_extract("message", r".*Unable\s+to\s+determine\s+external\s+address\s+of\s+"
                                                       r"node\s+with\s+internal\s+address\s+'(\S+)'.*", 1)) \
         .aggregate(Count(group_fields=["hostname", "host"],
                          aggregation_name=self._component_name + ".ring_status_node_warnings"))
Example #28
0
 def parse(self, raw_df):
     input = raw_df
     cols = []
     for col_name in self.groups_to_cols.keys():
         meta = self.groups_to_cols[col_name]
         cols.append(col_name)
         if meta['type'] == 'timestamp':
             input = input.withColumn(
                 col_name, 
                 udf.parse_ts_udf(regexp_extract(input.value, self.regexp, meta['group'])).cast(meta['type'])
             )
         else:
             input = input.withColumn(
                 col_name, 
                 regexp_extract(input.value, self.regexp, meta['group']).cast(meta['type'])
             )
     return input.select(*cols)
def prepare_google_trend(
    google_trend_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame:
    google_trend_all = google_trend_csv.withColumn(
        "Date",
        F.regexp_extract(google_trend_csv.week, "(.*?) -", 1)).withColumn(
            "State",
            F.regexp_extract(google_trend_csv.file, "Rossmann_DE_(.*)", 1))

    # map state NI -> HB,NI to align with other data sources
    google_trend_all = google_trend_all.withColumn(
        "State",
        F.when(google_trend_all.State == "NI",
               "HB,NI").otherwise(google_trend_all.State),
    )

    # expand dates
    return expand_date(google_trend_all)
def parse_genetics_evidence(genetics_df: DataFrame) -> DataFrame:
    """The JSON Schema format is applied to the df."""

    return (genetics_df.withColumn(
        'literature',
        when(
            col('pmid') != '',
            array(regexp_extract(col('pmid'), r'PMID:(\d+)$', 1))).when(
                col('study_id').contains('SAIGE'), array(lit('30104761')))
    ).withColumn(
        'cohortId',
        when(col('study_id').contains('SAIGE'),
             array(lit('UK Biobank 500k'))).when(
                 col('study_id').contains('NEALE'),
                 array(lit('UK Biobank 500k'))),
    ).select(
        lit('ot_genetics_portal').alias('datasourceId'),
        lit('genetic_association').alias('datatypeId'),
        col('gene_id').alias('targetFromSourceId'),
        col('efo').alias('diseaseFromSourceMappedId'),
        col('literature'),
        col('pub_author').alias('publicationFirstAuthor'),
        'projectId',
        substring(col('pub_date'), 1,
                  4).cast(IntegerType()).alias('publicationYear'),
        col('trait_reported').alias('diseaseFromSource'),
        col('study_id').alias('studyId'),
        col('sample_size').alias('studySampleSize'),
        col('pval_mantissa').alias('pValueMantissa'),
        col('pval_exponent').alias('pValueExponent'),
        col('odds_ratio').alias('oddsRatio'),
        col('oddsr_ci_lower').alias('oddsRatioConfidenceIntervalLower'),
        col('oddsr_ci_upper').alias('oddsRatioConfidenceIntervalUpper'),
        col('beta').alias('beta'),
        col('beta_ci_lower').alias('betaConfidenceIntervalLower'),
        col('beta_ci_upper').alias('betaConfidenceIntervalUpper'),
        col('y_proba_full_model').alias('resourceScore'),
        col('rsid').alias('variantRsId'),
        concat_ws('_', col('chrom'), col('pos'), col('ref'),
                  col('alt')).alias('variantId'),
        regexp_extract(col('consequence_link'), r'\/(SO.+)$',
                       1).alias('variantFunctionalConsequenceId'),
    ).dropDuplicates([
        'variantId', 'studyId', 'targetFromSourceId',
        'diseaseFromSourceMappedId'
    ]))
# MAGIC | _status_      | The HTTP status code the server sent back to the client.               |
# MAGIC | _bytes_       | The number of bytes (`Content-Length`) transferred to the client.      |
# MAGIC 
# MAGIC 
# MAGIC Next, we have to parse it into individual columns. We'll use the special built-in [regexp\_extract()](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.regexp_extract)
# MAGIC function to do the parsing. This function matches a column against a regular expression with one or more [capture groups](http://regexone.com/lesson/capturing_groups) and allows you to extract one of the matched groups. We'll use one regular expression for each field we wish to extract.
# MAGIC 
# MAGIC If you can't read these regular expressions, don't worry. Trust us: They work. If you find regular expressions confusing (and they certainly _can_ be), and you want to learn more about them, start with the
# MAGIC [RegexOne web site](http://regexone.com/). You might also find [_Regular Expressions Cookbook_](http://shop.oreilly.com/product/0636920023630.do), by Jan Goyvaerts and Steven Levithan, to be helpful.
# MAGIC 
# MAGIC _Some people, when confronted with a problem, think "I know, I'll use regular expressions." Now they have two problems._ (attributed to Jamie Zawinski)

# COMMAND ----------

from pyspark.sql.functions import split, regexp_extract
split_df = base_df.select(regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
                          regexp_extract('value', r'^.*\[(\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]', 1).alias('timestamp'),
                          regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"', 1).alias('path'),
                          regexp_extract('value', r'^.*"\s+([^\s]+)', 1).cast('integer').alias('status'),
                          regexp_extract('value', r'^.*\s+(\d+)$', 1).cast('integer').alias('content_size'))
split_df.show(truncate=False)


# COMMAND ----------

# MAGIC %md
# MAGIC ### (2c) Data Cleaning
# MAGIC 
# MAGIC Let's see how well our parsing logic worked. First, let's verify that there are no null rows in the original data set.

# COMMAND ----------
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()

    # setup spark/sql context to be used for communication with HDFS
    sc = SparkContext(appName="phedex_br")
    if not opts.yarn:
        sc.setLogLevel("ERROR")
    sqlContext = HiveContext(sc)

    schema_def = schema()

    # read given file(s) into RDD
    if opts.fname:
        pdf = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(opts.fname, schema = schema_def)
    elif opts.basedir:
        fromdate, todate = defDates(opts.fromdate, opts.todate)
        files = getFileList(opts.basedir, fromdate, todate)
        msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files))
        print msg

        if not files:
            return
        pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(file_path, schema = schema_def) \
                        for file_path in files])
    else:
        raise ValueError("File or directory not specified. Specify fname or basedir parameters.")

    # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date)
    groupdic, nodedic = getJoinDic()
    acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$"	
    data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$"
    groupf = udf(lambda x: groupdic[x], StringType())
    nodef = udf(lambda x: nodedic[x], StringType())

    ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \
         .withColumn("node_kind", nodef(pdf.node_id)) \
         .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \
         .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \
        .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1)))

	# print dataframe schema
    if opts.verbose:
        ndf.show()
        print("pdf data type", type(ndf))
        ndf.printSchema()

    # process aggregation parameters
    keys = [key.lower().strip() for key in opts.keys.split(',')]
    results = [result.lower().strip() for result in opts.results.split(',')]
    aggregations = [agg.strip() for agg in opts.aggregations.split(',')]
    order = [orde.strip() for orde in opts.order.split(',')] if opts.order else []
    asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else []
    filtc, filtv = opts.filt.split(":") if opts.filt else (None,None)

    validateAggregationParams(keys, results, aggregations, order, filtc)

    if filtc and filtv:
        ndf = ndf.filter(getattr(ndf, filtc) == filtv)

    # if delta aggregation is used
    if DELTA in aggregations:
        validateDeltaParam(opts.interval, results)			
        result = results[0]

        #1 for all dates generate interval group dictionary
        datedic = generateDateDict(fromdate, todate, opts.interval)
        boundic = generateBoundDict(datedic)
        max_interval = max(datedic.values())

        interval_group = udf(lambda x: datedic[x], IntegerType())
        interval_start = udf(lambda x: boundic[x][0], StringType())		
        interval_end = udf(lambda x: boundic[x][1], StringType())

        #2 group data by block, node, interval and last result in the interval
        ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result))
        idf = ndf.withColumn("interval_group", interval_group(ndf.now))
        win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc())	
        idf = idf.withColumn("row_number", rowNumber().over(win))
        rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\
                 .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0)))
        rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result))
        rdf.cache()

        #3 create intervals that not exist but has minus delta
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win))
        hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\
                 .withColumn("interval_group", adf.interval_group + 1)\
                 .withColumn(result, lit(0))\
                 .drop(adf.interval_group_aft)

        #4 join data frames
        idf = rdf.unionAll(hdf)
		
        #3 join every interval with previous interval
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win))

        #5 calculate delta_plus and delta_minus columns and aggregate by date and node
        ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \
                .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0))

        aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\
                                                                    sum(ddf.delta_minus).alias("delta_minus"))

        aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus)
		
    else:	
        resAgg_dic = zipResultAgg(results, aggregations)
        order, asc = formOrdAsc(order, asc, resAgg_dic)

        # perform aggregation
        if order:
            aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc)
        else:
            aggres = ndf.groupBy(keys).agg(resAgg_dic)

    # output results
    if opts.fout:
        fout_header = formFileHeader(opts.fout)
        if opts.header:
            aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header)
        else:
            aggres.write.format('com.databricks.spark.csv').save(fout_header)
    else:
        aggres.show(50)
  col("Description")).show(2)


# COMMAND ----------

from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\
  .show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
     regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
     col("Description")).show(2)


# COMMAND ----------

from pyspark.sql.functions import instr
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
  .where("hasSimpleColor")\
  .select("Description").show(3, False)


# COMMAND ----------