Ejemplo n.º 1
0
def df_erros_dia(_logs):
    return (_logs.filter(f.col('codigo_retorno_http') == 404).groupby(
        f.substring('timestamp', 1, 11).alias('timestamp')).agg(
            f.count(f.col('codigo_retorno_http')).alias(
                'total_erros_dia')).orderBy(f.unix_timestamp(
                    f.substring('timestamp', 1, 11), 'dd/MMM/yyyy'),
                                            ascending=False))
Ejemplo n.º 2
0
def main():
    args = parseArguments()

    spark = SparkSession.builder.getOrCreate()

    Logger = spark._jvm.org.apache.log4j.Logger
    joblogger = Logger.getLogger(__name__)
    joblogger.info(
        '****************************************************************')
    joblogger.info('')
    joblogger.info('Starting creation of test data file with {0} rows and {1} '
                   'partitions at {2}'.format(args.rows, args.partitions,
                                              args.outfile))
    joblogger.info('')
    joblogger.info(
        '****************************************************************')

    udfGetUUID = F.udf(getUUID, T.StringType())

    df = (spark.range(0, args.rows, numPartitions=args.partitions).withColumn(
        'value', udfGetUUID()).withColumn(
            'prefix2', F.substring(F.col('value'), 1, 2)).withColumn(
                'prefix4', F.substring(F.col('value'), 1, 4)).withColumn(
                    'prefix8', F.substring(F.col('value'), 1, 8)).withColumn(
                        'float_val',
                        F.rand(seed=8675309) * 1000000).withColumn(
                            'integer_val',
                            F.col('float_val').cast(T.LongType())).drop('id'))

    df.write.csv(args.outfile, mode='overwrite', header=True)
    joblogger.info('Done writing to {0}'.format(args.outfile))
Ejemplo n.º 3
0
    def hashstr(self, method='crc32', salt=''):
        f = {
            'crc32': F.crc32,
            'md5': F.md5,
            'md5-8': F.md5,
            'md5-4': F.md5,
            'sha1': F.sha1
        }.get(method, F.crc32)

        df = self.df
        for c in self.scols:
            col = F.col(c).cast('string')
            h = f(F.concat(col, F.lit(salt)))

            if method=='crc32':
                res = F.conv(h.cast('string'), 10, 16)
            elif method=='md5-8':
                res = F.substring(h, 0, 16)
            elif method=='md5-4':
                res = F.substring(h, 0, 8)
            else:
                res = h

            df = df.withColumn(c, res)

        return df
Ejemplo n.º 4
0
def extract_imits_tsv_allele_2(spark_session: SparkSession,
                               file_path: str) -> DataFrame:
    imits_df = utils.extract_tsv(spark_session, file_path)
    imits_df = imits_df.withColumn(
        "allele_mgi_accession_id",
        when(
            (col("allele_mgi_accession_id").isNull()) &
            (col("type") == "Allele"),
            concat(lit("NOT-RELEASED-"),
                   substring(md5(col("allele_symbol")), 0, 10)),
        ).otherwise(col("allele_mgi_accession_id")),
    )
    imits_df = imits_df.withColumn(
        "marker_mgi_accession_id",
        when(
            (col("marker_mgi_accession_id").isNull()) &
            (col("type") == "Gene"),
            concat(lit("NOT-RELEASED-"),
                   substring(md5(col("marker_symbol")), 0, 10)),
        ).otherwise(col("marker_mgi_accession_id")),
    )
    imits_df = imits_df.withColumn(
        "allele2_id",
        monotonically_increasing_id().astype(StringType()))
    for col_name in ALLELE2_MULTIVALUED:
        imits_df = imits_df.withColumn(
            col_name,
            when(
                col(col_name).contains("|"),
                split(col_name, r"\|"),
            ).otherwise(array(col_name)),
        )
    return imits_df
Ejemplo n.º 5
0
def split_colstxt(df, cols):
    """Function that splits fixed-size columns and casts to corresponding data type
    
    :param df: Spark data frame containing the texts to split
    :param cols: column schemes with name, data type and length
    :return: new Spark data frame with the columns divided according to the 'cols' parameter 
    """
    i = 1
    for c in cols:
        if (c.get('Datatype') != 'timestamp'):
            df = df.withColumn(
                c.get('name'),
                fspark.substring(df.data_column, i,
                                 c.get('lenght')).cast(c.get('Datatype')))
        else:
            df = df.withColumn(
                c.get('name'),
                fspark.to_timestamp(
                    fspark.substring(df.data_column, i, c.get('lenght')),
                    "dd/MM/yyyy HH:mm:ss"))
        i = i + c.get('lenght')
        for c in df.dtypes:
            if ((c[1]) == "string"):
                df = df.withColumn(c[0], fspark.trim(df[c[0]]))
    return df
Ejemplo n.º 6
0
def format_output(df):
    df = df.withColumn("uniqueKey",
                       f.upper(f.concat(f.lit("RY"),
                                        f.substring(f.col('year'), 3, 2),
                                        f.lit("_"),
                                        f.col("channel"),
                                        f.lit("_"),
                                        f.col("division"),
                                        f.lit("_"),
                                        f.col("gender"),
                                        f.lit("_"),
                                        f.col("category"),
                                        ))) \
        .withColumn("channel", f.upper(f.col("channel"))) \
        .withColumn("year", f.concat(f.lit("RY"), f.substring(f.col('year'), 3, 2))) \
        .withColumn("week_1", f.concat(f.lit("W"), f.col("week")))

    output = df.orderBy("week").groupBy('uniqueKey', 'division', 'gender', 'category', 'channel', 'year').agg(
        f.to_json(
            f.collect_list(
                f.create_map('week_1', 'netSales')
            )
        ).alias('Net Sales'),
        f.to_json(
            f.collect_list(
                f.create_map('week_1', 'salesUnits')
            )
        ).alias('Sales Units')
    )

    return output
Ejemplo n.º 7
0
def filter_df(spark, path):
    merged = spark.read.parquet(join(path, "train/merged_transformed/"))
    em = spark.read.parquet(join(path, "train/EM_transformed/"))

    merged_date = merged.withColumn(
        "year",
        F.substring(merged.articleUID, 1, 4).cast("int")).withColumn(
            "month",
            F.substring(merged.articleUID, 5, 2).cast("int"))
    merged_filtered = merged_date.filter(
        "year > 2014 OR (year = 2014 AND month >= 6)").drop("year", "month")
    em_date = em.withColumn("year",
                            F.substring(em.articleUID, 1,
                                        4).cast("int")).withColumn(
                                            "month",
                                            F.substring(em.articleUID, 5,
                                                        2).cast("int"))
    em_filtered = em_date.filter(
        "year > 2014 OR (year = 2014 AND month >= 6)").drop("year", "month")

    merge_rand = merged_filtered.withColumn("rand", F.rand(seed=SEED))
    em_rand = em_filtered.withColumn("rand", F.rand(seed=SEED))

    merge_nambiguous = merge_rand.filter(~merge_rand.ambiguous)
    em_nambiguous = em_rand.filter(~em_rand.ambiguous)
    merge_ambiguous = merge_rand.filter(merge_rand.ambiguous)
    em_ambiguous = em_rand.filter(em_rand.ambiguous)

    return merge_nambiguous, em_nambiguous, merge_ambiguous, em_ambiguous
Ejemplo n.º 8
0
def df_logs_layout(_logs):
    return (_logs.withColumn(
        'host_requisicao',
        f.split(_logs['value'], ' ').getItem(0)).withColumn(
            'timestamp',
            f.concat(
                f.substring(f.split(_logs['value'], ' ').getItem(3), 2, 20),
                f.substring(f.split(_logs['value'], ' ').getItem(4), 1,
                            5))).withColumn(
                                'requisicao',
                                f.concat(
                                    f.substring(
                                        f.split(_logs['value'],
                                                ' ').getItem(5), 2, 3),
                                    f.split(_logs['value'], ' ').getItem(6),
                                    f.substring(
                                        f.split(_logs['value'],
                                                ' ').getItem(7), 1, 8))).
            withColumn(
                'codigo_retorno_http',
                f.split(_logs['value'],
                        ' ').getItem(8).cast(IntegerType())).withColumn(
                            'total_bytes',
                            f.split(_logs['value'],
                                    ' ').getItem(9).cast(
                                        IntegerType())).drop('value'))
Ejemplo n.º 9
0
def main():

    try:
        with open("airly_param.json", "r") as file:
            param_dict = json.load(file)
        print(param_dict["broker"])
    except Exception as ex:
        print("Problem z odczytem pliku parametrow: airly_param.json")
        print(str(ex))
        exit()

    v_broker = param_dict["broker"]
    v_s3_bucket = param_dict["s3_bucket"]

    spark=SparkSession.builder.appName("Structured").getOrCreate()
    
    raw=spark.readStream.format("kafka")\
    .option("kafka.bootstrap.servers",v_broker)\
    .option("startingOffsets", "earliest")\
    .option("subscribe","sensor").load()

# Schemat napływających danych
    schema = StructType()\
    .add("current", StructType()\
    .add("fromDateTime", StringType())\
    .add("indexes", ArrayType(StructType().add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("description",StringType()).add("name",StringType()).add("value",DoubleType())))\
    .add("standards", ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType())))\
    .add("tillDateTime", StringType())\
    .add("values", ArrayType(StructType().add("name",StringType()).add("value",DoubleType())))
    )\
    .add("forecast", ArrayType(StructType().add("fromDateTime",StringType())\
    .add("indexes",ArrayType(StructType().add("advice",StringType()).add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("value",DoubleType())) ) \
    .add("standarts",ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType()))) \
    .add("tillDateTime", StringType())\
    .add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))\
    ))\
    .add("history", ArrayType(StructType().add("fromDateTime",StringType())\
    .add("indexes",ArrayType(StructType().add("advice",StringType()).add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("value",DoubleType())) ) \
    .add("standarts",ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType()))) \
    .add("tillDateTime", StringType())\
    .add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))\
    ))
# Data frame ze zparsowanymi danymi
    df = raw.select(f.col("key").cast("String").cast("Integer"),f.from_json(f.col("value").cast("string"), schema).alias("parsed_value"))

    stream = df.select(f.col("key").cast("integer"),
    f.concat(f.split(f.col('parsed_value.current.fromDateTime'),'T')[0],f.lit(' '),f.substring(f.split(f.col('parsed_value.current.fromDateTime'),'T')[1],0,8)).alias("fromTime"),
    f.concat(f.split(f.col('parsed_value.current.tillDateTime'),'T')[0],f.lit(' '),f.substring(f.split(f.col('parsed_value.current.tillDateTime'),'T')[1],0,8)).alias("untilTime"),
    f.explode("parsed_value.current.values").alias("value")).select("key","fromTime","untilTime","value.name",f.col("value.value").cast("float"))
    
    # zapis w mikrobatchach
    def write_s3(batch_df, batch_id):
        batch_df.write.format('json').mode("append").option("checkpointLocation",v_ckpt_loc)\
        .option("path",v_s3_bucket).save()

    stream.writeStream.foreachBatch(write_s3).start().awaitTermination()
def main(in_directory, out_directory):
    df = spark.read.csv(in_directory, schema=comments_schema,
                        sep=' ').withColumn('filename',
                                            functions.input_file_name())
    path_to_hour = functions.udf(getFileName, returnType=types.StringType())
    df = df.select(df['language'], df['title'], df['number_of_requests'],
                   path_to_hour(df['filename']).alias('filename'))

    df = df.filter((df['language'] == 'en')
                   & (df['title'] != 'Main Page')
                   & (functions.substring(df['title'], 0, 8) != 'Special:'))

    grouped = df.groupBy('filename', 'title')
    count = grouped.agg(functions.sum(df['number_of_requests']).alias('count'))

    grouped2 = count.groupBy('filename')
    maxi = grouped2.agg(functions.max(count['count']))

    joined = count.join(maxi, on='filename')
    joined = joined.filter(joined['count'] == joined['max(count)'])\
    .select(joined['filename'], joined['title'], joined['count'])
    joined = joined.cache()
    sorted_ = joined.sort('filename', 'title')

    sorted_.write.csv(out_directory, mode='overwrite')
Ejemplo n.º 11
0
def map_line_columns(line_df: DataFrame):
    for field, value in Constants.LINE_TO_OBSERVATION_MAP.items():
        if value is not None:
            line_df = line_df.withColumn(field, col(value))
        else:
            line_df = line_df.withColumn(field, lit(None))
    line_df = line_df.withColumn("biological_sample_group",
                                 lit("experimental"))
    line_df = line_df.withColumn("zygosity", lit("homozygote"))
    line_df = line_df.withColumn(
        "datasource_name",
        when(col("_dataSource") == "impc", lit("IMPC")).otherwise(
            when(col("_dataSource") == "europhenome",
                 lit("EuroPhenome")).otherwise(col("_dataSource"))),
    )
    line_df = line_df.withColumn(
        "allele_accession_id",
        when(col("biological_sample_group") == "control", lit(None)).otherwise(
            when(
                col("allele.mgiAlleleID").isNull(),
                concat(
                    lit("NOT-RELEASED-"),
                    substring(md5(line_df["allele_symbol"]), 0, 10),
                ),
            ).otherwise(col("allele.mgiAlleleID"))),
    )
    return line_df
Ejemplo n.º 12
0
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     column_spec = substring(
         self.column.get_column_spec(source_df=source_df,
                                     current_column=current_column),
         self.start, self.length)
     return column_spec
Ejemplo n.º 13
0
    def _clean_airports(self):
        """
        Class method to clean airports data

        Operations:
                    - select only US airports
                    - select only airports where type is in ("large_airport", "medium_airport", "small_airport")
                    - isolate region substring from iso_region field
                    - cast elevation in feet to float
        Returns:
                [dict] - object with source-name: SparkDF key-value pairs
        """
        df = self.data_dict.get('airports', None)
        if df is not None:
            data = df\
                .where(
                    (col("iso_country") == "US") &
                    (col("type").isin("large_airport", "medium_airport", "small_airport"))
                )\
                .withColumn("iso_region", substring(col("iso_region"), 4, 2))\
                .withColumn("elevation_ft", col("elevation_ft").cast("float"))\
                .withColumnRenamed("local_code", "port_code")

            return dict(airports=data)
        else:
            logger.error(
                ValueError(
                    'No dataset named "airports" found in sources dict.'))
            raise ValueError(
                'No dataset named "airports" found in sources dict.')
    def repartitionData(self, data):
        """
        Perform table repartitioning in spark depending on settings in config
        :param data:
        :return:
        """
        print(
            f"{self.__class__.__name__} received microdata dataframe with number of partitions: {data.rdd.getNumPartitions()}"
        )
        print(f"& schema: {data.printSchema()}")
        print(
            f"{self.__class__.__name__} is repartitioning microdata dataframe to {self.reader.num_reader_partitions} partitions, "
        )
        print(
            "collapsing to a sparse histogram representation, and passing back to driver..."
        )
        if self.reader.num_reader_partitions > 0:
            data = data.withColumn(
                self.PARTITION_CODE_COLUMN,
                substring(self.geography_variables[0], 0,
                          self.reader.reader_partition_len))
            if not self.reader.range_partition:
                print(
                    f"Using df hash partitioner by {self.PARTITION_CODE_COLUMN}"
                )
                return data.repartition(self.reader.num_reader_partitions,
                                        self.PARTITION_CODE_COLUMN).drop(
                                            self.PARTITION_CODE_COLUMN)

            print(
                f"Using df range partitioner by {self.PARTITION_CODE_COLUMN}")
            return data.repartitionByRange(self.reader.num_reader_partitions,
                                           self.PARTITION_CODE_COLUMN).drop(
                                               self.PARTITION_CODE_COLUMN)
        return data
Ejemplo n.º 15
0
    def do(self, workflow, etl_process):
        
        from pyspark.sql.functions import substring, substring_index, split, col

        self.new_column = self.action_details.pop("name")
        self.target = self.action_details.pop("target")

        self.type = self.action_details.pop("type", "simple")
        
        if self.type == "simple":
            self.pos = self.action_details.pop("pos", 1)
            self.len = self.action_details.pop("len")
            workflow.df = workflow.df \
                .withColumn(self.new_column, substring(col(self.target), self.pos, self.len))
        
        else:
            self.delim = self.action_details.pop("delim")
            self.index = self.action_details.pop("index", 1)

            if self.type == "delim": 
                workflow.df = workflow.df \
                    .withColumn(self.new_column, substring_index(col(self.target), self.delim, self.index))

            elif self.type == "delim_index":
                workflow.df = workflow.df \
                    .withColumn(self.new_column, split(self.target, self.delim).getItem(self.index - 1))
def fill_days(year: str, month: str, day: str) -> None:
    S3_FULL_PATH = f"{S3_PATH}/{year}/{month}/{day}"
    try:
        # read all parquet files from specific yyyy/mm/dd
        df = spark.read.format("parquet").load(f"{S3_FULL_PATH}/*.parquet")

        # creates extracted_at column from date path
        df = df.withColumn(
            "extracted_at",
            from_unixtime(  # covert unixtime to timestamp
                unix_timestamp(
                    substring(
                        regexp_replace(input_file_name(), f"{S3_PATH}/", ""), 1, 10
                    ),
                    "yyyy/MM/dd",
                )
            ),
        )
        # overwrite dataframe with the new column
        df.write.mode("overwrite").parquet(f"{S3_FULL_PATH}/")
        print(f"files updated from {S3_FULL_PATH}/")

    # some dates don't have paths.
    except AnalysisException as err:
        print(err)
Ejemplo n.º 17
0
def billing_events(df):
    import datetime

    MAX_MONTH = 72

    def get_last_month(col):
        h = F.abs(F.xxhash64(col))
        h1 = (h.bitwiseAND(0xff)) % (MAX_MONTH // 2)
        h2 = (F.shiftRight(h, 8).bitwiseAND(0xff)) % (MAX_MONTH // 3)
        h3 = (F.shiftRight(h, 16).bitwiseAND(0xff)) % (MAX_MONTH // 5)
        h4 = (F.shiftRight(h, 24).bitwiseAND(0xff)) % (MAX_MONTH // 7)
        h5 = (F.shiftRight(h, 32).bitwiseAND(0xff)) % (MAX_MONTH // 11)
        return -(h1 + h2 + h3 + h4 + h5)

    w = pyspark.sql.Window.orderBy(F.lit("")).partitionBy(df.customerID)

    charges = (df.select(
        df.customerID,
        F.lit("Charge").alias("kind"),
        F.explode(
            F.array_repeat(
                (df.TotalCharges / df.tenure).cast(get_currency_type()),
                df.tenure.cast("int"))).alias("value"),
        F.when(df.Churn == "Yes", get_last_month(
            df.customerID)).otherwise(0).alias("last_month")).withColumn(
                "now",
                F.lit(now).cast("date")).withColumn(
                    "month_number",
                    -(F.row_number().over(w) +
                      F.col("last_month"))).withColumn(
                          "date",
                          F.expr("add_months(now, month_number)")).drop(
                              "now", "month_number", "last_month"))

    serviceStarts = (df.withColumn(
        "last_month",
        F.when(df.Churn == "Yes", get_last_month(
            df.customerID)).otherwise(0)).select(
                df.customerID,
                F.lit("AccountCreation").alias("kind"),
                F.lit(0.0).cast(get_currency_type()).alias("value"),
                F.lit(now).alias("now"),
                (-df.tenure - 1 + F.col("last_month")).alias("month_number"),
            ).withColumn("date", F.expr("add_months(now, month_number)")).drop(
                "now", "month_number"))

    serviceTerminations = df.withColumn(
        "last_month",
        F.when(df.Churn == "Yes",
               get_last_month(df.customerID)).otherwise(0)).where(
                   df.Churn == "Yes").withColumn("now", F.lit(now)).select(
                       df.customerID,
                       F.lit("AccountTermination").alias("kind"),
                       F.lit(0.0).cast(get_currency_type()).alias("value"),
                       F.expr("add_months(now, last_month)").alias("date"))

    billingEvents = charges.union(serviceStarts).union(
        serviceTerminations).orderBy("date").withColumn(
            "month", F.substring("date", 0, 7))
    return billingEvents
Ejemplo n.º 18
0
def extract_imits_tsv_by_entity_type(spark_session: SparkSession,
                                     file_path: str,
                                     entity_type: str) -> DataFrame:
    """
    Uses a Spark Session to generate a DataFrame from a TSV file and a specific entity type.
    Can extract Genes or Alleles from a Alleles report file produced by IMITS.
    :param spark_session: spark SQL session to be used in the extraction
    :param file_path: path to the TSV file
    :param entity_type: 'Allele' or 'Gene'
    :return: Spark DataFrame with the extracted data
    """
    imits_df = utils.extract_tsv(spark_session, file_path)
    imtis_entity_df = imits_df.where(imits_df.type == entity_type)
    if entity_type == "Allele":
        imtis_entity_df = imtis_entity_df.withColumn(
            "acc",
            when(
                col("allele_mgi_accession_id").isNull(),
                concat(lit("NOT-RELEASED-"),
                       substring(md5(col("allele_symbol")), 0, 10)),
            ).otherwise(col("allele_mgi_accession_id")),
        )
        imtis_entity_df = imtis_entity_df.withColumn(
            "allele2_id",
            monotonically_increasing_id().astype(StringType()))
    for col_name in ALLELE2_MULTIVALUED:
        imits_df = imits_df.withColumn(
            col_name,
            when(
                col(col_name).contains("|"),
                split(col_name, r"\|"),
            ).otherwise(array(col_name)),
        )
    return imtis_entity_df
Ejemplo n.º 19
0
def read_and_process_airport_data(spark, filename, df_dimension_state_table):
    """ Load the airport codes join with state dimension data to get airports with state_key"""
    logging.info("Reading airport data")
    # load the airport codes so we can map them to states
    airport_schema = R([
        Fld("ident", Str()),
        Fld("type", Str()),
        Fld("name", Str()),
        Fld("elevation_ft", Int()),
        Fld("continent", Str()),
        Fld("iso_country", Str()),
        Fld("iso_region", Str()),
        Fld("municipality", Str()),
        Fld("gps_code", Str()),
        Fld("iata_code", Str()),
        Fld("local_code", Str()),
        Fld("coordinates", Str())
    ])

    df_airport = spark.read.options(Header=True,
                                    Delimter=",").csv(filename, airport_schema)

    # cleanse: we only want the airports in the US which map to the states that we have in the states table

    df_airport = df_airport.filter(df_airport.iso_country == "US") \
        .join(df_dimension_state_table, F.substring(df_airport.iso_region, 4, 2) == df_dimension_state_table.state_key,
              "inner") \
        .select(df_airport.ident, df_airport.local_code, df_dimension_state_table.state_key)

    return df_airport
Ejemplo n.º 20
0
    def read_parquet(self,inputfilepath):
        out_df=self.spark.read.parquet('file:///'+inputfilepath+'/'+'out_parquet')

        ldf2=out_df. \
            orderBy(out_df['ScreenTemperature'].desc()). \
            limit(1). \
            select(out_df['ForecastSiteCode'], substring(out_df['ObservationDate'],1,10).alias('Date'),out_df['Region'],out_df['ScreenTemperature'] ). \
            show(10,truncate=False)
Ejemplo n.º 21
0
def erro_404_dia(df):
    df1 = (df.filter(df['retorno_http'] == '404'))
    df2 = (df1.select(
        f.substring(df1['date'], 2, 11).alias('date'),
        df1['retorno_http']).groupBy('date').agg(
            f.count('retorno_http').alias('tot_erro_404')).sort(
                f.col('date').asc()))
    return df2
Ejemplo n.º 22
0
def extract_hes_year(hes_type_regex):
	table_yyYY_col = F.regexp_extract(F.col('tableName'), "(?<=" + hes_type_regex + "_{0,1})\d{4}", 0)
	table_yy_col = F.substring(table_yyYY_col, 0, 2)
	table_year_col = (F.when(table_yy_col > 80,
							 F.concat(F.lit('19'), table_yy_col))
					  .otherwise(F.concat(F.lit('20'), table_yy_col).cast('integer'))
					  )
	return table_year_col
Ejemplo n.º 23
0
def make_wikidata_movies():
    wd = spark.read.parquet(sys.argv[1])

    # just movies
    wd = wd.where(wd['imdb_id'].isNotNull()).where(
        functions.substring(wd['imdb_id'].getItem(0), 0, 2) == 'tt')

    # ... with reasonable depth of data
    wd = wd.where(wd['genre'].isNotNull())
    wd = wd.where(wd['enwiki_title'].isNotNull())
    wd = wd.where(wd['rotten_tomatoes_id'].isNotNull())

    # calculate made_profit boolean
    wd = wd.withColumn(
        'nbox',
        wd.box_office.getItem(0).getItem(0).cast(types.FloatType()))
    wd = wd.withColumn('ncost',
                       wd.cost.getItem(0).getItem(0).cast(types.FloatType()))
    wd = wd.withColumn('made_profit', (wd.nbox - wd.ncost) > 0)
    wd = wd.withColumn('profit', wd.nbox - wd.ncost)

    # actual fields we want to output: other fields could be included and/or modified here.
    output_data = wd.select(
        wd['id'].alias('wikidata_id'),
        wd['label'],
        take_first(wd, 'imdb_id'),
        take_first(wd, 'rotten_tomatoes_id'),
        take_first(wd, 'metacritic_id'),
        wd['enwiki_title'],
        wd['genre'],
        wd['main_subject'],
        wd['filming_location'],
        wd['director'],
        #wd['screenwriter'],
        wd['cast_member'],
        #wd['narrative_location'],
        #wd['director_of_photography'],
        #wd['film_editor'],
        #wd['filming_location'],
        take_first(wd, 'series'),
        #wd['voice_actor'],
        #wd['executive_producer'],
        #wd['composer'],
        #take_first(wd, 'production_company'),
        #take_first(wd, 'distributor'),
        first_publication_date(wd['publication_date']
                               ).alias('publication_date'),
        take_first(wd, 'based_on'),
        take_first(wd, 'country_of_origin'),
        take_first(wd, 'original_language'),
        wd['made_profit'],
        wd['profit'],
        wd['nbox'],
        wd['ncost'])
    # output is about 4MB compressed: safe to .coalesce().
    output_data.coalesce(1).write.json('./wikidata-movies',
                                       mode='overwrite',
                                       compression='gzip')
Ejemplo n.º 24
0
def mturkify(
    spark: SparkSession,
    *,
    quotes_context_path: str,
    quootstrap_path: str,
    speakers_path: str,
    output_path: str,
    nb_partition: int,
    compression: str = "gzip",
):
    qc = spark.read.json(quotes_context_path)
    qc_date = qc.withColumn("year",
                            F.substring(qc.articleUID, 1,
                                        4).cast("int")).withColumn(
                                            "month",
                                            F.substring(qc.articleUID, 5,
                                                        2).cast("int"))
    qc_filtered = qc_date.filter("year > 2014 OR (year = 2014 AND month >= 6)")
    qc_key = qc_filtered.withColumn(
        "key",
        F.substring(qc_filtered.articleUID, 1,
                    6).cast("int")).drop("year", "month")

    quootstrap_df = spark.read.json(quootstrap_path)
    q2 = quootstrap_df.select(F.explode("occurrences").alias("occurrence"))
    fields_to_keep = [
        q2.occurrence.articleUID.alias("articleUID"),
        q2.occurrence.articleOffset.alias("articleOffset"),
    ]

    attributed_quotes_df = q2.select(*fields_to_keep).withColumn(
        "in_quootstrap", F.lit(True))

    speakers = spark.read.json(speakers_path)
    joined = qc_key.join(speakers, on="articleUID")
    transformed = (joined.rdd.map(transform).filter(
        lambda x: x is not None).toDF().withColumn("nb_entities",
                                                   F.size("entities")))

    transformed_quootstrap = transformed.join(
        attributed_quotes_df, on=["articleUID",
                                  "articleOffset"], how="left").na.fill(False)
    transformed_quootstrap.write.parquet(output_path,
                                         "overwrite",
                                         compression=compression)
    def loadDiscovery(self):
        self.log.info('Exception Handling starts')

        validSourceFormat = self.isValidFormatInSource()

        if not validSourceFormat:
            self.log.error(
                "StoreDailyGoalForecast Source files not in csv format.")

        validSourceSchema = self.isValidSchemaInSource()
        if not validSourceSchema:
            self.log.error(
                "StoreDailyGoalForecast Source schema does not have all the required columns."
            )

        if not validSourceFormat or not validSourceSchema:
            self.log.info(
                "Copy the source files to data processing error path and return."
            )
            self.copyFile(
                self.storeDailyGoalForecastFile,
                self.dataProcessingErrorPath + self.tableName +
                datetime.now().strftime('%Y%m%d%H%M') + self.fileFormat)
            return

        self.log.info('Source format and schema validation successful.')
        self.log.info('Reading the input parquet file')

        dfStoreDailyGoalForecast = self.sparkSession.read.format("com.databricks.spark.csv"). \
            option("encoding", "UTF-8"). \
            option("ignoreLeadingWhiteSpace", "true"). \
            option("ignoreTrailingWhiteSpace", "true"). \
            option("header", "true"). \
            option("treatEmptyValuesAsNulls", "true"). \
            option("inferSchema", "true"). \
            option("escape", '"'). \
            option("quote", "\""). \
            option("multiLine", "true"). \
            load(self.storeDailyGoalForecastPath).toDF(*self.storeDailyGoalForecastCols)

        # dfStoreDailyGoalForecast.withColumnRenamed("Date", "date"). \
        #     withColumnRenamed("Day % to Forecast", "daypercentforecast"). \
        #     withColumnRenamed("Daily Forecast", "dailyforecast").registerTempTable("StoreDailyGoalForecast")

        # dfStoreDailyGoalForecastFinal = self.sparkSession.sql("select date, daypercentforecast, dailyforecast from "
        #                                                       "StoreDailyGoalForecast")

        dfStoreDailyGoalForecast.coalesce(1).write.mode('overwrite').format('parquet').\
            save(self.storeDailyGoalForecastWorkingPath)

        dfStoreDailyGoalForecast.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\
            withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\
            write.mode('append').partitionBy('year', 'month').format('parquet').save(
            self.storeDailyGoalForecastPartitionPath)

        self.sparkSession.stop()
Ejemplo n.º 26
0
def prepare_data(spark, sample_size):
    """
    Clean up raw data and split the data into train and split

    :param spark: SparkSession
    :param sample_size: int, number of data size to select
    :return : train, test, both of them are spark dataframes
    """
    movies = spark.read.load('data/ml-20m/movies.csv',
                             format='csv',
                             sep=',',
                             inferSchema='true',
                             header='true')
    ratings = spark.read.load('data/ml-20m/ratings.csv',
                              format='csv',
                              sep=',',
                              inferSchema='true',
                              header='true')
    """
    take subset of database (can't do ordering here as it is really slow over
    a distributed database)
    """
    n_ratings = ratings.limit(sample_size)

    # remove movies and users 1 rating
    user_filter = (n_ratings.groupBy('userId').agg(
        count('userId').alias('count')).filter(
            col('count') == 1).select('userId'))
    movie_filter = (n_ratings.groupBy('movieId').agg(
        count('movieId').alias('count')).filter(
            col('count') == 1).select('movieId'))
    n_ratings = n_ratings.join(user_filter, ['userId'], how='left_anti')
    n_ratings = n_ratings.join(movie_filter, ['movieId'], how='left_anti')

    # movies with valid genre
    movies_genre = movies.filter(col('genres') != '(no genres listed)')
    # remove year
    movies_genre = movies_genre.withColumn('year',
                                           substring(col('title'), -5, 4))
    genre_filter = movies_genre.select('movieId')

    # keep only movies with genre
    n_ratings = n_ratings.join(genre_filter, ['movieId'], how='left_semi')

    # test train split
    train, test = n_ratings.randomSplit([0.8, 0.2], seed=12345)

    # take union set of users, movies in both data pieces
    train = train.join(test.select('userId'), ['userId'], how='left_semi')
    train = train.join(test.select('movieId'), ['movieId'], how='left_semi')

    test = test.join(train.select('userId'), ['userId'], how='left_semi')
    test = test.join(train.select('movieId'), ['movieId'], how='left_semi')

    return train, test
Ejemplo n.º 27
0
def predict(args):
    flight_features = args["feature"].split(",")
    features = spark.createDataFrame(
        [(flight_features[0], flight_features[1], flight_features[2],
          flight_features[3], float(
              flight_features[4]), float(flight_features[5]))],
        schema=feature_schema)
    features = features.withColumn(
        'CRS_DEP_HOUR', substring(convert_time_to_hour("CRS_DEP_TIME"), 0, 2))
    result = model.transform(features).collect()[0].prediction
    return {"result": result}
Ejemplo n.º 28
0
def createDataFrameHTMLPreview(dataframe):
    from pyspark.sql.functions import substring
    #remove id
    #cast to string, limit to 50
    removeid = filter(lambda x: x != "_id", dataframe.columns)
    limitto50 = list(map(lambda x:(substring(col(x).cast("String"),0,100)).alias(x),removeid))
    pandas = dataframe.limit(displaylimit).select(limitto50).toPandas()
    html = pandas.to_html()
    

    return html
Ejemplo n.º 29
0
    async def run(
        self,
        spark,
        filter='2014*',
        start='2014-01-01',
        end='2015-01-01',
        mode='append',
        **inputs,
    ):
        print('bitmex trades csv->parquet')

        df = spark.read \
            .csv(
                f's3a://stackpoint-spark/data/bitmex/trades/{filter}.csv.gz',
                header=True,
                schema=BitmexTrades,
            )

        df = df.filter(df.timestamp >= start) \
               .filter(df.timestamp < end)

        df = df.drop('grossValue')
        df = df.drop('homeNotional')
        df = df.drop('foreignNotional')

        # cut milli/nanoseconds and convert to timestamp
        conv_timestamp = F.to_timestamp(F.substring(df.timestamp, 0, 23),
                                        TIMESTAMP)
        df = df.withColumn("timestamp", conv_timestamp)

        df = df.withColumn("time", F.unix_timestamp(df.timestamp))
        df = df.withColumn("date", F.from_unixtime(df.time, "yyyyMMdd"))
        df.drop('timestamp')

        df.limit(5).show()

        count = df.count()
        print(f'{count} trades')

        volume = df.agg(F.sum("size")).collect()[0][0]
        print(f'total volume: ${volume}')

        print('writing parquet')
        df.write \
            .mode(mode) \
            .partitionBy(['date']) \
            .parquet('s3a://stackpoint-spark/data/bitmex/parquet/trades')

        print('done writing')
        return {
            'trades': count,
            'volume': volume,
        }
Ejemplo n.º 30
0
def apply_basic_aggregations(df):
    """
    Take all transacations and apply some basic aggregations at the harp_property_id_no grain

    :param df:
    :return:
    """
    # Where for a given harp ID, hotel chain, lat/long, hotel_name has changed over the course of the analysis window,
    # simply take the most recent hotel_name
    df_current_hotel_names = \
        df \
            .groupBy('harp_property_id_no',
                     'hotel_name',
                     'hotel_chain_name',
                     'city_code',
                     'x',
                     'y',
                     ) \
            .agg(F.max('created_date').alias('max_created_date')) \
            .withColumn('hotel_name_rank',
                        F.dense_rank().over(Window
                                            .partitionBy(F.col('harp_property_id_no'))
                                            .orderBy(F.col('max_created_date').desc()))) \
            .filter('hotel_name_rank = 1') \
            .drop('hotel_name_rank', 'max_created_date')

    # Replace hotel names with most current iteration and aggregate down to the individual property grain
    return \
        df \
            .drop('hotel_name') \
            .join(df_current_hotel_names,
                  on=['harp_property_id_no',
                      'hotel_chain_name',
                      'city_code',
                      'x',
                      'y', ]) \
            .filter('star_rating is not null AND length(star_rating) > 1') \
            .withColumn('star_rating_number',
                        F.substring('star_rating', 0, 1).cast('integer')) \
            .groupBy('harp_property_id_no',
                     'hotel_name',
                     'hotel_chain_name',
                     'city_code',
                     'x',
                     'y',
                     ) \
            .agg(F.count(F.lit(1)).alias('num_bookings'),
                 F.max('star_rating').alias('star_rating'),
                 F.max('star_rating_number').alias('star_rating_number'),
                 F.avg('yield').alias('avg_yield'),
                 F.avg('adr').alias('avg_adr'),
                 )