Exemple #1
0
def CreateSubstring(df, inCol, outCol, strLen, delim, startPos, endPos, makeList=False):

	if endPos <= startPos:
		df = df.withColumn(outCol, lit(''))

	#here we create a substring of a string column
	startPos = builtin.min(builtin.max(0, startPos), strLen)
	endPos = builtin.min(builtin.max(startPos, endPos), strLen)

	#if one end of string coincides with beginning
	if startPos == 0:
		df = df.withColumn(outCol, substring_index(inCol, delim, endPos))
		

	#if  one end of string coincides with end
	elif endPos == strLen:
		df = df.withColumn(outCol, substring_index(inCol, delim, startPos - endPos))


	#if string is in middle
	else:
		#extract string from beginning upto position and then extract right end
		df = df.withColumn(outCol, substring_index(inCol, delim, endPos)) \
			.withColumn(outCol, substring_index(outCol, delim, startPos - endPos))

	#if string should be broken into list
	if makeList == True:
		df = df.withColumn(outCol, split(outCol, delim))

	return df
Exemple #2
0
def multi_day_client_df(client_addons_df):
    """A single-locale version of the `clients_daily` data."""
    df = (client_addons_df.where("locale = 'en-US'").withColumn(
        "client_id", substring_index("client_id", "_", -1)).where(
            substring_index("client_id", "-", -1).isin([1, 2, 3, 4, 5])))
    df.createOrReplaceTempView("clients_daily")
    return df
Exemple #3
0
def run(rucio_path, dbs_path, output, verbose):
    start = time.time()
    spark = SparkSession.builder.appName("rucio_dumps_test").getOrCreate()
    csvreader = spark.read.format("csv") \
        .option("nullValue", "null") \
        .option("mode", "FAILFAST")
    avroreader = spark.read.format("avro")
    rucio_info = avroreader.load(rucio_path) \
        .withColumn("filename", fn.input_file_name())
    logger.debug("Rucio data types")
    logger.debug(rucio_info.dtypes)
    # rucio_info.show(5, False)
    dbs_files = csvreader.schema(schemas.schema_files()) \
        .load(dbs_path) \
        .select("f_logical_file_name", "f_dataset_id")    
    # dbs_files.show(5, False)
    rucio_df = (rucio_info.withColumn("tmp1", fn.substring_index("filename", "/rucio/", -1))
                .withColumn("tally_date", fn.substring_index("tmp1", "/", 1))
                .withColumn('create_day', fn.date_format(fn.to_date((rucio_info.CREATED_AT / fn.lit(1000))
                                                                    .cast(types.LongType())
                                                                    .cast(types.TimestampType())),
                                                         'yyyyMMdd')
                            )
                .withColumn('tally_day', fn.date_format(fn.to_date("tally_date", "yyyy-MM-dd"), 'yyyyMMdd'))
                .select("RSE_ID", "BYTES", "NAME", "SCOPE", "tally_day", "create_day")
                )
    # rucio_df.show(5, False)
    rucio_df = rucio_df \
        .join(dbs_files, dbs_files.f_logical_file_name == rucio_df.NAME) \
        .groupBy("RSE_ID", "f_dataset_id", "SCOPE", "tally_day", "create_day") \
        .agg(fn.sum("BYTES").alias("rep_size"))
    # rucio_df.show(5, False)
    rucio_df.write.option("compression", "snappy").parquet(output, mode="overwrite")
    end = time.time()
    logger.info("Elapsed Time: {min} min, {sec} sec.".format(min=(end - start) // 60, sec=(end - start) % 60))
def test_substring_index(data_gen, delim):
    assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df(
        spark, data_gen).select(f.substring_index(f.col('a'), delim, 1),
                                f.substring_index(f.col('a'), delim, 3),
                                f.substring_index(f.col('a'), delim, 0),
                                f.substring_index(f.col('a'), delim, -1),
                                f.substring_index(f.col('a'), delim, -4)))
    def do(self, workflow, etl_process):
        
        from pyspark.sql.functions import substring, substring_index, split, col

        self.new_column = self.action_details.pop("name")
        self.target = self.action_details.pop("target")

        self.type = self.action_details.pop("type", "simple")
        
        if self.type == "simple":
            self.pos = self.action_details.pop("pos", 1)
            self.len = self.action_details.pop("len")
            workflow.df = workflow.df \
                .withColumn(self.new_column, substring(col(self.target), self.pos, self.len))
        
        else:
            self.delim = self.action_details.pop("delim")
            self.index = self.action_details.pop("index", 1)

            if self.type == "delim": 
                workflow.df = workflow.df \
                    .withColumn(self.new_column, substring_index(col(self.target), self.delim, self.index))

            elif self.type == "delim_index":
                workflow.df = workflow.df \
                    .withColumn(self.new_column, split(self.target, self.delim).getItem(self.index - 1))
 def get_column_spec(
     self, source_df: Optional[DataFrame], current_column: Optional[Column]
 ) -> Column:
     column_spec = substring_index(
         self.column.get_column_spec(
             source_df=source_df, current_column=current_column
         ), self.delimiter, self.delimiter_count
     )
     return column_spec
Exemple #7
0
def get_dataset(xmlQuery):
    """
    Runs an RCSB PDB Advanced Search web service using an XML query description.
    See https://www.rcsb.org/pdb/staticHelp.do?p=help/advancedSearch.html Advanced Search
    The returned dataset contains the following field dependent on the query type:
    # structureId, e.g., 1STP
    # structureChainId, e.g., 4HHB.A
    # ligandId, e.g., HEM

    :param xmlQuery: RCSB PDB advanced query xml string
    :return: dataset with matching ids
    """

    # run advanced query
    ids = post_query(xmlQuery)

    # convert list of ids to a list of lists (required for dataframe creation below)
    id_list = [[i] for i in ids]

    # convert list of lists to a dataframe
    spark = SparkSession.builder.getOrCreate()

    # distinguish 3 types of results based on length of string
    # structureId: 4 (e.g., 4HHB)
    # structureEntityId: > 4 (e.g., 4HHB:1)
    # entityId: < 4 (e.g., HEM)

    if len(ids[0]) > 4:
        ds: DataFrame = spark.createDataFrame(id_list, ['pdbEntityId'])
        # if results contain an entity id, e.g., 101M:1, then map entityId to pdbChainId
        ds = ds.withColumn("pdbId", substring_index(ds.pdbEntityId, ':', 1))
        ds = ds.withColumn("entityId", substring_index(ds.pdbEntityId, ':',
                                                       -1))
        mapping = __get_entity_to_chain_id()
        ds = ds.join(mapping, (ds.pdbId == mapping.structureId) &
                     (ds.entityId == mapping.entity_id))
        ds = ds.select(ds.pdbChainId)
    elif len(ids[0]) < 4:
        ds: DataFrame = spark.createDataFrame(id_list, ['ligandId'])
    else:
        ds: DataFrame = spark.createDataFrame(id_list, ['pdbId'])

    return ds
def community_indicators(spark: SparkSession) -> DataFrame:
    indicators_file = os.environ["CORE_CONF_fs_defaultFS"] + "/datasets/Community_Resiliency_Indicator_System.csv"

    # Read csv file
    indicators = spark.read \
        .format("csv") \
        .option("header", "true") \
        .load(indicators_file) \
        .cache()

    indicators = indicators.select(indicators["Neighborhood"].alias("neighborhood"),
                                   indicators["Haz_Score"].alias("hazard_score"),
                                   indicators["Env_Score"].alias("environment_score"),
                                   indicators["VCrim_Rate"].alias("violent_crime_rate"),
                                   indicators["Citz_Per"].alias("citizen_density"),
                                   indicators["Com_Score"].alias("community_score"),
                                   indicators["Food_Score"].alias("food_score"),
                                   indicators["Emp_per"].alias("employment_rate"),
                                   indicators["PopDens"].alias("population_density"),
                                   indicators["DayPopDens"].alias("population_density_day"))

    # Separate neighborhoods that have been combined with a "/"
    paired_neighborhoods = indicators.where(indicators["neighborhood"].like("%/%"))
    left_of_pairs = paired_neighborhoods.withColumn("neighborhood",
                                                    substring_index(paired_neighborhoods["neighborhood"], "/", 1))
    right_of_pairs = paired_neighborhoods.withColumn("neighborhood",
                                                     substring_index(paired_neighborhoods["neighborhood"], "/", -1))
    indicators = indicators.where(~indicators["neighborhood"].like("%/%")) \
        .unionAll(left_of_pairs) \
        .unionAll(right_of_pairs)

    # Use neighborhood areas to convert population density to population count
    areas = neighborhood_boundaries(spark) \
        .withColumn("area", calculate_area("polygon")) \
        .drop("polygon")

    indicators = indicators.join(areas, "neighborhood")
    indicators = indicators.withColumn("population", indicators["population_density"] * indicators["area"]) \
        .withColumn("population_day", indicators["population_density_day"] * indicators["area"]) \
        .drop("area")

    return indicators
Exemple #9
0
def main(business_file, postcode_file, ethnicity_file):
    # load files
    df_business = spark.read.parquet(business_file)
    df_postcode = spark.read.csv(postcode_file, header=True)
    df_ethnicity = spark.read.csv(ethnicity_file, header=True)

    # filter for businesses in toronto
    df_toronto = df_business.where("City like '%Toronto%'")

    # Combine external wellbeing  datasets to yelp datasets and pre-process
    df_join = df_wellbeing.join(df_postcode, on=['Neighbourhood'], how='left')
    df_join = df_join.drop('Combined Indicators', 'Borough')
    new_cols_ethn = [c.strip(' ') for c in df_ethnicity.columns]
    old_cols_ethn = df_ethnicity.schema.names
    df_ethnicity = reduce(
        lambda df_ethnicity, idx: df_ethnicity.withColumnRenamed(
            old_cols_ethn[idx], new_cols_ethn[idx]), range(len(old_cols_ethn)),
        df_ethnicity)

    df_indian = df_toronto.where("Categories like '%Indian%'")
    df_ethnicity = df_ethnicity.join(df_postcode,
                                     on=['Neighbourhood'],
                                     how='left')
    df_ethnicity_small = df_ethnicity.select('Neighbourhood',
                                             'Total Population', 'South Asian',
                                             'Postcode')
    df_ethn_norm = df_ethnicity_small

    cols = [
        'Chinese', 'South Asian', 'Black', 'Filipino', 'Latin American',
        'Southeast Asian', 'Arab', 'West Asian', 'Korean', 'Japanese',
        'Not a Visible Minority'
    ]

    for field in df_ethnicity_small.columns:
        if field in cols:
            df_ethn_norm = df_ethn_norm.withColumn(
                field,
                col(field) / col("Total Population"))

    df_indian_ethn = df_indian.withColumn(
        "PostCode", functions.substring_index(col("PostalCode"), " ",
                                              1)).join(df_ethn_norm,
                                                       on='PostCode',
                                                       how='left')

    df_ind_eth_sort = df_indian_ethn.orderBy('BusinessStars', ascending=False).select('BusinessID', 'Name', 'Latitude',
                                                                                      'Longitude', \
                                                                                      'BusinessStars', 'Neighbourhood',
                                                                                      'South Asian')

    df_ind_eth_sort.coalesce(1).write.csv('df_ind_eth_sort.csv')
 def get_column_spec(
     self,
     source_df: Optional[DataFrame],
     current_column: Optional[Column],
     parent_columns: Optional[List[Column]],
 ) -> Column:
     column_spec = substring_index(
         self.column.get_column_spec(
             source_df=source_df,
             current_column=current_column,
             parent_columns=parent_columns,
         ),
         self.delimiter,
         self.delimiter_count,
     )
     return column_spec
Exemple #11
0
def reformat_v1_0(flight, pqFolder, pqFileName):
	"""
		Read in the original v1.0 dataframe and save as a new parquet file compatible with v1.1
		@params:        
			flight		  - Required  : original v1.0 data(Spark DataFrame)        
			pqFolder      - Required  : folder to save the parquet files into (Str)        
			pqFileName    - Required  : parquet file name (Bool)                        
	"""
	flight2 = (flight.withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stay_days')))
					 .drop('stay_days')           
					 .withColumnRenamed('start_date', 'depDate')                 
					 .withColumn('depDate', to_date('depDate'))
					 .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
					 .withColumnRenamed('from_city_name', 'fromCity')
					 .withColumnRenamed('to_city_name', 'toCity')                 
					 .withColumnRenamed('search_date', 'searchDate')                 
					 .withColumn('searchDate', to_date('searchDate'))
					 .withColumnRenamed('company', 'airlineName')                 
					 .withColumnRenamed('dep_time', 'departureTime')                                  
					 .withColumnRenamed('arr_time', 'arrivalTime')                                                   
					 .withColumn('duration_h', split(flight.duration,'h').getItem(0))
					 .withColumn('duration_m', F.substring_index(split(flight.duration,'h').getItem(1), 'm', 1))
	#                  .withColumn('duration', F.struct(col('duration_h'), col('duration_m')))
					 .withColumn('duration_m', (col('duration_h')*60 + col('duration_m')))
					 .drop('duration', 'duration_h', 'flight_number')
					 .withColumnRenamed('price_code', 'currencyCode')                                  
					 .withColumnRenamed('stop', 'stops')
					 .withColumn('stops', col('stops').cast('byte')) 
					 .withColumn('stop_info', split(col('stop_info'), ';'))
	#                  .withColumn('stop_duration', take_all_duration_UDF(col('stop_info')))
					 .withColumn('noOfTicketsLeft', correct_tickets_left_UDF('ticket_left'))
					 .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte')) 
					.drop('ticket_left')
				   .withColumnRenamed('table_name', 'tableName')
				   .withColumn('task_id', col('task_id').cast('long')) 
				   .withColumn('span_days', col('span_days').cast('integer')) 
					.select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
							'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
							'stayDays', 
						   'departureTime', 'arrivalTime', 
							'airlineName',  'duration_m', 
							'flight_code', 'plane', 'stops', 'noOfTicketsLeft',
						   'airline_code', 'airline_codes',
						   'stop_info', 'span_days', 'power', 'video', 'wifi')                #'stop_duration', 
			  )

	flight2.repartition(1).write.parquet(os.path.join(pq_folder, pqFileName))
def test_auto_mapper_substring_by_delimiter(
    spark_session: SparkSession
) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, 'Qureshi', 'Imran', "1970-01-01"),
            (2, 'Vidal', 'Michael', "1970-02-02"),
        ], ['member_id', 'last_name', 'first_name', "date_of_birth"]
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(
        my_column=A.substring_by_delimiter(A.column("last_name"), "s", 1)
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df
    )
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["my_column"]) == str(
        substring_index(col("b.last_name"), "s", 1).alias("my_column")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("my_column"
                                                    ).collect()[0][0] == "Qure"
    assert result_df.where("member_id == 2"
                           ).select("my_column").collect()[0][0] == "Vidal"
Exemple #13
0
def main(business_data, labour_data, postcode_data):

    df_business = spark.read.parquet(business_data)
    df_labour = spark.read.csv(labour_data, header=True)
    df_postcode = spark.read.csv(postcode_file, header=True)

    df_toronto = df_business.where("City like '%Toronto%'")

    # Strip spaces from columns
    new_cols_lb = [c.strip(' ') for c in df_labour.columns]
    old_cols_lb = df_labour.schema.names
    df_labour = reduce(
        lambda df_labour, idx: df_labour.withColumnRenamed(
            old_cols_lb[idx], new_cols_lb[idx]), range(len(old_cols_lb)),
        df_labour)

    df_labour = df_labour.join(df_postcode, on=['Neighbourhood'], how='left')
    df_labour = df_labour.drop('CombinedIndicators', 'Borough',
                               'TotalPopulation')
    df_lb_norm = df_labour

    df_lb_norm = df_lb_norm.withColumn(
        "LabourForceCategory",
        df_lb_norm["LabourForceCategory"].cast(IntegerType()))
    df_lb_norm = df_lb_norm.withColumn(
        "InLabourForce", df_lb_norm["InLabourForce"].cast(IntegerType()))

    df_lb = df_toronto.withColumn("PostCode", functions.substring_index(col("PostalCode"), " ", 1)) \
        .join(df_lb_norm, on='PostCode', how='left')

    df_lb_pandas = df_lb.toPandas()
    df_lb_pandas = df_lb_pandas.dropna(
    )  #neighbourhoods not present in the toronto data dropped
    df_lb_pandas['ratio_emply'] = df_lb_pandas['InLabourForce'] / df_lb_pandas[
        'LabourForceCategory']
    df_lb_pandas_emply = df_lb_pandas[[
        'BusinessID', 'Neighbourhood', 'BusinessStars', 'ratio_emply'
    ]]

    df_lb_pandas_emply = df_lb_pandas_emply.astype({'BusinessStars': 'double'})
    df_group_lb = df_lb_pandas_emply.groupby('Neighbourhood').mean()

    df_group_lb.to_csv('df_lb_pandas_emply')
Exemple #14
0
def variants_from_vcf (vcf):
  """
  Given a VCF file in a data frame, extract the first 9 variant columns and give them unique identifiers.  Include genotype columns as an array parsed out
  with a pandas udf
  """

  # Get the main data and put a unique index on each variant
  maindata = vcf.filter(vcf.data.startswith('#') == False)
  splitdata = maindata.select("filename",f.split(f.substring_index('data',"[\t ]+",9),"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX"))
    
  # Now pull out the columns one at a time, casting non-strings to appropriate type.  Split out INFO and FORMAT here
  variant = splitdata.select("filename","VAR_IDX",\
    f.element_at(splitdata.split_data,1).alias("CHR"),\
    f.element_at(splitdata.split_data,2).cast(IntegerType()).alias("POS"),\
    f.element_at(splitdata.split_data,3).alias("ID"),\
    f.element_at(splitdata.split_data,4).alias("REF"),\
    f.element_at(splitdata.split_data,5).alias("ALT"),\
    f.element_at(splitdata.split_data,6).cast(FloatType()).alias("QUAL"),\
    f.element_at(splitdata.split_data,7).alias("FILTER"),\
    f.split(f.element_at(splitdata.split_data,8), ";").alias("INFO"),\
    f.split(f.element_at(splitdata.split_data,9), ":").alias("FORMAT"))
  return(variant)
def user_dimension(user_df):
    # cleaning user data and droping unnecessary columns

    user_dim = user_df.withColumn("user_friend_count", F.size(F.split(user_df.friends, ','))) \
        .withColumn('year', F.year(user_df.yelping_since)) \
        .withColumn('month', F.month(user_df.yelping_since)) \
        .withColumnRenamed("fans", "user_fans") \
        .withColumnRenamed("review_count", "user_review_count") \
        .withColumnRenamed("useful", "useful_vote_cnt") \
        .withColumnRenamed("funny", "funny_vote_cnt") \
        .withColumnRenamed("cool", "cool_vote_cnt") \
        .withColumnRenamed("useful", "useful_vote_cnt") \
        .withColumnRenamed("average_stars", "user_average_stars") \
        .withColumn("user_elite_year_cnt", F.when(F.length(user_df.elite) == 0, 0)
                    .otherwise(F.size(F.split(user_df.elite, ',')))) \
        .withColumn("is_user_elite", F.when(F.substring_index(user_df.elite, ',', -1) == currentYear - 1, True)
                    .otherwise(False)) \
        .select("user_id", "name", "user_review_count", "yelping_since", "user_friend_count",
                "useful_vote_cnt", "funny_vote_cnt", "cool_vote_cnt",
                "user_average_stars", "user_elite_year_cnt", "is_user_elite",
                "user_fans", "year", "month")
    return user_dim
Exemple #16
0
shutil.rmtree(imagePath, ignore_errors=True)
shutil.rmtree(deltaPath, ignore_errors=True)

request.urlretrieve(imageGzipUrl, imageGzipPath)
shutil.unpack_archive(imageGzipPath, imagePath)

# read the images from the flowers dataset
images = spark.read.format("binaryFile").\
  option("recursiveFileLookup", "true").\
  option("pathGlobFilter", "*.jpg").\
  load(imagePath)

# Knowing the file path, extract the flower type and filename using substring_index
# Remember, Spark dataframes are immutable, here we are just reusing the images dataframe
images = images.withColumn("flowerType_filename",
                           fn.substring_index(images.path, "/", -2))
images = images.withColumn(
    "flowerType", fn.substring_index(images.flowerType_filename, "/", 1))
images = images.withColumn(
    "filename", fn.substring_index(images.flowerType_filename, "/", -1))
images = images.drop("flowerType_filename")
images.show()

# Select the columns we want to write out to
df = images.select("path", "content", "flowerType", "filename").repartition(4)
df.show()

# Write out the delta table to the given path, this will overwrite any table that is currently there
df.write.format("delta").mode("overwrite").save(deltaPath)

# Reads the delta table that was just written
Exemple #17
0
def get_spark_commits(date_str):
    # 2.1: Change the github_api_url so that it queries with the input date
    # Convert the date string into date formate
    fromDate = datetime.strptime(date_str, '%Y%m%d').date()
    toDate = fromDate + timedelta(days=1)

    # Construct the Git URL to fetch JSON Object(s)
    request = 'https://api.github.com/repos/apache/spark/commits?since=' + str(
        fromDate) + 'T00:00:00Z&until=' + str(toDate) + 'T00:00:00'
    print('Beginning file download from: ' + request)

    import urllib.request, urllib.error
    try:
        # Get the json object(s) with Git URL
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        # Return code error (e.g. 404, 501, ...)
        print('HTTPError: {}'.format(e.code))
    except urllib.error.URLError as e:
        # Not an HTTP-specific error (e.g. connection refused)
        print('URLError: {}'.format(e.reason))
    else:
        # 200
        sourceJASON = response.read()

    import pandas as pd

    # response.read() returns a bytes object, which is just a sequence of bytes.
    # You need to decode it first, because Python doesn't know what the bytes represent.
    jsonData = json.loads(sourceJASON.decode('utf-8'))

    from pyspark import SparkContext
    # Create Spark Context Directly by passing the config parameters
    sc = SparkContext("local[*]", "PySpark Electronic Arts Test")

    # from pyspark import SparkSession
    from pyspark.sql import SparkSession
    spark = SparkSession(sc)
    # Create a Spark DataFrame from a Pandas DataFrame using Arrow
    # Pandas DataFrame is not distributed it exists on Driver node only
    # Inorder to acheive parallisam we need to distribute the data across the cluster
    # Spark DataFrame will distribute the DataFrame
    source_df = spark.createDataFrame(pd.DataFrame(jsonData))

    source_df.printSchema
    source_df.show()

    from pyspark.sql.types import DateType, IntegerType

    # Create a new DataFrame by selecting only few Key Value Pairs from the original JSON Object(s)
    jsonDF = source_df.select(source_df.sha.alias('sha') \
                              , source_df.author.login.alias('login_name') \
                              , source_df.committer.id.cast(IntegerType()).alias('commiter_id') \
                              , F.concat_ws(' ', F.map_values(source_df.commit.message)).alias('message') \
                              , source_df.commit.author.date.cast(DateType()).alias('commit_date') \
                              , source_df.commit.author.email.alias('email') \
                              , F.substring_index(source_df.commit.author.email, '@', -1).alias('email_company') \
                              , source_df.url.alias('url'))

    # Save this DataFrame in memory as it will be used multiple times in the future
    jsonDF.cache()
    jsonDF.printSchema
    jsonDF.show()

    # Set Parameters for PostgreSQL Database Connection
    url_connect = "jdbc:postgresql://pa1postgreserver.postgres.database.azure.com:5432/postgres?"
    commitTable = "F_SPARK_COMMITS"
    authorTable = "F_SPARK_AUTHORS"
    mode = "append"
    db_properties = {
        "user": "******",
        "password": "******",
        "driver": "org.postgresql.Driver"
    }

    # Read the Authors Table from PostgreSQL DB into a Spark DataFrame Object
    readAuthorTableDF = spark.read.jdbc(url=url_connect,
                                        table=authorTable,
                                        properties=db_properties)

    # Check if the Authors table is empty or not
    # If the table in the db is empty then insert the authors dataframe directly
    # If the table is not empty join the 2 author tables and filter the existing authors in db_properties
    # Insert only the new author records into DB table
    if len(readAuthorTableDF.head(1)) > 0:
        authDF = jsonDF.join(readAuthorTableDF, jsonDF.login_name == readAuthorTableDF.login_name, how='left') \
            .filter(readAuthorTableDF.login_name.isNull()) \
            .select(jsonDF.login_name \
                    , jsonDF.commiter_id \
                    , jsonDF.email \
                    , jsonDF.email_company)
    else:
        authDF = jsonDF.select(jsonDF.login_name \
                               , jsonDF.commiter_id \
                               , jsonDF.email \
                               , jsonDF.email_company)
    authDF.write.jdbc(url=url_connect,
                      table=authorTable,
                      mode="append",
                      properties=db_properties)
    authDF.show()

    # Read the Authors table after insearting the new authors
    readAuthorTableDF = spark.read.jdbc(url=url_connect,
                                        table=authorTable,
                                        properties=db_properties)
    # Read the Commits Table from PostgreSQL DB into a Spark DataFrame Object before Update
    readCommitTableDF = spark.read.jdbc(url=url_connect,
                                        table=commitTable,
                                        properties=db_properties)
    # Create DataFrame by joining the DataFrame which is createded from the source JSON with the authors table contents
    # Do a InnerJoin with authors dbtable data frame to fetch only the records that have a commit_id in authors table
    commitDF = jsonDF.join(readAuthorTableDF,
                           jsonDF.commiter_id == readAuthorTableDF.commiter_id,
                           how='inner')

    from pyspark.sql import Row

    # Check if the Commits table is empty or not
    # If the table in the db is empty then insert the commits dataframe directly
    # If the table is not empty then check the last executed date in the commits db table
    # Now filter all the records with the current date as last executed datetime
    # The above step will make sure the process is idempotent.
    # Insert only the new author records into Commits DB table
    if len(readCommitTableDF.head(1)) > 0:
        maxDate = readCommitTableDF.orderBy(
            readCommitTableDF.creation_date.desc()).head(1)[0].creation_date
        commitDF = commitDF.filter(F.current_timestamp().cast(DateType()) != maxDate).select(jsonDF.sha \
                                                                                             , jsonDF.url \
                                                                                             , jsonDF.message \
                                                                                             , jsonDF.commit_date \
                                                                                             ,
                                                                                             readAuthorTableDF.author_id \
                                                                                             ,
                                                                                             readAuthorTableDF.creation_date)
    else:
        commitDF = commitDF.select(jsonDF.sha\
                                   , jsonDF.url\
                                   , jsonDF.message
                                   , jsonDF.commit_date\
                                   , readAuthorTableDF.author_id\
                                   , readAuthorTableDF.creation_date)
    commitDF.show()
    commitDF.write.jdbc(url=url_connect,
                        table=commitTable,
                        mode="append",
                        properties=db_properties)
    def augment(df):
        if 'addons' in df.columns:
            df = df.select(['*'] + [create_get_addon_name_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__')) for addon in all_addons] + [create_get_addon_version_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__') + '-version') for addon in all_addons])

        if 'json_dump' in df.columns:
            df = df.select(['*'] + [functions.array_contains(df['json_dump']['modules']['filename'], module_name).alias(module_id) for module_id, module_name in module_ids.items()])

        if 'plugin_version' in df.columns:
            df = df.withColumn('plugin', df['plugin_version'].isNotNull())

        if 'app_notes' in df.columns:
            df = df.select(['*'] + [(functions.instr(df['app_notes'], app_note.replace('__DOT__', '.')) != 0).alias(app_note) for app_note in all_app_notes] + [(functions.instr(df['app_notes'], 'Has dual GPUs') != 0).alias('has dual GPUs')])

        if 'graphics_critical_error' in df.columns:
            df = df.select(['*'] + [(functions.instr(df['graphics_critical_error'], error.replace('__DOT__', '.')) != 0).alias(error) for error in all_gfx_critical_errors])

        if 'total_virtual_memory' in df.columns and 'platform_version' in df.columns and 'platform' in df.columns:
            def get_arch(total_virtual_memory, platform, platform_version):
                if total_virtual_memory:
                    try:
                        if int(total_virtual_memory) < 2684354560:
                            return 'x86'
                        else:
                            return 'amd64'
                    except:
                        return 'unknown'
                elif platform == 'Mac OS X':
                    return 'amd64'
                else:
                    if 'i686' in platform_version:
                        return 'x86'
                    elif 'x86_64' in platform_version:
                        return 'amd64'

            get_arch_udf = functions.udf(get_arch, StringType())

            df = df.withColumn('os_arch', get_arch_udf(df['total_virtual_memory'], df['platform'], df['platform_version']))

        if 'adapter_driver_version' in df.columns:
            def get_driver_version(adapter_vendor_id, adapter_driver_version):
                # XXX: Sometimes we have a driver which is not actually made by the vendor,
                #      in those cases these rules are not valid (e.g. 6.1.7600.16385).
                if adapter_driver_version:
                    if adapter_vendor_id == '0x8086' or adapter_vendor_id == '8086':
                        return adapter_driver_version[adapter_driver_version.rfind('.') + 1:]
                    elif adapter_vendor_id == '0x10de' or adapter_vendor_id == '10de':
                        return adapter_driver_version[-6:-5] + adapter_driver_version[-4:-2] + '.' + adapter_driver_version[-2:]
                    # TODO: AMD?

                return adapter_driver_version

            get_driver_version_udf = functions.udf(get_driver_version, StringType())

            df = df.withColumn('adapter_driver_version_clean', get_driver_version_udf(df['adapter_vendor_id'], df['adapter_driver_version']))

        if 'cpu_info' in df.columns:
            df = df.withColumn('CPU Info', functions.substring_index(df['cpu_info'], ' | ', 1))
            df = df.withColumn('Is Multicore', functions.substring_index(df['cpu_info'], ' | ', -1) != '1')

        if 'dom_ipc_enabled' in df.columns:
            df = df.withColumnRenamed('dom_ipc_enabled', 'e10s_enabled')

        if 'memory_ghost_windows' in df.columns:
            df = df.withColumn('ghost_windows > 0', df['memory_ghost_windows'] > 0)

        if 'memory_top_none_detached' in df.columns:
            df = df.withColumn('top(none)/detached > 0', df['memory_top_none_detached'] > 0)

        return df
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName('fifth_exercise').getOrCreate()

df = spark.read.csv("price_paid_records.csv", header=True)
df = df.select(
    F.col('`Date of Transfer`'),
    F.substring_index(F.col('`Date of Transfer`'), "-", 1).alias('year'))
df = df.select('year', F.split('`Date of Transfer`', "-")[1].alias('month'))
df = df.withColumn('month', df.month.cast('integer'))

df1 = df.groupBy('year',
                 'month').count().groupBy('year').max('count').alias('counts')
df = df.groupBy('year', 'month').count().alias('original')

df = df.join(df1, (F.col('original.count') == F.col('counts.max(count)')) &
             (F.col('original.year') == F.col('counts.year')))
df = df.select('original.year', 'original.month')

df.show()
Exemple #20
0
    def read_spark_df_from_msexchange_data_store(self, **args):
        url = args["hbase_url"]

        r = requests.get(url)

        # Converting api data in json file
        try:
            d = r.json()

        except:
            print("Invalid URL")

        # Checking for data availability
        if len(d) == 0:
            print(
                "There are no events to process. Please enter a different search criteria in the url."
            )

        # Converting API data into Spark Dataframe
        print("Reading the data from profiler...")
        spark = SparkSession.builder.appName(
            'mseapi').enableHiveSupport().getOrCreate()
        sc = spark.sparkContext
        tsRDD = sc.parallelize(d)
        df_mail = spark.read.option('multiline', "true").json(tsRDD)
        total_evt_count = df_mail.count()
        print("Total number of records: " + str(total_evt_count))

        if total_evt_count > 0:
            mail_len = f.udf(lambda s: len(s), LongType())
            mail_sum = f.udf(lambda s: sum(s), LongType())
            # mail_mean  = f.udf(lambda s: round(mean(s),4), FloatType())
            # mail_stdev = f.udf(lambda s: round(stdev(s),4), FloatType())


            df_mail_grp = df_mail.filter(f.length(f.trim(df_mail["mail_size"]))>0)\
                            .withColumn("check", f.when(f.instr(df_mail["mail_size"],',') == 1,f.substring_index(df_mail["mail_size"],',',-1)).otherwise(df_mail["mail_size"]))\
                            .withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\
                            .withColumn("mail_size", f.regexp_replace('check', ' ', ''))\
                            .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',')
                                                            .cast(ArrayType(IntegerType())).alias("email_size"),
                                                    f.sum("ext_sndrs").alias("ext_sndrs"))\
                            .withColumn("no_of_emails", mail_len("email_size"))\
                            .withColumn("tot_email_size", mail_sum("email_size"))\
                            .withColumn("avg_email_size", f.round(f.col("tot_email_size")/ f.col("no_of_emails"),4))\
                            .drop("email_size")
            #.withColumn("email_size_mean", mail_mean("email_size"))\
            #.withColumn("email_size_stdev", f.when(mail_len("email_size") > 1,mail_stdev("email_size")))\

            # df_mail_grp = df_mail.filter(f.length(f.trim(df_mail["mail_size"]))>0)\
            #                 .withColumn("check", f.when(f.instr(df_mail["mail_size"],',') == 1,f.substring_index(df_mail["mail_size"],',',-1)).otherwise(df_mail["mail_size"]))\
            #                 .withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\
            #                 .withColumn("mail_size", f.regexp_replace('check', ' ', ''))\
            #                 .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',')
            #                                                 .cast(ArrayType(IntegerType())).alias("email_size"),
            #                                         f.sum("ext_sndrs").alias("ext_sndrs"))\
            #                 .withColumn("no_of_emails", mail_len("email_size"))\
            #                 .withColumn("tot_email_size", mail_sum("email_size"))\
            #                 .withColumn("avg_email_size", f.round(f.col("tot_email_size")/ f.col("no_of_emails"),4))\
            #                 .drop("email_size")
            #                 #.withColumn("email_size_mean", mail_mean("email_size"))\
            #                 #.withColumn("email_size_stdev", f.when(mail_len("email_size") > 1,mail_stdev("email_size")))\

            # df_mail_grp = df_mail.withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\
            #                     .withColumn("mail_size", f.regexp_replace('mail_size', ' ', ''))\
            #                     .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',')
            #                                                     .cast(ArrayType(IntegerType())).alias("email_size"),
            #                                             f.sum("ext_sndrs").alias("ext_sndrs"))\
            #                     .withColumn("no_of_emails", mail_len("email_size"))\
            #                     .withColumn("tot_email_size", mail_sum("email_size"))\
            #                     .withColumn("email_size_mean", mail_mean("email_size"))\
            #                     .withColumn("email_size_stdev", mail_stdev("email_size"))\
            #                     .drop("email_size")
            df_mail_grp.show(3)
            return df_mail_grp

        else:
            schema = StructType([])
            sqlContext = SQLContext(sc)
            sdf = sqlContext.createDataFrame(sc.emptyRDD(), schema)
            return sdf
Exemple #21
0
  os.remove(imageGzipPath)
shutil.rmtree(imagePath, ignore_errors=True)
shutil.rmtree(deltaPath, ignore_errors=True)

request.urlretrieve(imageGzipUrl, imageGzipPath)
shutil.unpack_archive(imageGzipPath, imagePath)

# read the images from the flowers dataset
images = spark.read.format("binaryFile").\
  option("recursiveFileLookup", "true").\
  option("pathGlobFilter", "*.jpg").\
  load(imagePath)

# Knowing the file path, extract the flower type and filename using substring_index
# Remember, Spark dataframes are immutable, here we are just reusing the images dataframe
images = images.withColumn("flowerType_filename", fn.substring_index(images.path, "/", -2))
images = images.withColumn("flowerType", fn.substring_index(images.flowerType_filename, "/", 1))
images = images.withColumn("filename", fn.substring_index(images.flowerType_filename, "/", -1))
images = images.drop("flowerType_filename")
images.show()

# Select the columns we want to write out to
df = images.select("path", "content", "flowerType", "filename").repartition(4)
df.show()

# Write out the delta table to the given path, this will overwrite any table that is currently there
df.write.format("delta").mode("overwrite").save(deltaPath)

# Reads the delta table that was just written
dfDelta = spark.read.format("delta").load(deltaPath)
dfDelta.show()
Exemple #22
0
                                avg("col_2").over(spec)).withColumn(
                                    "rank_val",
                                    rank().over(spec)).withColumn(
                                        "dense_rank_val",
                                        dense_rank().over(spec)).show()

# COMMAND ----------

from pyspark.sql.functions import col, substring, substring_index, instr, split, concat_ws, repeat
from pyspark.sql.types import StringType
#substring
#orders_new_col.show()
func_df = orders_new_col.select(
    'order_status',
    substring('order_status', 1, 2).alias("sub"),
    substring_index('order_status', "E", -3).alias("sub_ind")).select(
        "*",
        instr('sub_ind', 'E').alias("instr_val"),
        split('order_status', "_")[0].alias("split_val")).select(
            "*",
            concat_ws("|", "order_status", "sub").alias("concat_val"))
func_df.withColumn("repeat_val", repeat("instr_val", 3)).select(
    "*",
    concat_ws("|", *func_df.columns).alias("conc_ws")).show(truncate=False)
#orders_new_col.select(substring_index('order_status', "_", 2)).show()
#list_1 = ["col_1", "col_2"]
#df_1 = spark.createDataFrame(list_1, StringType())
#df_1.select(substring_index("value", "_", 1)).show()

# COMMAND ----------
from pyspark.sql import *
from pyspark.sql.functions import regexp_extract, substring_index

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .master("local[3]") \
        .appName("LogFileDemo") \
        .getOrCreate()

    file_df = spark.read.text("data/apache_logs.txt")
    file_df.printSchema()

    log_reg = r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\S+) "(\S+)" "([^"]*)'

    logs_df = file_df.select(regexp_extract('value', log_reg, 1).alias('ip'),
                             regexp_extract('value', log_reg, 4).alias('date'),
                             regexp_extract('value', log_reg, 6).alias('request'),
                             regexp_extract('value', log_reg, 10).alias('referrer'))

    logs_df \
        .where("trim(referrer) != '-' ") \
        .withColumn("referrer", substring_index("referrer", "/", 3)) \
        .groupBy("referrer") \
        .count() \
        .show(100, truncate=False)