Ejemplo n.º 1
0
def table_schema_from_spark(hcat_table_name):
    #returns schema of table with this database.name in hcatalog
    #   (spark-workaround as long as hcatweb api is not available...)
    # initialize spark
    import findspark
    findspark.init()
     
    import pyspark
    from pyspark.sql import HiveContext
    
    sc_conf = pyspark.SparkConf()
    #sc_conf.set('spark.executor.extraClassPath','/opt/cloudera/parcels/CDH/lib/hive/lib/*')
    #sc_conf.set('spark.master','yarn-client')
    
    sc = pyspark.SparkContext(appName = 'ade_get_table_schema', conf=sc_conf)
    hc = HiveContext(sc)
    
    hive_schema = hc.table(hcat_table_name).schema.jsonValue()
    
    print hive_schema
    
    sc.stop()
    
    table_schema = {'columns':{}}
    
    col_sequence = 0
    for field in hive_schema['fields']:
        table_schema['columns'][field['name']] = {'col_sequence': col_sequence, 'type':field['type']}
        col_sequence += 1
    
    return table_schema
Ejemplo n.º 2
0
    def create_dataframe_from_hive(spark_session, dbConnectionParams):
        df = None
        try:
            sc = SparkSession.builder.appName("Testing").config(
                conf=SparkConf()).enableHiveSupport().getOrCreate()
            sqlContext = HiveContext(sc)
            sqlContext.setConf(
                "hive.metastore.uris",
                "thrift://{}:{}".format(dbConnectionParams.get("host"),
                                        dbConnectionParams.get("port")))

            tdf = sqlContext.sql("show databases")
            tdf.show()

            schema = DataLoader.get_db_name(dbConnectionParams)
            table_name = dbConnectionParams.get("tablename")
            df = sqlContext.table(".".join([schema, table_name]))

        except Exception as e:
            print("couldn't connect to hive")
            raise e
        return df
def filtering(alpha_thr):



if __name__ == "__main__":
    sc = SparkContext(appName="Edges Filtering Hive v1")
    hc = HiveContext(sparkContext=sc)
    tbl = hc.table("mi2mi.edges")
    tbl.registerTempTable("edges")
    edgesDF = hc.sql("select SID1, SID2, EdgeCost from edges where MilanoDate='2013-11-01'")
    edgesDF.show()

    v = hc.sql("select distinct SID1 id from edges where MilanoDate='2013-11-01' order by SID1")
    e = hc.sql("select SID1 src, SID2 dst, EdgeCost cost from edges where MilanoDate='2013-11-01'")
    d = hc.sql("select sid1, sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate='2013-11-01' group by sid1")
    g = GraphFrame(v, e)

    g.vertices.show()
    g.edges.show()

    
    d = hc.sql("select sid1, sid2, 1 - (select count(distinct sid1) - 2 from edges where MilanoDate=e.MilanoDate)*(EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1) + pow(1 - EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1), (select count(distinct sid1) - 1 from edges where MilanoDate=e.MilanoDate))) /((select count(distinct sid1) - 1 from edges where MilanoDate=e.MilanoDate) * (EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1) - 1)) alpha from edges e where MilanoDate='2013-11-01'")
    d.show()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--output_path',
        help='output path in s3 including bucket without file name')
    args = parser.parse_args()
    if args.output_path:
        output_path = args.output_path
    else:
        raise ValueError('missing argument - output_path')

    model_name = 'photography'
    spark_config = SparkConf().setAppName(model_name)

    dependencies = get_dependencies()
    spark_context = SparkContext(conf=spark_config, pyFiles=dependencies)
    hive_context = HiveContext(spark_context)

    is_photo_related = F.udf(
        lambda s: True if ('camera' in s) or ('video' in s) else False,
        types.BooleanType())

    get_event_score = F.udf(score_single_event, types.FloatType())

    # received notification at least as many as viewed
    fix_received = F.udf(lambda received, view: max(received, view),
                         types.FloatType())

    # TODO: switch to l1 home_events_uuid
    events = hive_context.table('l2_sprint.mixpanel_home')

    # choose photography related content interactions from notifications
    # (devicefeatures_attribute exists only for notification items, not future cards)
    # relevant content started approximately '2017-10-31'
    content_items = events \
        .filter(events['event'].isin(
            [x.lower() for x in ['ContentItem_Received', 'ContentItem_View', 'ContentItem_Click',
                                 'ContentItem_TimeOnPage', 'ContentItem_PageScroll']])
        ) \
        .filter(events['Time'.lower()] > '2017-10-31') \
        .filter(events['CarrierName'.lower()].isin('sprint', 'verizon')) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['IsTest']".lower()) == 'false') \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['DeviceId']".lower()).isNotNull()) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['MessageType']".lower()).isNotNull()) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['devicefeatures_attribute']").isNotNull()) \
        .filter(is_photo_related(F.get_json_object(
            events['properties'], "$['properties']['devicefeatures_attribute']")))

    # assign score for each interactions
    content_items = content_items \
        .withColumn(
            'score',
            get_event_score(
                events['event'], F.get_json_object(events['properties'], "$['properties']"))
        )

    # aggregate score per user, item, event, action (action to differentiate clicks).
    # use max on properties for score because page scroll sends intermediate states for example.
    # use max on properties in case it's null or empty string in one of the events
    content_items = content_items \
        .groupBy(
            F.get_json_object(
                events['properties'], "$['properties']['DeviceId']".lower()).alias('device_id'),
            events['event'],
            F.get_json_object(
                events['properties'], "$['properties']['MessageType']".lower()).alias('topic'),
            F.get_json_object(
                events['properties'], "$['properties']['ActionId']".lower()).alias('action')
        ) \
        .agg(
            F.max(F.get_json_object(
                events['properties'], "$['properties']['AtlasUniqueUserId']".lower())).alias('user_id'),
            F.max('CarrierName'.lower()).alias('carrier_name'),
            F.max('DeviceModel'.lower()).alias('device_model'),
            F.max('DeviceModelName'.lower()).alias('device_model_name'),
            F.max('DeviceOsType'.lower()).alias('device_os_type'),
            F.max('DeviceVendor'.lower()).alias('device_vendor'),
            F.max('score').alias('score')
        )

    # FIXME fix view according action events
    received_content_items = content_items \
        .groupBy('device_id') \
        .pivot('event', ['ContentItem_Received'.lower(), 'ContentItem_View'.lower()]).sum('score') \
        .fillna(0.0) \
        .select(
            'device_id',
            fix_received(F.col('contentitem_received'), F.col('contentitem_view')).alias('receive'))

    # calculate final score for user.
    content_items = content_items \
        .filter(events['event'] != 'ContentItem_Received'.lower()) \
        .groupBy('device_id') \
        .agg(
            F.max('user_id').alias('user_id'),
            F.max('carrier_name').alias('carrier_name'),
            F.max('device_model').alias('device_model'),
            F.max('device_model_name').alias('device_model_name'),
            F.max('device_os_type').alias('device_os_type'),
            F.max('device_vendor').alias('device_vendor'),
            F.sum('score').alias('total_score')
        ) \
        .join(received_content_items, 'device_id', 'left') \
        .withColumn('score', F.round(F.col('total_score') / F.col('receive'))) \
        .drop('total_score', 'receive') \
        .withColumn('photography_interest', F.lit(None))

    # choose users who completed user interest questionnaire
    interests = events \
        .filter(events['event'] == 'Timeline_OnboardingMessage_Click'.lower()) \
        .filter(events['CarrierName'.lower()].isin('sprint', 'verizon')) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['IsTest']".lower()) == 'false') \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['DeviceId']".lower()).isNotNull()) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['ActionId']".lower()) == 'done')

    # assign score for photography interest
    interests = interests \
        .withColumn(
            'score',
            get_event_score(
                events['event'], F.get_json_object(events['properties'], "$['properties']"))
        )

    # subset relevant properties and drop duplicated devices
    # (assuming each user should answer questionnaire ones)
    interests = interests \
        .select(
            F.get_json_object(
                events['properties'], "$['properties']['DeviceId']".lower()).alias('device_id'),
            F.get_json_object(
                events['properties'], "$['properties']['AtlasUniqueUserId']".lower()).alias('user_id'),
            events['CarrierName'.lower()].alias('carrier_name'),
            events['DeviceModel'.lower()].alias('device_model'),
            events['DeviceModelName'.lower()].alias('device_model_name'),
            events['DeviceOsType'.lower()].alias('device_os_type'),
            events['DeviceVendor'.lower()].alias('device_vendor'),
            'score'
        ) \
        .drop_duplicates(['device_id']) \
        .withColumn('photography_interest', F.when(F.col('score') > 0, 1.0).otherwise(0.0))

    # assregate content and interest scores
    # use max on properties in case it's null or empty string in one of the events
    photography_user = content_items.union(interests) \
        .groupBy('device_id') \
        .agg(
            F.max('user_id').alias('user_id'),
            F.max('carrier_name').alias('carrier_name'),
            F.max('device_model').alias('device_model'),
            F.max('device_model_name').alias('device_model_name'),
            F.max('device_os_type').alias('device_os_type'),
            F.max('device_vendor').alias('device_vendor'),
            F.sum('score').alias('score'),
            F.max('photography_interest').alias('photography_interest')
        )

    dgx = hive_context.table('l2_asurion.demographics_dbo_source_dgx')
    mobileid = hive_context.table('l3_sprint.mobileid')

    # FIXME: decrypt ethnicityrollup, dob, ethnicity
    photography_user_augmented = photography_user \
        .join(mobileid.select('mobileuid', 'subid'),
              photography_user['user_id'] == mobileid['mobileuid'],
              'left') \
        .join(dgx.select('source_dfx_id', 'nameprefix', 'state', 'age_range', 'income_range_vds',
                         'gender', 'marital_status', 'dwelling_type', 'home_ownership',
                         'length_of_residence', 'presence_of_children',
                         'mail_public_responder_indicator', 'mail_responsive_buyer_indicator',
                         'home_value_range', 'networthindicator_rollup', 'wealth_decile',
                         'homeandlandvalue', 'first_mortgage_amount', 'level_of_education',
                         'head_of_household', 'professionalrollup', 'premover',
                         'active_fitness_interest', 'golf_interest', 'traveler', 'green_advocate'),
              mobileid['subid'] == dgx['source_dfx_id'],
              'left')

    apps = hive_context.read.parquet(APPS_PATH)

    photography_user_augmented = photography_user_augmented \
        .join(apps, photography_user_augmented['device_id'] == apps['deviceId'], 'left')

    photography_user_augmented.write.csv('s3://' + output_path,
                                         mode='overwrite',
                                         compression='gzip',
                                         header=True)
Ejemplo n.º 5
0

# Some hospitals have too few non-NA measure.  To have a fair ranking, we want to set a min. bar
# on the # of non-NA measure for our hospitals to participate in our evaluation.

# For each hospital, find out the # of non-NA measure it has
nonNAMeasureCount = dict(df_total_quality.map(lambda r: (r.providerid, r.normalizedscore)).
                         combineByKey( # Use combineByKey to count the # of non-NA Measure
                            lambda value: 0 if value is None else 1,
                            lambda x, value: x if value is None else x + 1,
                            lambda x, y: x + y).collect())

# Find the 25th percentile of non-NA measure, and this will be the min-bar of # of non-NA measure.
minMeasureCount = np.percentile(nonNAMeasureCount.values(), 25.)

df_hospitals = sqlContext.table("hospitals")
# For the purpose of evaluation, we keep only those hospitals which meet the bar
hospitals_qualified = df_hospitals.map(lambda r: (r.providerid, r.hospitalname, r.state,
                        bool(nonNAMeasureCount[r.providerid] >= minMeasureCount
                             if nonNAMeasureCount.has_key(r.providerid) else False)))

schema = StructType([
    StructField("providerid", StringType(), True),
    StructField("hospitalname", StringType(), True),
    StructField("state", StringType(), True),
    StructField("qualified", BooleanType(), True)])

df_hospitals_qualified = sqlContext.createDataFrame(hospitals_qualified, schema)
saveAsHiveTable(df_hospitals_qualified, "hospitals_qualified")

Ejemplo n.º 6
0
if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Error usage: CreateHive [master] [inputFile] [inputTable]"
        sys.exit(-1)
    master = sys.argv[1]
    inputFile = sys.argv[2]
    inputTable = sys.argv[3]
    sc = SparkContext(master, "CreateHive")
    hiveContext = HiveContext(sc)
    # create hive table 
    hiveContext.sql(
        "CREATE TABLE IF NOT EXISTS default." +
        inputTable +
        " (a int, b string, c string)")
	# loading data into hive table
	
    hiveContext.sql(
        "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable)
	
	# read data from hive table
	
	sampleData = hiveContext.table("default."+inputTable)
	
	sampleData.show()
	
	# running sql on hive table
	
	sampleData.registerTempTable("test_temp")
	hiveContext.sql("select * from test_temp").show()
Ejemplo n.º 7
0
                                                        errors='coerce')
supervised_pd[keyvars] = supervised_pd[keyvars].astype(str)

#remove alerts generated after case
supervised_pd = supervised_pd.loc[~(
    supervised_pd.ALERT_CREATE_DATE > supervised_pd.CASE_CREATE_DATE)]

#use alert data to filter txn data to reduce computation/memory
supervised_pd_filter = supervised_pd[['FINAL_ACCOUNT_KEY',
                                      'ALERT_MONTH_SK']].drop_duplicates()
supervised_pd_filter.columns = ['account_key', 'alert_month_sk']
supervised_filter = hive_context.createDataFrame(supervised_pd_filter)

#generate monthly txn summary
sam_txn = hive_context.table('udm_cds_transactions1023').where(
    "month_sk >=216 and month_sk <= 226").withColumn(
        'abs_value', F.abs(col('acct_curr_amount')))
sam_acct = hive_context.table('udm_cds_account0822').where(
    "is_error_account is null").dropDuplicates()
sam_txn_acctsmry = sam_txn.where("acct_curr_amount<>0").groupBy(["account_sk","month_sk"])\
                    .agg(F.sum('abs_value').alias('total_value'),CD('transaction_key').alias('total_volume')).alias('t')\
                    .join(sam_acct.alias('a'),col('t.account_sk')==col('a.entity_sk'),'left').selectExpr('a.account_key',"t.*").alias('t2')\
                    .join(supervised_filter.alias('s'),[col('t2.account_key')==col('s.account_key'),col("t2.month_sk") +1 == col("s.alert_month_sk")], "inner")\
                    .selectExpr("t2.*","s.alert_month_sk").distinct()
sam_txn_acctsmry_pd = sam_txn_acctsmry.toPandas()

#merge alert data and txn summary
supervised_pd_common = supervised_pd.rename(index=str,columns={"ACCOUNT_KEY": "FORT_ACCOUNT_KEY","FINAL_ACCOUNT_KEY":"account_key","ALERT_MONTH_SK":"alert_month_sk"})\
                                    .merge(sam_txn_acctsmry_pd, on = ['account_key','alert_month_sk'], how='inner')
supervised_pd_common.columns = [
    colu.lower() for colu in supervised_pd_common.columns
Ejemplo n.º 8
0
from pyspark import SparkContext
from pyspark.sql import HiveContext, DataFrame, Column, Window, DataFrameWriter
from pyspark.sql.functions import rank, col
from datetime import timedelta, datetime
import pytz
import os

sc = SparkContext()
hc = HiveContext(sc)
# table 1
table_lot_history = hc.table("prod_mti_ww_be_idl.tte_2did_lot_history_view")
table_lot_history.registerTempTable("table_lot_history")
# table 2
table_machine_attr = hc.table("prod_mti_ww_be_idl.tte_2did_machine_attr_view")
table_machine_attr.registerTempTable("table_machine_attr")
# table 3
table_lot_relation = hc.table("prod_mti_ww_be_idl.tte_2did_lot_relation_view")
table_lot_relation.registerTempTable("table_lot_relation")
# table 4
table_comp_history = hc.table("prod_mti_ww_be_idl.tte_2did_comp_history_view")
table_comp_history.registerTempTable("table_comp_history")

ctz = pytz.timezone('Singapore')
path_root = '/eng/mti/ww/be/msb/assembly_quality/twodid'

for n in range(1, 2):
    date = (datetime.now(tz=ctz) - timedelta(days=n)).strftime("%Y-%m-%d")
    for t in (1, 2):
        if t == 1:
            time_boundary_1 = date + ' ' + '00:00:00.000'
            time_boundary_2 = date + ' ' + '11:59:59.999'
        "IsSystemApp": False,
        "Name": "Uber",
        "VersionCode": 13
    }]
}

if __name__ == "__main__":
    (from_date, to_date, target_path, external_lib) = get_parameters()

    sc = SparkContext(appName="telemetries", pyFiles=external_lib)
    sql_context = HiveContext(sc)

    from telemetries.create_user_apps import load_sent_events
    from flaten_mixpanel_home_events.pyspark_schema_utils import rdd_to_df

    tele_sprint = sql_context.table("l1_sprint.telemetry_events")
    events_sprint = sql_context.table("l2_sprint.mixpanel_home")
    apps_data_rdd = tele_sprint.filter((tele_sprint.event_date >= '2018-05-01') & (tele_sprint.event_date <= '2018-05-01 ')) \
        .filter((tele_sprint.event_name == 'apps') | (tele_sprint.event_name == 'systemapps')) \
        .rdd.map(lambda x: (x.event_name , x.os, x.uuid, x.event_date, json.loads(x.json_data), x.agentversion)) \
        .filter(lambda x: x[4].get('IsTest', True) == False) \
        .map(lambda x: Row(carrierName=x[4]['CarrierName'],
                           deviceId=x[4]['DeviceId'],
                           deviceModel=x[4].get('DeviceModel',None),
                           deviceVendor=x[4].get('DeviceVendor',None),
                           agentTimestamp=x[4].get('AgentTimestamp',None),
                           agentVersion=x[5],
                           telemtryType=x[0],
                           os=x[1],
                           uuid=x[2],
                           date=x[3],
Ejemplo n.º 10
0
# Created by Raju Kumar Mishra 
# Book PySpark Recipes
# Chapter 8
# Recipe 8-8. Reading data from Apache Hive.
# Run following PySpark code lines, line by line in PySpark shell

#Step 8-8-1. Creating HiveContext object. 

from pyspark.sql import HiveContext
ourHiveContext = HiveContext(sc)

#Step 8-8-2. Reading table data from Hive. 

FilamentDataFrame = ourHiveContext.table('apress.filamenttable')
FilamentDataFrame.show(5)

###############################################################################################################
#
#   Spark - Execute Job Against Hive Table
#
###############################################################################################################


from pyspark.sql import HiveContext

hive_context = HiveContext(sc)

hive_context.sql('show tables').show(25,False)

sample = hive_context.table("mm_teams_sql")
sample.show(10,False)



###############################################################################################################
#
#   Spark - Execute Job Against Phoenix Table
#
###############################################################################################################

# NOTE: Just an example - Not used for IAA Module

/usr/hdp/2.5.0.0-1245/spark2/bin/pyspark --jars /usr/hdp/2.5.0.0-1245/phoenix/lib/phoenix-spark-4.7.0.2.5.0.0-1245.jar

/usr/hdp/2.5.0.0-1245/spark2/bin/pyspark --jars /usr/hdp/2.5.0.0-1245/phoenix/phoenix-4.7.0.2.5.0.0-1245-client.jar
Ejemplo n.º 12
0
#
# survey_resp.py
# Extract useful data from survey_responses table and store in a parquet file
#
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import HiveContext

sc = SparkContext("local", "Exercise1")
hiveCX = HiveContext(sc)

# bring the table into a data frame
dfSurveyResp = hiveCX.table("survey_responses")

# select out the relevent info into a new dataframe
dfSurveyRespNew = dfSurveyResp.select(
    col("provider_number").alias("providerID"),
    col("hcahps_base_score").cast(DecimalType()).alias("baseScore"),
    col("hcahps_consistency_score").cast(
        DecimalType()).alias("consistencyScore")).where(
            col("baseScore").isNotNull() & col("consistencyScore").isNotNull())

# write the dataframe out as a parquet file
dfSurveyRespNew.write.parquet("/user/w205/hospital_compare/surveyRespParquet")
Ejemplo n.º 13
0
    print correlation

    most_common_weaknesses = sql_context.sql("select weakness, count(*) as times "
                                            "from ("
                                            "select explode(weaknesses) as weakness "
                                            "from pokemons "
                                            ") weaknesses "
                                            "group by weakness "
                                            "order by times desc "
                                            "limit 3"
                                             )

    normal_pokemons_not_in_eggs = sql_context.sql("select * from "
                                                 "pokemons "
                                                 "where array_contains(type, 'Normal') and egg='Not in Eggs'")

    print "3 most common weaknesses: {}".format(most_common_weaknesses.collect())
    print "Normal pokemons not in eggs: {}".format(normal_pokemons_not_in_eggs.collect())

    hive_context = HiveContext(spark_context)
    #hive_context.setConf("hive.warehouse.dir", "/Users/adrian/Documents/hive-warehouse")

    normal_pokemons_not_in_eggs.write.mode("overwrite").saveAsTable("default.normal_pokemons")

    # Show the content of the Hive table we've just created
    pokemons_normal_table = hive_context.table("default.normal_pokemons")
    pokemons_normal_table.show()

    hive_context.read.table("default.normal_pokemons")
    print "Normal Pokemons from Hive table: {}".format(sql_context.sql("select * from normal_pokemons").collect())
Ejemplo n.º 14
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
from pandas import *


sc = SparkContext("local[*]", "RentalData")


ssc = StreamingContext(sc, 5)

ss = SparkSession.builder \
	.appName(sc.appName) \
	.config("spark.sql.warehouse.dir",
		"/user/hive/warehouse") \
	.config("hive.metastore.uris",
		"thrift://localhost:9083") \
	.enableHiveSupport() \
	.getOrCreate()

hive_context = HiveContext(sc)
df = hive_context.table("default.rentals")

df.toPandas().to_csv('mycsv.csv')

print(rentals)

ssc.start()
ssc.awaitTermination()
def tok_str(text, ngrams=1, minChars=2):
    text = re.sub(r'\s+', ' ', text) 		     # change any whitespace to regular space
    tokens = map(unicode, text.lower().split(' '))     # split into tokens and change to lower case
    tokens = filter(lambda x: len(x)>=minChars and x[0]!='@', tokens)     
                                                       # remove short words and usernames
    tokens = ["URL" if t[:4]=="http" else t for t in tokens]      
     # repalce any url by the constant word "URL"
    tokens = [punct.sub('', t) for t in tokens]        # remove punctuation from tokens
    if ngrams==1:
        return tokens
    else:
        return tokens + [' '.join(tokens[i:i+ngrams]) for i in xrange(len(tokens)-ngrams+1)]
tokenize = F.udf(lambda s: tok_str(unicode(s),ngrams=2), ArrayType(StringType()))

# Load sentiment dictionary
wv = hc.table('sentiment_words').collect()
wordlist = dict([(r.word,r.score) for r in wv])

# get positive sentiment scores from words RDD using word-list
def pscore(words):
    scores = filter(lambda x: x>0, [wordlist[t] for t in words if t in wordlist])
    return 0.0 if len(scores)==0 else (float(sum(scores))/len(scores))
pos_score = F.udf(lambda w: pscore(w), FloatType())

# get negative sentiment scores from words RDD using word-list
def nscore(words):
    scores = filter(lambda x: x<0, [wordlist[t] for t in words if t in wordlist])
    return 0.0 if len(scores)==0 else (float(sum(scores))/len(scores))
neg_score = F.udf(lambda w: nscore(w), FloatType()) 

# Create feature matrix for the model
    if to_date is None:
        to_date = get_date_n_days_ago(1)
    if to_date == from_date:
        to_date = (date_parser.parse(to_date) +
                   timedelta(days=1)).strftime('%Y-%m-%d')

    print('from_date = {} | to_date = {}'.format(from_date, to_date))

    from flaten_mixpanel_home_events.data_provider import load_sent_events, load_received_events, load_dismiss_events, \
        load_view_events, \
        load_time_on_page_events, load_page_scroll_events, load_feedbackmodule_impression_events, \
        load_feedbackmodule_click_events, load_click_events, join_all_events
    from flaten_mixpanel_home_events.events_schema import sent_record_example, flatten_data_record_example
    from flaten_mixpanel_home_events.pyspark_schema_utils import rdd_to_df

    events = sqlContext.table("l2_sprint.mixpanel_home")
    sent_events = load_sent_events(events, from_date, to_date, sent_record_example, sqlContext) \
        .drop_duplicates(subset=['message_id'])
    received_events = load_received_events(events, from_date, to_date) \
        .drop_duplicates(subset=['received_message_id'])
    dismiss_events = load_dismiss_events(events, from_date, to_date) \
        .drop_duplicates(subset=['dismiss_message_id'])
    view_events = load_view_events(events, from_date, to_date) \
        .drop_duplicates(subset=['view_message_id'])
    time_on_page_events = load_time_on_page_events(events, from_date, to_date) \
        .drop_duplicates(subset=['time_on_page_message_id'])
    page_scroll_events = load_page_scroll_events(events, from_date, to_date) \
        .drop_duplicates(subset=['page_scroll_message_id'])
    feedbackmodule_impression_events = load_feedbackmodule_impression_events(events, from_date, to_date) \
        .drop_duplicates(subset=['feedback_module_impression_message_id'])
    feedbackmodule_click_events = load_feedbackmodule_click_events(events, from_date, to_date) \
Ejemplo n.º 17
0
sc = SparkContext("local", "best_hospitals")

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

# Select the top 10 hospital by average avgscore
# Please note that we filter out those hospital not qualified for evaluation
df_top10_hospitals = sqlContext.sql("select Q.providerid as id, AVG(Q.normalizedscore) as avgscore \
from total_quality Q join hospitals_qualified H on Q.providerid = H.providerid \
where Q.normalizedscore is not null and H.qualified = true \
group by Q.providerid \
order by avgscore DESC").limit(10)

# Join with hospitals_qualified to get the hospital name and state
# Note: couldn't figure out how to do it in the above select statement (together with Group By) in one-shot! :-(
df_hospitals = sqlContext.table("hospitals_qualified")
df_top10_hospitals_full = df_top10_hospitals.join(df_hospitals, df_top10_hospitals.id == df_hospitals.providerid).\
    select(df_hospitals.providerid, df_hospitals.hospitalname, df_hospitals.state, df_top10_hospitals.avgscore)

df_top10_hospitals_full = df_top10_hospitals_full.orderBy(df_top10_hospitals_full.avgscore.desc())

# Save it as a table
df_top10_hospitals_full.registerTempTable("df")
sqlContext.sql("drop table if exists top_10_hospitals")
sqlContext.sql("CREATE TABLE top_10_hospitals AS SELECT * FROM df")

print
print "Top 10 hospitals"
print
rank = 1
for i in df_top10_hospitals_full.collect():
# importing required packages
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import Row
#import numpy as np

# seeting up spark context and hive context
sc = SparkContext("local", "Simple App")
sqlCtx = HiveContext(sc)

# creating a spark data frame using the hive table effective_care
df_raw = sqlCtx.table("effective_care")
print 'Number of rows in the table {0}'.format(df_raw.count())

# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())
Ejemplo n.º 19
0
# Probar el codigo en CLOUDERA:
# Texto original

from pyspark.sql import HiveContext
from pyspark.sql.functions import *
import ConfigParser
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("Prueba_1")
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = HiveContext(sc)

df1 = sqlContext.sql("SELECT * FROM dwhprod.cen_identificacion LIMIT 100")
df1.show()

df2 = sqlContext.table("dwhprod.cen_identificacion")
df2.show()

# Guargadado de tablas
df1.write.mode("overwrite").saveAsTable("dwhprod.base_nueva")

sc.stop()

# JOINS EN IMPALA

# --SELECT * FROM dwhprod.cenpersona LIMIT 100

#--SELECT * FROM dwhprod.cenactividad LIMIT 100

#--fiactividad
Ejemplo n.º 20
0
             "/user/hive/warehouse") \
 .config("hive.metastore.uris",
                      "thrift://localhost:9083") \
 .enableHiveSupport() \
 .getOrCreate()

kafkastream = KafkaUtils.createStream(ssc, "localhost:2181", "SW", {"SW": 1})

parsed = kafkastream.map(lambda x: json.loads(x[1]))

content = parsed.map(lambda x: x.get("content")) \
 .flatMap(lambda x: x.get("properties")) \
 .map(lambda x: (x.get("id"), x.get("abbreviation"),
                     x.get("city"), x.get("conference"), x.get("division"),
                     x.get("full_name"), x.get("name")))

content.foreachRDD(Process)

## Create a new DF based on NBA teams in the Southwest Division

hive_context = HiveContext(sc)
df = hive_context.table("default.SW")

new = df.select("id", "conference" "division") \
 .withColumn("city", "name") \
 .drop(col("full_name")) \
 .show()

ssc.start()
ssc.awaitTermination()
Ejemplo n.º 21
0
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.sql.functions import col
from pyspark.sql import HiveContext
import datetime

#create spark context
#conf = SparkConf().set("spark.sql.warehouse.dir","/apps/hive/warehouse/")
#sc=SparkContext(conf=conf)
sqlContext = SQLContext(sc)

#creating hive context to fetch data from hive tables
hive_context = HiveContext(sc)

#loading the data from hive tables into spark dataframes
df_retail_sales = hive_context.table("retail_project.retail_sales")
df_retail_stores = hive_context.table("retail_project.retail_stores")
df_retail_features = hive_context.table("retail_project.retail_features")

#converting data frame to temporary tables
df_retail_sales.createOrReplaceTempView("spark_tab_retail_sales")
df_retail_stores.createOrReplaceTempView("spark_tab_retail_stores")
df_retail_features.createOrReplaceTempView("spark_tab_retail_features")

#Query-01
#the department-wide sales for each store

query_01 = open(
    "/home/reallegendscorp9155/pyspark_project_retail/query_files/query_01.txt"
)
query01 = query_01.read()
Ejemplo n.º 22
0
import pyspark
from pyspark.sql import HiveContext

if __name__ == '__main__':
    config = pyspark.SparkConf().setAppName("Basico")
    sc = pyspark.SparkContext(conf=config)

    hive_context = HiveContext(sc)
    log = hive_context.table("CURSOBIGDATA.apachelog")
    log.show()
Ejemplo n.º 23
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.functions import col
from pyspark.sql.functions import array

conf = SparkConf()
sc = SparkContext(conf=conf)
hive_context = HiveContext(sc)

combined = hive_context.table("flight.flight_data_denorm")

carrier_desc = "carrier_desc"
origin_desc = "origin_desc"
dest_desc = "dest_desc"

#print combined.select(col("origin_dest_names").getItem(0)).head(3)

#combined.groupBy("ORIGIN_DESC").avg("DEP_DELAY").withColumnRenamed("avg(DEP_DELAY)", "avg_dep_delay").sort(col("avg_dep_delay").desc()).head(5)

combined = combined.groupBy(
    col("origin_dest_names").getItem(0)).avg("dep_delay").withColumnRenamed(
        "avg(dep_delay)",
        "avg_dep_delay").sort(col("avg_dep_delay").desc()).limit(5)

combined.write.format("csv").save("file:///root/flight_data/query1.csv")
Ejemplo n.º 24
0
need_state = sqlContext.sql("""
SELECT A.*,B.NS_HARM FROM 
(SELECT A.*,CONCAT('_',GROUP_CODE,'_',DUNN_CAT_ENGLISH_1) AS NEW_CAT 
FROM x5_ru_analysis.ak_prod_ovr_1 A) A INNER JOIN 
X5_RU_ANALYSIS.AK_NS_MAP_OVR_SPLIT B ON A.NEW_CAT = B.KARU_CATEGORY_FINAL 
""")
need_state = need_state.withColumn("need_state", F.upper(compress("NS_HARM")))
need_state = need_state.select("product_code", "need_state")
# need_state = need_state.withColumn("format_code",F.upper(compress("format_code")))
'''
rule_mapping = sqlContext.createDataFrame(pd.read_excel("X5_lookup_micro_business_v3.xlsx"))
rule_mapping.write.saveAsTable("ak_micro_ns_x5_lookup_business_3",
                                        mode='overwrite')
'''
rule_mapping = sqlContext.table("ak_micro_ns_x5_lookup_business_3")

#rule_mapping  = sqlContext.table("ak_micro_ns_x5_lookup ")
for i in [
        "need_state", "Micro_qualifier_2", "Micro_qualifier_3_5",
        "Micro_qualifier_6_7"
]:
    rule_mapping = rule_mapping.withColumn(i, F.upper(compress(i)))

df_final = sqlContext.sql(""" 
SELECT DISTINCT BASKET_ID AS TRANSACTION_CODE,
       NS_HARM AS NEED_STATE,
       ITEM_SPEND AS NET_SPEND_AMT,
       CATEGORY_DESC_ENG AS PROD_HIER_L30_CODE,
       A.PRODUCT_CODE AS CATEGORY_NAME
FROM AK_TRANS_52weeks A
def sql_hive_context_example(spark):
    
    # create hive context object.
    hive_ctx = HiveContext(spark.sparkContext)

    # createDataFrame
    l = [('Alice', 18), ('Bob', 20), ('Charley', 22)]
    df = hive_ctx.createDataFrame(l, ('name', 'age'))
    print("createDataFrame API finished")

    # registerDataFrameAsTable 
    hive_ctx.registerDataFrameAsTable(df, "table1")
    print("registerDataFrameAsTable API finished")

    # sql
    tmp_df = hive_ctx.sql("select * from table1")
    tmp_df.show()
    print("sql API finished")

    # table
    tmp_df = hive_ctx.table("table1")
    tmp_df.show()
    print("table API finished")

    # tableNames
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("tableNames API finished")

    # tables
    tables = hive_ctx.tables()
    print(tables)
    print("tables API finished")

    # range
    tmp_df = hive_ctx.range(1,10,2)
    tmp_df.show()
    print("range API finished")

    # dropTempTable
    hive_ctx.dropTempTable("table1")
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("dropTempTable API finished")

    # cacheTable & uncacheTable & clearCache
    df = hive_ctx.range(1,10,2)
    hive_ctx.registerDataFrameAsTable(df, "table")
    hive_ctx.cacheTable("table")
    hive_ctx.uncacheTable("table")
    hive_ctx.clearCache()
    print("cacheTable & uncacheTable & clearCache API finished")

    # createExternalTable

    # newSession

    # registerFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead

    # registerJavaFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead

    # setConf & getConf
    hive_ctx.setConf("key1", "value1")
    value = hive_ctx.getConf("key1")
    print(value)
    print("setConf & getConf API finished")

    # refreshTable
    # Exception: An error occurred while calling o26.refreshTable:
    # Method refreshTable([class java.lang.String]) does not exist
    
    print("Finish running HiveContext API")
    logException("Error in reading YAML file in TableDataAnalysisForDuplicacy.py"+" "+str(e))

# Create a 2D array for the features from the input dataframe, which has to be tested
input_features_arr = inputtablevalues()
# Create the array for the input columns from the to be tested file
columnarray=[]
columnarray.append(filecolumnstomatch)

try:
    # Read each table from the file which has been created after column matching
    readfile = open(filepath+"colmatchingtables.txt")
    while True:
        line = readfile.readline()
        tableName = line.replace(filenameContent, '')
        # Create the sql dataframe from Hive table
        data = sqlContext.table(tableName)
        # Select the columns which are not string type; remove them and create a numerical only dataframe
        columnList = [item[0] for item in data.dtypes if item[1].startswith('string') == False]
        data_number_only = data.selectExpr(columnList).cache()
        # Fill the NA values with 0 for all columns
        data_number_only = data_number_only.fillna(0)
        # Get the approx row count
        cntInterval = data_number_only.rdd.countApprox(timeout=300000, confidence=.1)
        # Create the sampled dataframe if row count is more than a million
        if cntInterval > 1000000:
            # consider 20% data of equal probability weight (None) with no replacement (False)
            sampledData = data_number_only.sample(False, .2, None)
        elif cntInterval > 10000000:
            sampledData = data_number_only.sample(False, .1, None)
        else:
            sampledData = data_number_only
Ejemplo n.º 27
0
from __future__ import print_function
!echo $PYTHON_PATH
import os, sys
#import path
from pyspark.sql import *

# create spark sql session
myspark = SparkSession\
    .builder\
    .config("spark.executor.instances", 3 ) \
    .config("spark.executor.memory", "3g") \
    .config("spark.executor.cores", 2) \
    .config("spark.scheduler.listenerbus.eventqueue.size", 10000) \
    .config("spark.sql.parquet.compression.codec", "snappy") \
    .appName("Sample_07_kmeans") \
    .getOrCreate()



sc = myspark.sparkContext
print ( myspark)
myspark.sql("SET spark.sql.parquet.binaryAsString=true")
from pyspark.sql import HiveContext
hive_context = HiveContext(sc)
myview = hive_context.table("default.sample_07p")

myview.show(5)
Ejemplo n.º 28
0
from pyspark import SparkContext
from pyspark.sql import HiveContext
import time

if __name__ == "__main__":
    sc = SparkContext(appName="Link Filtering Hive v1")
    hc = HiveContext(sparkContext=sc)
    tbl = hc.table("mi2mi.edges")
    tbl.registerTempTable("edges")
    
    # ties = hc.sql("select MilanoDate, sid1, sid2, 1 - (select count(distinct sid1) - 2 from edges where MilanoDate=e.MilanoDate)*(EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1) + pow(1 - EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1), (select count(distinct sid1) - 1 from edges where MilanoDate=e.MilanoDate))) /((select count(distinct sid1) - 1 from edges where MilanoDate=e.MilanoDate) * (EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1) - 1)) alpha from edges e")
    ties = hc.sql("select MilanoDate, sid1, sid2, pow(1 - EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1), (select count(distinct sid1) - 1 from edges where MilanoDate=e.MilanoDate)) alpha from edges e where sid1 != sid2 order by MilanoDate, sid1, sid2")
    ties.write.format("orc").saveAsTable("mi2mi.LinkFiltering")
    alfa_value = [0.01, 0.05, 0.001]
    ties.filter(ties.alpha < 0.05).show()
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

# Initialize Spark
SparkContext.setSystemProperty('spark.executor.memory', '4g')
conf = SparkConf()
conf.set('spark.executor.instances', 20)
sc = SparkContext('yarn-client', 'kdd99', conf=conf)
hc = HiveContext(sc)

kdd = hc.table("kdd99")

(trainData, testData) = kdd.randomSplit([0.7, 0.3], seed=42)
trainData.cache()
services = trainData.withColumnRenamed('service',
                                       'srvc').select('srvc').distinct()
testData = testData.join(services, testData.service == services.srvc)
# filter out any rows with a service not trained upon
testData.cache()

print "training set has " + str(trainData.count()) + " instances"
print "test set has " + str(testData.count()) + " instances"

# Build model
inx1 = StringIndexer(inputCol="protocol", outputCol="protocol-cat")
inx2 = StringIndexer(inputCol="service", outputCol="service-cat")
inx3 = StringIndexer(inputCol="flag", outputCol="flag-cat")
inx4 = StringIndexer(inputCol="is_anomaly", outputCol="label")
Ejemplo n.º 30
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row
from pyspark.sql import HiveContext
import pyspark.sql.functions as f

sc = SparkContext("local", "NBA Stats")
hive_context = HiveContext(sc)
data = hive_context.table("default.nbahive")

#1 Max Height
maxheight = data.groupBy().max('Height').withColumnRenamed(
    "max(Height)", "max_height")
#maxheight.show()
result1 = data.join(maxheight, data.height == maxheight.max_height,
                    "inner").select(data.playername, data.height).distinct()
result1.show()
result1.write.mode("overwrite").saveAsTable("default.tallest")

#2 Min Weight
minweight = data.groupBy().min('Weight').withColumnRenamed(
    "min(Weight)", "min_weight")
#minweight.show()
result2 = data.join(minweight, data.weight == minweight.min_weight,
                    "inner").select('playerName', 'Weight').distinct()
result2.show()
result2.write.mode("overwrite").saveAsTable("default.minWeight")

#3 Most Team Wins

wins = data.select('gmDate', 'teamAbbr',
                   'Result').filter(data.result == 'Win').distinct()
Ejemplo n.º 31
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, HiveContext
hive_context = HiveContext(sc)

pubg = hive_context.table("pubg_new")

#Selecting columns of interest
pubg = pubg.select('boosts', 'damagedealt', 'dbnos', 'headshotkills', 'heals',
                   'killplace', 'killpoints', 'kills', 'killstreaks',
                   'longestkill', 'maxplace', 'numgroups', 'revives',
                   'ridedistance', 'roadkills', 'swimdistance', 'teamkills',
                   'vehicledestroys', 'walkdistance', 'weaponsacquired',
                   'winpoints', 'winplaceperc')

pubg.show(10)
pubg.printSchema()
pubg.cache()

from pyspark.ml.feature import VectorAssembler

#Creating feature vector
vectorAssembler = VectorAssembler(inputCols=[
    'boosts', 'damagedealt', 'dbnos', 'headshotkills', 'heals', 'killplace',
    'killpoints', 'kills', 'killstreaks', 'longestkill', 'maxplace',
    'numgroups', 'revives', 'ridedistance', 'roadkills', 'swimdistance',
    'teamkills', 'vehicledestroys', 'walkdistance', 'weaponsacquired',
    'winpoints', 'winplaceperc'
],
                                  outputCol='features')

#Transforming the dataframe
                        help="train|inference",
                        default="train")
    parser.add_argument("-c",
                        "--rdma",
                        help="use rdma connection",
                        default=False)

    args = parser.parse_args()
    print("args:", args)

    # read data
    # input_data = sc.textFile(args.input).map(lambda ln: [float(x) for x in ln.split(',')])
    # change the input data to hive table
    # input_data = hive_context.table(args.input).map(lambda row: [float(x) for x in row])
    # import hiveContext.sql
    # import hiveContext.implicits._
    # input_data = hive_context.sql("select * from "+args.input).map(lambda row: [float(x) for x in row])
    input_data = hive_context.table(
        args.input).map(lambda row: [float(x) for x in row])

    # input_data = sqlContext.sql("select * from "+args.input).map(lambda row: [float(x) for x in row])
    cluster = TFCluster.run(sc, map_fun, args, num_executors, num_ps,
                            args.tensorboard, TFCluster.InputMode.SPARK)

    labelRDD = cluster.inference(input_data).map(lambda x: (x, ))

    # infer the schema
    schema_rdd = hive_context.createDataFrame(labelRDD, ['predict'])
    # save
    schema_rdd.saveAsTable(args.output)
    cluster.shutdown()
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

# Initialize Spark
SparkContext.setSystemProperty("spark.executor.memory", "4g")
conf = SparkConf()
conf.set("spark.executor.instances", 20)
sc = SparkContext("yarn-client", "kdd99", conf=conf)
hc = HiveContext(sc)

kdd = hc.table("kdd99")

(trainData, testData) = kdd.randomSplit([0.7, 0.3], seed=42)
trainData.cache()
services = trainData.withColumnRenamed("service", "srvc").select("srvc").distinct()
testData = testData.join(services, testData.service == services.srvc)
# filter out any rows with a service not trained upon
testData.cache()

print "training set has " + str(trainData.count()) + " instances"
print "test set has " + str(testData.count()) + " instances"

# Build model
inx1 = StringIndexer(inputCol="protocol", outputCol="protocol-cat")
inx2 = StringIndexer(inputCol="service", outputCol="service-cat")
inx3 = StringIndexer(inputCol="flag", outputCol="flag-cat")
inx4 = StringIndexer(inputCol="is_anomaly", outputCol="label")
ohe2 = OneHotEncoder(inputCol="service-cat", outputCol="service-ohe")
Ejemplo n.º 34
0
#
# procedures.py
# Extract useful data from effective_care and readmissions tables and store in a parquet file
#
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import HiveContext

sc = SparkContext("local", "Exercise1")
hiveCX = HiveContext(sc)

# bring the table into a data frame
dfEffCare = hiveCX.table("effective_care")
dfReadmissions = hiveCX.table("readmissions")

# select out the relevent info into a new dataframe
dfEffCareNew = dfEffCare.withColumn(
    "score", regexp_replace("score", "Low.*", "1")).withColumn(
        "score", regexp_replace("score", "Medium.*", "2")).withColumn(
            "score", regexp_replace("score", "High.*", "3")).withColumn(
                "score", regexp_replace("score", "Very High.*", "4")).select(
                    col("provider_id").alias("providerID"),
                    col("measure_id").alias("measureID"),
                    col("score").cast(DecimalType()).alias("score")).where(
                        col("score").isNotNull())

dfReadmissionsNew = dfReadmissions.select(
    col("provider_id").alias("providerID"),
    col("measure_id").alias("measureID"),
    col("score").cast(DecimalType()).alias("score")).where(
Ejemplo n.º 35
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.functions import col
from pyspark.sql.functions import array
from pyspark.sql.functions import collect_list
from pyspark.sql.functions import concat_ws

conf = SparkConf()
sc = SparkContext(conf=conf)
hive_context = HiveContext(sc)

airlines = hive_context.table("flight.flight_data_orc2")
airports = hive_context.table("flight.airport_lookup")
carriers = hive_context.table("flight.carrier_lookup")

carrier_desc = "carrier_desc"
origin_desc = "origin_desc"
dest_desc = "dest_desc"

#we do not observe any duplicates on airport lookup
airports = airports.dropDuplicates(['code'])

#we group the carriers over code and create a list of all possible descriptions. Then, we concat the descriptions as a string with '||' as a separator
carriers = carriers.groupBy("code").agg(
    collect_list(carriers.description).alias('new_desc')).select(
        [col("code"),
         concat_ws(" || ", col("new_desc")).alias("description")])

air_car = airlines.join(carriers, airlines.carrier == carriers.code).select(
    [a for a in airlines.columns] + [carriers.description.alias(carrier_desc)])
# importing required packages
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import Row

# seeting up spark context and hive context
sc = SparkContext("local", "Simple App")
sqlCtx = HiveContext(sc)

# creating a spark data frame using the hive table effective_care
df_raw = sqlCtx.table("effective_care")
print 'Number of rows in the table {0}'.format(df_raw.count())

# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())

# converting the data types for score column
Ejemplo n.º 37
0
# By default, sparkcontext (sc) is initialised, hence, line number 13 is commented out.

# Make necessary imports
import datetime
from itertools import chain
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.functions import create_map, lit, max

# Initialise sparkcontext and hivecontext
# sc = SparkContext("local", "First App") # Check NOTE 👆
hive_context = HiveContext(sc)
current_date = datetime.datetime.now().strftime('%Y-%m-%d')

# Create a data frame with the raw data available in Hive
clickstream_funnel_df = hive_context.table("default.funnel_clickstreams")

# Assign a code to the various pages. This code will also act as the 'likeliness' of the user getting converted.
# A higher numbered code implies that the user is more likely to buy the ticket.
page_code = {'listing': 1, 'review': 2, 'payments': 3, 'thankyou': 4}
mapping_expr = create_map([lit(x) for x in chain(*page_code.items())])

# Add a new column 'conversion_likeliness' to the dataframe and filter out past searches
conversion_likeliness_data_df = clickstream_funnel_df.withColumn(
    'conversion_likeliness', mapping_expr[clickstream_funnel_df['page_name']])
conversion_likeliness_data_df = conversion_likeliness_data_df.filter(
    conversion_likeliness_data_df.departure_date > current_date).alias('df')

# Select unique searches by the user and fetch its max 'likeliness' to be converted.
unique_searches_df = conversion_likeliness_data_df. \
  groupBy('userid', 'origin', 'destination', 'departure_date'). \
# importing required packages
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import Row
from pyspark.mllib.stat import Statistics


# seeting up spark context and hive context
sc = SparkContext("local", "Simple App")
sqlCtx = HiveContext(sc)

# creating a spark data frame using the hive tables survey_response and effective_care
df_survey_raw = sqlCtx.table("survey_response")
df_care_raw = sqlCtx.table("effective_care")
print 'Number of rows in the survey table {0}'.format(df_survey_raw.count())
print 'Number of rows in the effective_care table {0}'.format(df_care_raw.count())

# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_survey_clean_rdd = df_survey_raw.rdd.filter(lambda row: CheckValidScore(row.hcahps_base_score))
df_care_clean_rdd = df_care_raw.rdd.filter(lambda row: CheckValidScore(row.score))