Exemple #1
0
def clacForFile(features, file, otherFile):
    print('running on file {0}'.format(file))
    sc = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    spark = SQLContext(sc)

    data = spark.read.csv(file, header=True).coalesce(100)
    #otherData = spark.read.csv(otherFile, header=True).coalesce(100)

    data.createTempView('clicks')
    #otherData.createTempView('otherDataclicks')
    #spark.cacheTable('clicks')

    fieldsCsv = ','.join(ALL_COLUMNS)

    query = 'select case clicked when 1 then 1 else 0 end as clicked, case clicked when 1 then 0 else 1 end as nonclicked, rowIndex, ' \
            '{0} from clicks '.format(fieldsCsv)

    d1 = spark.sql(query)

    d1.createTempView('countClicks')
    spark.cacheTable('countClicks')

    numOfRows = spark.sql('select count(*) as numOfRows from countClicks'
                          ).take(1)[0]['numOfRows']

    print('running on file {0} and numOfRows{1}'.format(file, numOfRows))

    for column in ALL_COLUMNS:
        features[column][
            'totalNumOfRows'] = features[column]['totalNumOfRows'] + numOfRows
        features[column]['numOfRowsPerFile'].append(numOfRows)

    print('running on file {0} and noNullColumns'.format(file))

    noNullColumns(spark, ALL_COLUMNS, features)

    print('running on file {0} and distinctValuesFilter'.format(file))
    distinctValuesFilter(spark, ALL_COLUMNS, features)

    # spark.cacheTable('countClicks')

    print('running on file {0} and adsAndClicksPerUserFilter'.format(file))

    adsAndClicksPerUserFilter(spark, numOfRows, ALL_COLUMNS, features)

    print('running on file {0} and adsAndClicksPerUserFilter'.format(file))

    sc.stop()

    print('done')
Exemple #2
0
def main(sc):
    sqlContext = SQLContext(sc)
    taxiFile = sc.textFile("taxizip/taxizipaa.csv")
    header = taxiFile.first()

    taxiHeader = taxiFile.filter(lambda l: "vendor_id" in l)
    taxiNoHeader = taxiFile.subtract(taxiHeader)

    taxi_temp = taxiNoHeader.map(lambda k: k.split(","))

    taxi_rdd = taxi_temp.map(lambda p: Row(vendor_id=p[0],
    pickup_datetime=datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"),
    dropoff_datetime=datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S"),
    passenger_count=int(p[3] if p[3]!="" else 0),
    trip_distance=float(p[4] if p[4]!="" else 0),
    pickup_longitude=float(p[5] if p[5]!="" else 0) ,
    pickup_latitude=float(p[6] if p[6]!="" else 0),
    rate_code=p[7],
    store_and_fwd_flag=p[8],
    dropoff_longitude=float(p[9] if p[9]!="" else 0),
    dropoff_latitude=float(p[10] if p[10]!="" else 0),
    payment_type=p[11],
    fare_amount=float(p[12] if p[12]!="" else 0),
    surcharge=float(p[13] if p[13]!="" else 0),
    mta_tax=float(p[14] if p[14]!="" else 0),
    tip_amount=float(p[15] if p[15]!="" else 0),
    tolls_amount=float(p[16] if p[16]!="" else 0),
    total_amount=float(p[17] if p[17]!="" else 0),
    zipcode=p[18]))

    taxi_df = sqlContext.createDataFrame(taxi_rdd)

    taxi_df.registerTempTable("taxi")

    sqlContext.registerFunction("to_hour", lambda x: x.hour)
    sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))
 
    th = sqlContext.sql("SELECT to_hour(dropoff_datetime) as hour, dropoff_datetime as trip_date, dropoff_longitude as lng,dropoff_latitude as lat,zipcode FROM taxi where dropoff_longitude!=0 and dropoff_latitude!=0")

    th.registerTempTable("taxi_hr")
    sqlContext.cacheTable("taxi_hr")

    grouped_taxi = sqlContext.sql("select hour, zipcode,str_date(trip_date), count(*) as c from taxi_hr group by hour,zipcode,str_date(trip_date) order by c desc")
    grouped_taxi.show(100)
 
    #save this intermediate result to a file as csv
    grouped_csv = grouped_taxi.map(toCSV)
    grouped_csv.saveAsTextFile('results')

    grouped_taxi.registerTempTable("taxi_grouped")
    sqlContext.cacheTable("taxi_grouped")
Exemple #3
0
    return x.upper()


if __name__ == "__main__":
    # read json file
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("SparkSQLTwitter")
    sc = SparkContext(conf=conf)
    sqlCtx = SQLContext(sc)
    input = sqlCtx.read.json(inputFile)
    # print data schema
    # print input.printSchema()
    # get partitions number
    print input.rdd.getNumPartitions()
    input.createOrReplaceTempView("tb_data")
    sqlCtx.cacheTable("tb_data")
    data = sqlCtx.sql("select * from tb_data")
    print "----read json file and save as parquet file----"
    data.show()
    data.write.save("./parquetFile", mode="overwrite", format="parquet")
    # Read parquet file
    print "----read parquet file and show----"
    rows = sqlCtx.read.parquet("./parquetFile")
    print rows.show()
    # Use Row to create DataFrame base on RDD
    happyPeopleRDD = sc.parallelize(
        [Row(name="holden", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = sqlCtx.createDataFrame(happyPeopleRDD)
    happyPeopleSchemaRDD.createOrReplaceTempView("happy_people")
    print sqlCtx.sql("select * from happy_people").show()
    # UDF
def main():
    """
    Main function to execute the HW assignments
    """

    print "Adv DB HW"

    conf = (SparkConf().setMaster("local").setAppName("Adv DB HW app").set(
        "spark.executor.memory", "1g"))
    sc = SparkContext(conf=conf)

    # Define a display when no DISPLAY is undefined
    matplotlib.use('Agg')

    # Import weather dataset
    weatherDataset = sc.textFile("/datasets/2016weather.csv")

    # Import user dataset
    userDataset = sc.textFile("/datasets/userdata.csv")

    # Parse datasets
    weatherParse = weatherDataset.map(lambda line: line.split(","))
    userParse = userDataset.map(lambda line: line.split(","))

    # Work on datasets
    weatherSnow = weatherParse.filter(lambda x: x[2] == "SNOW")

    # Just testing
    firstWeather = weatherParse.first()
    print "firstWeather is ", firstWeather

    # instantiate SQLContext object
    sqlContext = SQLContext(sc)

    # Convert each line of snowWeather RDD into a Row object
    snowRows = weatherSnow.map(
        lambda p: Row(station=p[0],
                      month=datetime.strptime(p[1], '%Y%m%d').month,
                      date=datetime.strptime(p[1], '%Y%m%d').day,
                      metric=p[2],
                      value=int(p[3])))

    # Apply Row schema
    snowSchema = sqlContext.createDataFrame(snowRows)

    # Register 'snow2016' table with 5 columns: station, month, date, metric, and value
    snowSchema.registerTempTable("snow2016")
    sqlContext.cacheTable("snow2016")

    snow_US10chey021 = sqlContext.sql(
        "SELECT month, COUNT(*) AS snowdays FROM snow2016 WHERE station='US10chey021' GROUP BY month ORDER BY month"
    ).collect()

    snow_US10chey021[0]

    # list of 12 values are 0
    US10chey021_snowdays_y = [0] * 12
    for row in snow_US10chey021:
        US10chey021_snowdays_y[row.month - 1] = row.snowdays

    print(US10chey021_snowdays_y)

    snow_USW00094985 = sqlContext.sql(
        "SELECT  month, COUNT(*) AS snowdays FROM snow2016 WHERE station='USW00094985' GROUP BY month ORDER BY month"
    ).collect()

    USW00094985_snowdays_y = [0] * 12
    for row in snow_USW00094985:
        USW00094985_snowdays_y[row.month - 1] = row.snowdays

    print(USW00094985_snowdays_y)
    """
        Print figures
    """

    N = 12
    ind = np.arange(N)
    width = 0.35
    pUS10chey021 = plt.bar(ind,
                           US10chey021_snowdays_y,
                           width,
                           color='g',
                           label='US10chey021')
    pUSW00094985 = plt.bar(ind + width,
                           USW00094985_snowdays_y,
                           width,
                           color='y',
                           label='USW00094985')

    plt.ylabel('SNOW DAYS')
    plt.xlabel('MONTH')
    plt.title('Snow Days in 2016 at Stations US10chey021 vs. USW00094985')
    plt.xticks(ind + width, ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul',
                             'Aug', 'Sep', 'Oct', 'Nov', 'Dec'))
    plt.legend()

    plt.savefig('/python-scripts/examples/weatherfig.png')

    plt.show()

    snowStations = sqlContext.sql(
        "SELECT  station, COUNT(*) AS snowdays FROM snow2016 WHERE station LIKE 'US%' GROUP BY station ORDER BY station LIMIT 100"
    )

    snowStations.head(5)

    sqlContext.registerDataFrameAsTable(snowStations, "snowdays_2016")

    snowStations_new = sqlContext.sql(
        "SELECT station, snowdays FROM snowdays_2016 ORDER BY snowdays DESC LIMIT 5"
    ).collect()

    for row in snowStations_new:
        print(row)
Exemple #5
0
sc = SparkContext("local[*]", "Simple App")
#sc = SparkContext("spark://url:7077", "Simple App")
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.shuffle.partitions", "5")

# issue movies query
conf = {"es.resource" : "movies2/logs", "es.query" : "?q=name:picture"}
movies = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat",\
    "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf)

# place results in table
moviesRows = movies.map(lambda p: Row(id=int(p[1]['id']), name=p[1]['name']))
moviesRowsList = moviesRows.collect()
schemaMovies = sqlContext.createDataFrame(moviesRowsList)
schemaMovies.registerTempTable("movies")
sqlContext.cacheTable("movies")

# get ids in order to form acted_in query
ids = []
for moviesRow in moviesRowsList:
    ids.append(moviesRow['id'])
movieIdSnippets = []
for id in ids:
    movieIdSnippets.append("movie_id:" + str(id))

# partition acted_in query
actedInRowsTotalList = []
movieIdSnippetsChunks = list(chunks(movieIdSnippets, 1000))
for chunk in movieIdSnippetsChunks:
    movieIdQuery = " OR ".join(chunk)
    conf = {"es.resource" : "acted_in2/logs", "es.query" : "?q=" + movieIdQuery, "es.size" : "10000"}
Sentiment = sqlContext.read.parquet("swift://notebooks.spark/sentiment.parquet")


# In[62]:

print Gender


# In[37]:

followers.registerTempTable("followers");


# In[38]:

sqlContext.cacheTable("followers")
plt1 = sqlContext.sql("SELECT * FROM followers where followers not in (2870776) order by userDisplayName desc")


# In[39]:

df1 = plt1.toPandas()
df1=df1.set_index('userDisplayName')


# In[75]:

from pylab import rcParams
rcParams['figure.figsize'] = 20,10
Line=df1.plot(kind='line',title='Count by followers of users',stacked=False)
Line.set_ylabel("No.of follwers") 
Exemple #7
0
        counter=0
        ff=kmeansDF.rdd.map(lambda x:Kmeans(x))
        new=ff.groupByKey().map(computecentroid)
        dd=new.collect()
        for itt in dd:
            index=itt[0]
            val=itt[1]
            centroidlist[index-1][0]=val[0]
            centroidlist[index-1][1]=val[1]

    classrow=ff.map(lambda x: (Row(Zone=x[0],ids=x[1][2])))
    finaldf=sqlContext.createDataFrame(classrow)
    naivedf=kmeansDF.join(finaldf,(kmeansDF.id==finaldf.ids))
    '''------------------------------------------------------Naive Bayes Distributed--------------------------------------------'''
    classification = []
    classfinal = {}

    naivedf.registerTempTable('bayesDataTable')
    sqlContext.cacheTable("bayesDataTable")

    timevocab=sqlContext.sql("SELECT count(distinct(Time_Zone)) from bayesDataTable").collect()[0][0]
    weekdayvocab=sqlContext.sql("SELECT count(distinct(Weekday)) from bayesDataTable").collect()[0][0]
    # zoneList = sqlContext.sql("SELECT distinct(Zone) from bayesDataTable").collect()
    min=0.0
    timelist=sqlContext.sql("SELECT Time_Zone FROM bayesDataTable").collect()
    weeklist=sqlContext.sql("SELECT Weekday FROM bayesDataTable").collect()

    classpredicted=PredictNaiveBayes(giventimeZone, givenWeekday)
    print "Area with highest demand given time and day is ",classpredicted
    print "Top three highest demand areas are ", sorted(classfinal,key=classfinal.get,reverse=True)[:3]
Exemple #8
0
    file_list = cols.split(',')
    #print("file list: ", file_list)

    file_id = 0
    path = '/user/hm74/NYCColumns/'
    
    labels = dict()
    for file in file_list:
        if file_id < 135 and file != '4d7f-74pe.Address.txt.gz' and file != '64gx-bycn.EMPCITY.txt.gz' and file != 'pq5i-thsu.DVC_MAKE.txt.gz' and file != 'jz4z-kudi.Respondent_Address__Zip_Code_.txt.gz' and file != 'tukx-dsca.Address_1.txt.gz' and file != 'h9gi-nx95.VEHICLE_TYPE_CODE_1.txt.gz' and file != 'mdcw-n682.First_Name.txt.gz' and file != 'sxx4-xhzg.Park_Site_Name.txt.gz':
            inFile = path + file
            print("File Name: ", file)
            file_name = file.split('.')[1]

            if file in labels:
                labels[file].append(file_name)
            else:
                labels[file] = [file_name]

            spark = SparkSession(sc)
            col = sqlContext.read.format('csv').options(inferschema='true', delimiter='\t').load(inFile)
            col.createOrReplaceTempView("col")
            sqlContext.cacheTable("col")
            spark.sql("select * from col").show(col.count(), False)
            file_id += 1
        else:
            file_id += 1

    with open('135_label.json', 'w', encoding='utf-8') as f:
        json.dump(labels, f, ensure_ascii=False)
	
    sc.stop()
Exemple #9
0
            zonenum = eachh[0]
            newcen = eachh[1]
            #Adding new list of Centroids to the ListofCentroids
            ListOfCentroids[zonenum - 1][0] = newcen[0]
            ListOfCentroids[zonenum - 1][1] = newcen[1]

    classrow = TempRdd.map(lambda x: (Row(Zone=x[0], idxx=x[1][2])))
    finaldf = sqlContext.createDataFrame(classrow)
    #Adding a column called "Zone" to each record/row in the RDD
    NBDF = KMDF.join(finaldf, (KMDF.id == finaldf.idxx))

    Result = {}

    NBDF.registerTempTable('NBtable')
    #Caching the table so to reduce the execution time
    sqlContext.cacheTable("NBtable")

    #To get the total vocabulary of Periods in the file
    PeriodVocabulary = sqlContext.sql(
        "SELECT count(distinct(Period)) from NBtable").collect()[0][0]
    #To get the total vocabulary of days in the file
    DayVocabulary = sqlContext.sql(
        "SELECT count(distinct(Weekday)) from NBtable").collect()[0][0]

    WeatherVocabulary = sqlContext.sql(
        "SELECT count(distinct(weather)) from NBtable").collect()[0][0]
    totaltimes = sqlContext.sql("SELECT Period FROM NBtable").collect()
    weeklist = sqlContext.sql("SELECT Weekday FROM NBtable").collect()
    weatherlist = sqlContext.sql("SELECT weather FROM NBtable").collect()
    #Sending the given user arguments of day and time to NaiveBayes to predict zone
    PredictedZone = NaiveBayes(InputPeriod, InputDay, InputWeather)
Exemple #10
0
jsonFile = sqlContext.read.json("swift://notebooks.spark/lobbying.json")


# In[3]:

print jsonFile


# In[4]:

jsonFile.registerTempTable("lobbyings");


# In[5]:

sqlContext.cacheTable("lobbyings")


# In[6]:

lobbyings = sqlContext.sql("SELECT * FROM lobbyings")


# In[7]:

lobbyings.cache()


# In[8]:

lobbyings.printSchema()
Exemple #11
0
def main(argv):
    conf = SparkConf().setAppName("checkpoint-2")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    partition = int(argv[0])
    input_file = argv[1]

    #Print info to the console
    print '\nCreating dataframe:\n'
    print 'No. of partitions : ', partition
    print 'input_file : ', input_file

    ## create rdd from input file and apply map operation to split strings
    rdd = sc.textFile(input_file, partition)
    mapped_rdd = rdd.map(lambda l: l.split(","))

    ## Converting fields into appropriate data types
    aadhaar = mapped_rdd.map(lambda l: \
    Row(date=(datetime.strptime(l[0].strip(), '%Y%m%d').date()), \
            registrar=(l[1].strip()), \
            private_agency=(l[2].strip()), \
            state=(l[3].strip()), \
            district=(l[4].strip()), \
            sub_district=(l[5].strip()), \
            pincode=(l[6].strip()), \
            gender=(l[7].strip()), \
            age=int(l[8].strip()), \
            aadhaar_generated=int(l[9].strip()), \
            rejected=int(l[10].strip()), \
            mobile_number=int(l[11].strip()), \
            email_id=int(l[12].strip())))

    # Applying schema to RDD and create Dataframe
    aadhaar_df = sqlContext.createDataFrame(aadhaar)
    aadhaar_df.persist()

    aadhaar_df.registerTempTable('aadhaar_table')
    sqlContext.cacheTable('aadhaar_table')

    # 7. Describe the schema
    print('7 - Schema : ')
    print('\n')
    aadhaar_df.printSchema()
    print('\n\n')

    # 8. count and names of registrars in the table
    print('8 - the count and names of registrars in the table ')
    print('\n')
    count_registrars = aadhaar_df.select("registrar").distinct().count()
    print("8.1 - count of registrars : \n", count_registrars)

    print("names of registrars: \n")
    print('\n')
    if aadhaar_df.isLocal:
        aadhaar_df.select("registrar").distinct().show(count_registrars, False)
    else:
        aadhaar_df.select("registrar").distinct().show(False)
    print('\n\n')

    # 9. number of states, districts in each state and sub-districts in each district
    count_states = aadhaar_df.select("state").distinct().count()
    print(" 9 - number of states : ", count_states)
    print('\n')
    query = 'select state, count(distinct district) as district_count from aadhaar_table group by state order by state'
    district_per_state = sqlContext.sql(query)
    print('\n\n')

    print(" 9 - number of districts in each state : ")
    print('\n')
    district_per_state.show(district_per_state.count(), False)

    query = 'select state, district, count(distinct sub_district) as sub_district_count from aadhaar_table group by state, district order by state, district'
    subdist_per_dist = sqlContext.sql(query)

    print(" 9 -number of sub_district in each district : ")
    print('\n')
    subdist_per_dist.show(subdist_per_dist.count(), False)
    print('\n\n')

    # 10. number of males and females in each state from the table and display a suitable plot
    query = 'select state, \
	 sum(case when gender = \'M\' then 1 else 0 end) as male,\
	 sum(case when gender = \'F\' then 1 else 0 end) as female \
	 from aadhaar_table \
	 group by state \
	 order by state'

    mf_each_state = sqlContext.sql(query)
    print(" 10 - number of male and female in each state are: ")
    print('\n')
    mf_each_state.show(mf_each_state.count(), False)
    print('\n\n')

    # plot the graph using Matplotlib
    df = mf_each_state.toPandas()
    df = df.set_index('state')
    gf = df[['male', 'female']]
    gf.plot.barh(stacked=True)
    plt.title('Number of males and females in each state')
    plt.xlabel('Number of males and females')
    plt.ylabel('States')
    plt.show()

    # 11. names of private agencies for each state
    query = 'select distinct state, private_agency from aadhaar_table group by state, private_agency'
    prvt_agencies_each_state = sqlContext.sql(query)
    print(" 11 - names of private_agencies in each state are : ")
    print('\n')
    prvt_agencies_each_state.show(prvt_agencies_each_state.count(), False)
    print('\n\n')

    # 12. number of private agencies for each state.
    query = 'select state, count(distinct private_agency) as private_agencies from aadhaar_table group by state order by state'
    prvt_agencies_each_state = sqlContext.sql(query)
    print(" 12 - Number of private_agencies in each state as follow: ")
    print('\n')
    prvt_agencies_each_state.show(prvt_agencies_each_state.count(), False)
    print('\n\n')

    # plot the graph using Matplotlib
    df = prvt_agencies_each_state.toPandas()
    df = df.set_index('state')
    gf = df['private_agencies']
    gf.plot.barh(stacked=True)
    plt.title('Number of private agencies for each state')
    plt.xlabel('Number of private agencies')
    plt.ylabel('States')
    plt.show()

    # remove cached elements from memory
    sqlContext.uncacheTable('aadhaar_table')
    aadhaar_df.unpersist()
Exemple #12
0
class SparkDriver:
    def __init__(self, config):
        self.config = config
        log.debug('SPARK_CONFIG: {0}'.format(config))

        #获取需要进行计算的文件列表
        path = self.config['data_path']
        file_list = []
        import os
        dir_list = os.listdir(path)
        for d in dir_list:
            d2 = os.path.join(path,d)
            if not os.path.isfile(d2):
                continue

            n, e = os.path.splitext(d)
            if e == '.csv':
                file_list.append(d2)

        file_list = sorted(file_list,reverse = True)

        if 'file_count' in self.config and self.config['file_count'] != -1 and len(file_list) > self.config['file_count']:
            file_list = file_list[:self.config['file_count']]

        # 配置spark的上下文
        spark_conf = SparkConf().setMaster(
                self.config['master']
            ).setAppName(
                datetime.now().strftime('%Y%m%d%H%M%S')
            ).set(
                'spark.executor.memory', self.config['executor_memory']
            )

        self.sc = SparkContext(conf=spark_conf)
        self.sqlContext = SQLContext(self.sc)

        # 读取文件列表
        parts_list = []
        for name in file_list:
            log.debug('+++++ADD FILE:  '+name)
            lines = self.sc.textFile(name)
            parts = lines.map(lambda l: l.split(','))
            parts_list.append(parts)

        log.debug('@@@@@DATA FILE COUNT({0})'.format(len(parts_list)))
        parts = self.sc.union(parts_list)
        reportInfo = parts.map(lambda p: Row(order_count=1, order_id=p[0],
                downstream=p[1],
                face_value=float(p[2])*10000,
                mobile=p[3],
                start_time=p[4] and p[4] != '' and p[4] != 'None' and int(p[4]) or 0,
                end_time= p[5] and p[5] != '' and p[5] != 'None' and int(p[5]) or 0,
                order_status=[p[6], '9'][p[6] not in ('0', '1')],
                downstream_price= p[7] and p[7] !='' and p[7] != 'None' and int(p[7]) or 0,
                carrier=p[8],
                area=p[9],
                product_type=p[10],
                upstream=p[11],
                upstream_price=p[12] and p[12] !='' and p[12] != 'None' and int(p[12]) or 0,
                profit=[0, p[12] and (p[7] and p[7] !='' and p[7] != 'None' and int(p[7]) or 0) - (p[12] and p[12] !='' and p[12] != 'None' and int(p[12]) or 0) ][ p[6] == '1'],
            )
        )

        self.sales_temp_table = self.sqlContext.createDataFrame(reportInfo)
        self.sales_temp_table.registerTempTable('sales_temp_table')
        execSql='''
            SELECT
            SUM(order_count) AS total_order_count,
            downstream,
            SUM(face_value) AS total_face_value,
            start_time,
            order_status,
            SUM(downstream_price) AS total_downstream_price,
            carrier,
            area,
            upstream,
            SUM(upstream_price) AS total_upstream_price,
            SUM(profit) AS total_profit
            FROM sales_temp_table
            GROUP BY
            downstream,
            start_time,
            end_time,
            order_status,
            carrier,
            area,
            upstream
        '''
        log.info('\033[32mc*********reating sales_cache_table sql=%s\033[0m' % execSql)
        sqlResult = self.sqlContext.sql(execSql)
        #将原始表转换成用于查询的表,并保存到内存中去,必须采用一个动作的方法(如:collect)才能载入内存成功
        sqlResult.registerTempTable('sales_cache_table')
        #perist(MEMORY_AND_DISK)  #->内存加硬盘模式
        self.sqlContext.cacheTable('sales_cache_table')
        self.sqlContext.sql('SELECT * FROM sales_cache_table').collect()
        #用于查询的目标表载入内存完成

        # 测试代码
        if True:
            import os
            path = os.path.join('./', 'test.txt')
            with open(path, 'w') as testFile:
                testFile.write('100')
            self.sc.addFile(path)
            result = self.sc.parallelize([1, 2, 3, 4]).mapPartitions(test_func).collect()
            log.info('SPARK TEST RESULT: {0}'.format(result) )

    # 销售数据的详细报表
    def report_sales(self, argu_list):
        # 暂时不处理分页的数据
        argu_list.pop('page_index')
        argu_list.pop('page_size')

        query_start_time = argu_list['query_start_time']
        query_end_time = argu_list['query_end_time']

        argu_list['query_start_time'] = datetime.strptime(argu_list['query_start_time'], '%Y/%m/%d')
        argu_list['query_end_time'] = datetime.strptime(argu_list['query_end_time'], '%Y/%m/%d') + timedelta(days=1)

        # 删除值为空的过滤条件
        # argu_list = {k: argu_list[k] for k in argu_list if argu_list[k] and argu_list[k] != '' }

        if 'query_start_time' in argu_list:
            argu_list['query_start_time'] = int( argu_list['query_start_time'].timestamp() )

        if 'query_end_time' in argu_list:
            argu_list['query_end_time'] = int( argu_list['query_end_time'].timestamp() )

        log.info('SparkDriver report REQU: {0}'.format(argu_list) )

        # 组装SQL语句
        #需要显示的列,
        select_sql = '''
            SELECT
            SUM(total_face_value) as total_face_value,
            SUM(total_downstream_price) as total_downstream_price,
            SUM(total_upstream_price) as total_upstream_price,
            SUM(total_profit) as total_profit,
            SUM(total_order_count) as total_order_count,'''
        where_sql = 'WHERE (start_time>={query_start_time}) AND (start_time<{query_end_time})'.format(
            query_start_time=argu_list['query_start_time'],
            query_end_time=argu_list['query_end_time'],
            )
        argu_list.pop('query_start_time')
        argu_list.pop('query_end_time')

        groupby_sql = 'GROUP BY'
        for key in argu_list:
            if argu_list[key] == '':  #全部
                continue

            select_sql += ' {0},'.format(key)
            groupby_sql += ' {0},'.format(key)

            if argu_list[key] == '@':#全选
                continue
            elif isinstance(argu_list[key], list): #数组处理
                value_list = ''
                for value in argu_list[key]:
                    value_list += '"{0}",'.format(value)
                if value_list != '':
                    value_list = value_list[:-1]
                where_sql += ' AND {key} IN ({value_list})'.format(key=key, value_list=value_list)
            else:
                where_sql += ' AND {key}="{value}"'.format(key=key, value=argu_list[key])

        default_upstream = '未知'
        if 'upstream' in argu_list and argu_list['upstream'] == '':
            default_upstream = '全部'

        default_downstream = '未知'
        if 'downstream' in argu_list and argu_list['downstream'] == '':
            default_downstream = '全部'

        select_sql = select_sql[:-1]
        if groupby_sql != 'GROUP BY':
            groupby_sql = groupby_sql[:-1]
        else:
            groupby_sql = ''

        query_sql = '{select_sql} FROM sales_cache_table {where_sql} {groupby_sql}'.format(
            select_sql=select_sql,
            where_sql=where_sql,
            groupby_sql=groupby_sql
        )

        # query_sql = ''
        # with open('./test.sql', 'r') as f:
        #     query_sql = f.read()

        log.info('\033[32m*************query_sql=%s\033[0m ' % query_sql)
        query_result = self.sqlContext.sql(query_sql)
        query_output = query_result.collect()
        data_list=[]
        for data in query_output:
            data = data.asDict()
            log.debug(data)

            total_face_value = data['total_face_value'] != None and data['total_face_value'] or 0
            total_downstream_price = data['total_downstream_price'] != None and data['total_downstream_price'] or 0
            total_upstream_price = data['total_upstream_price'] != None and data['total_upstream_price'] or 0
            total_order_count = data['total_order_count'] != None and data['total_order_count'] or 0
            total_profit = data['total_profit'] != None and data['total_profit'] or 0

            one_line = {
                'date_range': query_start_time + '-' + query_end_time,
                'order_status': 'order_status' in data and data['order_status'] or '',
                'upstream': 'upstream' in data and data['upstream'] or default_upstream,
                'downstream': 'downstream' in data and data['downstream'] or default_downstream,
                'carrier': 'carrier' in data and data['carrier'] or '',
                'area': 'area' in data and data['area'] or '',
                'total_face_value': total_face_value,
                'total_downstream_price': total_downstream_price,
                'total_upstream_price': total_upstream_price,
                'total_order_count': total_order_count,
                'total_profit': total_profit
            }
            data_list.append(one_line)

        return data_list

    # 销售数据的概要报表
    def report_sales_brief(self, argu_list):
        # 暂时不处理分页的数据
        argu_list.pop('page_index')
        argu_list.pop('page_size')
        query_start_time = argu_list['query_start_time']
        query_end_time = argu_list['query_end_time']

        argu_list['query_start_time'] = datetime.strptime(argu_list['query_start_time'], '%Y/%m/%d')
        argu_list['query_end_time'] = datetime.strptime(argu_list['query_end_time'], '%Y/%m/%d') + timedelta(days=1)

        if 'query_start_time' in argu_list:
            argu_list['query_start_time'] = int( argu_list['query_start_time'].timestamp() )

        if 'query_end_time' in argu_list:
            argu_list['query_end_time'] = int( argu_list['query_end_time'].timestamp() )

        log.info('SparkDriver report REQU: {0}'.format(argu_list) )

        # 组装SQL语句
        #需要显示的列,
        select_sql = '''
            SELECT
            SUM(total_downstream_price) as total_downstream_price,
            SUM(total_order_count) as total_order_count,'''
        where_sql = 'WHERE (start_time>={query_start_time}) AND (start_time<{query_end_time})'.format(
            query_start_time=argu_list['query_start_time'],
            query_end_time=argu_list['query_end_time'],
            )
        argu_list.pop('query_start_time')
        argu_list.pop('query_end_time')

        groupby_sql = 'GROUP BY'
        for key in argu_list:
            if argu_list[key] == '':  #全部
                continue

            select_sql += ' {0},'.format(key)
            groupby_sql += ' {0},'.format(key)

            if argu_list[key] == '@':#全选
                continue
            elif isinstance(argu_list[key], list): #数组处理
                value_list = ''
                for value in argu_list[key]:
                    value_list += '"{0}",'.format(value)
                if value_list != '':
                    value_list = value_list[:-1]
                where_sql += ' AND {key} IN ({value_list})'.format(key=key, value_list=value_list)
            else:
                where_sql += ' AND {key}="{value}"'.format(key=key, value=argu_list[key])

        default_downstream = '未知'
        if 'downstream' in argu_list and argu_list['downstream'] == '':
            default_downstream = '全部'

        select_sql = select_sql[:-1]
        if groupby_sql != 'GROUP BY':
            groupby_sql = groupby_sql[:-1]
        else:
            groupby_sql = ''

        query_sql = '{select_sql} FROM sales_cache_table {where_sql} {groupby_sql}'.format(
            select_sql=select_sql,
            where_sql=where_sql,
            groupby_sql=groupby_sql
        )

        log.info('\033[32m*************query_sql=%s\033[0m ' % query_sql)
        query_result = self.sqlContext.sql(query_sql)
        query_output = query_result.collect()

        #组装返回数据格式
        all_total_downstream_price = 0
        for data in query_output:
            all_total_downstream_price += data['total_downstream_price']

        data_list=[]
        for data in query_output:
            data = data.asDict()
            log.debug(data)
            total_downstream_price = data['total_downstream_price'] != None and data['total_downstream_price'] or 0
            total_order_count = data['total_order_count'] != None and data['total_order_count'] or 0
            one_line = {
                'date_range': query_start_time + '-' + query_end_time,
                'downstream': 'downstream' in data and data['downstream'] or default_downstream,
                'carrier': 'carrier' in data and data['carrier'] or '',
                'total_downstream_price': total_downstream_price,
                'total_order_count': total_order_count,
                'percentage':("%.2f%%" % (total_downstream_price*100/all_total_downstream_price))
            }
            data_list.append(one_line)
        return data_list
Exemple #13
0
    for file in file_list:
        print("---" + str(file_id) + "th file---")
        fileName = file[0]
        print(fileName)
        file_id += 1

        temp_JSON = dict()
        temp_JSON['dataset_name'] = fileName
        path = inFile + "/" + fileName

        spark = SparkSession(sc)
        test = sqlContext.read.format('csv').options(header='true',
                                                     inferschema='true',
                                                     delimiter='\t').load(path)
        test.createOrReplaceTempView("test")
        sqlContext.cacheTable("test")
        #spark.sql("select * from test").show()
        #spark.sql("select count(*) from test").show()
        attrs = test.schema.names
        columns = []
        candidate_keys = []

        for attr in attrs:
            temp_column = dict()
            #print("column name:", attr)
            temp_column['column_name'] = attr
            temp = spark.sql("SELECT `" + attr + "` as attr FROM test")
            #temp.show()
            temp.createOrReplaceTempView("temp")
            temp_count = temp.count()
Exemple #14
0
        lambda x: tuple(x.split('\t')))

    print('Cities loaded')

    print('Creating SQL Context for cities')

    cities_df = cities_file.map(lambda c: Row(name=c[0],
                                              lat=float(c[1]),
                                              lon=float(c[2]),
                                              country_code=c[3],
                                              country_name=c[4],
                                              type=c[5]))

    cities_df = sqlContext.createDataFrame(cities_df)
    cities_df.registerTempTable('cities')
    sqlContext.cacheTable('cities')

    print('SQL context for cities created')

    # See timestamp function
    print('Task 1.2 - Calculate local time for each checkin')
    checkin_df = foursqr.map(lambda l: Row(checkin_id=int(l[0]),
                                           user_id=int(l[1]),
                                           session_id=l[2],
                                           time=timestamp(l[3], l[4]),
                                           lat=float(l[5]),
                                           lon=float(l[6]),
                                           category=l[7],
                                           subcategory=l[8]))

    print('Local time for each checkin calculated')
calculate_forest_bound_udf = func.udf(calculate_forest_bound,
                                      ArrayType(FloatType()))
df2 = df2.withColumn('forest_bound', calculate_forest_bound_udf('polygon'))

join_condition_udf = func.udf(join_condition, BooleanType())

# join suburb dataframe with forest dataframe based on the condition of intersection
df1p1 = df1.repartition(1)
df2p1 = df2.repartition(10)
joined = df1p1.crossJoin(df2p1).where(
    join_condition_udf(df1p1.suburb_bound, df2p1.forest_bound))

joined = joined.drop('suburb_bound').drop('area').drop('forest_bound')

# group and aggregate joined table by suburb code(name)
joined.registerTempTable('joined_table')
sql_context.cacheTable("joined_table")
result = sql_context.sql(
    'SELECT FIRST(suburb_name) AS suburb_name, suburb_code, FIRST(multipolygon) AS multipolygon, FIRST(suburb_area) '
    'AS suburb_area, collect_list(polygon) AS forest_multipolygon FROM joined_table GROUP BY suburb_code'
)

result = result.withColumn(
    'percentage_%',
    func.udf(calculate_forest_rate,
             FloatType())('multipolygon', 'forest_multipolygon',
                          'suburb_area'))
result = result.drop('multipolygon').drop('suburb_area').drop(
    'forest_multipolygon')
result.orderBy('percentage_%', ascending=False).show()
Exemple #16
0
from pyspark.sql import SQLContext, Row

conf = SparkConf().setAppName('TriWordCount')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

trigrams = sc.textFile(
    'hdfs:///users/rocks1/12307130174/spark_probabilities_smoothed01/*')

trigrams = trigrams.map(lambda line: eval(line)) \
                        .map(lambda t: Row(word0 = t[0][0], word1=t[0][1], word2=t[0][2], prob=t[1]))

schemaTrigram = sqlContext.createDataFrame(trigrams)
schemaTrigram.registerTempTable("trigram")

sqlContext.cacheTable("trigram")
#schemaTrigram.cache()

import socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

s.bind(("", 54899))
s.listen(5)

while True:
    #word0, word1 = raw_input(">").split()
    print "in loop"
    client, _ = s.accept()
    print "acccpeted"
    recved = client.recv(1024)
    print "recived"
def timeslot(f):
    time24=datetime.datetime.strptime(f,'%I:%M:%S %p').strftime('%X')
    timesl=int(time24[0:2])/3
    timesl=timesl+1#divided time into 8 slots 
    return timesl
   
schemaString="day timeslot block crimetype latitude longitude" 


reformattedCrime=crimeData.map(lambda line: [date2dayofweek(line[1].split(' ',1)[0]),timeslot(line[1].split(' ',1)[1]),line[2].split(' ',1)[1],line[3],line[4],line[5]])


schemaCrime = sqlContext.createDataFrame(reformattedCrime, ['day','timeslot','block','crimetype','latitude','longitude'])
schemaCrime.registerTempTable("chicagocrimedata")
sqlContext.cacheTable("chicagocrimedata")

timeMatrix=sqlContext.sql("SELECT crimetype,timeslot,count(*) AS countPerTime FROM chicagocrimedata group by crimetype,timeslot order by crimetype")


#Extract all classes. Here, distinct crime types 
CrimeTypes = sqlContext.sql("SELECT distinct(crimetype) AS crimetypes FROM chicagocrimedata order by crimetypes").collect()
allCrimeTypes = list()
for index in range(len(CrimeTypes)):
    allCrimeTypes.append(CrimeTypes[index][0])
    
  
    
#Extracting statistics of crimes top 10 
crimeCounts=sqlContext.sql("SELECT crimetype,count(*) as crimeCount FROM chicagocrimedata GROUP BY crimetype order by crimeCount desc LIMIT 10").collect()
countByCrimeType = {}
         writer = csv.writer(f)
         writer.writerows([["BA", "TimeAndDate", "Demand"]])
         f.flush()
         os.fsync(f)
 else:
     # create the union of our new rows and our old latest records and create a table and cache it.
     demand_table.unionAll(
         spark.read.csv(
             "data/latestAndEarliest.csv",
             header=True,
             inferSchema=True,
             timestampFormat="yyyy-MM-dd HH:mm:ss+00:00").
         select("BA", "TimeAndDate",
                "Demand")).write.partitionBy("BA").saveAsTable(
                    "demands")
     spark.cacheTable('demands')
     # read the old latest dates and get the mean and sd of each ba and cache it.
     latestDates = spark.read.csv(
         "data/latestAndEarliest.csv",
         header=True,
         inferSchema=True,
         timestampFormat="yyyy-MM-dd HH:mm:ss+00:00")
     latestDates.write.saveAsTable("latestAndEarliest")
     spark.cacheTable('latestAndEarliest')
     # get the clean data and write it to a table
     cleanData = spark.read.csv(
         "data/elec_demand_hourlyClean.csv",
         header=True,
         inferSchema=True,
         timestampFormat="yyyy-MM-dd HH:mm:ss+00:00")
     cleanData.write.saveAsTable("demandsClean")
Exemple #19
0
                               CAST(SQLDATE AS INTEGER), CAST(MonthYear AS INTEGER), CAST(Year AS INTEGER),
                               CASE WHEN Actor1Type1Code = '' AND Actor2Type1Code <> '' THEN Actor2Type1Code
				    ELSE Actor1Type1Code END AS Actor1Type1Code,
                               CAST(NumArticles AS INTEGER),
                               CAST(GoldsteinScale AS INTEGER),
                               CAST(AvgTone AS INTEGER)
                          FROM temp
                         WHERE ActionGeo_CountryCode <> '' AND ActionGeo_CountryCode IS NOT NULL
                            AND Actor1Type1Code <> '' AND Actor1Type1Code IS NOT NULL
                            AND NumArticles <> '' AND NumArticles IS NOT NULL
                            AND GoldsteinScale <> '' AND GoldsteinScale IS NOT NULL
                            AND AvgTone <> '' AND AvgTone IS NOT NULL""")

sqlContext.dropTempTable('temp')
sqlContext.registerDataFrameAsTable(df2, 'temp3')
sqlContext.cacheTable('temp3')

dfdaily = sqlContext.sql("""SELECT ActionGeo_CountryCode,
				   SQLDATE,
				   Actor1Type1Code,
				   SUM(NumArticles) AS NumArticles,
                                   ROUND(AVG(GoldsteinScale),0) AS GoldsteinScale,
			           ROUND(AVG(AvgTone),0) AS AvgTone
			      FROM temp3
			     GROUP BY ActionGeo_CountryCode,
				      SQLDATE,
				      Actor1Type1Code""")

dfmonthly = sqlContext.sql("""SELECT ActionGeo_CountryCode,
				     MonthYear,
				     Actor1Type1Code,
Exemple #20
0
# In[85]:

Sentiment = sqlContext.read.parquet(
    "swift://notebooks.spark/sentiment.parquet")

# In[62]:

print Gender

# In[37]:

followers.registerTempTable("followers")

# In[38]:

sqlContext.cacheTable("followers")
plt1 = sqlContext.sql(
    "SELECT * FROM followers where followers not in (2870776) order by userDisplayName desc"
)

# In[39]:

df1 = plt1.toPandas()
df1 = df1.set_index('userDisplayName')

# In[75]:

from pylab import rcParams
rcParams['figure.figsize'] = 20, 10
Line = df1.plot(kind='line',
                title='Count by followers of users',
sqlContext = SQLContext(sc)


# In[24]:

gender = sqlContext.read.parquet("gender.parquet")


# In[25]:

gender.registerTempTable("gender");


# In[33]:

sqlContext.cacheTable("gender")
plt1 = sqlContext.sql("SELECT * FROM gender order by 2")


# In[34]:

df1 = plt1.toPandas()
df1=df1.set_index('USER_GENDER')


# In[63]:

from pylab import rcParams
rcParams['figure.figsize'] = 15,5
barh=df1.plot(kind='barh',title='Count of  gender speaking about Dance',stacked=False,color='b')
barh.set_ylabel("Gender") 
Exemple #22
0
schemaString = "day timeslot block crimetype latitude longitude"
#fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
#schema = StructType(fields)

reformattedCrime = crimeData.map(lambda line: [
    date2dayofweek(line[1].split(' ', 1)[0]),
    timeslot(line[1].split(' ', 1)[1]), line[2].split(' ', 1)[1], line[3],
    line[4], line[5]
])

#Creating table from the input file.
schemaCrime = sqlContext.createDataFrame(
    reformattedCrime,
    ['day', 'timeslot', 'block', 'crimetype', 'latitude', 'longitude'])
schemaCrime.registerTempTable("chicagocrimedata")
sqlContext.cacheTable("chicagocrimedata")

#Caching helps in the retrieving the data faster
locationVocabulary = sqlContext.sql(
    "SELECT count(distinct(block)) from chicagocrimedata").collect()[0][0]
timeVocabulary = sqlContext.sql(
    "SELECT count(distinct(timeslot)) from chicagocrimedata").collect()[0][0]
dayVocabulary = sqlContext.sql(
    "SELECT count(distinct(day)) from chicagocrimedata").collect()[0][0]
TOTALCRIMES = sqlContext.sql(
    "SELECT count(distinct(day)) from chicagocrimedata").collect()[0][0]

#Creating matrix for NaiveBayes calculations.
locationsMatrix = sqlContext.sql(
    "SELECT crimetype,block,count(*) AS countPerBlock FROM chicagocrimedata group by crimetype,block order by countPerBlock desc"
)
    timesl=int(time24[0:2])/3
    timesl=timesl+1#divided time into 8 slots 
    return timesl

#Schema for creating the table.  
schemaString="day timeslot block crimetype latitude longitude" 
#fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
#schema = StructType(fields)


reformattedCrime=crimeData.map(lambda line: [date2dayofweek(line[1].split(' ',1)[0]),timeslot(line[1].split(' ',1)[1]),line[2].split(' ',1)[1],line[3],line[4],line[5]])

#Creating table from the input file.
schemaCrime = sqlContext.createDataFrame(reformattedCrime, ['day','timeslot','block','crimetype','latitude','longitude'])
schemaCrime.registerTempTable("chicagocrimedata")
sqlContext.cacheTable("chicagocrimedata")

#Caching helps in the retrieving the data faster
locationVocabulary = sqlContext.sql("SELECT count(distinct(block)) from chicagocrimedata").collect()[0][0]
timeVocabulary = sqlContext.sql("SELECT count(distinct(timeslot)) from chicagocrimedata").collect()[0][0]
dayVocabulary = sqlContext.sql("SELECT count(distinct(day)) from chicagocrimedata").collect()[0][0]
TOTALCRIMES = sqlContext.sql("SELECT count(distinct(day)) from chicagocrimedata").collect()[0][0]

#Creating matrix for NaiveBayes calculations.
locationsMatrix=sqlContext.sql("SELECT crimetype,block,count(*) AS countPerBlock FROM chicagocrimedata group by crimetype,block order by countPerBlock desc")
timeMatrix=sqlContext.sql("SELECT crimetype,timeslot,count(*) AS countPerTime FROM chicagocrimedata group by crimetype,timeslot order by crimetype")
dayMatrix=sqlContext.sql("SELECT crimetype,day,count(*) AS countPerDay FROM chicagocrimedata group by crimetype,day order by countPerDay desc")

#Extract all classes. Here, distinct crime types 
CrimeTypes = sqlContext.sql("SELECT distinct(crimetype) AS crimetypes FROM chicagocrimedata order by crimetypes").collect()
allCrimeTypes = list()
def sql_context_api(spark):

    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    print("Start running SQL context API")

    # createDataFrame
    l = [('Alice', 1)]
    sqlContext.createDataFrame(l).collect()
    res = sqlContext.createDataFrame(l, ['name', 'age']).collect()
    print(res)
    rdd = sc.parallelize(l)
    sqlContext.createDataFrame(rdd).collect()
    df = sqlContext.createDataFrame(rdd, ['name', 'age'])
    res = df.collect()
    print(res)
    print("createDataFrame API finished")

    # table and cache
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    sqlContext.cacheTable("table1")
    sqlContext.uncacheTable("table1")
    sqlContext.cacheTable("table1")
    sqlContext.clearCache()
    # sqlContext.createExternalTable("table1", schema = df2)
    sqlContext.dropTempTable("table1")
    # res = df2.collect()
    # print(res)
    print("External, TempTable and cache API finished")

    # getConf
    res = sqlContext.getConf("spark.sql.shuffle.partitions")
    print(res)
    res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
    print(res)
    sqlContext.setConf("spark.sql.shuffle.partitions", u"50")
    res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
    print(res)
    print("getConf API finished")

    # newSession
    newspark = sqlContext.newSession()
    print("newSession API finished")

    # range
    res = sqlContext.range(1, 7, 2).collect()
    print(res)
    res = sqlContext.range(3).collect()
    print(res)
    print("range API finished")

    # read
    res = sqlContext.read
    text_sdf = sqlContext.readStream.text(
        "/ppml/trusted-big-data-ml/work/examples/helloworld.py")
    res = text_sdf.isStreaming
    print(res)
    print("read and readStream API finished")

    # sql
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.sql("SELECT name AS f1, age as f2 from table1")
    res = df2.collect()
    print(res)
    print("sql API finished")

    # table
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.table("table1")
    res = (sorted(df.collect()) == sorted(df2.collect()))
    print(res)
    print("table API finished")

    # tableNames
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    res = ("table1" in sqlContext.tableNames())
    print(res)
    res = ("table1" in sqlContext.tableNames("default"))
    print(res)
    print("tableNames API finished")

    # tables
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.tables()
    res = df2.filter("tableName = 'table1'").first()
    print(res)
    print("tables API finished")

    # register
    # strlen = sqlContext.registerFunction("stringLengthString", lambda x: len(x))
    # res = spark.sql("SELECT stringLengthString('test')").collect()
    # print(res)
    spark.udf.registerJavaFunction("javaStringLength3",
                                   "org.apache.spark.sql.JavaStringLength",
                                   "integer")
    res = spark.sql("SELECT javaStringLength3('test')").collect()
    print(res)
    print("register API finished")

    print("Finish running SQL context API")
Exemple #25
0
def save_by_spark_streaming():
    async = True
    root_path = os.path.dirname(os.path.realpath(__file__))
    record_path = os.path.join(root_path, "offset.txt")
    from_offsets = {}

    if os.path.exists(record_path):

        datas = []
        with open(record_path, 'r') as f:
            for line in f:
                datas.append(line)

        for ii in datas:
            offset_data = json.loads(ii)
            topic_partion = TopicAndPartition(offset_data["topic"],
                                              offset_data["partition"])
            if topic_partion not in from_offsets:
                from_offsets[topic_partion] = int(offset_data["untilOffset"])

    sc = SparkContext(appName="Rec2champzee")
    global sqlContext
    sqlContext = SQLContext(sc)

    ssc = StreamingContext(sc, int(timer))

    kvs = KafkaUtils.createDirectStream(
        ssc=ssc,
        topics=[topic_name],
        fromOffsets=from_offsets,
        kafkaParams={"metadata.broker.list": broker_list})

    sql1 = '''
           SELECT
      sku.id AS skuid ,
      sku.SPUID ,
      sku.SKCID , 
      tm1.AttributeStatus AS spu_status ,
      tm2.ContentStatus AS skc_status ,
      sku.SizeStatus AS sku_status ,
      sku.BachName AS NAME ,
      tm1.Cate1 ,
      tm1.cate2 ,
      tm1.Cate3 ,
      tm1.cate3name,
      tm1.Brand ,
      tm1.Description ,
      tm1.SerialNumber ,
      tm2.pcolour AS pcolor ,
      tm2.color AS color ,
      GROUP_CONCAT(
        tm1.typename ,
        '-' ,
        tm1. NAME SEPARATOR ' '
      ) label ,
      tm2.Material ,
      tm1.Introduction ,
      tm2.VendorColor ,
      sku.PreSizeID ,
      cast(sku.MsrPrice as signed) MsrPrice ,
      cast(sku.PurchPrice as signed) PurchPrice ,
      cast(sku.Price as signed)  Price,
      sku.Type ,
      sku.IsSpecial AS is_special ,
      tm2.Version AS img_version ,
      tm2.ImgType AS img_type ,
      psp.Position ,
      sku.Note ,
      sku.LatestUPCTime ,
      sku.CreatedDate ,
      sku.UpdatedDate,
      sku.Quantity
    FROM
      product sku
    LEFT JOIN(
      SELECT
        spu.id AS id1 ,
        spu.AttributeStatus AS AttributeStatus ,
        spu.Cate1 AS cate1 ,
        spu.cate2 AS cate2 ,
        spu.Cate3 AS cate3 ,
        prc.name as cate3name,
        spu.Brand AS brand ,
        spu.Description AS Description ,
        spu.SerialNumber AS SerialNumber ,
        AT .TypeName AS typename ,
        pa. NAME AS NAME ,
        spu.Introduction AS Introduction
      FROM
        spu
      LEFT JOIN spuattribute sa ON spu.id = sa.SPUID
      LEFT JOIN productattribute pa ON sa.AttributeID = pa.id
      LEFT JOIN attributetype AT ON pa.Type = AT .typeID
      LEFT JOIN productcate prc ON spu.cate3 = prc .id
    ) tm1 ON sku.SPUID = tm1.id1
    LEFT JOIN(
      SELECT
        skc.id AS id2 ,
        skc.ContentStatus AS ContentStatus ,
        pc.ParentID AS pcolour ,
        skc.Color AS color ,
        skc.Material AS Material ,
        skc.VendorColor AS VendorColor ,
        skc.Version AS Version ,
        skc.ImgType AS ImgType
      FROM
        productcolor pc
      LEFT JOIN skc ON skc.Color = pc.id
    ) tm2 ON sku.skcid = tm2.id2
    LEFT JOIN productskuposition psp ON sku.id = psp.ProdID
    GROUP BY
      sku.id
        '''
    sql2 = '''

        SELECT
        	t1.id AS cate1 ,
        	t1. NAME AS cate1name ,
        	t2.id AS cate2 ,
        	t2.name as cate2name,
        	t3.id AS cate3,
        	t3.name as cate3name
        FROM
        	(
        		productcate AS t1
        		INNER JOIN productcate AS t2 ON t1.id = t2.parentid
        	)
        INNER JOIN productcate AS t3 ON t2.id = t3.parentid
        WHERE
        	t1.DELETEd = 0
        AND t2.deleted = 0
        AND t3.deleted = 0
        AND floor(t3.id / 100) > 0


            '''

    prod_detail = mysql.get(sql1)
    cate_detail = mysql.get(sql2)
    sqlContext.createDataFrame(
        sc.parallelize(prod_detail)).createOrReplaceTempView('pd')
    sqlContext.createDataFrame(
        sc.parallelize(cate_detail)).createOrReplaceTempView('cd')
    sqlContext.cacheTable('pd')
    sqlContext.cacheTable('cd')
    kvs.transform(store_offset_ranges).flatMap(reco).foreachRDD(sync_data2es)

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row

sc = SparkContext("local", "barcos")
sq = SQLContext(sc)

df = sq.read.load("data/containers_tiny.parquet")
df.registerTempTable("container")
sq.cacheTable("container")

df.select("ship_imo", "ship_name",
          "country").filter(df['country'] == 'DK').show()
sq.sql("SELECT ship_imo, ship_name FROM container WHERE country = 'DK'").show()
sq.sql(
    "SELECT ship_imo, count(container_id) number, sum(net_weight) total_weight FROM container GROUP BY ship_imo"
).show()

input("Press Enter to continue... http://localhost:4040/storage")
if demand_table.count() != 0:
    # check if the demand column contains a "None" entry
    if demand_table.dtypes[2][1] == "string":
        demand_table = demand_table.select("BA", "TimeAndDate",
                                           "Demand").filter("Demand!='None'")
    else:
        demand_table = demand_table.select("BA", "TimeAndDate", "Demand")
        # check that there are still new rows.
    if (demand_table.count() == 0):
        with open('data/newRows.csv', 'w', newline="") as f:
            pass
    else:
        # create a table of the demand data
        demand_table.write.saveAsTable("demands")
        # cache it
        spark.cacheTable('demands')
        # get the earliest and latest date within the new records
        dates = spark.sql(
            "select min(TimeAndDate), max(TimeAndDate) from demands").collect(
            )[0]
        # create a pandas dataframe for the calender
        dataframe = spark.createDataFrame(
            pd.DataFrame(pd.date_range(dates[0], dates[1], freq="H",
                                       tz='UTC')),
            ["TimeAndDate"]).repartition(8).registerTempTable("dates")
        # cache the table of dates
        spark.cacheTable('dates')
        # work out mean and sd values
        getOutliersImproved2().write.saveAsTable("demandsOut3")
        # wrtie it out to file
        spark.sql("select * from demandsOut3").coalesce(1).write.option(
def sql_context_api(spark):

    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    print("Start running SQL context API")

    # createDataFrame

    l = [('Alice', 1)]
    sqlContext.createDataFrame(l).collect()
    res = sqlContext.createDataFrame(l, ['name', 'age']).collect()
    print(res)
    rdd = sc.parallelize(l)
    sqlContext.createDataFrame(rdd).collect()
    df = sqlContext.createDataFrame(rdd, ['name', 'age'])
    res = df.collect()
    print(res)
    print("createDataFrame API finished")

    # table and cache
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    sqlContext.cacheTable("table1")
    sqlContext.uncacheTable("table1")
    sqlContext.cacheTable("table1")
    sqlContext.clearCache()
    # sqlContext.createExternalTable("table1", schema = df2)
    sqlContext.dropTempTable("table1")
    # res = df2.collect()
    # print(res)
    print("External, TempTable and cache API finished")

    # getConf
    res = sqlContext.getConf("spark.sql.shuffle.partitions")
    print(res)
    res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
    print(res)
    sqlContext.setConf("spark.sql.shuffle.partitions", u"50")
    res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
    print(res)
    print("getConf API finished")

    # newSession
    newspark = sqlContext.newSession()
    print("newSession API finished")

    # range
    res = sqlContext.range(1, 7, 2).collect()
    print(res)
    res = sqlContext.range(3).collect()
    print(res)
    print("range API finished")

    # read
    res = sqlContext.read
    # text_sdf = sqlContext.readStream.text(tempfile.mkdtemp())
    # res = text_sdf.isStreaming
    # print(res)
    print("read and readStream API finished")

    # register

    # sql
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.sql("SELECT name AS f1, age as f2 from table1")
    res = df2.collect()
    print(res)
    print("sql API finished")

    # table
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.table("table1")
    res = (sorted(df.collect()) == sorted(df2.collect()))
    print(res)
    print("table API finished")

    # tableNames
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    res = ("table1" in sqlContext.tableNames())
    print(res)
    res = ("table1" in sqlContext.tableNames("default"))
    print(res)
    print("tableNames API finished")

    # tables
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.tables()
    res = df2.filter("tableName = 'table1'").first()
    print(res)
    print("tables API finished")

    print("Finish running SQL context API")
Exemple #29
0
def main(argv):
    conf = SparkConf().setAppName("checkpoint-5")
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    partition = int(argv[0])
    input_file = argv[1]

    # Print info to the console
    print '\nCreating dataframe:\n'
    print 'No. of partitions : ', partition
    print 'input_file : ', input_file

    ## Create rdd from input file and apply map operation to split strings
    rdd = sc.textFile(input_file, partition)
    mapped_rdd = rdd.map(lambda l: l.split(","))

    ## Converting fields into appropriate data types
    aadhaar = mapped_rdd.map(lambda l: \
     Row(date=(datetime.strptime(l[0].strip(), '%Y%m%d').date()), \
     registrar=(l[1].strip()), \
     private_agency=(l[2].strip()), \
     state=(l[3].strip()), \
     district=(l[4].strip()), \
     sub_district=(l[5].strip()), \
     pincode=(l[6].strip()), \
     gender=(l[7].strip()), \
     age=int(l[8].strip()), \
     aadhaar_generated=int(l[9].strip()), \
     rejected=int(l[10].strip()), \
     mobile_number=int(l[11].strip()), \
     email_id=int(l[12].strip())))

    # Apply schema to RDD and create Dataframe
    aadhaar_df = sqlContext.createDataFrame(aadhaar)
    aadhaar_df.registerTempTable('aadhaar_card_table')
    sqlContext.cacheTable('aadhaar_card_table')

    # male and female percentage in each state query
    aadhaar_male_female_query = 'select state,\
					sum(case when gender = \'F\' then aadhaar_generated ELSE 0 END) as aadhaar_generated_female, \
					sum(case when gender = \'M\' then aadhaar_generated ELSE 0 END) as aadhaar_generated_male, \
					sum(aadhaar_generated) as aadhaar_generated from aadhaar_card_table group by state'

    aadhaar_male_female_dataframe = sqlContext.sql(aadhaar_male_female_query)
    aadhaar_male_female_dataframe.registerTempTable(
        'aadhaar_male_female_table')
    sqlContext.cacheTable('aadhaar_male_female_table')

    aadhaar_male_female_prctnge_query = 'select state, \
			(aadhaar_generated_male * 100.0)/aadhaar_generated as aadhaar_generated_male_percentage, \
			(aadhaar_generated_female * 100.0)/aadhaar_generated as aadhaar_generated_female_percentage \
			from aadhaar_male_female_table'

    aadhaar_male_female_prctnge_dataframe = sqlContext.sql(
        aadhaar_male_female_prctnge_query)
    aadhaar_male_female_prctnge_dataframe.registerTempTable(
        'aadhaar_male_female_prctnge_table')
    sqlContext.cacheTable('aadhaar_male_female_prctnge_table')

    # 22. The top 3 states where the percentage of Aadhaar cards being generated for males is the highest.
    print('\n')
    print(
        'The top 3 states where the percentage of Aadhaar cards being generated for males is the highest : '
    )
    query = 'select state from ( select * from aadhaar_male_female_prctnge_table \
	order by aadhaar_generated_male_percentage DESC limit 3)temp'

    top_3_males_dataframe = sqlContext.sql(query)
    top_3_males_dataframe.persist()
    top_3_males_dataframe.registerTempTable('top_3_males_table')
    top_3_males_dataframe.show(top_3_males_dataframe.count(), truncate=False)
    print('\n\n')

    # 23. The top 3 districts where the percentage of Aadhaar cards being rejected for females is the highest.
    print('\n')
    print(
        'The top 3 districts where the percentage of Aadhaar cards being rejected for females is the highest'
    )
    query = 'select top_3_males_table.state, act.district, act.gender, act.rejected \
					from top_3_males_table LEFT OUTER JOIN aadhaar_card_table act\
					ON(top_3_males_table.state = act.state)'

    top_3_males_info_dataframe = sqlContext.sql(query)
    top_3_males_info_dataframe.registerTempTable('top_male_states_info_table')

    query = 'select district from ( select district, \
				(case when rejected = 0 then 0 ELSE (aadhaar_rejected_female * 100.0) / rejected END) as aadhaar_rejected_female_percentage \
				from ( select district, \
						sum(case when gender = \'F\' then rejected ELSE 0 END) as aadhaar_rejected_female, \
						sum(rejected) as rejected \
						from top_male_states_info_table \
						group by district)temp1 \
					order by aadhaar_rejected_female_percentage DESC limit 3)temp2'

    top_3_female_rejected_aadhar_df = sqlContext.sql(query)
    top_3_female_rejected_aadhar_df.show(
        top_3_female_rejected_aadhar_df.count(), truncate=False)
    print('\n\n')

    # 24. The top 3 states where the percentage of Aadhaar cards being generated for females is the highest.
    print('\n')
    print(
        'The top 3 states where the percentage of Aadhaar cards being generated for females is the highest.'
    )
    query = 'select state from ( select * from aadhaar_male_female_prctnge_table order by aadhaar_generated_female_percentage DESC limit 3)A'
    top_3_females_dataframe = sqlContext.sql(query)
    top_3_females_dataframe.persist()
    top_3_females_dataframe.registerTempTable('top_3_females_table')
    top_3_females_dataframe.show(top_3_females_dataframe.count(),
                                 truncate=False)
    print('\n\n')

    # 25. The top 3 districts where the percentage of Aadhaar cards being rejected for males is the highest.
    print('\n')
    print(
        'The top 3 districts where the percentage of Aadhaar cards being rejected for males is the highest'
    )
    query = 'select top_3_females_table.state, act.district, act.gender, act.rejected \
					from top_3_females_table LEFT OUTER JOIN aadhaar_card_table act \
					ON(top_3_females_table.state = act.state)'

    top_female_states_info_dataframe = sqlContext.sql(query)
    top_female_states_info_dataframe.registerTempTable(
        'top_female_states_info_table')

    query = 'select district from ( select district, \
					(case when rejected = 0 then 0 \
						ELSE (aadhaar_rejected_male * 100.0)/rejected END) as aadhaar_rejected_male_percentage \
				from ( select district, \
						sum(case when gender = \'M\' then rejected ELSE 0 END) as aadhaar_rejected_male, \
						sum(rejected) as rejected \
					from top_female_states_info_table \
					group by district)temp1 \
					order by aadhaar_rejected_male_percentage DESC limit 3)temp2'

    top_3_male_rejected_aadhar_df = sqlContext.sql(query)
    top_3_female_rejected_aadhar_df.show(
        top_3_female_rejected_aadhar_df.count(), truncate=False)
    print('\n\n')

    # 26. The summary of the acceptance percentage of all the Aadhaar cards applications by bucketing the age group into 10 buckets.
    age_df = aadhaar_df.select("age")

    upper_limit = age_df.rdd.max()[0]
    lower_limit = age_df.rdd.min()[0]

    diff_max_min = float(upper_limit - lower_limit)
    bucket_size = int(math.ceil(diff_max_min / 10))

    query1 = 'select (case ' \
       + 'when age <= ' + str(lower_limit + 1*bucket_size) + \
       ' then \'' + str(lower_limit) + '_' + str(lower_limit + bucket_size) + '\'' \
       + 'when age > ' + str(lower_limit + 1*bucket_size) + ' and age <= ' + str(lower_limit + 2*bucket_size) + \
       ' then \'' + str(lower_limit + 1*bucket_size) + '_' + str(lower_limit + 2*bucket_size) + '\' '  \
       + 'when age > ' + str(lower_limit + 2*bucket_size) + ' and age <= ' + str(lower_limit + 3*bucket_size) + \
       ' then \'' + str(lower_limit + 2*bucket_size) + '_' + str(lower_limit + 3*bucket_size) + '\' '  \
       + 'when age > ' + str(lower_limit + 3*bucket_size) + ' and age <= ' + str(lower_limit + 4*bucket_size) + \
       ' then \'' + str(lower_limit + 3*bucket_size) + '_' + str(lower_limit + 4*bucket_size) + '\' '  \
       + 'when age > ' + str(lower_limit + 4*bucket_size) + ' and age <= ' + str(lower_limit + 5*bucket_size) + \
       ' then \'' + str(lower_limit + 4*bucket_size) + '_' + str(lower_limit + 5*bucket_size) + '\' '  \
       + 'when age > ' + str(lower_limit + 5*bucket_size) + ' and age <= ' + str(lower_limit + 6*bucket_size) + \
       ' then \'' + str(lower_limit + 5*bucket_size) + '_' + str(lower_limit + 6*bucket_size) + '\' '  \
       + 'when age > ' + str(lower_limit + 6*bucket_size) + ' and age <= ' + str(lower_limit + 7*bucket_size) + \
       ' then \'' + str(lower_limit + 6*bucket_size) + '_' + str(lower_limit + 7*bucket_size) + '\' '  \
       + 'when age > ' + str(lower_limit + 7*bucket_size) + ' and age <= ' + str(lower_limit + 8*bucket_size) + \
       ' then \'' + str(lower_limit + 7*bucket_size) + '_' + str(lower_limit + 8*bucket_size) + '\' '  \
       + 'when age > ' + str(lower_limit + 8*bucket_size) + ' and age <= ' + str(lower_limit + 9*bucket_size) + \
       ' then \'' + str(lower_limit + 8*bucket_size) + '_' + str(lower_limit + 9*bucket_size) + '\' '  \
       + 'when age > ' + str(lower_limit + 9*bucket_size) + ' and age <= ' + str(lower_limit + 10*bucket_size) + \
       ' then \'' + str(lower_limit +9*bucket_size) + '_' + str(lower_limit + 10*bucket_size) + '\' '  \
       + 'ELSE \'more_than_' + str(upper_limit) + '\' end) as age_group, ' + 'aadhaar_generated as selected, ' \
       + 'rejected ' + 'from aadhaar_card_table'

    query_final = 'select age_group, (100 * sum(selected))/(sum(selected) + sum(rejected)) as percentage_summary ' \
       + 'from ('+ query1 +')A group by age_group order by length(age_group), age_group'

    summary_dataframe = sqlContext.sql(query_final)
    summary_dataframe.show(summary_dataframe.count())

    ## unpersist elements from memory (optional)
    top_3_females_dataframe.unpersist()
    top_3_males_dataframe.unpersist()
    sqlContext.uncacheTable('aadhaar_card_table')
    sqlContext.uncacheTable('aadhaar_male_female_table')
    sqlContext.uncacheTable('aadhaar_male_female_prctnge_table')
    # Importing dataframe from HDFS with datasetnames
    #datasets = sqlContext.read.format("csv").option("header", "false").option("delimiter", "\t").load(inputDirectory + "datasets.tsv")
    # List of dataset file names
    #dataList = [str(row._c0) for row in datasets.select('_c0').collect()]
    # Iteration over dataframes begins bu using dataframe file names
    #for filename in dataList[0:4]:
    df = sqlContext.read.format("csv").option("header",
        "true").option("inferSchema", "true").option("delimiter",
        "\t").load("dummy_set.tsv")

    #col_names = df.columns
    #print(col_names)

    df.createOrReplaceTempView("df")
    sqlContext.cacheTable("df")
    df_result = spark.sql("select * \
              from df \
              ").show()

    # Copy of the jsonSchema for current iteration 
    outJSON = jsonSchema.copy()

    for col, dtype in df.dtypes: # change to map function later
      
      if dtype is 'integer':
        intJSON = intSchema.copy()
        intJSON["column_name"] = col
        intJSON = fill_numeric_json(name, df, dtype, intJSON)

        outJSON["columns"].append(intJSON)
Exemple #31
0
sqlContext = SQLContext(sc)


# In[4]:

FOLLCOUNT = sqlContext.read.parquet("FOLLOWERS.PARQUET")


# In[5]:

FOLLCOUNT.registerTempTable("FOLLCOUNT");


# In[20]:

sqlContext.cacheTable("FOLLCOUNT")
Q1 = sqlContext.sql("SELECT * FROM FOLLCOUNT WHERE F_COUNT > 1000000")


# In[21]:

F1 = Q1.toPandas()
F1=F1.set_index('USER_DISPLAY_NAME')


# In[22]:

from pylab import rcParams
import pylab
rcParams['figure.figsize'] = 20,10
Line=F1.plot(kind='line',title='Count by followers of users',stacked=False)
  td.timezone AS dest_timezone,
  to.timezone AS origin_timezone,
  TO_UTC_TIMESTAMP(DATE_TRUNC('hour', TO_TIMESTAMP(CONCAT(fl_date, ' ', LPAD(crs_dep_time, 4, '0')), 'yyyy-MM-dd HHmm')), to.timezone) AS truncated_crs_dep_time_utc,
  TO_UTC_TIMESTAMP(DATE_TRUNC('hour', TO_TIMESTAMP(CONCAT(fl_date, ' ', LPAD(crs_dep_time, 4, '0')), 'yyyy-MM-dd HHmm')), to.timezone) - INTERVAL 3 HOURS AS truncated_crs_dep_minus_three_utc,
  TO_UTC_TIMESTAMP(TO_TIMESTAMP(CONCAT(fl_date, ' ', LPAD(crs_dep_time, 4, '0')), 'yyyy-MM-dd HHmm'), to.timezone) AS crs_dep_time_utc,
  TO_UTC_TIMESTAMP(TO_TIMESTAMP(CONCAT(fl_date, ' ', LPAD(crs_dep_time, 4, '0')), 'yyyy-MM-dd HHmm'), to.timezone) - INTERVAL 2 HOURS 15 MINUTES AS crs_dep_minus_two_fifteen_utc,
  TO_UTC_TIMESTAMP(TO_TIMESTAMP(CONCAT(fl_date, ' ', LPAD(crs_arr_time, 4, '0')), 'yyyy-MM-dd HHmm'), to.timezone) AS crs_arr_time_utc
FROM airlines_temp AS f
LEFT JOIN city_state_timezone AS td ON
  f.short_dest_city_name = td.city_state
LEFT JOIN city_state_timezone AS to ON
  f.short_orig_city_name = to.city_state
""")

# make sure view is cached for subsequent operations
sqlContext.cacheTable("airlines")

# reassign airlines df to derived view and cache it
airlines = sqlContext.sql("SELECT * FROM airlines").cache()

# COMMAND ----------

# MAGIC %md
# MAGIC #### Weather

# COMMAND ----------

# add airport code to weather data to facilitate join
# filter to only records with valid airport code

Exemple #33
0
    # get command-line arguments
    inFile = sys.argv[1]
    supp = sys.argv[2]
    conf = sys.argv[3]
    prot = sys.argv[4]

    print ("Executing HW2SQL with input from " + inFile + ", support=" +supp + ", confidence=" + conf + ", protection=" + prot)

    pp_schema = StructType([
            StructField("uid", IntegerType(), True),
            StructField("attr", StringType(), True),
            StructField("val", IntegerType(), True)])

    Pro_Publica = sqlContext.read.format('csv').options(header=False).schema(pp_schema).load(inFile)
    Pro_Publica.createOrReplaceTempView("Pro_Publica")
    sqlContext.cacheTable("Pro_Publica")

    # compute frequent itemsets of size 1, store in F1(attr, val)
    query = "select attr, val, count(*) as supp \
              from Pro_Publica \
              group by attr, val \
             having count(*) >= "  + str(supp)
    F1 = spark.sql(query)
    F1.createOrReplaceTempView("F1")
    #F1.show()
    
    query = "select * from Pro_Publica where attr != 'vdecile' "
    Pro_Publica_Attr = spark.sql(query)
    Pro_Publica_Attr.createOrReplaceTempView("Pro_Publica_Attr")

    query = "select * from Pro_Publica where attr = 'vdecile'"
Exemple #34
0
                                        airline=p[1], 
                                        GeoSummary=p[5], 
                                        GeoRegion=p[6],
                                        PriceCategory=p[8],
                                        passengercount=int(p[11]),
                                        Year=int(p[14]),
                                        Month=p[15],
                                    )
                         )

# Apply Row schema
euroSchema = sqlContext.createDataFrame(euroRows)

# Register 'euroRegion' table with 7 columns: airline, summary, region, price category, passenger count, year, month
euroSchema.registerTempTable("euroRegion")
sqlContext.cacheTable("euroRegion")

# average passenger count of airline in the month of june in euro region
passengerCountbyEachMonth = sqlContext.sql("SELECT airline, AVG(passengercount)FROM euroRegion WHERE Month = 'June' GROUP BY airline").collect()

# print the results
print passengerCountbyEachMonth

# average passenger count of virgin atlantic airline in the every year
avgPassengerCountVirginAirline = sqlContext.sql("SELECT Year,AVG(passengercount)FROM euroRegion WHERE airline='Virgin Atlantic'  GROUP BY Year").collect()

print avgPassengerCountVirginAirline

# average passenger count of united airlines airline in the every year
avgPassengerCountUnitedAirline = sqlContext.sql("SELECT Year,AVG(passengercount)FROM euroRegion WHERE airline='United Airlines'  GROUP BY Year").collect()
Exemple #35
0
from pyspark.sql import SQLContext, Row


conf = SparkConf().setAppName('TriWordCount')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

trigrams = sc.textFile('hdfs:///users/rocks1/12307130174/spark_probabilities_smoothed01/*')

trigrams = trigrams.map(lambda line: eval(line)) \
                        .map(lambda t: Row(word0 = t[0][0], word1=t[0][1], word2=t[0][2], prob=t[1]))

schemaTrigram= sqlContext.createDataFrame(trigrams)
schemaTrigram.registerTempTable("trigram")

sqlContext.cacheTable("trigram")
#schemaTrigram.cache()

import socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

s.bind(("",54899))
s.listen(5)

while True:
    #word0, word1 = raw_input(">").split()
    print "in loop"
    client, _ = s.accept()
    print "acccpeted"
    recved = client.recv(1024)
    print "recived"