コード例 #1
0
"""SimpleApp1.py"""
from pyspark import SparkContext as sc

logFile = "/home/tibil/Downloads/spark-2.4.4-bin-hadoop2.7/README.md"  # Should be some file on your system
sc = sc("local", "ps")
logData = sc.textFile(logFile).cache()
numAs = logData.filter(lambda s: 'a' in s).count()
numBs = logData.filter(lambda s: 'b' in s).count()
print("Lines with a: %i, lines with b: %i" % (numAs, numBs))
print("Lines with a: %i, lines with b: %i" % (numAs, numBs))
コード例 #2
0
    image = np.concatenate(images, axis=0)
    return image


# Choice of number of blocks being Blocks * Blocks
Blocks = 8

# Threshold of the edge map
T = 50

# Size of the filter and number to be extended
filterSize = 3
numExt = (filterSize - 1) // 2

# getting an instance of spark context
sc = sc()

# Obtaining rdd through of hdfs
hdfsDirectory = 'hdfs://localhost:9000/SampleImages/'
rdd = sc.binaryFiles(hdfsDirectory + '*')

# Decoding the images -- file_params (fileName, binary)
rdd = rdd.map(lambda file_params: (
    file_params[0],
    cv2.imdecode(np.asarray(bytearray(file_params[1]), dtype=np.uint8), 1)))

# file_params (fileName, img) -> file_params (i, (fileName, img))
rdd = rdd.flatMap(lambda file_params: extendVertical(file_params))

# file_params (i, (fileName, img)) -> file_params ((i,j),(fileName, img))
rdd = rdd.flatMap(lambda file_params: extendHorizontal(file_params))
コード例 #3
0
    #

    # for row in result.collect():
    #     print(row)
    #     #t, vector = row
    #     #print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))
    #
    #vec = model.transform("batman")
    #model


if __name__ == "__main__":
    os.chdir("D:\\trentsemester2\\bigData\\the-movies-dataset")
    sparkconf = SparkConf().setAppName("movie").setMaster("local[*]")
    sparkcont = sc(conf=sparkconf)
    sparkcont.setLogLevel("ERROR")
    sqlContext = SQLContext(sparkcont)
    pandas_df = getMovieData()
    df = sqlContext.createDataFrame(pandas_df)
    movie_wordcloud(df)
    #story_df = pd.read_csv("movies_metadata.csv")
    getStory(pandas_df)
    top_25(df)
    print("Top 15 romance movies")
    build_chart(df=df, genre='Romance').limit(15).show()
    #word2vec_model()
    top10 = als_model(8, df)
    ls_sentiment_model = sentiment_model()
    predict_sentiment_df(top10, ls_sentiment_model)
    print("End of Project")
コード例 #4
0
import findspark
findspark.init()
import time
from pyspark import SparkContext as sc
from pyspark import SparkConf
from pyspark.sql import SparkSession as ss
from pyspark.sql.types import *
conf = SparkConf()
conf.setMaster("spark://Sarthaks-MBP:7077").setAppName('IPL Analytics Job').set("spark.executor.memory", "512m")
spark = sc(conf=conf) 
a = spark.textFile("Dataset/*.csv").map(lambda line: line.split(",")).filter(lambda line: line[0].strip()=="ball").collect()
player_vs_player={}
for line in a:
	details = line
	if details[0].strip()=='ball':
		players1=(details[4],details[6])
		players2=(details[5],details[6])

		if players1 in player_vs_player.keys():
			player_vs_player[players1]['total']+=int(details[7])
			player_vs_player[players1]['runs'][int(details[7])]+=1
			player_vs_player[players1]['balls']+=1
			if details[9]!='""' and details[9]!='run out' and players1[0].strip()==details[10].strip():
				player_vs_player[players1]['wickets']+=1
		else:
			player_vs_player[players1]={}
			player_vs_player[players1]['total']=int(details[7])
			player_vs_player[players1]['runs']=[0,0,0,0,0,0,0,0]
			player_vs_player[players1]['wickets']=0
			player_vs_player[players1]['balls']=1
			if details[9]!='""' and details[9]!='run out' and players1[0].strip()==details[10].strip():
コード例 #5
0
import pyspark
import sys
assert sys.version_info >= (3, 5)
from pyspark.sql import SparkSession, functions, types


def main(input):
    business = spark.read.json(input).repartition(80)
    split_col = functions.split(business['categories'], ',')
    business = business.withColumn("categories",split_col).filter(business["city"] != "").dropna()
    business.createOrReplaceTempView("business")

    b_etl = spark.sql("SELECT business_id, name, city, state, latitude, longitude, stars, review_count, is_open, categories, attributes FROM business").cache()
    b_etl.createOrReplaceTempView("b_etl")
    outlier = spark.sql("SELECT b1.business_id, SQRT(POWER(b1.latitude - b2.avg_lat, 2) + POWER(b1.longitude - b2.avg_long, 2)) as dist FROM b_etl b1 INNER JOIN (SELECT state, AVG(latitude) as avg_lat, AVG(longitude) as avg_long FROM b_etl GROUP BY state) b2 ON b1.state = b2.state ORDER BY dist DESC")
    outlier.createOrReplaceTempView("outlier")
    joined = spark.sql("SELECT b.* FROM b_etl b INNER JOIN outlier o ON b.business_id = o.business_id WHERE o.dist<10")
    joined.write.parquet("yelp-etl/business_etl", mode = "overwrite")




if __name__ == '__main__':
    data_path = os.getcwd()+"/yelp-dataset/"
    Business_filepath = data_path + 'yelp_academic_dataset_business.json'
    sc = sc(appName="Yelp")
    sqlContext = SQLContext(sc)
    spark = SparkSession.builder.appName('reddit average').getOrCreate()
    assert spark.version >= '2.3'
    main(Business_filepath)