def convert_to_line(json_list): json_string = "" for line in json_list: json_string += json.dumps(line) + "\n" print(json_string) return json_string def parse_json(json_data, sc): r = convert_to_line(json_data) mylist = [] for line in r.splitlines(): mylist.append(line) rdd = sc.parallelize(mylist, 8) df = sqlContext.read.json(rdd) return df if __name__ == '__main__': sprk = Spark_Session() conn = sprk.Spark_Context() sql_conn = sprk.Spark_Connect() sqlContext = SQLContext(conn) ##https://api.github.com/users?since=100 with urllib.request.urlopen("https://api.github.com/users?since=100") as url: data = parse_json(parse(url.read().decode("utf-8")), conn) data.show()
from pyspark import Row from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql.context import SQLContext import pandas as pd import re from pyspark.sql.functions import * from operator import add from Pycrypto import Spark_Session def sum_cal(v): sum = 0.0 for i in v: sum += i return sum if __name__ == '__main__': sprk = Spark_Session() conn = sprk.Spark_Context() lines = conn.textFile("/Users/shuvamoymondal/Downloads/emp.txt").map( lambda v: v.split(",")).map(lambda g: (g[5], float(g[4]))) print(lines.collect()) v = lines.map(lambda h: h[1]).max() print(v)