def main(input_dir, result_path):
    conf = SparkConf().setMaster("yarn-client").setAppName("avg").set(
        'spark.executor.memory',
        '4G').set('spark.driver.memory',
                  '4G').set('spark.driver.maxResultSize', '4G')
    sc = SparkContext(conf=conf)

    sqlContext = sql.SQLContext(sc)
    with open(result_path, "a") as f:

        for file in listdir(input_dir):
            sum = None
            count = None
            with open(input_dir + "/" + file) as in_f:
                lines = in_f.read().splitlines()
                rdd = sc.parallelize(lines)
                row_rdd = rdd.map(lambda line: line.split(",")).filter(
                    lambda line: len(line) == 2)
                sum = row_rdd.map(lambda line: (float(line[1]))).sum()
                count = row_rdd.count()
                in_f.close()

                f.write(file + " " + str(sum) + " " + str(count) + " " +
                        str(sum / count) + "\n")

        f.close()
Ejemplo n.º 2
0
def consumer():
	conf = SparkConf().set("spark.jars", "/home/tammy/Downloads/spark-streaming-kafka-0-8-assembly_2.11-2.4.4.jar")
	sc =SparkContext(conf=conf)
	ssc = StreamingContext(sc,5)
	print("PROGRAM STARTING!!!!!!!!!")
	print("PROGRAM STARTING!!!!!!!!!")

	sqlContext = sql.SQLContext(sc)
	directKafkaStream = KafkaUtils.createDirectStream(ssc, ["sparky"], {"metadata.broker.list":
"localhost:9091"})
	lines = directKafkaStream.map(lambda x: x[1])
	line_list = []

	def makeIterable(rdd):
		for x in rdd.collect():    
			print(x)
			line_list.append(x)
			strippedlist = [sub.replace('\n', '').replace('\r','').replace(' ','') for sub in line_list] 
			dic = json.loads(strippedlist[0])
			flattened_list = [flatten(dic)]
			df = pd.DataFrame(flattened_list)
			print(df)


	lines.foreachRDD(makeIterable)


	ssc.start()
	ssc.awaitTermination()
Ejemplo n.º 3
0
def get_sqlcontext_instance(spark_context):
    """
    :type spark_context: pyspark.SparkContext
    :param spark_context: The currently active Spark Context
    :return: Returns the SQLContext
    :rtype: sql.SQLContext
    """
    if 'sqlContextSingletonInstance' not in globals():
        globals()['sqlContextSingletonInstance'] = sql.SQLContext(
            spark_context)
    return globals()['sqlContextSingletonInstance']
def Spark_read_write_csv_to_hdfs(inputType, fileList, outDirectory):
    sc = SparkContext(appName="DATA-local-to-HDFS")
    #Set out put replication factor to 1
    sc._jsc.hadoopConfiguration().set("dfs.replication", "1")
    sqlContext = sql.SQLContext(sc)
    for filename in fileList:
        print 'Reading ' + 'file://' + filename
        rddFrame1 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true') \
        .load('file://' + filename)
        #rddFrame1.coalesce(1).write.format('com.databricks.spark.csv').save(outDirectory+filename[len(filename)-33:])
        rddFrame1.write.format('com.databricks.spark.csv').save(
            outDirectory + filename[len(filename) - 33:])
        print 'Writing ' + outDirectory + filename[len(filename) -
                                                   33:] + ' done!'
    sc.stop()
def main(data_paths: list):
	print(data_paths)

	print('Creating spark configs.')
	config = SparkConf().setAppName("Spark configuration").setMaster(cluster_address)

	print('Setting spark context and sql context.')
	spark_context = SparkContext(conf=config)
	saprk_sql_context = sql.SQLContext(spark_context)

	print('Loding data to spark')
	dataframe = saprk_sql_context.read.option('header', 'true').csv(data_paths)

	print('Saving data in parquet format')
	dataframe.write.mode("overwrite").parquet("/data/compressed.parquet")
Ejemplo n.º 6
0
def main():
    input_file, output_file = sys.argv[1], sys.argv[2]
    sc = SparkContext('local[*]', 'task1')
    sc.setLogLevel("ERROR")
    rdd = sc.textFile(input_file).map(lambda x: x.split()).cache()

    vertices_list = list(set(rdd.flatMap(set).collect()))
    edges_list = rdd.map(tuple).distinct().collect()
    edges_list_undirected = set()

    for i in range(len(vertices_list)):
        vertices_list[i] = tuple([vertices_list[i]])

    for edge in edges_list:
        if edge not in edges_list_undirected:
            edges_list_undirected.add(edge)
        edge2 = (edge[1], edge[0])
        if edge2 not in edges_list_undirected:
            edges_list_undirected.add(edge2)

    sqlContext = sql.SQLContext(sc)
    vertices = sqlContext.createDataFrame(vertices_list, ["id"])
    edges = sqlContext.createDataFrame(edges_list_undirected, ["src", "dst"])
    g = GraphFrame(vertices, edges)
    result = g.labelPropagation(maxIter=5)

    community_list = result.select("id", "label").collect()
    communities = {}
    for community in community_list:
        if community.label not in communities:
            communities[community.label] = []
        communities[community.label].append(community.id)

    communities_res = {}
    for c, ids in communities.items():
        if len(ids) not in communities_res:
            communities_res[len(ids)] = []
        ids_str = "', '".join(sorted(ids))
        ids_str = "'{}'".format(ids_str)
        communities_res[len(ids)].append(ids_str)

    with open(output_file, "w") as f:
        for k in sorted(communities_res):
            for id in sorted(communities_res[k]):
                f.write(id + "\n")
Ejemplo n.º 7
0
def consumer():
    conf = SparkConf().set(
        "spark.jars",
        "/home/fielemployee/spark-streaming-kafka-0-8-assembly_2.11-2.4.4.jar")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 5)
    print("PROGRAM STARTING!!!!!!!!!")
    print("PROGRAM STARTING!!!!!!!!!")

    sqlContext = sql.SQLContext(sc)
    directKafkaStream = KafkaUtils.createDirectStream(
        ssc, ["kafka_spark"], {"metadata.broker.list": "localhost:9094"})
    lines = directKafkaStream.map(lambda x: x[1])

    print("LINES START!!!")
    print("LINES START!!!")
    print("LINES START!!!")
    print("LINES START!!!")
def main(tweet_path, dataset_path):
    conf = SparkConf().setMaster("local").setAppName("Test")
    sc = SparkContext(conf=conf)

    sqlContext = sql.SQLContext(sc)

    # join
    #senti_score = sc.textFile("/user/ja3802/geo-data/*").map(lambda line: (line.split(",")[0], float(line.split(",")[1])))
    senti_score = sc.textFile(dataset_path).map(lambda line: line.split(","))\
        .filter(lambda line: len(line) == 2).map(lambda line: (line[0], float(line[1])))

    #$senti_score.saveAsTextFile("err")
    geodata = sc.textFile(tweet_path).map(lambda line: toKV(line))
    fulldata = geodata.join(senti_score)
    #fulldata.saveAsTextFile("fulldata")

    timeKey = fulldata.map(lambda json: getTime(json[1]))
    time_df = timeKey.toDF(["time", "geo", "like", "rt", "score"])
    time_df.write.partitionBy("time").json("heatmapdata")
Ejemplo n.º 9
0
def main():
	# parser = argparse.ArgumentParser(description="Read file contents from S3")
	# parser.add_argument("bucket", type=str, help="S3 Bucket name")
	# parser.add_argument("key", type=str, help="S3 Key path and name")
	# args = parser.parse_args()

	# config = ConfigParser.ConfigParser()
	# config.read(os.environ['HOME'] + '/.aws/credentials')
	# access_key = config.get('default', 'aws_access_key_id')
	# secret_key = config.get('default', 'aws_secret_access_key')

	# conn = boto.connect_s3(
	#         aws_access_key_id = access_key,
	#         aws_secret_access_key = secret_key,
	#         #is_secure=False,               # uncomment if you are not using ssl
	#         )   
	# bucket = conn.get_bucket(args.bucket)

	# #key = public/growth/staging/silver/third_party/facebook/ad_set/2017-04-12.avro
	# key = Key(bucket, args.key)
	# print(key.get_contents_as_string())

	# df = SQLContext.read.format("com.databricks.spark.avro").load("src/test/resources/episodes.avro")

	# #  Saves the subset of the Avro records read in
	# subset = df.where("doctor > 5")
	# subset.write.format("com.databricks.spark.avro").save("/tmp/output")

	conf = SparkConf()
	conf.setMaster('local')
	conf.setAppName('SQLApiDemo')
	sc = SparkContext(conf = conf)
	print sc.version

	sqlContext = sql.SQLContext(sc)

	sqlContext.sql("CREATE TEMPORARY TABLE table_name USING com.databricks.spark.avro OPTIONS (path '/Users/ridshakeel/Downloads/2017-04-24.avro')")
	df = SQLContext.sql("SELECT COUNT(*) FROM table_name")
	df.collect()
	DataFrame df = sqlContext.load("/Users/tariq/avro_data/browser.avro/", "com.databricks.spark.avro");
Ejemplo n.º 10
0
def main():


    spark = SparkSession.builder.appName("TRAFFIC").config("spark.executor.cores", "4").config("spark.executor.memory", "4g").getOrCreate()
    sc = spark.sparkContext
    mapping = sc.textFile("s3a://insighttraffic/ML_model/mappings").collect()[0]
    mapping = ast.literal_eval(str(mapping))

    models=[]
    for hour in range(0, 24):
        model = LinearRegressionModel.load(sc, "s3a://insighttraffic/ML_model/linear_model_log_"+str(hour))
        models.append(model)

    category_len = 154

    sqlContext = sql.SQLContext(sc)


    hadoop_conf=sc._jsc.hadoopConfiguration()
    hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    hadoop_conf.set("fs.s3n.awsAccessKeyId", 'awsAccessKeyId')
    hadoop_conf.set("fs.s3n.awsSecretAccessKey", 'awsSecretAccessKey')

    # set microbatch interval as 10 seconds, this can be customized according to the project
    ssc = StreamingContext(sc,10)
    # directly receive the data under a certain topic
    kafkaStream = KafkaUtils.createDirectStream(ssc, ['data'], {"metadata.broker.list": 'Kafka-DNS:9092'})


    connection = psycopg2.connect(host = 'postgres-ip-address', database = 'postgres', user = '******', password = '******')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS realtimetraffic (sid text, location text, latitude double precision, longitude double precision,\
        direction text, lanes integer, roadtype text, highway text, current integer, historical double precision, level text, PRIMARY KEY (id));')
    cursor.execute('SELECT AddGeometryColumn (%s,%s,%s,4326,%s,2);', (public,realtimetraffic,geom,POINT,))


    #The inbound stream is a DStream
    dstream = kafkaStream.map(lambda (key, value): json.loads(value))
    dstream.foreachRDD(lambda rdd: update(rdd, models, mapping))
Ejemplo n.º 11
0
def test_spark_transformation(spark_context, mocker):
    """ test that a single event is categorized correctly
    Args:
        spark_context: test fixture SparkContext
        sql_context: test fixture SqlContext
    """

    sqlContext = sql.SQLContext(spark_context)

    # Mocking the message coming from Kafka
    mocker.patch(
        'processor.spark_processor_refactored.read_from_kafka',
        return_value=spark_context.parallelize([
            Row(value='{"event_id": "141b3ff2a92111ebbfae367ddad5b1fa", '
                '"account_id": "684", "event_type": "other", '
                '"device": "ANDROID", "location_country": "FR", '
                '"event_timestamp": "1619724510"}')
        ]).toDF())

    # Mocking the connection with MySQL
    mocker.patch('processor.spark_processor_refactored.read_from_mysql',
                 return_value=spark_context.parallelize(
                     [Row(account_no='684', user_device='ANDROID')]).toDF())

    # Spark transformation result dataframe
    result = s.transform().collect()

    # Expected esult
    expected_result = [
        Row(event_id='141b3ff2a92111ebbfae367ddad5b1fa',
            account_id=684,
            event_type='other',
            device='ANDROID',
            location_country='FR',
            event_timestamp=datetime.datetime(2021, 4, 29, 12, 28, 30),
            status='good')
    ]

    assert result == expected_result
Ejemplo n.º 12
0
from pyspark import sql, SparkConf, SparkContext

conf = SparkConf().setAppName("task_1")
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)

df_albums = sqlContext.read.csv("albums.csv")\
  .withColumnRenamed("_c0", "id")\
  .withColumnRenamed("_c1", "artist_id")\
  .withColumnRenamed("_c2", "album_title")\
  .withColumnRenamed("_c3", "genre")\
  .withColumnRenamed("_c4", "year_of_pub")\
  .withColumnRenamed("_c5", "num_of_tracks")\
  .withColumnRenamed("_c6", "num_of_sales")\
  .withColumnRenamed("_c7", "rolling_stone_critic")\
  .withColumnRenamed("_c8", "mtv_critic")\
  .withColumnRenamed("_c9", "music_maniac_critic")

df_artists = sqlContext.read.csv("artists.csv")\
  .withColumnRenamed("_c0", "id")\
  .withColumnRenamed("_c1", "real_name")\
  .withColumnRenamed("_c2", "art_name")\
  .withColumnRenamed("_c3", "role")\
  .withColumnRenamed("_c4", "year_of_birth")\
  .withColumnRenamed("_c5", "country")\
  .withColumnRenamed("_c6", "city")\
  .withColumnRenamed("_c7", "email")\
  .withColumnRenamed("_c8", "zip_code")

a = df_artists.select("id").distinct().count()
Ejemplo n.º 13
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark import sql

# Create the spark session
spark = SparkSession \
    .builder \
    .appName("Vertica Connector Pyspark Example") \
    .getOrCreate()
spark_context = spark.sparkContext
sql_context = sql.SQLContext(spark_context)

# The name of our connector for Spark to look up
format = "com.vertica.spark.datasource.VerticaSource"

# Set connector options based on our Docker setup
host="vertica"
user="******"
password=""
db="docker"
staging_fs_url="webhdfs://hdfs:50070/data/"
table="pysparktest"

# Define data to write to Vertica
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]
# Create an RDD from the data
rdd = spark_context.parallelize(data)
# Convert the RDD to a DataFrame
df = rdd.toDF(columns)
# Write the DataFrame to the Vertica table pysparktest
Ejemplo n.º 14
0
from pyspark import SparkConf, SparkContext, sql

if __name__ == '__main__':
    conf = SparkConf().setAppName("app")
    sc = SparkContext(conf=conf)

    spark = sql.SparkSession \
        .builder \
        .appName("TEST") \
        .getOrCreate()

    sql_context = sql.SQLContext(sc, spark)
    filename = 'admitware/test.parquet4'
    s3_uri = 's3a://nu-data-lake-test/{}'.format(filename)
    print(s3_uri)
    df = sql_context.createDataFrame([('1', '4'), ('2', '5'), ('3', '6')],
                                     ["A", "B"])
    df.write.parquet(s3_uri)
    # df.write.parquet("s3a://nu-data-lake-test/admitware/test.parquet",mode="overwrite")

# spark.stop()
Ejemplo n.º 15
0
def initializeSQLContext(sc):
    """
        Creates and returns a SQL context from the Spark context.
    """
    return sql.SQLContext(sc)
Ejemplo n.º 16
0
from pyspark import sql
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext, Row
from pyspark.sql import HiveContext
from pyspark.sql import types
import json
import csv
from json import loads
from time import sleep

sc = SparkContext()
hc = HiveContext(sc)
SparkContext.setSystemProperty("hive.metastore.uris", "thrift://nn1:9083")
ssc = StreamingContext(sc, 5)
sqlc = sql.SQLContext(sc)
directKafkaStream = KafkaUtils.createDirectStream(
    ssc, ["kafka_spark"],
    {"metadata.broker.list": "sandbox-hdp.hortonworks.com:6667"})
lines = directKafkaStream.map(lambda x: x[1])

#Create Spark session with Hive supported.
appName = "PySpark Hive Example"
master = "sandbox-hdp.hortonworks.com"
ss = SparkSession.builder \
    .appName(appName) \
    .config("spark.sql.warehouse.dir", "/warehouse/tablespace/managed/hive") \
    .getOrCreate()

print("LINES START!!!")
print("LINES START!!!")
Ejemplo n.º 17
0
#for value in dfPostComment:
#    print(value)
#print("---------")
#output=dataFile.collect()
#for value in dfPostView:
#    print(value)
#print("-----------")


dataFile2=sc.textFile("/home/aisenur/Datasets/Tagsnew")
header2=dataFile2.first()

dataFile2=dataFile2.filter(lambda x: x!=header2)
dataFile2=dataFile2.map(lambda x: x.split(" "))

sqlContext1 = sql.SQLContext(sc)
dfPostFav = sqlContext1.createDataFrame(dfPostFav, ["postTag1", "FavoriteCount"])
#dfPostFav.show()
tf=dfPostFav.alias('tf')

sqlContext1 = sql.SQLContext(sc)
dfPostComment = sqlContext1.createDataFrame(dfPostComment, ["postTag2", "CommentCount"])
#dfPostComment.show()
tc=dfPostComment.alias('tc')

sqlContext1 = sql.SQLContext(sc)
dfPostView = sqlContext1.createDataFrame(dfPostView, ["postTag3", "ViewCount"])
#dfPostView.show()
tv=dfPostView.alias('tv')

join_post=tf.join(tc, tf.postTag1 == tc.postTag2, how='left').select([col('tc.'+xx) for xx in tc.columns]
Ejemplo n.º 18
0
    def run(self, inputType, fileList, outDirectory):
        sc = SparkContext(appName="ALU Application")
        sqlContext = sql.SQLContext(sc)
        outputName = outDirectory + "result_group_by_" + self.groupby + "_ALU_2017_spark_" + inputType + ".csv"
        start = dt.datetime.now()
        dataframe = None

        for filename in fileList:
            date = LTE_MAPPING.x_date(filename[len(filename) -
                                               12:len(filename) - 4])
            if inputType == 'hdfs':
                filename = "hdfs://hdfs1:8020/user/ec2-user/sample-data/" + filename
            print "reading " + filename
            rddFrame1 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true')\
                .load(filename)
            #ENODEB_CELLNAME	ENODEB	DATA_DATE	MARKET_CLUSTER	VERSION	REGION	MARKET	DL_CH_BANDWIDTH	EARFCN_DL	DRBPDCPSDUKBYTESDL_NONGBR	DLPRBUSEDWITHDSPUC_FDUSERS	DLPRBUSEDWITHDSPUC_FSUSERS	EUCELL_DL_TPUT_NUM_KBITS	EUCELL_DL_TPUT_DEN_SECS	EUCELL_DL_DRB_TPUT_NUM_KBITS	EUCELL_DL_DRB_TPUT_DEN_SECS
            #rddFrame1 = rddFrame1.drop('ENODEB','DATA_DATE','VERSION').dropna()
            rddFrame1 = rddFrame1.dropna()
            rddFrame1 = rddFrame1.withColumn('DATE', sql.functions.lit(date))
            if dataframe == None:
                dataframe = rddFrame1
            else:
                dataframe = dataframe.unionAll(rddFrame1)
        print "reading finished!"
        self.printDfPartitions(dataframe)

        #cast Type
        dataframe = dataframe.withColumn(
            'EUCELL_DL_TPUT_NUM_KBITS',
            dataframe['EUCELL_DL_TPUT_NUM_KBITS'].cast(sql.types.DoubleType()))
        dataframe = dataframe.withColumn(
            'EUCELL_DL_TPUT_DEN_SECS',
            dataframe['EUCELL_DL_TPUT_DEN_SECS'].cast(sql.types.DoubleType()))
        dataframe = dataframe.withColumn(
            'EUCELL_DL_DRB_TPUT_NUM_KBITS',
            dataframe['EUCELL_DL_DRB_TPUT_NUM_KBITS'].cast(
                sql.types.DoubleType()))
        dataframe = dataframe.withColumn(
            'EUCELL_DL_DRB_TPUT_DEN_SECS',
            dataframe['EUCELL_DL_DRB_TPUT_DEN_SECS'].cast(
                sql.types.DoubleType()))
        dataframe = dataframe.withColumn(
            'DRBPDCPSDUKBYTESDL_NONGBR',
            dataframe['DRBPDCPSDUKBYTESDL_NONGBR'].cast(
                sql.types.DoubleType()))
        dataframe = dataframe.withColumn(
            'DLPRBUSEDWITHDSPUC_FDUSERS',
            dataframe['DLPRBUSEDWITHDSPUC_FDUSERS'].cast(
                sql.types.DoubleType()))
        dataframe = dataframe.withColumn(
            'DLPRBUSEDWITHDSPUC_FSUSERS',
            dataframe['DLPRBUSEDWITHDSPUC_FSUSERS'].cast(
                sql.types.DoubleType()))
        #add columns
        dataframe = dataframe.withColumn('Total cell count',
                                         sql.functions.lit(1))
        BandMapping = sql.functions.udf(
            lambda x: LTE_MAPPING.EARFCN_DL_mapping(x), sql.types.StringType())
        dataframe = dataframe.withColumn('BAND', BandMapping('EARFCN_DL'))
        BandWidthMapping = sql.functions.udf(
            lambda x: LTE_MAPPING.bandwidth(x), sql.types.IntegerType())
        dataframe = dataframe.withColumn('Total Spectrum in MHz',
                                         BandWidthMapping('DL_CH_BANDWIDTH'))

        dataframeoutput = dataframe.groupBy(['DATE', self.groupby,
                                             'BAND']).sum()
        dataframeoutput = dataframeoutput.withColumn(
            'UE Tput (kbps)',
            dataframeoutput['sum(EUCELL_DL_TPUT_NUM_KBITS)'] /
            dataframeoutput['sum(EUCELL_DL_TPUT_DEN_SECS)'])
        dataframeoutput = dataframeoutput.withColumn(
            'DRB Tput (kbps)',
            dataframeoutput['sum(EUCELL_DL_DRB_TPUT_NUM_KBITS)'] /
            dataframeoutput['sum(EUCELL_DL_DRB_TPUT_DEN_SECS)'])
        dataframeoutput = dataframeoutput.withColumn(
            'Cell Spectral Efficiency (bps/Hz)',
            8 * dataframeoutput['sum(DRBPDCPSDUKBYTESDL_NONGBR)'] /
            (dataframeoutput['sum(DLPRBUSEDWITHDSPUC_FDUSERS)'] +
             dataframeoutput['sum(DLPRBUSEDWITHDSPUC_FSUSERS)']) / 1.024 /
            0.18)
        dataframeoutput = dataframeoutput.withColumn('VENDOR',
                                                     sql.functions.lit('ALU'))
        dataframeoutput = dataframeoutput.withColumn(
            'UE Traffic (kbytes)',
            dataframeoutput['sum(EUCELL_DL_TPUT_NUM_KBITS)'] / 8)
        dataframeoutput = dataframeoutput.withColumn(
            'Cell Used PRB',
            (dataframeoutput['sum(DLPRBUSEDWITHDSPUC_FDUSERS)'] +
             dataframeoutput['sum(DLPRBUSEDWITHDSPUC_FSUSERS)']) * 1.024)
        #rename colname
        dataframeoutput = dataframeoutput.withColumnRenamed(
            "sum(DRBPDCPSDUKBYTESDL_NONGBR)", "Cell Traffic (kbytes)")
        dataframeoutput = dataframeoutput.withColumnRenamed(
            "sum(EUCELL_DL_TPUT_DEN_SECS)", "UE Active Time (s)")
        dataframeoutput = dataframeoutput.withColumnRenamed(
            "sum(Total cell count)", "Total cell count")
        dataframeoutput = dataframeoutput.withColumnRenamed(
            "sum(Total Spectrum in MHz)", "Total Spectrum in MHz")
        #dataframeoutput = dataframeoutput.drop('sum(EUCELL_DL_TPUT_NUM_KBITS)').drop('sum(DLPRBUSEDWITHDSPUC_FDUSERS)').drop('sum(DLPRBUSEDWITHDSPUC_FSUSERS)').drop('sum(EUCELL_DL_DRB_TPUT_NUM_KBITS)').drop('sum(EUCELL_DL_DRB_TPUT_DEN_SECS)')
        dataframeoutput = dataframeoutput.select(
            'DATE', 'MARKET', 'VENDOR', 'BAND', 'Cell Traffic (kbytes)',
            'Cell Used PRB', 'Cell Spectral Efficiency (bps/Hz)',
            'UE Traffic (kbytes)', 'UE Active Time (s)', 'UE Tput (kbps)',
            'Total cell count', 'Total Spectrum in MHz')
        dataframeoutput = dataframeoutput.coalesce(1)
        #take action here
        dataframeoutput.write.format('com.databricks.spark.csv').save(
            outputName)
        difference = dt.datetime.now() - start
        dataframeoutput.unpersist()
        sc.stop()
        return difference
Ejemplo n.º 19
0
def setup_spark():
    config = SparkConf().setAppName(APP_NAME)
    context = SparkContext.getOrCreate(conf=config)
    sql_context = sql.SQLContext(context)

    return {'config': config, 'context': context, 'sql_context': sql_context}
import etl.EtlAnuncio as EtlAnun
import etl.EtlEstadisticaAnuncio as EtlEstAnun
import etl.EtlAccionDeAnuncio as EtlAccion
import etl.EtlAnunciosReporte as Etlrep
import sys
import pyspark as pspk
import pyspark.sql as pysql
import util.LoggerImpl as Log
import findspark

reload(sys)
sys.setdefaultencoding('utf-8')
findspark.init("/home/arturo/Software/spark-2.2.3-bin-hadoop2.7")

context = pspk.SparkContext.getOrCreate()
sql_context = pysql.SQLContext(context)
dto_logger = Log.Logger('', '', 'Script_Campanias', '', '')

dto_credenciales = Dto.DtoCredenciales(
    id_cuenta='act_804059193122922',
    token_de_acceso=
    'EAAFqYKPZBGTwBAJSHoktCxD1IHAn0tsl9I3iATCrLWb0aol1cUmq5Bfg1TKqWW'
    'SIJccxb2kxtN7HubCQ32rLCN50nzddGPbh1rtJmsbdFgGcD6n4jHWb1IqSINZC'
    'GGFgZBRJYGJAjqUfQpAXkmtd4dZCwZCEGDHicZBpj5dZCMgYgZDZD',
    id_usuario='',
    id_app='',
    id_pagina='',
    app_secreta='')

etl_campania = EtlCamp.EtlCampania(dto_credenciales, sql_context)
etl_campania.extrae()
    df.to_csv(DF_CSV_PATH, index=False)

df.head()

#%%
df = df.astype({'class': 'int32'})

# %%
trainDF = df.sample(frac=0.8, random_state=80)
testDF = df.drop(trainDF.index)

#%%
trainDF['class'].isnull().sum()

# %%
trainDF, testDF = sql.SQLContext(spark.sparkContext).createDataFrame(trainDF), sql.SQLContext(spark.sparkContext).createDataFrame(testDF)
# trainDF.show()
# testDF.show()

#%%
print(trainDF.count())
print(trainDF.filter(F.col('class') == 1).count())
print(trainDF.filter(F.col('class') == 0).count())

# %%
stopWords = list(set(nltk.corpus.stopwords.words('english'))) + ['']

tokenizer = Tokenizer(inputCol='review', outputCol='tokens')
stopWordRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='stoppedWords').setStopWords(stopWords)
countVector = CountVectorizer(inputCol=stopWordRemover.getOutputCol(), outputCol='vectors')
idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf')
Ejemplo n.º 22
0
def sql_context(spark_context):
    sql_context = sql.SQLContext(spark_context)
    return sql_context
#Data loading
data = sc.textFile("./ml-100k/u.data")

#loaded data will be a spark RDD type, run the below command to findout the data type of data object.
#print(type(data),data.count(),data.first())
print(data.take(5))

# total length of the data loaded is given by:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
ratings = data.map(lambda l: l.split('\t'))\
	    .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
print(ratings.take(5))

#创建df
sql_ctx = sql.SQLContext(sc)
df =  sql_ctx.createDataFrame(ratings, ['UserID', 'product',"Rating"])
#df.select('user').distinct().show(100)

user_count = df.groupBy("UserID" ).count()
print(type(user_count))

#漂亮的直方图
#plt_show(df)
#

df.stat.crosstab("UserID", "Rating").show()

#分割训练集
(training, test) = ratings.randomSplit([0.8, 0.2])
Ejemplo n.º 24
0
from pyspark import sql, SparkConf, SparkContext
import pyspark
import os

conf = SparkConf().setAppName("Read_CSV")
sc = SparkContext(conf=conf)
spark = sql.SQLContext(sc)

df1 = spark.read.option("delimiter", ";").option("header", "true").option(
    "encoding", "ISO-8859-1"
).csv(
    "/home/data/Documents/dadosCvm/Cias Abertas Documentos Formulário DFP - Balanço Patrimonial Passivo (BPP)/Cias Abertas Documentos Formulário DFP - Balanço Patrimonial Passivo (BPP)/bpp_cia_aberta_con_2013.csv"
)

for i in range(0, 3):
    df2 = spark.read.option("delimiter", ";").option("header", "true").option(
        "encoding", "ISO-8859-1"
    ).csv(
        "/home/data/Documents/dadosCvm/Cias Abertas Documentos Formulário DFP - Balanço Patrimonial Passivo (BPP)/Cias Abertas Documentos Formulário DFP - Balanço Patrimonial Passivo (BPP)/bpp_cia_aberta_con_2013.csv"
    )
    df1 = df1.union(df2)

df1.show()

print(df1.count())