def Spark_MapReduce(level, wordsatthislevel, graphcache):
	freqterms1_local=wordsatthislevel
	md5hash = hashlib.md5(",".join(wordsatthislevel)).hexdigest()
	#md5hash = ",".join(wordsatthislevel)
	cachevalue=graphcache.get(md5hash)
	if cachevalue: 
		print "Spark_MapReduce(): hash = ", md5hash, "; returning from cache"
		return cachevalue 
	else:	
		spcon=SparkContext("local[2]","Spark_MapReduce")
		print "Spark_MapReduce(): wordsatthislevel:",wordsatthislevel
		paralleldata=spcon.parallelize(wordsatthislevel).cache()
		#k=paralleldata.map(lambda wordsatthislevel: mapFunction(wordsatthislevel)).reduceByKey(reduceFunction)
		k=paralleldata.map(mapFunction2).reduceByKey(reduceFunction)
		#k=paralleldata.map(mapFunction).reduceByKey(reduceFunction)

		#dict_k=k.collect()
		#s = sorted(dict_k.items(),key=operator.itemgetter(1), reverse=True)
		#print "Spark MapReduce results:"
		#print s
		############################
		sqlContext=SQLContext(spcon)
		recursiveglossoverlap_schema=sqlContext.createDataFrame(k.collect())
		recursiveglossoverlap_schema.registerTempTable("Interview_RecursiveGlossOverlap")
		query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap")
		dict_query_results=dict(query_results.collect())
		#print "Spark_MapReduce() - SparkSQL DataFrame query results:"
		#print dict_query_results[1]
		graphcache.set(md5hash, dict_query_results[1])
		print "graphcache_mapreduce updated:", graphcache
		spcon.stop()
		return dict_query_results[1]
Ejemplo n.º 2
0
 def test_logistic_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
Ejemplo n.º 3
0
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvsModel = tvs.fit(dataset)
     tvsPath = temp_path + "/tvs"
     tvs.save(tvsPath)
     loadedTvs = TrainValidationSplit.load(tvsPath)
     self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
     self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
     self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())
     tvsModelPath = temp_path + "/tvsModel"
     tvsModel.save(tvsModelPath)
     loadedModel = TrainValidationSplitModel.load(tvsModelPath)
     self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def Spark_MapReduce_Parents(keyword, tokensofprevlevel, graphcache):
	#tokensofprevlevelkeyword=tokensofprevlevel
	#tokensofprevlevelkeyword.append(keyword)
	md5hashparents = hashlib.md5(keyword).hexdigest()

	#md5hashparents = keyword
	md5hashparents = md5hashparents + "$parents"

	picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
	asfer_pickle_string_dump(keyword,picklef_keyword)
	picklef_keyword.close()
	cachevalue=graphcache.get(md5hashparents)
	if cachevalue:
		print "Spark_MapReduce_Parents(): hash = ", md5hashparents, "; returning from cache"
		return cachevalue 
	else:	
		#picklelock.acquire()
		spcon = SparkContext("local[2]","Spark_MapReduce_Parents")
		#picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
		#asfer_pickle_string_dump(keyword,picklef_keyword)
		#picklef_keyword.close()
		paralleldata = spcon.parallelize(tokensofprevlevel).cache()
		#k=paralleldata.map(lambda keyword: mapFunction_Parents(keyword,tokensofprevlevel)).reduceByKey(reduceFunction_Parents)
		k=paralleldata.map(mapFunction_Parents).reduceByKey(reduceFunction_Parents)
		sqlContext=SQLContext(spcon)
		parents_schema=sqlContext.createDataFrame(k.collect())
		parents_schema.registerTempTable("Interview_RecursiveGlossOverlap_Parents")
		query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap_Parents")
		dict_query_results=dict(query_results.collect())
		#print "Spark_MapReduce_Parents() - SparkSQL DataFrame query results:"
		#picklelock.release()
		graphcache.set(md5hashparents,dict_query_results[1])
		spcon.stop()
		print "graphcache_mapreduce_parents updated:", graphcache
		return dict_query_results[1]
Ejemplo n.º 5
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Ejemplo n.º 6
0
def main(argv):
    #STEP1: data ingestion
    sc = SparkContext(appName="PythonWordCount")
    sqlContext = SQLContext(sc)

    #read data into RDD
    input_schema_rdd = sqlContext.read.json("file:///scratch/network/alexeys/KaggleDato/Preprocessed/0_1/part-*")
    #input_schema_rdd.show() 
    #input_schema_rdd.printSchema()
    #input_schema_rdd.select("id").show()

    train_label_rdd = sqlContext.read.json("file://"+PATH_TO_TRAIN_LABELS)
    sub_label_rdd = sqlContext.read.json("file://"+PATH_TO_SUB_LABELS)

    input_schema_rdd.registerTempTable("input")
    train_label_rdd.registerTempTable("train_label")
    sub_label_rdd.registerTempTable("sub_label")

    # SQL can be run over DataFrames that have been registered as a table.
    train_wlabels_0 = sqlContext.sql("SELECT title,text,images,links,label FROM input JOIN train_label WHERE input.id = train_label.id AND label = 0")
    train_wlabels_1 = sqlContext.sql("SELECT title,text,images,links,label FROM input JOIN train_label WHERE input.id = train_label.id AND label = 1")

    text_only_0 = train_wlabels_0.map(lambda p: p.text)
    text_only_1 = train_wlabels_1.map(lambda p: p.text)

    counts0 = text_only_0.flatMap(lambda line: tokenize(line))\
          .map(lambda x: (x, 1)) \
          .reduceByKey(add)

    counts1 = text_only_1.flatMap(lambda line: tokenize(line))\
          .map(lambda x: (x, 1)) \
          .reduceByKey(add)

    relevance = counts0.subtractByKey(counts1).map(lambda (x,y): (y,x)).sortByKey(False, 1)
    relevance.saveAsTextFile("/user/alexeys/KaggleDato/WordCount")
Ejemplo n.º 7
0
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     evaluator = BinaryClassificationEvaluator()
     cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     cvModel = cv.fit(dataset)
     cvPath = temp_path + "/cv"
     cv.save(cvPath)
     loadedCV = CrossValidator.load(cvPath)
     self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
     self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
     self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())
     cvModelPath = temp_path + "/cvModel"
     cvModel.save(cvModelPath)
     loadedModel = CrossValidatorModel.load(cvModelPath)
     self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
Ejemplo n.º 8
0
 def run(self):
     sc = SparkContext("local", "gender")
     sqlContext = SQLContext(sc)
     #StringType =(str, unicode)
     _out = self.output().open('w')
     #lines = sc.textFile("myUser.csv")
     #fobj = self.input().open("r")
     #lines = sc.textFile(fobj.name)
     print(type(self.required_tasks['insert_source'].output()))
     print(self.required_tasks['insert_source'])
     #print(self.input()['insert_source'].input())
     lines = sc.textFile("myUser.csv")
     parts = lines.map(lambda l: l.split(","))
     users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7],
         p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19]))
     schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
     print(schemaString)
     _out.write(schemaString )
     fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
     schema = StructType(fields)
     #schemaUser = sqlContext.createDataFrame(users, schema)
     schemaUser = sqlContext.applySchema(users, schema)
     schemaUser.registerTempTable("users")
     results = sqlContext.sql("SELECT gender FROM users")
     genders = results.map(lambda p : (p,1))
     counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
     for name in counts.collect():
         _out.write(str(name))
     _out.close()
Ejemplo n.º 9
0
def main():
    log = logging.getLogger(prog)
    log.setLevel(logging.INFO)
    # bit hackish and hard to keep aligned with docstring changes, not using this
    # usage = '\r\b\r\b\r' + __doc__ + "usage: %prog -j file.json -p directory.parquet"
    # parser = OptionParser(usage=usage, version='%prog ' + __version__)
    parser = OptionParser(version='%prog ' + __version__)
    parser.add_option('-j', '--json', dest='jsonFile', help='JSON input file/dir', metavar='<file/dir>')
    parser.add_option('-p', '--parquetDir', dest='parquetDir', help='Parquet output dir', metavar='<dir>')

    (options, args) = parser.parse_args()

    jsonFile   = options.jsonFile
    parquetDir = options.parquetDir

    if args or not jsonFile or not parquetDir:
        usage(parser)

    conf = SparkConf().setAppName('HS PySpark JSON => Parquet')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    spark_version = sc.version
    log.info('Spark version detected as %s' % spark_version)
    if not isVersionLax(spark_version):
        die("Spark version couldn't be determined. " + support_msg('pytools'))
    if isMinVersion(spark_version, 1.4):
        json = sqlContext.read.json(jsonFile)
        json.write.parquet(parquetDir)
    else:
        log.warn('running legacy code for Spark <= 1.3')
        json = sqlContext.jsonFile(jsonFile)
        json.saveAsParquetFile(parquetDir)
def index(request):
    string  = u'template显示字符串变量'
    list = ['第一','第二','第三']
    tuple = ('q','w','e','r','t')
    dict = {'a':1,'b':2,'c':3,'d':4}
    conf = SparkConf().setAppName("djangotest").setMaster("spark://HP-Pavilion:7077")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    url='jdbc:mysql://127.0.0.1:3306?user=root&password=raymon'
    dbtable='networkPublicOpinionAnalysisSystem.test'
    df = sqlContext.read.format('jdbc').options(url=url,dbtable=dbtable).load()
    lines = sc.textFile(settings.BASE_DIR+'/system/data/roll_news_sina_com_cn.csv')
    parts = lines.map(lambda l:l.split(','))
    schemaNews = parts.map(lambda p : Row(category=p[0],title=p[1],url=p[2],time=p[3]))
    news = sqlContext.createDataFrame(schemaNews)
    # news.registerTempTable('test')
    # dbtable = 'networkPublicOpinionAnalysisSystem.test'
    # news.write.format('jdbc').options(url=url).insertInto(tableName=dbtable)
    # string = news.count()
    row = news.first()
    a = Row()
    print(type(news))
    print(type(row))
    # print(type(a))
    # dict = row.asDict()
    # string = dict['title']

    # news.write.jdbc(url,table=dbtable)
    return render(request,'index.html',{'string':string,'list':list,'tuple':tuple,'dict':dict})
Ejemplo n.º 11
0
class TestSQL(PySparkTestCase):

    def setUp(self):
        PySparkTestCase.setUp(self)
        self.sqlCtx = SQLContext(self.sc)

    def test_basic_functions(self):
        rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}'])
        srdd = self.sqlCtx.jsonRDD(rdd)
        srdd.count()
        srdd.collect()
        srdd.schemaString()
        srdd.schema()

        # cache and checkpoint
        self.assertFalse(srdd.is_cached)
        srdd.persist(StorageLevel.MEMORY_ONLY_SER)
        srdd.unpersist()
        srdd.cache()
        self.assertTrue(srdd.is_cached)
        self.assertFalse(srdd.isCheckpointed())
        self.assertEqual(None, srdd.getCheckpointFile())

        srdd = srdd.coalesce(2, True)
        srdd = srdd.repartition(3)
        srdd = srdd.distinct()
        srdd.intersection(srdd)
        self.assertEqual(2, srdd.count())

        srdd.registerTempTable("temp")
        srdd = self.sqlCtx.sql("select foo from temp")
        srdd.count()
        srdd.collect()
Ejemplo n.º 12
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
Ejemplo n.º 13
0
 def __init__(self, predictionAndLabels):
     sc = predictionAndLabels.ctx
     sql_ctx = SQLContext(sc)
     df = sql_ctx.createDataFrame(predictionAndLabels,
                                  schema=sql_ctx._inferSchema(predictionAndLabels))
     java_model = callMLlibFunc("newRankingMetrics", df._jdf)
     super(RankingMetrics, self).__init__(java_model)
Ejemplo n.º 14
0
    def run(self):
        jsonFile   = self.options.jsonFile
        parquetDir = self.options.parquetDir

        if not jsonFile:
            self.usage('--json not defined')
        if not parquetDir:
            self.usage('--parquetDir not defined')
        if self.args:
            self.usage()

        conf = SparkConf().setAppName('HS PySpark JSON => Parquet')
        sc = SparkContext(conf=conf)
        sqlContext = SQLContext(sc)
        spark_version = sc.version
        log.info('Spark version detected as %s' % spark_version)
        if not isVersionLax(spark_version):
            die("Spark version couldn't be determined. " + support_msg('pytools'))
        if isMinVersion(spark_version, 1.4):
            json = sqlContext.read.json(jsonFile)
            json.write.parquet(parquetDir)
        else:
            log.warn('running legacy code for Spark <= 1.3')
            json = sqlContext.jsonFile(jsonFile)
            json.saveAsParquetFile(parquetDir)
def main(argv):

    Conf = (SparkConf().setAppName("recommendation"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)

    dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet"
    rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER)
    # argv[1] is the dump of training data in hdfs
    # argv[2] is the user perferences

    # User Hash Lookup stored into cassandra
    user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a)))
    distinctUser = user_hash.distinct()
    userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"])
    userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace =  keyspace).save(mode="append")
    

    # Product Hash Lookup stored into cassandra
    product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b)))
    distinctProduct = product_hash.distinct()
    productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"])
    productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace =  keyspace).save(mode="append")

    # Ratings for training
    # ALS requires a java hash of string. This function does that and stores it as Rating Object
    # for the algorithm to consume
    ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c)))

    
    model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5)
    model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model")

    sc.stop()
Ejemplo n.º 16
0
def writeLumbarReadings(time, rdd):
	try:
		# Convert RDDs of the words DStream to DataFrame and run SQL query
		connectionProperties = MySQLConnection.getDBConnectionProps('/home/erik/mysql_credentials.txt')
		sqlContext = SQLContext(rdd.context)
		if rdd.isEmpty() == False:
			lumbarReadings = sqlContext.jsonRDD(rdd)
			lumbarReadingsIntermediate = lumbarReadings.selectExpr("readingID","readingTime","deviceID","metricTypeID","uomID","actual.y AS actualYaw","actual.p AS actualPitch","actual.r AS actualRoll","setPoints.y AS setPointYaw","setPoints.p AS setPointPitch","setPoints.r AS setPointRoll")
			assembler = VectorAssembler(
						inputCols=["actualPitch"], # Must be in same order as what was used to train the model.  Testing using only pitch since model has limited dataset.
						outputCol="features")
			lumbarReadingsIntermediate = assembler.transform(lumbarReadingsIntermediate)

			
			predictions = loadedModel.predict(lumbarReadingsIntermediate.map(lambda x: x.features))
			predictionsDF = lumbarReadingsIntermediate.map(lambda x: x.readingID).zip(predictions).toDF(["readingID","positionID"])
			combinedDF = lumbarReadingsIntermediate.join(predictionsDF, lumbarReadingsIntermediate.readingID == predictionsDF.readingID).drop(predictionsDF.readingID)
			
			combinedDF = combinedDF.drop("features")
			
			combinedDF.show()


			combinedDF.write.jdbc("jdbc:mysql://localhost/biosensor", "SensorReadings", properties=connectionProperties)
	except:
		pass
def main(sc):

        sqlContext = SQLContext(sc)
        tasteProfileRdd = sc.textFile("userTaste/*")
	songRdd = sc.textFile("songsDict/*")
        # Load a text file and convert each line to a Row.
        tasteProfile = tasteProfileRdd.filter(lambda l:len(l) > 0)
        parsedSplits = tasteProfile.map(lambda l: l.split('\t'))
        userTaste = parsedSplits.map(lambda p: Row(userId=p[0], songId=p[1], playCount=p[2]))

	individualSong = songRdd.map(lambda l:l.split('|'))
        songData = individualSong.map(lambda s: Row(songId=s[0],featureSet=s[1]))

        # Infer the schema, and register the DataFrame as a table.
        schemaUserTaste = sqlContext.inferSchema(userTaste)
        schemaUserTaste.registerTempTable("userTaste")

	schemaSongData = sqlContext.inferSchema(songData)
        schemaSongData.registerTempTable("songData")

	test2 = sqlContext.sql("select * from songData limit 5")
	songIds = test2.map(lambda p: "songIds: " + s.songId)
        #test1 = sqlContext.sql("SELECT distinct * FROM userTaste limit 5")

        #songIds = test1.map(lambda p: "songIds: " + p.songId)
        for i in songIds.collect():
               print i
Ejemplo n.º 18
0
    def main(self, sc, *args):
        from pyspark.sql.types import BooleanType, StringType
        from pyspark.sql.types import FloatType, StructField, StructType
        from pyspark.sql import SQLContext

        fields = []
        for field in header_avro["fields"] + self.extra_fields:
            if field["type"] == "float":
                field_type = FloatType()
            elif field["type"] == "bool":
                field_type = BooleanType()
            else:
                field_type = StringType()
            fields.append(StructField(field["name"], field_type))
        schema = StructType(fields)

        sqlContext = SQLContext(sc)
        logger.info("Reading %s from %s" % (self.test_name, self.input().path))
        df = sqlContext.jsonFile(self.input().path, schema)
        df.registerTempTable("reports")

        entries = df.filter("({test_names}) AND"
                            " record_type = 'entry'".format(
                                test_names=' OR '.join([
                                    "test_name = '{test_name}'".format(
                                        test_name=tn)
                                    for tn in self.test_names])))
        interestings = self.find_interesting(entries)

        out_file = self.output().open('w')
        for interesting in interestings.toJSON().collect():
            out_file.write(interesting)
            out_file.write("\n")
        out_file.close()
Ejemplo n.º 19
0
def main():
    # Setting the cluster configuration parameters
    conf = SparkConf()
    conf.setMaster("spark://localhost:7077")
    conf.setAppName("Tweet App")
    conf.set("spark.executor.memory", "3g")
    conf.set("spark.driver.memory", "4g")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    # Creating and SQL context to perform SQL queries
    sqlContext = SQLContext(sc)

    # Define the data path
    curr_path = os.path.dirname(os.path.abspath(__file__))
    json_name = "out.json"

    json_file_path = os.path.join(curr_path +
                                  "/../Spark_Jobs/data/",
                                  json_name)

    parquet_file_path = createSQLContext(json_file_path, sqlContext)
    print(parquet_file_path)

    # Read from parquet file
    parquetFile = sqlContext.read.parquet(parquet_file_path)
    parquetFile.registerTempTable("tweets")
    counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
    print("============= Count =================")
    print("Count:: " + str(counter.collect()[0].cnt))
Ejemplo n.º 20
0
    def __init__(self, sparkContext):
        """Create a new HbaseContext.

    @param sparkContext: The SparkContext to wrap.
    """
        SQLContext.__init__(self, sparkContext)
        self._scala_HBaseSQLContext = self._get_hbase_ctx()
def main():
  reviews_parquet = sys.argv[1]
  metadata_parquet = sys.argv[2]
  users_ascores_file = sys.argv[3]
  products_ascores_file = sys.argv[4]

  conf = SparkConf().setAppName('Amazon Cassandra Injector').setMaster("local").set("spark.cassandra.connection.host", "localhost")
  sc = SparkContext(conf=conf)
  sqlContext = SQLContext(sc)

  sqlContext.read.parquet(reviews_parquet).registerTempTable('amazon_reviews')
  reviews = sqlContext.sql("""SELECT * FROM amazon_reviews""").rdd.cache()
  reviews_by_reviewer = reviews.map(process_review).map(lambda j: (j["reviewerid"], j))
  users_ascores = sc.textFile(users_ascores_file).map(ast.literal_eval).map(lambda (r_id, score, histo): (r_id, (score, histo)))
  reviews_joined = reviews_by_reviewer.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_review(j, score))
  # join with meth2_users_ascores. join on reviewerid -> ascore is reviewer ascore
  reviews_joined.saveToCassandra("amzdb", "reviews")

  # reviewers need their alternative score
  reviewers = reviews.map(process_reviewer).map(lambda j: (j["reviewerid"], j))
  # join with meth2_user_ascores. Get ascore and overall_histogram
  reviewers_joined = reviewers.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_reviewer(j, score, histo))
  reviewers_joined.saveToCassandra("amzdb", "reviewers")

  # products need their overall score/histogram, and adjuted score/histogram
  sqlContext.read.parquet(metadata_parquet).registerTempTable('amazon_metadata')
  products = sqlContext.sql("""SELECT * FROM amazon_metadata""").rdd.map(process_product).map(lambda j: (j["asin"], j))
  # join with meth2_product_ascores
  products_ascores = sc.textFile(products_ascores_file).map(ast.literal_eval).map(lambda (asin, o_s, a_s, o_h, a_h, n): (asin, (o_s, o_h, a_s, a_h)))
  products_joined = products.join(products_ascores).map(lambda (asin, (j, (o_s, o_h, a_s, a_h))): fillin_product(j, o_s, o_h, a_s, a_h))
  products_joined.saveToCassandra("amzdb", "products")
Ejemplo n.º 22
0
class RecommendationEngine:
    """A travel recommendation engine
    """
    def get_recommendations(self, user_id):
        """Recommends travel for user
        """
        data = (1,2,3,4,5)
        even_rdd = self.sc.parallelize(data)
        #ratings = even_rdd.collect()
        reco = self.sqlContext.sql("SELECT c.contact_id, o.prod_id  FROM contacts c , offres o WHERE  o.continent_offre = c.continent and o.envie_offre = c.envie and o.moyen_offre = c.moyen").collect()
        return reco

    def __init__(self, sc):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        logger.info("Starting up the Recommendation Engine: ")
        self.sc = sc
        self.sqlContext = SQLContext(sc)

        path_contacts = "data_v3/contacts/attempt_contactV3_perfect_match.json"
        df_contacts = self.sqlContext.jsonFile(path_contacts)

        df_contacts.registerTempTable("contacts")

        path_offres = "data_v3/offres/attempt_productV3_perfect_match.json"
        df_offres = self.sqlContext.jsonFile(path_offres)
        df_offres.registerTempTable("offres")
def main(n_part, hdfs_path):
    print "********************\n*"
    print "* Start main\n*"
    print "********************"
    conf = SparkConf().setAppName("Benchmark Spark SQL")
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)
    rowsRDD = sc.textFile(hdfs_path).repartition(n_part).map(lambda x: recordToRows(x)).cache()
    df = sqlContext.createDataFrame(rowsRDD).cache()
    df.count()
    df.registerTempTable("msd_table")
    print "********************\n*"
    print "* Start querres\n*"
    print "********************"
    [ave_t1, std1, dt1, n1] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext)
    [ave_t2, std2, dt2, n2] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext, method=1)
    [ave_t3, std3, dt3, n3] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext)
    [ave_t4, std4, dt4, n4] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext, method=1)
    if n1 != n2:
        print "\t!!!!Error, counts disagree for the number of T.S. songs!"
    if n3 != n4:
        print "\t!!!!Error, counts disagree for the number of high paced songs!"
    print "********************\n*"
    print "* Results"
    print "\t".join(map(lambda x: str(x), [ave_t1, std1, dt1, ave_t2, std2, dt2, ave_t3, std3, dt3, ave_t4, std4, dt4]))
    print "********************"
Ejemplo n.º 24
0
 def mock_data(self):
     """Mock data to imitate read from database."""
     sqlContext = SQLContext(self.sc)
     mock_data_rdd = self.sc.parallelize([("A", 1, 1), ("B", 1, 0), ("C", 0, 2), ("D", 2, 4), ("E", 3, 5) ])
     schema = ["id", "x", "y"]
     mock_data_df = sqlContext.createDataFrame(mock_data_rdd, schema)
     return mock_data_df
def log_mapreducer(logfilename, pattern, filt="None"):
        spcon=SparkContext()
	if filt == "None":
        	input=open(logfilename,'r')
        	paralleldata=spcon.parallelize(input.readlines())
        	patternlines=paralleldata.filter(lambda patternline: pattern in patternline)
		print "pattern lines",patternlines.collect()
        	matches=patternlines.map(mapFunction).reduceByKey(reduceFunction)
	else:
        	input=spcon.textFile(logfilename)
		matches=input.flatMap(lambda line:line.split()).filter(lambda line: filt in line).map(mapFunction).reduceByKey(reduceFunction)
        matches_collected=matches.collect()
	print "matches_collected:",matches_collected
	if len(matches_collected) > 0:
		sqlContext=SQLContext(spcon)
		bytes_stream_schema=sqlContext.createDataFrame(matches_collected)
		bytes_stream_schema.registerTempTable("USBWWAN_bytes_stream")
		query_results=sqlContext.sql("SELECT * FROM USBWWAN_bytes_stream")
		dict_query_results=dict(query_results.collect())
        	print "----------------------------------------------------------------------------------"
        	print "log_mapreducer(): pattern [",pattern,"] in [",logfilename,"] for filter [",filt,"]"
        	print "----------------------------------------------------------------------------------"
		dict_matches=dict(matches_collected)
		sorted_dict_matches = sorted(dict_matches.items(),key=operator.itemgetter(1), reverse=True)
        	print "pattern matching lines:",sorted_dict_matches 
        	print "----------------------------------------------------------------------------------"
		print "SparkSQL DataFrame query results:"
        	print "----------------------------------------------------------------------------------"
		pprint.pprint(dict_query_results)
        	print "----------------------------------------------------------------------------------"
		print "Cardinality of Stream Dataset:"
        	print "----------------------------------------------------------------------------------"
		print len(dict_query_results)
		spcon.stop()
        	return sorted_dict_matches 
def main(argv):
    Conf = (SparkConf().setAppName("SimpleGraph"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)


    dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/data/"+argv[1]+".parquet"

    rawDF = sqlContext.read.parquet(dirPath).registerTempTable("comments")
    
    
    # This is where the magic happens
    # SQL self join to join users who have interacted with one another
    df = sqlContext.sql("""
    SELECT t1.subreddit as Subreddit,
       
       t1.id as OrigId ,                t2.id as RespId,
       t1.author AS OrigAuth,              t2.author AS RespAuth,
       t1.score  AS OrigScore,             t2.score  AS RespScore,
       t1.ups    AS OrigUps,               t2.ups    AS RespUps,
       t1.downs  AS OrigDowns,             t2.downs  AS RespDowns,
       t1.controversiality AS OrigControv, t2.controversiality AS RespControv
FROM comments t1 INNER JOIN comments t2 ON CONCAT("t1_",t1.id) = t2.parent_id where t1.author!='[deleted]' and t2.author!='[deleted]'
""")

    # write it into parquet ? Why ? Cause it compresses the data and is really fast to read from !
    df.write.parquet("hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/data/"+argv[1]+"-selfjoin.parquet")
Ejemplo n.º 27
0
def main(sc):
    sql_context = SQLContext(sc)
    all_data = get_all_data()

    # Input data: Each row is a bag of words from a sentence or document.
    training_data = [(id_gen.next(), text.split(" ")) for text in all_data]
    documentdf = sql_context.createDataFrame(training_data, ["id", "text"])

    remover = StopWordsRemover(inputCol="text", outputCol="text_filtered")
    cleaned_document = remover.transform(documentdf)

    # Learn a mapping from words to Vectors.
    word2vec = Word2Vec(vectorSize=len(training_data),
                        inputCol="text_filtered",
                        outputCol="result")
    model = word2vec.fit(cleaned_document)
    matrix = column_similarities(model.transform(cleaned_document))

    # We use the size of the target data to filter only
    # products of target data to filter data and avoid
    # products of taret data to itself
    values = matrix.entries.filter(
        lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy(
        keyfunc=lambda x: x.value, ascending=False).map(
        lambda x: x.j).distinct().take(100)

    training_data_index = dict(training_data)
    for position, item in enumerate(values):
        line = " ".join(training_data_index[int(item)])
        print('%d -> %s' % (position, line.encode('utf-8')))
Ejemplo n.º 28
0
def RunRandomForest(tf, ctx):
	sqlContext = SQLContext(ctx)
	rdd = tf.map(parseForRandomForest)
	# The schema is encoded in a string.
	schema = ['genre', 'track_id', 'features']
	# Apply the schema to the RDD.
	songDF = sqlContext.createDataFrame(rdd, schema)

	# Register the DataFrame as a table.
	songDF.registerTempTable("genclass")
	labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)

	trainingData, testData = songDF.randomSplit([0.8, 0.2])

	labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

	rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
	#rfc = SVMModel([.5, 10, 20], 5)
	#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")

	pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
	model = pipeline.fit(trainingData)

	predictions = model.transform(testData)
	predictions.show()

	evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
	accuracy = evaluator.evaluate(predictions)
	print 'Accuracy of RandomForest = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
Ejemplo n.º 29
0
def main(dataFile, outputPath):

    conf = SparkConf().setAppName("S3 Example").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    raw_text = sc.textFile(dataFile).persist(StorageLevel.MEMORY_AND_DISK)

    csv_data = raw_text.map(lambda l: l.split(","))
    row_data = csv_data.map(lambda p: dataIO.dataStruc(p))

    interaction_df = sqlContext.createDataFrame(row_data)

    # features.save_hdfs_parquet(interaction_df, outputPath)
    dataIO.save_hdfs_parquet(interaction_df, outputPath)

    interaction_df.registerTempTable("interactions")

    tcp_interactions = sqlContext.sql( """
        SELECT duration, dst_bytes, protocol_type FROM interactions WHERE protocol_type = 'tcp' AND duration > 1000 AND dst_bytes=0
    """)

    tcp_interactions.show()

    features.print_tcp_interactions(tcp_interactions)
    dataIO.print_from_dataio()
    features.print_from_feature()

    sc.stop()
Ejemplo n.º 30
0
    def get_latest_data(self):
        from pyspark.sql import SparkSession
        import config
        import pandas as pd
        # initialise sparkContext
        spark1 = SparkSession.builder \
            .master(config.sp_master) \
            .appName(config.sp_appname) \
            .config('spark.executor.memory', config.sp_memory) \
            .config("spark.cores.max", config.sp_cores) \
            .getOrCreate()

        sc = spark1.sparkContext

        # using SQLContext to read parquet file
        from pyspark.sql import SQLContext
        sqlContext = SQLContext(sc)

        from datetime import datetime
        t1 = datetime.now()
        df = sqlContext.read.parquet(config.proj_path+'/datas/appid_datapoint_parquet1')
        # creating and querying fron the temporory table
        df1 = df.registerTempTable('dummy')
        df1 = sqlContext.sql('select count(distinct application) as app_count, time_stamp, source from dummy group by source, time_stamp')

        # data cleaning
        self.p2_df = df1.toPandas()
        
        dates_outlook = pd.to_datetime(pd.Series(self.p2_df.time_stamp),unit='ms')
        self.p2_df.index = dates_outlook   
        self.p2_df['date'] = self.p2_df.index.date
        self.p2_df = self.p2_df.sort_values(by='time_stamp')
    
        t2 =datetime.now()
        time_to_fetch = str(t2-t1)
Ejemplo n.º 31
0
def read_file_spark(file_path, file_type, **kwargs):
    sc = init_nncontext()
    node_num, core_num = get_node_and_core_number()

    if ZooContext.orca_pandas_read_backend == "pandas":
        file_url_splits = file_path.split("://")
        prefix = file_url_splits[0]

        file_paths = []
        if isinstance(file_path, list):
            [
                file_paths.extend(extract_one_path(path, os.environ))
                for path in file_path
            ]
        else:
            file_paths = extract_one_path(file_path, os.environ)

        if not file_paths:
            raise Exception(
                "The file path is invalid or empty, please check your data")

        num_files = len(file_paths)
        total_cores = node_num * core_num
        num_partitions = num_files if num_files < total_cores else total_cores
        rdd = sc.parallelize(file_paths, num_partitions)

        if prefix == "hdfs":
            pd_rdd = rdd.mapPartitions(
                lambda iter: read_pd_hdfs_file_list(iter, file_type, **kwargs))
        elif prefix == "s3":
            pd_rdd = rdd.mapPartitions(
                lambda iter: read_pd_s3_file_list(iter, file_type, **kwargs))
        else:

            def loadFile(iterator):
                for x in iterator:
                    df = read_pd_file(x, file_type, **kwargs)
                    yield df

            pd_rdd = rdd.mapPartitions(loadFile)
    else:  # Spark backend; spark.read.csv/json accepts a folder path as input
        assert file_type == "json" or file_type == "csv", \
            "Unsupported file type: %s. Only csv and json files are supported for now" % file_type
        from pyspark.sql import SQLContext
        sqlContext = SQLContext.getOrCreate(sc)
        spark = sqlContext.sparkSession
        # TODO: add S3 confidentials

        # The following implementation is adapted from
        # https://github.com/databricks/koalas/blob/master/databricks/koalas/namespace.py
        # with some modifications.

        if "mangle_dupe_cols" in kwargs:
            assert kwargs[
                "mangle_dupe_cols"], "mangle_dupe_cols can only be True"
            kwargs.pop("mangle_dupe_cols")
        if "parse_dates" in kwargs:
            assert not kwargs["parse_dates"], "parse_dates can only be False"
            kwargs.pop("parse_dates")

        names = kwargs.get("names", None)
        if "names" in kwargs:
            kwargs.pop("names")
        usecols = kwargs.get("usecols", None)
        if "usecols" in kwargs:
            kwargs.pop("usecols")
        dtype = kwargs.get("dtype", None)
        if "dtype" in kwargs:
            kwargs.pop("dtype")
        squeeze = kwargs.get("squeeze", False)
        if "squeeze" in kwargs:
            kwargs.pop("squeeze")
        index_col = kwargs.get("index_col", None)
        if "index_col" in kwargs:
            kwargs.pop("index_col")

        if file_type == "csv":
            # Handle pandas-compatible keyword arguments
            kwargs["inferSchema"] = True
            header = kwargs.get("header", "infer")
            if isinstance(names, str):
                kwargs["schema"] = names
            if header == "infer":
                header = 0 if names is None else None
            if header == 0:
                kwargs["header"] = True
            elif header is None:
                kwargs["header"] = False
            else:
                raise ValueError("Unknown header argument {}".format(header))
            if "quotechar" in kwargs:
                quotechar = kwargs["quotechar"]
                kwargs.pop("quotechar")
                kwargs["quote"] = quotechar
            if "escapechar" in kwargs:
                escapechar = kwargs["escapechar"]
                kwargs.pop("escapechar")
                kwargs["escape"] = escapechar
            # sep and comment are the same as pandas
            if "comment" in kwargs:
                comment = kwargs["comment"]
                if not isinstance(comment, str) or len(comment) != 1:
                    raise ValueError(
                        "Only length-1 comment characters supported")
            df = spark.read.csv(file_path, **kwargs)
            if header is None:
                df = df.selectExpr(*[
                    "`%s` as `%s`" % (field.name, i)
                    for i, field in enumerate(df.schema)
                ])
        else:
            df = spark.read.json(file_path, **kwargs)

        # Handle pandas-compatible postprocessing arguments
        if usecols is not None and not callable(usecols):
            usecols = list(usecols)
        renamed = False
        if isinstance(names, list):
            if len(set(names)) != len(names):
                raise ValueError(
                    "Found duplicate names, please check your names input")
            if usecols is not None:
                if not callable(usecols):
                    # usecols is list
                    if len(names) != len(usecols) and len(names) != len(
                            df.schema):
                        raise ValueError("Passed names did not match usecols")
                if len(names) == len(df.schema):
                    df = df.selectExpr(*[
                        "`%s` as `%s`" % (field.name, name)
                        for field, name in zip(df.schema, names)
                    ])
                    renamed = True

            else:
                if len(names) != len(df.schema):
                    raise ValueError(
                        "The number of names [%s] does not match the number "
                        "of columns [%d]. Try names by a Spark SQL DDL-formatted "
                        "string." % (len(names), len(df.schema)))
                df = df.selectExpr(*[
                    "`%s` as `%s`" % (field.name, name)
                    for field, name in zip(df.schema, names)
                ])
                renamed = True
        index_map = dict([(i, field.name)
                          for i, field in enumerate(df.schema)])
        if usecols is not None:
            if callable(usecols):
                cols = [
                    field.name for field in df.schema if usecols(field.name)
                ]
                missing = []
            elif all(isinstance(col, int) for col in usecols):
                cols = [
                    field.name for i, field in enumerate(df.schema)
                    if i in usecols
                ]
                missing = [
                    col for col in usecols
                    if col >= len(df.schema) or df.schema[col].name not in cols
                ]
            elif all(isinstance(col, str) for col in usecols):
                cols = [
                    field.name for field in df.schema if field.name in usecols
                ]
                if isinstance(names, list):
                    missing = [c for c in usecols if c not in names]
                else:
                    missing = [col for col in usecols if col not in cols]
            else:
                raise ValueError(
                    "usecols must only be list-like of all strings, "
                    "all unicode, all integers or a callable.")
            if len(missing) > 0:
                raise ValueError(
                    "usecols do not match columns, columns expected but not found: %s"
                    % missing)
            if len(cols) > 0:
                df = df.select(cols)
                if isinstance(names, list):
                    if not renamed:
                        df = df.selectExpr(*[
                            "`%s` as `%s`" % (col, name)
                            for col, name in zip(cols, names)
                        ])
                        # update index map after rename
                        for index, col in index_map.items():
                            if col in cols:
                                index_map[index] = names[cols.index(col)]

        if df.rdd.getNumPartitions() < node_num:
            df = df.repartition(node_num)

        def to_pandas(columns, squeeze=False, index_col=None):
            def f(iter):
                import pandas as pd
                data = list(iter)
                pd_df = pd.DataFrame(data, columns=columns)
                if dtype is not None:
                    if isinstance(dtype, dict):
                        for col, type in dtype.items():
                            if isinstance(col, str):
                                if col not in pd_df.columns:
                                    raise ValueError(
                                        "column to be set type is not"
                                        " in current dataframe")
                                pd_df[col] = pd_df[col].astype(type)
                            elif isinstance(col, int):
                                if index_map[col] not in pd_df.columns:
                                    raise ValueError(
                                        "column index to be set type is not"
                                        " in current dataframe")
                                pd_df[index_map[col]] = pd_df[
                                    index_map[col]].astype(type)
                    else:
                        pd_df = pd_df.astype(dtype)
                if squeeze and len(pd_df.columns) == 1:
                    pd_df = pd_df.iloc[:, 0]
                if index_col:
                    pd_df = pd_df.set_index(index_col)

                return [pd_df]

            return f

        pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns, squeeze,
                                                index_col))

    data_shards = SparkXShards(pd_rdd)
    return data_shards
Ejemplo n.º 32
0
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameWriter

# Module Constants
APP_NAME = "reddit-comment-controversiality-regression"
REDDIT_AUG = "swift://reddit3.sjc01/RC_2010-08"
REDDIT_SEPT = "swift://reddit3.sjc01/RC_2010-09"

if __name__ == "__main__":
    
    # Configure Spark
    sc = SparkContext(appName=APP_NAME)
    sqlContext = SQLContext(sc)

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # prepare Reddit json files as sql Dataframes for pyspark.ml
    aug_comments =  sqlContext.read.json(REDDIT_AUG)
    sep_comments = sqlContext.read.json(REDDIT_SEPT)

    training = aug_comments.select('id', 'body', (aug_comments.controversiality).cast("double").alias('label'))
    test = sep_comments.select('id', 'body')
    test_actual = sep_comments.select('id', (sep_comments.controversiality).alias('actual'))
Ejemplo n.º 33
0
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql import functions as F

# Created a spark session
spark = SparkSession.builder \
           .master('local[*]') \
           .appName('My App') \
           .getOrCreate()
print(spark)

# Read a parquet file

sparkContext = spark.sparkContext
sc = SQLContext(sparkContext)

df = sc.read.parquet('../data/userdata1.parquet')
print(df)

# Mean
mean_df = df.agg({'salary': 'mean'})
print(mean_df.collect()[0][0])

# Using describe
described_df = df.select(['salary']).describe()
print(described_df.collect()[1][1])
Ejemplo n.º 34
0
{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}


Overview
 Spark SQL is a Spark module for structured data processing. It provides a programmi-
 ng abstraction called DataFrames and can also act as distributed SQL query engine. -
 Spark SQL can also be used to read data from an existing Hive installation.
 
--> DataFrames
 -->--> Starting Point: SQLContext
  
  #python
  >from pyspark.sql import SQLContext
  >sqlContext = SQLContext(sc)
  
 -->--> Creating DataFrames
  #python
  >from pyspark.sql import SQLContext
  >sqlContext = SQLContext(sc)
  >df = sqlContext.read.json("examples/src/main/resources/people.json")
  # Displays the content of the DataFrame to stdout
  >df.show()
  
 -->--> DataFrame Operations
 
  ------- python
  from pyspark.sql import SQLContext
  sqlContext = SQLContext(sc)
  
Ejemplo n.º 35
0
from pyspark.sql import SQLContext
from pyspark.sql import HiveContext
from pyspark.sql.types import *
import steel_thread
from pyspark import SparkContext
import forecast_data_v3
import numpy as np
import pandas as pd

sc = SparkContext()
hive_context = HiveContext(sc)
sqlContext = SQLContext(sc)

outageData = sc.textFile("file:///home/w205/steel_thread/outage_history.csv")
weatherData = sc.textFile("file:///home/w205/steel_thread/weather_history.csv")

riOutages = outageData.filter(lambda x: "Rhode Island" in x)
riOutageRecords = riOutages.map(lambda r: r.split(","))
weatherRecords = weatherData.map(lambda r: r.split(","))

RI_Outages = riOutageRecords.map(lambda p: (p[2], p[4], p[5], p[8], p[
    12]))  # I could not figure out how to properly parse this...
RI_Weather = weatherRecords.map(lambda p: (p[5], p[6], p[26], p[27], p[28], p[
    30], p[37], p[38], p[39], p[40], p[41], p[42], p[43], p[44], p[46]))

outageSchemaString = 'DATETIME HR MIN AREA NUMCUSTOMERS'  # If the above gets updated, this would too (of course)
weatherSchemaString = 'DTS ReportType maxTemp minTemp aveTemp aveHumidity WeatherCodes Precip Snowfall SnowDepth aveStationPressure aveSeaLevelPressure aveWindSpeed maxWindSpeed SustainedWindSpeed'

outageFields = [
    StructField(field_name, StringType(), True)
    for field_name in outageSchemaString.split()
Ejemplo n.º 36
0
# See the License for the specific language governing permissions and
# limitations under the License.
#******************************************************************************/
import pprint
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName(
    "Cloudant Spark SQL External Datasource in Python")
# define coudant related configuration
conf.set("cloudant.host", "yanglei.cloudant.com")
conf.set("cloudant.username", "ntledesewstarkalkedirsee")
conf.set("cloudant.password", "b0VbcAS7davOYC0f4umPC2BR")

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

print 'About to test com.cloudant.spark.CloudantRP for airportcodemapping'
sqlContext.sql(
    "CREATE TEMPORARY TABLE airportTable USING com.cloudant.spark.CloudantRP OPTIONS ( database 'airportcodemapping')"
)

airportData = sqlContext.sql(
    "SELECT airportCode, airportName FROM airportTable WHERE airportCode >= 'CAA' ORDER BY airportCode"
)
airportData.printSchema()
for code in airportData.collect():
    print code.airportCode

print 'About to test com.cloudant.spark.CloudantRP for booking'
sqlContext.sql(
from pyspark.sql.types import IntegerType, StringType
import pyspark.sql.functions as F
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.clustering import KMeans
import json
import socket
import pandas as pd
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.ml.linalg import Vectors
import pandas as pd

conf = SparkConf().setAppName('MyFirstStandaloneApp')
conf.set("spark.network.timeout", "5601s")
conf.set("spark.executor.heartbeatInterval", "5600s")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)


def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]), r[-1]]).toDF(
        ['features', 'label'])


#----------ZMIEŃ ŚCIEŻKĘ DO PLIKU Z DANYMI TRENINGOWYMI!!!!---------------
lines = sc.textFile('Structured_data2')
data = lines.map(lambda line: line.split(";"))
df_all = data.toDF(['Scrap_date','Scrap_time','Country_from','Country_to','Flight_id','Days','Journey_time','Airline1_There',\
                'Airline1_Back','Airline2_There','Airline2_Back','Price1_There','Price1_Back','Price2_There','Price2_Back',\
                'Depart_hour1_There','Depart_hour1_Back','Depart_hour2_There','Depart_hour2_Back','Depart_from1_There',\
                'Depart_from1_Back','Depart_from2_There','Depart_from2_Back','Arrival_hour1_There','Arrival_hour1_Back',\
                'Arrival_hour2_There','Arrival_hour2_Back','Arrive_to1_There','Arrive_to1_Back','Arrive_to2_There',\
Ejemplo n.º 38
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SQLContext

from graphframes.examples import Graphs

spark = SparkSession.builder.appName("GraphX").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

sqlContext = SQLContext(spark.sparkContext)

g = Graphs(sqlContext).friends()

g.vertices.show()
g.edges.show()







Ejemplo n.º 39
0
# Even though columns are named differently, the column indices of the ones
# we're interested in are consistent across years
COLUMN_INDEX_TO_NAME = {
  1 : "Pickup_Time",
  2 : "Dropoff_Time",
  5 : "Start_Lon",
  6 : "Start_Lat",
  9 : "End_Lon",
  10 : "End_Lat",
}

# Setup Spark
conf = (SparkConf().setAppName('taxi-preprocessing'))
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
sql_context = SQLContext(sc)

# Read & Parse file list.
# From lines like "2009/yellow_tripdata_2009-03.csv", it extracts the file name.
with open(input_file_list_file) as filelist_file:
    filelist = [line.strip().split("/")[1] for line in filelist_file.readlines()]

# Read in all CSVs, project the relation to the column we need & concatenate
df = None
for csv_file in filelist:
    new_df = sql_context.read.format('com.databricks.spark.csv')\
             .options(header='true', inferschema='true')\
             .load(file_location_base + csv_file)

    for column_index, column_name in COLUMN_INDEX_TO_NAME.iteritems():
        new_df = new_df.withColumnRenamed(new_df.columns[column_index], column_name)
Ejemplo n.º 40
0
def getSqlContextInstance(sparkContext):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(sc)
    return globals()['sqlContextSingletonInstance']
Ejemplo n.º 41
0
import re
# other required imports here

if __name__ == "__main__":
    # create Spark context with necessary configuration
    spark = SparkContext("local", "Stock Returns")

    # read json data from the newdata directory
    # df = SQLContext(spark).read.option("multiLine", True) \
    # .option("mode", "PERMISSIVE").json("./newsdata")
    schema = (
        'date STRING, open FLOAT, high FLOAT, low FLOAT, close FLOAT, volume INT, ticker STRING'
    )

    df = SQLContext(spark).read.csv('stock_prices.csv',
                                    schema=schema,
                                    header=False)
    # df.show(2)
    # lines = df.select("date","open","close")
    # sim = df.withColumn("percent", (df("close") - df("open"))*100/df("open"))
    sim = df.withColumn("return",
                        (df["close"] - df["open"]) * 100 / df["open"])
    # sim.groupBy('date').avg('return').show()
    # sim.select("date","return").groupBy("date").avg()
    x = sim.groupBy("date").avg("return")
    x.collect()
    # sim=sim.select('date','return')
    # df.groupBy(df.date).avg(df.close - df.open).show()
    # vals = lines.map(lambda row: row[2]-row[1])
    # to take avg on key
Ejemplo n.º 42
0
    else:
        k = 10
        w = 0.5
        alpha = 6
        b_update = True
        debug = True
        loss_type = 0
        dataset = 'slicing/datasets/parallel_data/salaries/rows1000.csv'
        enumerator = "join"

    conf = SparkConf().setAppName("salary_test").setMaster('local[4]')
    num_partitions = 8
    model_type = "regression"
    label = 'salary'
    sparkContext = SparkContext(conf=conf)
    sqlContext = SQLContext(sparkContext)
    fileRDD = sparkContext.textFile(dataset, num_partitions)
    header = fileRDD.first()
    head_split = header.split(",")
    fileRDD = fileRDD.filter(lambda line: line != header)
    data = fileRDD.map(lambda row: row.split(","))
    dataset_df = sqlContext.createDataFrame(data, head_split)

    cat_features = ["rank", "discipline", "sincephd_bin", "service_bin", "sex"]
    # initializing stages of main transformation pipeline
    stages = []
    dataset_df = dataset_df.withColumn("id", sf.monotonically_increasing_id())
    # bining numeric features by local binner udf function (specified for current dataset if needed)
    dataset_df = dataset_df.withColumn('sincephd_bin',
                                       binner(dataset_df['sincephd']))
    dataset_df = dataset_df.withColumn('service_bin',
Ejemplo n.º 43
0
# schema defined should exactly match the table created in cassandra
class userrepo2014_2(Model):
    username = columns.Text(primary_key=True)
    repo = columns.List(columns.Text)

    def __repr__(self):
        return '%s %d' % (self.username, self.repo)


# getting master node's IP and public DNS to run Spark job and read from HDFS
master_ip = os.environ['master_ip']
master_public_dns = os.environ['master_public_dns']

# setting SparkContext and SQLContext
sc = SparkContext("spark://" + master_ip + ":7077", "2014_events")
sqlContext = SQLContext(sc)

# reading events data for 2014 from HDFS
df14 = sqlContext.jsonFile("hdfs://" + master_public_dns +
                           ":9000/data2014_2/2014-*.*")

# filtering rows with just the three relevant events
df14_watch = df14.filter("type='WatchEvent'")
df14_commit = df14.filter("type='CommitCommentEvent'")
df14_fork = df14.filter("type='ForkEvent'")

# registering  dataframes as tables to be able to select just the three relevant columns
sqlContext.registerDataFrameAsTable(df14_watch, "df14_watch_table")
sqlContext.registerDataFrameAsTable(df14_commit, "df14_commit_table")
sqlContext.registerDataFrameAsTable(df14_fork, "df14_fork_table")
Ejemplo n.º 44
0
 def __init__(self, sparkContext, magellanContext=None):
     SQLContext.__init__(self, sparkContext)
     if magellanContext:
         self._scala_MagellanContext = magellanContext
# -*- coding: utf-8 -*-

from  pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import *
import os
import time

import re
if __name__ == "__main__":

    conf = SparkConf()
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # PYSPARK_PYTHON = "C:\\Python27\\python.exe"    #多版本python情况下,需要配置这个变量指定使用哪个版本
    # os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON

    def p(x):
        print type(x),type(x[0]),type(x[1])
        # print type(x)
        # print type(x[0]),type(x[1])
        print x[0],x[1][0],x[1][1]


    # judgment_new:556万,3030306	3392975
    df = sqlContext.read.jdbc(url='jdbc:mysql://cdh-slave1:3306/laws_doc_zhangye_etl',
                               table='(select * from judgment_zhangye_etl01  ) tmp',
                               column='id', lowerBound=1, upperBound=4816521, numPartitions=28,
                               properties={"user": "******", "password": "******"})
    # court:4778条
Ejemplo n.º 46
0
# Spark Hands On Training
# Databricks CE Cloud Practice
# Raul Arrabales / Conscious-Robots.com

# Getting the Spark SQL context and imports
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext.getOrCreate(sc.getOrCreate())

# Creating a simple DataFrame programatically
array = [
    Row(key="a", group="vowels", value=1),
    Row(key="b", group="consonants", value=2),
    Row(key="c", group="consonants", value=3),
    Row(key="d", group="consonants", value=4),
    Row(key="e", group="vowels", value=5)
]
dataframe = sqlContext.createDataFrame(sc.parallelize(array))
dataframe.registerTempTable("PythonTestTable")

# Visualize (in Databricks cloud - Display() )
display(dataframe)

# Creating more sample DataFrames:

# Sample age data:
datosEdad = [('Raul', 22), ('Ana', 32), ('Juan', 46)]
df1 = sqlContext.createDataFrame(datosEdad, ['nombre', 'edad'])

# Apply filter to age data:
filtroEdad = df1.filter(df1.edad >= 30).collect()
print filtroEdad
Ejemplo n.º 47
0
# -*- coding:utf-8 -*-
# author :seed
# date :20170522
# 来源于spark文档中的官方实例测试
from pyspark import SparkContext,SparkConf
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *
#from pyspark.sql.functions import col

sqlContext = conf = SparkConf().setAppName("the apache sparksql")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
sc.setLogLevel("WARN")

l = [("zhangfei",1),("guanyu",33)]
row = sqlContext.createDataFrame(l).collect()
row = sqlContext.createDataFrame(l, ['name', 'age']).collect()
print(row)
d = [{"name":"zhangfei","age":33},{"name":"guanyu","age":44}]
row = sqlContext.createDataFrame(d).collect()
print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx ")
print(row)
rdd = sc.parallelize(l)
row = sqlContext.createDataFrame(rdd).collect()
print("xxxxxxxxxxxxxxxxxxxx the 2nd xxxxxxxxxxxxxxxxxxxxxxxxxxx")
print(row)
Ejemplo n.º 48
0
# Project Crime/Living Index - Dhivya Sivaramakrishnan, Mangesh Bhangare

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, DataFrame, Row

import sys

conf = SparkConf().setAppName('K-Means test')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
sqlContext = SQLContext(sc)

input_cluster = sys.argv[1]
output = sys.argv[2]

# Read the parquet data (output of K-means) and convert to RDD
parquet_cluster = sqlContext.read.parquet(input_cluster)
parquet_cluster.registerTempTable("cluster_data")
cluster_output = sqlContext.sql("SELECT * FROM cluster_data")

# Save the result as a text file containing tuples
cluster_tuple = cluster_output.rdd.map(tuple)
cluster_output = cluster_tuple.saveAsTextFile(output)
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.sql.functions import split
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql.functions import lit
from pyspark.sql.functions import format_string
from pyspark.sql.functions import monotonically_increasing_id
import functools

sc = SparkContext()

sqlc=SQLContext(sc)

def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 

original_df = sqlc.read \
      .format("jdbc") \
      .option("url", "jdbc:oracle:thin:@150.136.138.197:1521/BIASDB_PDB1.subnet12011439.vcn12011439.oraclevcn.com") \
      .option("dbtable", "POC.social_media_dwh") \
      .option("user", "poc") \
      .option("password", "WElcome##123") \
      .option("driver", "oracle.jdbc.driver.OracleDriver") \
      .load()
	  
selectDF=original_df.select(original_df['social_media_id'], original_df['friends_list'])
Ejemplo n.º 50
0
def predict(sc, borough, speed, weather):
    sqlContext = SQLContext(sc)
    isSpeeding = 0
    if int(speed) > 70:
        isSpeeding = 1
    model = PipelineModel.load("data/treeModelNew2")
    time = str(datetime.now()).split(' ')
    month = float(time[0].split('-')[1])
    hour = float(time[1].split(':')[0])
    city2id = {
        "Brooklyn": 0,
        "Queens": 1,
        "Staten_Island": 2,
        "Bronx": 3,
        "Manhattan": 4
    }
    if hour == 0.0:
        hour = 24.0
    data = [(float(city2id[borough]), float(speed), month, hour,
             float(weather['wind']), float(weather['rain']),
             float(weather['snow']), float(weather['snwd']),
             float(weather['temp'] * 9.0 / 5.0 + 32))]

    rdd = sc.parallelize(data)
    test = rdd.map(lambda x: Row(BOROUGH_1=x[0],
                                 MEAN_SPEED=x[1],
                                 MONTH=x[2],
                                 HOUR=x[3],
                                 AWND=x[4],
                                 PRCP=x[5],
                                 SNOW=x[6],
                                 SNWD=x[7],
                                 TAVG=x[8]))

    df = sqlContext.createDataFrame(test)
    df.show()

    assembler = VectorAssembler(inputCols=[
        "BOROUGH_1", "MEAN_SPEED", "MONTH", "HOUR", "AWND", "PRCP", "SNOW",
        "SNWD", "TAVG"
    ],
                                outputCol="features")

    df2 = assembler.transform(df)

    df3 = VectorIndexer(inputCol="features",
                        outputCol="indexedFeatures",
                        maxCategories=25).fit(df2).transform(df2)

    predictions = model.transform(df3)

    predictions.show()

    predictions = predictions.toPandas()

    predictedLevel = predictions["prediction"][0] + isSpeeding

    print("======================")
    print("======================")
    print("predictedLevel:", predictedLevel)
    print("======================")
    print("======================")

    return predictedLevel
Ejemplo n.º 51
0
import nltk
from nltk.corpus import stopwords
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

findspark.init()

conf = SparkConf().setAppName("TF-IDF").set("spark.dynamicAllocation.enabled",
                                            "true")  # Set Spark configuration
try:
    sc = SparkContext(conf=conf)
except:
    sc.stop()
    sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
sql = SQLContext(sc)

docs_path = "C:/stories/test"  # Path to the data
textFiles = sc.wholeTextFiles(docs_path)  # (path_doc_name, content)
num_docs = textFiles.count()

# Get the list of stop word.
try:
    stops = set(stopwords.words('english'))
except:
    nltk.download('popular')
    stops = set(stopwords.words('english'))


def delete_stop_word(word: str):
    global stops
Ejemplo n.º 52
0
def get_sql_context_instance(spark_context):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
    return globals()['sqlContextSingletonInstance']
Ejemplo n.º 53
0

from pyspark import SparkContext,SparkConf
import pyspark
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructField,StringType,DoubleType,IntegerType
from pyspark.sql import functions as f
from pyspark.sql.functions import lit,trim,concat,coalesce,udf,struct
from pyspark.sql import SQLContext


# In[4]:


sc = SparkContext('local','similarity')
sqlContext = SQLContext(sc)


# In[150]:


#reading the input
input_1 = sc.textFile('./data1.txt')


# In[151]:


input_f=input_1.map(lambda x: x+'~ ') #delimiting every document by ~

Ejemplo n.º 54
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, DoubleType, IntegerType
from abbreviations_dict import tofullname, toevent
from operator import itemgetter
from pyspark import StorageLevel
import pyspark_cassandra

sc = SparkContext()
sqlContext = SQLContext(sc)

customSchema =  StructType([
        StructField('GLOBALEVENTID',StringType(),True),
        StructField('SQLDATE',StringType(),True),
        StructField('MonthYear',StringType(),True),
        StructField('Year',StringType(),True),
        StructField('FractionDate',StringType(),True),
        StructField('Actor1Code',StringType(),True),
        StructField('Actor1Name',StringType(),True),
        StructField('Actor1CountryCode',StringType(),True),
        StructField('Actor1KnownGroupCode',StringType(),True),
        StructField('Actor1EthnicCode',StringType(),True),
        StructField('Actor1Religion1Code',StringType(),True),
        StructField('Actor1Religion2Code',StringType(),True),
        StructField('Actor1Type1Code',StringType(),True),
        StructField('Actor1Type2Code',StringType(),True),
        StructField('Actor1Type3Code',StringType(),True),
        StructField('Actor2Code',StringType(),True),
Ejemplo n.º 55
0
#
# Copyright 2021,  SenX S.A.S.
#

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

spark = SparkSession.builder.appName("02").getOrCreate()
sc = spark.sparkContext

sqlContext = SQLContext(sc)

##
## Configuration used to fetch data from a Warp 10 instance
##

conf = {}
conf['warp10.fetcher.fallbacks'] = '127.0.0.1'
conf['warp10.fetcher.fallbacksonly'] = 'true'
conf['warp10.fetcher.protocol'] = 'http'
conf['http.header.now'] = 'X-Warp10-Now'
conf['http.header.timespan'] = 'X-Warp10-Timespan'
conf['warp10.fetcher.port'] = '8080'
conf['warp10.fetcher.path'] = '/api/v0/sfetch'
conf['warp10.splits.endpoint'] = 'http://127.0.0.1:8080/api/v0/splits'

# We fetch a single data point from the GTS, this could be an actual timespan if it were a positive value
conf['warp10.fetch.timespan'] = '-1'

conf['warp10.http.connect.timeout'] = '60000'
conf['warp10.http.read.timeout'] = '60000'
def sim_matrix(pair):
    similarity_mat = []
    pair = pair[1]
    for i in range(len(pair)):
        for j in range(i + 1, len(pair)):
            if (len(pair) > 1):
                wt1 = pair[i][1]
                wt2 = pair[j][1]
                sim = ((pair[i][0], pair[j][0]), wt1 * wt2)
                similarity_mat.append(sim)
    return similarity_mat


conf = SparkConf().setAppName("HW3_Part3_Avro_Uncompressed")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.avro.compression.codec", "uncompressed")

#Read the inverted index as dataframe and convert it into rdd
inv_index_data_avro = sqlContext.read.format("com.databricks.spark.avro").load(
    sys.argv[1])
inv_index_rdd = inv_index_data_avro.rdd.map(list)

#Find similarity matrix
inverted_file = inv_index_rdd.map(lambda pr: pr).filter(
    lambda pr: len(pr) > 0).map(lambda pair: pair)
sim_cal = inverted_file.map(sim_matrix).flatMap(lambda pr: pr)
similarity_matrix = sim_cal.reduceByKey(lambda c1, c2: c1 + c2).sortBy(
    lambda x: x[1], ascending=False)

#Create dataframe and write as avro file
Ejemplo n.º 57
0
!pip install pyspark
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import Row
from pyspark.sql import SQLContext
conf = SparkConf().setAll([('spark.executor.memory', '1g'),('spark.driver.memory','1g')])
sc =  SparkContext(conf=conf)
# solve the question:AttributeError: 'PipelinedRDD' object has no attribute 'toDF'
sqlContext = SQLContext(sc)
rdd = sc.parallelize([1,2,3,4])
df = rdd.map(lambda l: Row(l)).toDF()
with open ('/bigdata/xiaoma/spark/data/people.csv') as f:
    for l in f:
        print(l)
myDF = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load("file:///bigdata/xiaoma/spark/data/people.csv")
df.registerTempTable("tasks")
results = sqlContext.sql("select * from tasks")
results.show()
lines = sc.textFile('file:///bigdata/xiaoma/spark/data/people.csv')\
.map(lambda x:x.split(','))\
.map(lambda l:Row(ID=l[0],name=l[1],age=l[2],sex=l[3],val=l[4]))
for i in lines.collect(): 
    print(i)
    
myDF = sc.textFile('file:///bigdata/xiaoma/spark/data/people.csv')\
.map(lambda x:x.split(','))\
.map(lambda l:Row(ID=l[0],name=l[1],age=l[2],sex=l[3],val=l[4]))\
.toDF()
myDF.show(20)  
myDF.select('name').show
myDF.registerTempTable("tmp_df")
Ejemplo n.º 58
0
f.filter(lambda x: x.contains("LMKBRUKER")).count()

errors = f.filter(lambda line: line.startswith("139.116.15.37,POSTEN"))
messages = errors.map(
    lambda s: s.split(',')[2])  # Get the third element in the tuplet
messages.cache()
messages.filter(lambda s: "7/28" in s).count()

messages = errors.map(lambda s: s.split(',')[2]).collect()

# --------------------------------------------------------------------------------------------
# Spark SQL:

# from pyspark.sql import SQLContext, Row
from pyspark.sql import *
sqlContext = SQLContext(sc)

messages = errors.map(
    lambda s: s.split(','))  # Get the first four elements in the tuplet
for m in messages[0][0:3]:
    print m  # Get fields 0-3 of row 0

# Ex 1
lines = sc.textFile("file:///" +
                    "C:/coding/Hadoop/pig/MapReduceInputData/iis3.log")
messages = lines.map(lambda l: l.split(","))
messages_subset = messages.map(
    lambda p: Row(ip=p[0], user=p[1], date=p[2], time=p[3]))

# Ex 2
lines = sc.textFile("file:///" +
Ejemplo n.º 59
0
# May cause deprecation warnings, safe to ignore, they aren't errors
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc

# In[2]:

# Can only run this once. restart your kernel for any errors.
sc = SparkContext()

# In[3]:

ssc = StreamingContext(sc, 10)
sqlContext = SQLContext(sc)

# In[4]:

socket_stream = ssc.socketTextStream("127.0.0.1", 5555)

# In[5]:

lines = socket_stream.window(20)

# In[6]:

from collections import namedtuple
fields = ("tag", "count")
Tweet = namedtuple('Tweet', fields)
Ejemplo n.º 60
0
def main(argv):
    #STEP1: data ingestion
    sc = SparkContext(appName="KaggleDato_Step2")
    sqlContext = SQLContext(sc)

    #read data into RDD
    input_schema_rdd = sqlContext.read.json(
        "file:///scratch/network/alexeys/KaggleDato/Preprocessed/0_1/part-00000"
    )
    #input_schema_rdd.show()
    #input_schema_rdd.printSchema()
    #input_schema_rdd.select("id").show()

    train_label_rdd = sqlContext.read.json(PATH_TO_TRAIN_LABELS)
    sub_label_rdd = sqlContext.read.json(PATH_TO_SUB_LABELS)

    input_schema_rdd.registerTempTable("input")
    train_label_rdd.registerTempTable("train_label")
    sub_label_rdd.registerTempTable("sub_label")

    # SQL can be run over DataFrames that have been registered as a table.
    train_wlabels_0 = sqlContext.sql(
        "SELECT title,text,images,links,label FROM input JOIN train_label WHERE input.id = train_label.id AND label = 0"
    )
    train_wlabels_1 = sqlContext.sql(
        "SELECT title,text,images,links,label FROM input JOIN train_label WHERE input.id = train_label.id AND label = 1"
    )

    sub_wlabels = sqlContext.sql(
        "SELECT title,text,images,links,label FROM input JOIN sub_label WHERE input.id = sub_label.id"
    )

    text_only_0 = train_wlabels_0.map(lambda p: p.text)
    text_only_1 = train_wlabels_1.map(lambda p: p.text)
    image_only_0 = train_wlabels_0.map(lambda p: p.images)
    image_only_1 = train_wlabels_1.map(lambda p: p.images)
    links_only_0 = train_wlabels_0.map(lambda p: p.links)
    links_only_1 = train_wlabels_1.map(lambda p: p.links)
    title_only_0 = train_wlabels_0.map(lambda p: p.title)
    title_only_1 = train_wlabels_1.map(lambda p: p.title)

    tf = HashingTF(numFeatures=10)
    #preprocess text features
    text_documents_0 = text_only_0.map(lambda line: tokenize(line)).map(
        lambda word: tf.transform(word))
    text_documents_1 = text_only_1.map(lambda line: tokenize(line)).map(
        lambda word: tf.transform(word))

    #add them adhoc non-text features
    documents_0 = text_documents_0.zip(image_only_0).zip(links_only_0).zip(
        title_only_0)
    documents_1 = text_documents_1.zip(image_only_1).zip(links_only_1).zip(
        title_only_1)

    #turn into a format expected by MLlib classifiers
    labeled_tfidf_0 = documents_0.map(lambda row: parsePoint(0, row))
    labeled_tfidf_1 = documents_1.map(lambda row: parsePoint(1, row))
    #print labeled_tfidf_0.take(2)

    labeled_tfidf = labeled_tfidf_0.union(labeled_tfidf_1)
    #print labeled_tfidf.count()
    #print labeled_tfidf.collect()
    labeled_tfidf.cache()

    #CV split
    (trainData, cvData) = labeled_tfidf.randomSplit([0.7, 0.3])
    trainData.cache()
    cvData.cache()

    #Try various classifiers
    #With logistic regression only use training data
    #model = LogisticRegressionWithLBFGS.train(trainData)
    #Logistic regression works a lot better
    #model = NaiveBayes.train(trainData)
    #random forest
    model = RandomForest.trainClassifier(trainData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=4,
                                         maxBins=32)

    ## Evaluating the model on training data
    #labelsAndPreds = cvData.map(lambda p: (p.label, model.predict(p.features)))
    ##print labelsAndPreds.collect()
    #trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(cvData.count())
    #print("CV Error = " + str(trainErr))

    # Evaluate model on test instances and compute test error
    predictions = model.predict(cvData.map(lambda x: x.features))
    labelsAndPredictions = cvData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda (v, p): v != p).count() / float(cvData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())