if __name__ == '__main__': sprk = Spark_Session() conn = sprk.Spark_Context() sql_conn =sprk.Spark_Connect() dept_rdd = conn.textFile("/Users/shuvamoymondal/Downloads/Dept.txt") ## Create RDD from file convert to line to a Row emp_rdd = conn.textFile("/Users/shuvamoymondal/Downloads/Emp.txt").map(lambda s: s.split(",")) print(emp_rdd.take(3)) # Each line is converted to a tuple. emp = emp_rdd.map(lambda s: (s[0],s[1].strip(),s[2].strip(),s[3].strip(),s[4],s[5])) print(emp.take(2)) # The schema is encoded in a string. schemaString = "Id Name Desgn Age Sal Dept_ID" ## Now read all the field defined above as String type and crete StructField ## Next bind all the field to type fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) # Apply the schema to the RDD. df = sql_conn.createDataFrame(emp,schema) df.show() df.createOrReplaceTempView("emp") sqlContext.sql("select sum(") sqlContext=SQLContext(conn) data = sqlContext.load(source="com.databricks.spark.csv", path = '/Users/shuvamoymondal/Downloads/SalesJan2009', header = True,inferSchema = True) data.show(2)
ascontext = spss.pyspark.runtime.getContext() sc = ascontext.getSparkContext() df = ascontext.getSparkInputData() model_type = '%%model_type%%' target = '%%target_field%%' lambda_param = float('%%lambda%%') predictors = map(lambda x: x.strip(),"%%predictor_fields%%".split(",")) modelpath = ascontext.createTemporaryFolder() except: import os sc = SparkContext('local') sqlCtx = SQLContext(sc) # get an input dataframe with sample data by looking in working directory for file DRUG1N.json wd = os.getcwd() df = sqlCtx.load("file://"+wd+"/DRUG1N.json","json").repartition(4) # specify predictors and target predictors = ["Drug","BP", "Sex", "Age"] target = "Cholesterol" lambda_param = 1.0 modelpath_base = "/tmp/model1234" import shutil try: shutil.rmtree(modelpath_base) except: pass modelpath = "file://"+modelpath_base+"/model" metadatapath = modelpath_base+"/metadata" mbr = ModelBuildReporter(sc)
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import DenseVector from sklearn import preprocessing, svm import time import sys import os import json ascontext = None if len(sys.argv) > 1 and sys.argv[1] == "-test": import os sc = SparkContext('local') sqlCtx = SQLContext(sc) # get an input dataframe with sample data by looking in working directory for file DRUG1N.json wd = os.getcwd() df = sqlCtx.load("file://" + wd + "/DRUG1N.json", "json").repartition(4) # specify predictors and target fields = ["Age", "K", "Na"] target = "BP" modelpath = "/tmp/svm.model" modelmetadata_path = "/tmp/svm.metadata" kernel = "linear" tol = 0.001 coef0 = 0.0 gamma = 'auto' degree = 3 shrinking = True else: import spss.pyspark.runtime ascontext = spss.pyspark.runtime.getContext() sc = ascontext.getSparkContext()