def loadDF(filename): """ Load and parse filename as pyspark.sql.DataFrame using pyspark_csv.py """ path = os.path.join(DATADIR, filename) plain = sc.textFile(path) df = pycsv.csvToDataFrame(sqlCtx, plain, sep=',') return df
def processRdd(rdd): try: print 'processRDD' #covnert to a dataframe from rdd printOnConsole('Started Processing the streams') #desiredCol = ['c-ip','cs-uri-stem','c-user-agent','customer-id','x-ec_custom-1'] if rdd.count() > 0: df = pycsv.csvToDataFrame(sqlContext, rdd, columns=COLUMNS, colTypes=COLUMN_TYPES) #df = df.select(desiredCol) #startTime endTime = getCurrentTimeStamp() startTime = endTime - SPARK_STREAM_BATCH endTime = getDateTimeFormat(endTime) startTime = getDateTimeFormat(startTime) df = df.withColumn(COL_STARTTIME, lit(startTime)) #endTime df = df.withColumn(COL_ENDTIME, lit(endTime)) df.registerTempTable("tempTable") query = ( 'select' + ' startTime,' + #startTime ' endTime,' + #endTime ' \'\' as ' + COL_CUSTOMERID + ',' + #customerid ' setProjectId(`projectid`) as ' + COL_PROJECTID + ',' + #projectid ' \'\' as ' + COL_FONTTYPE + ',' + #FontType ' \'\' as ' + COL_FONTID + ',' + #FontId ' getDomainName(`referrer`) as ' + COL_DOMAINNAME + ',' + #DomainName ' getBrowser(`useragent`) as ' + COL_USERAGENT + ',' + #UserAgent ' setIpaddress(`ip`) as ' + COL_IPADDRESS + #customer ipaddress ' from tempTable') df = sqlContext.sql(query) type = PAGEVIEW_TYPE processForTable(df, type) else: printOnConsole('Nothing to process') except Exception, ex: printOnConsole('There was an error...') print ex
def processRdd(rdd): try: print 'processRDD' #covnert to a dataframe from rdd printOnConsole('Started Processing the streams') #desiredCol = ['c-ip','cs-uri-stem','c-user-agent','customer-id','x-ec_custom-1'] if rdd.count() > 0: df = pycsv.csvToDataFrame(sqlContext, rdd, columns=COLUMNS, colTypes=COLUMN_TYPES) #df = df.select(desiredCol) #startTime endTime = getCurrentTimeStamp() startTime = endTime - SPARK_STREAM_BATCH endTime = getDateTimeFormat(endTime) startTime = getDateTimeFormat(startTime) df = df.withColumn(COL_STARTTIME, lit(startTime)) #endTime df = df.withColumn(COL_ENDTIME, lit(endTime)) df.registerTempTable("tempTable") query = ('select' + ' startTime,' + #startTime ' endTime,' + #endTime ' \'\' as ' + COL_CUSTOMERID + ',' + #customerid ' setProjectId(`projectid`) as ' + COL_PROJECTID + ',' + #projectid ' \'\' as ' + COL_FONTTYPE + ',' + #FontType ' \'\' as ' + COL_FONTID + ',' + #FontId ' getDomainName(`referrer`) as ' + COL_DOMAINNAME + ',' + #DomainName ' getBrowser(`useragent`) as ' + COL_USERAGENT + ',' + #UserAgent ' setIpaddress(`ip`) as ' + COL_IPADDRESS + #customer ipaddress ' from tempTable') df = sqlContext.sql(query) type = PAGEVIEW_TYPE | PAGEVIEWGEO_TYPE processForTable(df, type) else: printOnConsole('Nothing to process') except Exception, ex: printOnConsole('There was an error...') print ex
def main(): global data global week p = optparse.OptionParser() #take inputs #take training data set p.add_option('--train_dataset', '-i', default='/afs/cern.ch/user/s/sganju/private/2013.csv') #specify target column p.add_option('--target', '-y', default="target") #add different algos #random forest p.add_option('--algo', '-a',default = "sgd") #parse inputs #read options options, arguments = p.parse_args() a = options.algo #hdfs path is the new path path =sys.argv[1] #hdfs://samrouch-mesos-01:54310/user/root/test/" #know all of 2014 data files #use glob filename_2014 = path + 'dataframe-20140101-20140107.csv' #data = sc.textFile(path + "2013.csv") #data=sc.textFile(path+"2013.csv").map(lambda line: line.split(",")).filter(lambda line: len(line)>1).map(lambda line: (line[0],line[1])).collect() #read csv file using pycsv plaintext_rdd = sc.textFile(path+'2013.csv') from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) data = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd) for each_file in filename_2014: week = sc.textFile(each_file) print (' ==== ', each_file ) transform_csv() algo(a) merge_csv() break
def permSave(csv_location, table_name, sc, sqlContext): csv_location = downloadHTTP(csv_location) rdd = sc.textFile(csv_location) df = pycsv.csvToDataFrame(sqlContext, rdd) df.write.saveAsTable(table_name)
from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext conf = SparkConf().setMaster('local').setAppName('InternationalStudentsByCountry') sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) import pyspark_csv as pycsv sc.addPyFile('pyspark_csv.py') def extract_row(row): country = row[2] year = row[13] internationalStudents = 0 if (row[9] != None and row[11] != None): numStudents = float(str(row[9]).replace(',','')) internationalPercentage = float(str(row[11])[:-1]) internationalStudents = int(numStudents * internationalPercentage / 100.0) return ((year, country), internationalStudents) plaintext_rdd = sc.textFile('file:///Users/Wik/Documents/Kuliah/BigData/Tugas-2/WorldRankUniversity-Mining/data/timesData.csv') rdd = pycsv.csvToDataFrame(sqlContext, plaintext_rdd).rdd mapped = rdd.map(extract_row) reduced = mapped.reduceByKey(lambda a, b : a + b) sorted = reduced.sortByKey() result = sorted.collect() for item in result: print str(item[0][0]) + ' - ' + str(item[0][1]) + ': ' + str(item[1])
# Download pyspark_csv.py from https://github.com/seahboonsiew/pyspark-csv sys.path.append('/Users/eyalbenivri/Developer/libs/pyspark_libs') # replace as necessary import pyspark_csv sc.addFile('/Users/eyalbenivri/Developer/libs/pyspark_libs/pyspark_csv.py') # ditto sqlContext = SQLContext(sc) # Task 1: load the prop-prices.csv file as an RDD, and use the csvToDataFrame function from the pyspark_csv module # to create a DataFrame and register it as a temporary table so that you can run SQL queries: print("------- ******* Task 1 ******* -------") columns = ['id', 'price', 'date', 'zip', 'type', 'new', 'duration', 'PAON', 'SAON', 'street', 'locality', 'town', 'district', 'county', 'ppd', 'status'] rdd = sc.textFile(datadir + "prop-prices.csv") df = pyspark_csv.csvToDataFrame(sqlContext, rdd, columns=columns) df.registerTempTable("properties") df.persist() # Task 2: let's do some basic analysis on the data. # Find how many records we have per year, and print them out sorted by year. print("------- ******* Task 2 ******* -------") year_count = sqlContext.sql( """select year(date) as year, count(*) as count from properties group by year(date) order by year(date)""").collect() print(year_count) # Task 3: Everyone knows that properties in London are expensive. # Find the average property price by county,
dataLines.take(5) #RDD to Dense vector vectorsUSD = dataLines.map(transformationDT.transformToNumeric) vectorsUSD.take(5) #Perform statistical Analysis statsUSD = Statistics.colStats(vectorsUSD) statsUSD.mean() statsUSD.variance() statsUSD.min() statsUSD.max() Statistics.corr(vectorsUSD) #SPARK SQL dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",") dataframe.registerTempTable("dataUSDuprv") dff1 = sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show() dataframe.show() #LabeledPoint lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint) lpUSD.take(5) dfUSD = sqlContext.createDataFrame(lpUSD, ["label", "features"]) dfUSD.select("label", "features").show(10) #String Indexer stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(dfUSD) td = si_model.transform(dfUSD) td.collect()
import pyspark_csv as pycsv from pyspark import SparkContext from pyspark.sql import SQLContext sc = SparkContext() sqlCtx = SQLContext(sc) sc.addPyFile('pyspark_csv.py') plaintext_rdd = sc.textFile('accidents.csv') df = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd) df.registerTempTable("accidents") new_df = sqlCtx.sql("SELECT id, latitude, longitude, datetime_of_accident, visibility, precipitation, conditions, weather_id, date_format(datetime_of_accident, 'EEEE') AS day FROM accidents") mysql_url="jdbc:mysql://localhost?user=root" new_df.groupBy("latitude","longitude","conditions", "day").count().write.jdbc(url=mysql_url, table="accident_prediction.aggregated_data", mode="append")
if (idx == 0): next(iterator) return iterator license_header = collisions.first() license_header_list = license_header.split(",") license_body = license.mapPartitionsWithIndex(skip_header) # filter not valid rows license_body = license_body.filter(lambda line: len(line.split(",")) > 24) # create Spark DataFrame using pyspark-csv license_df = pycsv.csvToDataFrame(sqlContext, license_body, sep=",", columns=license_header_list) #license_df.cache() # In[44]: license_df.printSchema() # In[45]: license_df.take(1) # In[46]: license_df.count()
return encoded_cpt sc = SparkContext() sc.addPyFile('pyspark_csv.py') all_patients = sc.textFile("data/PATIENTS.csv").cache() icu_visits = sc.textFile("data/ICUSTAYS.csv").cache() cpt_events = sc.textFile("data/CPTEVENTS.csv").cache() note_events = sc.textFile("data/NOTEEVENTS.csv").cache() lda_events = sc.textFile("data/LDAEVENTS.csv").cache() lab_events = sc.textFile("data/LABEVENTS.csv").cache() sc = SQLContext(sc) all_patients = pycsv.csvToDataFrame(sc, all_patients, parseDate=False).rdd icu_visits = pycsv.csvToDataFrame(sc, icu_visits, parseDate=False).rdd cpt_events = pycsv.csvToDataFrame(sc, cpt_events, parseDate=False).rdd note_events = pycsv.csvToDataFrame(sc, note_events, parseDate=False).rdd lda_events = pycsv.csvToDataFrame(sc, lda_events, parseDate=False).rdd lab_events = pycsv.csvToDataFrame(sc, lab_events, parseDate=False).rdd icu_patients = all_patients.keyBy(lambda p: p.SUBJECT_ID).join( icu_visits.keyBy(lambda v: v.SUBJECT_ID)) children_data = icu_patients.filter( lambda ip: is_infant(ip[1][0].DOB, ip[1][1].INTIME)) children_ids = children_data.map(lambda c: c[0]).collect() visit_ids = children_data.map(lambda c: c[1][1].ICUSTAY_ID) # AGE AND GENDER FEATURES visits_age_gender = children_data.map(lambda c: (
def anom_with_lr(): try: plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",") anom = pat_proc.filter(pat_proc.is_anomalous == 1) benign = pat_proc.filter(pat_proc.is_anomalous == 0) n_benign = benign.count() #Take a random sample of 50K from the unlabeled 100K sqlContext.registerFunction("my_random", lambda x: x - x + random()) sqlContext.registerDataFrameAsTable(benign, "benign") benign = sqlContext.sql("SELECT *, my_random(is_anomalous) as random_number FROM benign") threshold = 50000/n_benign into_model = benign.filter(benign.random_number <= threshold) for_finding_more = benign.filter(benign.random_number > threshold) for_modeling = anom.unionAll(into_model.drop(into_model.random_number)) for_finding_more = for_finding_more.drop(for_finding_more.random_number) #Try to pull this from a much larger sample, or, the entire data, because the ones with lowest probabilities, among #the selected 10,000, have probabilities around 0.05 print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count()) + ", into_model.count() = " + str(into_model.count()) + ", for_modeling.count() = " + str(for_modeling.count()) + ", for_finding_more.count() = " + str(for_finding_more.count())) all_columns = for_modeling.columns features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])] categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway procedure_features = [x for x in features if (x not in categorical_features)] #Unlike decision tree, logistic regression does not need the map categoricalFeaturesInfo, just an RDD of LabeledPoint objects. #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries: #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature. cat_feature_number = 0 dict_cat_features = {} for feature in categorical_features: agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect() distinct_values = map(lambda row: row.asDict().values()[0], agvalues) distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values)) dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values] cat_feature_number += 1 for_modeling = for_modeling.rdd print("for_modeling.getNumPartitions() = " + str(for_modeling.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8 (train, test) = for_modeling.randomSplit([0.5, 0.5]) test_data_size = test.count() print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size)) training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) print("training_data.count() = " + str(training_data.count())) t0 = time() #model = LogisticRegressionWithLBFGS.train(training_data) #LBFGS took 66.766 seconds model = LogisticRegressionWithSGD.train(training_data) #SGCD took 69.261 seconds tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) t0 = time() predictions = model.predict(test_data.map(lambda p: p.features)) tt = time() - t0 print "Prediction made in {} seconds".format(round(tt,3)) #Reports as 0.0 seconds labelsAndPreds = test_data.map(lambda p: (p.label, model.predict(p.features))) test_accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count()/float(test_data_size) fpr = labelsAndPreds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p): v == 0).count() fnr = labelsAndPreds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p): v == 1).count() print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #Test accuracy is 0.9057, fpr is 0.1634, fnr is 0.0282 model.clearThreshold() for_finding_more = for_finding_more.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) #OK for_finding_more = for_finding_more.map(lambda p: (p.features, model.predict(p.features), p.label)) #OK try: for_finding_more.first() #We perform an action here because otherwise the output will be a PipelinedRDD. #Reverse-sort the additional patients by their predicted probabilities of being anomalous and take the top 10,000 #for_finding_more.take(5) except EOFError: print("EOF handled") df = sqlContext.createDataFrame(for_finding_more.collect(), ['features', 'predicted_prob', 'is_anom']) df = df.orderBy(df.predicted_prob.desc()) #The orderBy is not actually called if collect() is not called. Can be also triggered by calling take(). We are triggering it by the writing in the next statement. df.select('is_anom', 'predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save('file:///Users/blahiri/healthcare/data/cloudera_challenge/additional_10000_from_spark.csv') #Top one has #probability of 0.86818, last one has probability 0.5928958 except Exception: print("Exception in user code:") traceback.print_exc(file = sys.stdout) return for_finding_more
def loadData(sc, sqlContext, path): plain = sc.textFile(path) df = pycsv.csvToDataFrame(sqlContext, plain, sep=',') return df
return (r[0], (r[1], r[2])) def merge_labels_name(t): a = [] labels = t[1][1].split(" ") if len(labels) > 0: tokens = nltk.wordpunct_tokenize(t[1][0][0]) for i, tok in enumerate(tokens): a.append((tok, labels[i])) return (t[0], a) gbif = sc.textFile("backbone/taxon.txt") gb_names = gbif.map(split_gbif).flatMap(get_gbif_word_type).union(named_tokens).aggregateByKey([],lambda a, s: list(set(a + [s])),lambda a1,a2: list(set(a1 + a2))) idigbio = sc.textFile("uniquenames.csv") idigbio_names = pycsv.csvToDataFrame(sqlCtx,idigbio) idigbio_names.cache() idigbio_words = idigbio_names.flatMap(get_idigbio_words_extended) idigbio_tokens = idigbio_words.leftOuterJoin(gb_names) # idigbio_tokens.cache() # idigbio_tokens.saveAsTextFile("sn_parsed_intermediate") idigbio_labeled = idigbio_tokens.map(un_tuple).aggregateByKey([],lambda a, s: a + [s], lambda a1, a2: a1 + a2).map(parts_in_order) idigbio_labeled.cache() idigbio_labeled.saveAsTextFile("sn_parsed_format_by_key") idigbio_labeled.map(prep_count).reduceByKey(count).saveAsTextFile("sn_parsed") idigbio_labeled_rev = idigbio_labeled.map(reverse_labels) idigbio_parsed = idigbio_names.map(names_as_tuples).join(idigbio_labeled_rev).map(merge_labels_name) idigbio_parsed.saveAsTextFile("uniquenames_parsed")
num_cols = len(first_rec) from pandas import read_csv from numpy import float64 as np_float64 X = read_csv(s_input_path + 'clicks_X_valid_4-1_para_spark.csv', dtype=np_float64, header = None) X2 = read_csv(s_input_path + 'clicks_X_valid_4-2_para_spark.csv', dtype=np_float64, header = None) y = read_csv(s_input_path + 'clicks_y_valid_4-1_para_spark.csv', dtype=np_float64, header = None) y2 = read_csv(s_input_path + 'clicks_y_valid_4-2_para_spark.csv', dtype=np_float64, header = None) from numpy import concatenate as np_concat # Para concatenar varios ficheros en uno (leer_y_reshape) X = np_concat((X, X2), axis=0) y = np_concat((y, y2), axis=0) X.shape, y.shape num_cols = X.shape[1] # NOTA: Cuidado que se ordena (por la primera columna...) dfX = pycsv.csvToDataFrame(sqlCtx, txt_rdd, columns=['Col_' + str(i) for i in range(0,num_cols)]) txt_rdd = sc.textFile(s_spark_inputpath + 'clicks_y_valid_4.csv') # NOTA: Cuidado que se ordena (por la primera columna...) dfy = pycsv.csvToDataFrame(sqlCtx, txt_rdd, columns=['Clicked']) dfX.select(['Col_' + str(i) for i in range(0,4)]).show(10) dfy.select('Clicked').show(10) # Ahora estos DataFrame tienen que convertirse en uno como hace [rdd = to_simple_rdd(sc, X_train, y_train)] PENDIENTE***** from elephas.utils.rdd_utils import to_simple_rdd rdd = to_simple_rdd(sc, X_train, Y_train) [?] sc.statusTracker().getActiveJobsIds() sc.statusTracker().getActiveStageIds()
dataLines.take(5) #RDD to Dense vector vectorsUSD = dataLines.map(transformationDT.transformToNumeric) vectorsUSD.take(5) #Perform statistical Analysis statsUSD=Statistics.colStats(vectorsUSD) statsUSD.mean() statsUSD.variance() statsUSD.min() statsUSD.max() Statistics.corr(vectorsUSD) #SPARK SQL dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",") dataframe.registerTempTable("dataUSDuprv") dff1=sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show() dataframe.show() #LabeledPoint lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint) lpUSD.take(5) dfUSD = sqlContext.createDataFrame(lpUSD, ["label", "features"]) dfUSD.select("label", "features").show(10) #String Indexer stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(dfUSD) td = si_model.transform(dfUSD)
import pyspark_csv as pycsv expenditures = sc.textFile("swift://notebooks." + credentials_1['name'] + "/health_expenditures.csv") def skip_header(idx, iterator): if (idx == 0): next(iterator) return iterator expenditures_header = expenditures.first() expenditures_header_list = expenditures_header.split(",") expenditures_body = expenditures.mapPartitionsWithIndex(skip_header) # create Spark DataFrame using pyspark-csv expenditures_df = pycsv.csvToDataFrame(sqlContext, expenditures_body, sep = ",", columns = expenditures_header_list) # In[6]: life_expectancy = sc.textFile("swift://notebooks." + credentials_1['name'] + "/life_expectancy_at_birth.csv") life_expectancy_header = life_expectancy.first() life_expectancy_header_list = life_expectancy_header.split(",") life_expectancy_body = life_expectancy.mapPartitionsWithIndex(skip_header) # create Spark DataFrame using pyspark-csv life_expectancy_df = pycsv.csvToDataFrame(sqlContext, life_expectancy_body, sep = ",", columns = life_expectancy_header_list)
# Using PySpark_CSV, read in Titanic data and build classification models for it. # # #### (5a) Logistic Regression # In[67]: from pyspark.sql import SQLContext, Row from pyspark.mllib.linalg import Vectors from pyspark.mllib.classification import LogisticRegressionWithSGD sqlCtx = SQLContext(sc) fileName = os.path.join(baseDir, 'titanic3.csv') plaintext_rdd = sc.textFile(fileName) titanicRawRDD = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd).rdd #remove blank rows titanicRDD = titanicRawRDD.filter(lambda r : (r[2] != None) ) #pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_dest def parseRow(r): pclass = r[0] sex = 0 if r[3] == 'female' else 1 age = r[4] if r[4] != None else -1 #flag invalid ages for filtering sibsp = r[5] parch = r[6] fare = r[8] if r[8] != None else -1 #flag missing fares for filtering try: lp =LabeledPoint(r[1], [pclass,sex,age,sibsp,parch,fare])
def train_validate_test_rpart(): try: plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",") anom = pat_proc.filter(pat_proc.is_anomalous == 1) benign = pat_proc.filter(pat_proc.is_anomalous == 0) n_benign = benign.count() print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count())) #anom.count() = 49542, benign.count() = 197406 sample_from_benign = benign.sample(False, 50000/n_benign) pat_proc = anom.unionAll(sample_from_benign) print("pat_proc.count() = " + str(pat_proc.count())) #99,227 all_columns = pat_proc.columns features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])] categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway procedure_features = [x for x in features if (x not in categorical_features)] #Construct the map categoricalFeaturesInfo, which specifies which features are categorical and how many categorical values each of those features can take. #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries: #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature. cat_feature_number = 0 dict_cat_features = {} for feature in categorical_features: agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect() #collect() is an action that returns all the elements of the dataset as an array at the driver program. #Calls to collect() imply there would be communication between the executors and the driver, so use it with discretion. distinct_values = map(lambda row: row.asDict().values()[0], agvalues) distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values)) dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values] cat_feature_number += 1 pat_proc = pat_proc.rdd print("pat_proc.getNumPartitions() = " + str(pat_proc.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8 (train, test) = pat_proc.randomSplit([0.5, 0.5]) test_data_size = test.count() print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size)) training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) print("training_data.count() = " + str(training_data.count())) #Populate the actual categoricalFeaturesInfo dictionary cat_features_info = dict([(value[0], value[1]) for (key, value) in dict_cat_features.iteritems()]) procedure_features_info = dict([(feature_id, 2) for feature_id in range(3, 2 + len(procedure_features))]) cat_features_info = dict(cat_features_info.items() + procedure_features_info.items()) t0 = time() model = DecisionTree.trainClassifier(training_data, numClasses = 2, categoricalFeaturesInfo = cat_features_info, impurity = 'gini', maxDepth = 2, maxBins = 32) #Under the hood in DecisionTree.scala, RandomForest is called with numTrees = 1 and featureSubsetStrategy = "all". tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) #63.355 seconds (5.5 times compared to standalone R). Even when maxDepth was reduced from 5 to 2, time to train was 61.942 seconds. print(model) test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) t0 = time() predictions = model.predict(test_data.map(lambda p: p.features)) tt = time() - t0 print "Prediction made in {} seconds".format(round(tt,3)) #0.014 seconds labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) #Create a list of tuples with each tuple having the actual and the predicted label test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data_size) fpr = labels_and_preds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labels_and_preds.filter(lambda (v, p): v == 0).count() fnr = labels_and_preds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labels_and_preds.filter(lambda (v, p): v == 1).count() print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #With maxDepth = 5, test accuracy is 0.9084, fpr is 0.1555, fnr is 0.0272. #With maxDepth = 2, test accuracy is 0.861, fpr is 0.2591, fnr is 0.018 print model.toDebugString() except Exception: print("Exception in user code:") traceback.print_exc(file = sys.stdout) return model
def tempSave(csv_location, table_name, sc, sqlContext): csv_location = downloadHTTP(csv_location) rdd = sc.textFile(csv_location) df = pycsv.csvToDataFrame(sqlContext, rdd) df.registerTempTable(table_name)