def loadDF(filename):
    """
    Load and parse filename as pyspark.sql.DataFrame
    using pyspark_csv.py
    """
    path = os.path.join(DATADIR, filename)
    plain = sc.textFile(path)
    df = pycsv.csvToDataFrame(sqlCtx, plain, sep=',')
    return df
def loadDF(filename):
    """
    Load and parse filename as pyspark.sql.DataFrame
    using pyspark_csv.py
    """
    path = os.path.join(DATADIR, filename)
    plain = sc.textFile(path)
    df = pycsv.csvToDataFrame(sqlCtx, plain, sep=',')
    return df
Exemple #3
0
def processRdd(rdd):

    try:
        print 'processRDD'
        #covnert to a dataframe from rdd

        printOnConsole('Started Processing the streams')

        #desiredCol = ['c-ip','cs-uri-stem','c-user-agent','customer-id','x-ec_custom-1']
        if rdd.count() > 0:
            df = pycsv.csvToDataFrame(sqlContext,
                                      rdd,
                                      columns=COLUMNS,
                                      colTypes=COLUMN_TYPES)
            #df = df.select(desiredCol)

            #startTime
            endTime = getCurrentTimeStamp()
            startTime = endTime - SPARK_STREAM_BATCH

            endTime = getDateTimeFormat(endTime)
            startTime = getDateTimeFormat(startTime)
            df = df.withColumn(COL_STARTTIME, lit(startTime))

            #endTime
            df = df.withColumn(COL_ENDTIME, lit(endTime))

            df.registerTempTable("tempTable")
            query = (
                'select' + ' startTime,' +  #startTime
                ' endTime,' +  #endTime				
                ' \'\' as ' + COL_CUSTOMERID + ',' +  #customerid				
                ' setProjectId(`projectid`) as ' + COL_PROJECTID +
                ',' +  #projectid					 	
                ' \'\' as ' + COL_FONTTYPE + ',' +  #FontType
                ' \'\' as ' + COL_FONTID + ',' +  #FontId
                ' getDomainName(`referrer`) as ' + COL_DOMAINNAME +
                ',' +  #DomainName
                ' getBrowser(`useragent`) as ' + COL_USERAGENT +
                ',' +  #UserAgent
                ' setIpaddress(`ip`) as ' +
                COL_IPADDRESS +  #customer ipaddress   
                ' from tempTable')

            df = sqlContext.sql(query)

            type = PAGEVIEW_TYPE
            processForTable(df, type)
        else:
            printOnConsole('Nothing to process')

    except Exception, ex:
        printOnConsole('There was an error...')
        print ex
Exemple #4
0
def processRdd(rdd):
	
	try:
		print 'processRDD'
		#covnert to a dataframe from rdd
		
		printOnConsole('Started Processing the streams')

		#desiredCol = ['c-ip','cs-uri-stem','c-user-agent','customer-id','x-ec_custom-1']
		if rdd.count() > 0:
			df = pycsv.csvToDataFrame(sqlContext, rdd, columns=COLUMNS, colTypes=COLUMN_TYPES)
			#df = df.select(desiredCol)
			
			#startTime
			endTime = getCurrentTimeStamp()
			startTime = endTime - SPARK_STREAM_BATCH
			
			endTime = getDateTimeFormat(endTime)
			startTime = getDateTimeFormat(startTime)
			df = df.withColumn(COL_STARTTIME, lit(startTime))
			
			#endTime
			df = df.withColumn(COL_ENDTIME, lit(endTime))

			df.registerTempTable("tempTable")
			query = ('select' + 
					' startTime,' +  																				#startTime
					' endTime,' +  																					#endTime				
					' \'\' as ' +  COL_CUSTOMERID +  ',' +															#customerid				
					' setProjectId(`projectid`) as ' +  COL_PROJECTID + ',' +														#projectid					 	
					' \'\' as ' +  COL_FONTTYPE +  ',' + 															#FontType
					' \'\' as ' +  COL_FONTID +  ',' + 																#FontId
					' getDomainName(`referrer`) as ' +  COL_DOMAINNAME +  ',' + 											#DomainName
					' getBrowser(`useragent`) as ' + COL_USERAGENT +  ',' + 										#UserAgent
					' setIpaddress(`ip`) as ' +  COL_IPADDRESS + 																	#customer ipaddress   
					' from tempTable')

			df = sqlContext.sql(query)
			
			type =  PAGEVIEW_TYPE | PAGEVIEWGEO_TYPE
			processForTable(df, type)
		else:
			printOnConsole('Nothing to process')
	
	except Exception, ex:
		printOnConsole('There was an error...')
		print ex			
Exemple #5
0
def main():
    global data        
    global week
    p = optparse.OptionParser()
    #take inputs 
    #take training data set 
    p.add_option('--train_dataset', '-i', default='/afs/cern.ch/user/s/sganju/private/2013.csv')
    #specify target column
    p.add_option('--target', '-y', default="target")
    #add different algos 
    #random forest 
    p.add_option('--algo', '-a',default = "sgd")
    #parse inputs
    #read options
    options, arguments = p.parse_args()
    a = options.algo 
    #hdfs path is the new path 
    path =sys.argv[1] #hdfs://samrouch-mesos-01:54310/user/root/test/"
    #know all of 2014 data files 
    #use glob 
    filename_2014 = path + 'dataframe-20140101-20140107.csv'
        
    #data = sc.textFile(path + "2013.csv")
    #data=sc.textFile(path+"2013.csv").map(lambda line: line.split(",")).filter(lambda line: len(line)>1).map(lambda line: (line[0],line[1])).collect()
    #read csv file using pycsv
    plaintext_rdd = sc.textFile(path+'2013.csv')
    from pyspark.sql import SQLContext
    sqlCtx = SQLContext(sc)    
    data = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd)

    for each_file in filename_2014:
        week = sc.textFile(each_file)
        print (' ==== ', each_file )
        transform_csv()
        algo(a)
	    merge_csv() 
 	break 
Exemple #6
0
def permSave(csv_location, table_name, sc, sqlContext):
    csv_location = downloadHTTP(csv_location)
    rdd = sc.textFile(csv_location)
    df = pycsv.csvToDataFrame(sqlContext, rdd)
    df.write.saveAsTable(table_name)
Exemple #7
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

conf = SparkConf().setMaster('local').setAppName('InternationalStudentsByCountry')
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

import pyspark_csv as pycsv
sc.addPyFile('pyspark_csv.py')

def extract_row(row):
   country = row[2]
   year = row[13]
   internationalStudents = 0
   if (row[9] != None and row[11] != None):
      numStudents = float(str(row[9]).replace(',',''))
      internationalPercentage = float(str(row[11])[:-1])
      internationalStudents = int(numStudents * internationalPercentage / 100.0)
   return ((year, country), internationalStudents)

plaintext_rdd = sc.textFile('file:///Users/Wik/Documents/Kuliah/BigData/Tugas-2/WorldRankUniversity-Mining/data/timesData.csv')
rdd = pycsv.csvToDataFrame(sqlContext, plaintext_rdd).rdd
mapped = rdd.map(extract_row)
reduced = mapped.reduceByKey(lambda a, b : a + b)
sorted = reduced.sortByKey()
result = sorted.collect()
for item in result:
   print str(item[0][0]) + ' - ' + str(item[0][1]) + ': ' + str(item[1])
   
    # Download pyspark_csv.py from https://github.com/seahboonsiew/pyspark-csv
    sys.path.append('/Users/eyalbenivri/Developer/libs/pyspark_libs')  # replace as necessary
    import pyspark_csv

    sc.addFile('/Users/eyalbenivri/Developer/libs/pyspark_libs/pyspark_csv.py')  # ditto
    sqlContext = SQLContext(sc)

    # Task 1: load the prop-prices.csv file as an RDD, and use the csvToDataFrame function from the pyspark_csv module
    # to create a DataFrame and register it as a temporary table so that you can run SQL queries:
    print("------- ******* Task 1 ******* -------")
    columns = ['id', 'price', 'date', 'zip', 'type', 'new', 'duration', 'PAON',
               'SAON', 'street', 'locality', 'town', 'district', 'county', 'ppd',
               'status']

    rdd = sc.textFile(datadir + "prop-prices.csv")
    df = pyspark_csv.csvToDataFrame(sqlContext, rdd, columns=columns)
    df.registerTempTable("properties")
    df.persist()

    # Task 2: let's do some basic analysis on the data.
    # Find how many records we have per year, and print them out sorted by year.
    print("------- ******* Task 2 ******* -------")
    year_count = sqlContext.sql(
        """select   year(date) as year, count(*) as count
        from     properties
        group by year(date)
        order by year(date)""").collect()
    print(year_count)

    # Task 3: Everyone knows that properties in London are expensive.
    # Find the average property price by county,
Exemple #9
0
dataLines.take(5)

#RDD to Dense vector
vectorsUSD = dataLines.map(transformationDT.transformToNumeric)
vectorsUSD.take(5)

#Perform statistical Analysis
statsUSD = Statistics.colStats(vectorsUSD)
statsUSD.mean()
statsUSD.variance()
statsUSD.min()
statsUSD.max()
Statistics.corr(vectorsUSD)

#SPARK SQL
dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",")
dataframe.registerTempTable("dataUSDuprv")
dff1 = sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show()
dataframe.show()

#LabeledPoint
lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint)
lpUSD.take(5)
dfUSD = sqlContext.createDataFrame(lpUSD, ["label", "features"])
dfUSD.select("label", "features").show(10)

#String Indexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(dfUSD)
td = si_model.transform(dfUSD)
td.collect()
Exemple #10
0
import pyspark_csv as pycsv
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext()
sqlCtx = SQLContext(sc)
sc.addPyFile('pyspark_csv.py')

plaintext_rdd = sc.textFile('accidents.csv')
df = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd)
df.registerTempTable("accidents")

new_df = sqlCtx.sql("SELECT id, latitude, longitude, datetime_of_accident, visibility, precipitation, conditions, weather_id, date_format(datetime_of_accident, 'EEEE') AS day FROM accidents")


mysql_url="jdbc:mysql://localhost?user=root"
new_df.groupBy("latitude","longitude","conditions", "day").count().write.jdbc(url=mysql_url, table="accident_prediction.aggregated_data", mode="append")
Exemple #11
0
    if (idx == 0):
        next(iterator)
    return iterator


license_header = collisions.first()

license_header_list = license_header.split(",")
license_body = license.mapPartitionsWithIndex(skip_header)

# filter not valid rows
license_body = license_body.filter(lambda line: len(line.split(",")) > 24)

# create Spark DataFrame using pyspark-csv
license_df = pycsv.csvToDataFrame(sqlContext,
                                  license_body,
                                  sep=",",
                                  columns=license_header_list)
#license_df.cache()

# In[44]:

license_df.printSchema()

# In[45]:

license_df.take(1)

# In[46]:

license_df.count()
Exemple #12
0
    return encoded_cpt


sc = SparkContext()
sc.addPyFile('pyspark_csv.py')

all_patients = sc.textFile("data/PATIENTS.csv").cache()
icu_visits = sc.textFile("data/ICUSTAYS.csv").cache()
cpt_events = sc.textFile("data/CPTEVENTS.csv").cache()
note_events = sc.textFile("data/NOTEEVENTS.csv").cache()
lda_events = sc.textFile("data/LDAEVENTS.csv").cache()
lab_events = sc.textFile("data/LABEVENTS.csv").cache()

sc = SQLContext(sc)

all_patients = pycsv.csvToDataFrame(sc, all_patients, parseDate=False).rdd
icu_visits = pycsv.csvToDataFrame(sc, icu_visits, parseDate=False).rdd
cpt_events = pycsv.csvToDataFrame(sc, cpt_events, parseDate=False).rdd
note_events = pycsv.csvToDataFrame(sc, note_events, parseDate=False).rdd
lda_events = pycsv.csvToDataFrame(sc, lda_events, parseDate=False).rdd
lab_events = pycsv.csvToDataFrame(sc, lab_events, parseDate=False).rdd

icu_patients = all_patients.keyBy(lambda p: p.SUBJECT_ID).join(
    icu_visits.keyBy(lambda v: v.SUBJECT_ID))
children_data = icu_patients.filter(
    lambda ip: is_infant(ip[1][0].DOB, ip[1][1].INTIME))
children_ids = children_data.map(lambda c: c[0]).collect()
visit_ids = children_data.map(lambda c: c[1][1].ICUSTAY_ID)

# AGE AND GENDER FEATURES
visits_age_gender = children_data.map(lambda c: (
def anom_with_lr():
  try:
    plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB
    pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",")
    anom = pat_proc.filter(pat_proc.is_anomalous == 1)
    benign = pat_proc.filter(pat_proc.is_anomalous == 0)
    n_benign = benign.count()
    
    #Take a random sample of 50K from the unlabeled 100K
    sqlContext.registerFunction("my_random", lambda x: x - x + random())
    sqlContext.registerDataFrameAsTable(benign, "benign")
    benign = sqlContext.sql("SELECT *, my_random(is_anomalous) as random_number FROM benign")
    
    threshold = 50000/n_benign
    into_model = benign.filter(benign.random_number <= threshold)
    for_finding_more = benign.filter(benign.random_number > threshold)
    
    for_modeling = anom.unionAll(into_model.drop(into_model.random_number))
    for_finding_more = for_finding_more.drop(for_finding_more.random_number)
    #Try to pull this from a much larger sample, or, the entire data, because the ones with lowest probabilities, among
    #the selected 10,000, have probabilities around 0.05
    
    print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count()) + ", into_model.count() = " + str(into_model.count()) 
            + ", for_modeling.count() = " + str(for_modeling.count()) + ", for_finding_more.count() = " + str(for_finding_more.count()))
    
    all_columns = for_modeling.columns
    features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])]
    categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway 
    procedure_features = [x for x in features if (x not in categorical_features)]

    #Unlike decision tree, logistic regression does not need the map categoricalFeaturesInfo, just an RDD of LabeledPoint objects.
    
    #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries:
    #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature.
    cat_feature_number = 0
    dict_cat_features = {}
    
    for feature in categorical_features:
       agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect()
       distinct_values = map(lambda row: row.asDict().values()[0], agvalues)
       distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values))
       dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values]
       cat_feature_number += 1
       
    for_modeling = for_modeling.rdd
    print("for_modeling.getNumPartitions() = " + str(for_modeling.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8
    
    (train, test) = for_modeling.randomSplit([0.5, 0.5])
    test_data_size = test.count()
    print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
    training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    print("training_data.count() = " + str(training_data.count()))
    
    t0 = time()
    #model = LogisticRegressionWithLBFGS.train(training_data) #LBFGS took 66.766 seconds
    model = LogisticRegressionWithSGD.train(training_data) #SGCD took 69.261 seconds
    tt = time() - t0
    print "Classifier trained in {} seconds".format(round(tt,3)) 
    
    test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    
    t0 = time()
    predictions = model.predict(test_data.map(lambda p: p.features))
    tt = time() - t0
    print "Prediction made in {} seconds".format(round(tt,3)) #Reports as 0.0 seconds
    
    labelsAndPreds = test_data.map(lambda p: (p.label, model.predict(p.features)))
    test_accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count()/float(test_data_size)

    fpr = labelsAndPreds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p): v == 1).count()
    print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #Test accuracy is 0.9057, fpr is 0.1634, fnr is 0.0282
    
    model.clearThreshold()
    for_finding_more = for_finding_more.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) #OK
    for_finding_more = for_finding_more.map(lambda p: (p.features, model.predict(p.features), p.label)) #OK
    
    try:
      for_finding_more.first() #We perform an action here because otherwise the output will be a PipelinedRDD.
      #Reverse-sort the additional patients by their predicted probabilities of being anomalous and take the top 10,000
      #for_finding_more.take(5)
    except EOFError:
      print("EOF handled")
      
    df = sqlContext.createDataFrame(for_finding_more.collect(), ['features', 'predicted_prob', 'is_anom'])
    df = df.orderBy(df.predicted_prob.desc()) #The orderBy is not actually called if collect() is not called. Can be also triggered by calling take(). We are triggering it by the writing in the next statement.
    df.select('is_anom', 'predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save('file:///Users/blahiri/healthcare/data/cloudera_challenge/additional_10000_from_spark.csv') #Top one has 
    #probability of 0.86818, last one has probability 0.5928958
    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return for_finding_more
Exemple #14
0
def loadData(sc, sqlContext, path):
    plain = sc.textFile(path)
    df = pycsv.csvToDataFrame(sqlContext, plain, sep=',')
    return df
Exemple #15
0
    return (r[0], (r[1], r[2]))

def merge_labels_name(t):
    a = []
    labels = t[1][1].split(" ")
    if len(labels) > 0:
        tokens = nltk.wordpunct_tokenize(t[1][0][0])
        for i, tok in enumerate(tokens):
            a.append((tok, labels[i]))

    return (t[0], a)

gbif = sc.textFile("backbone/taxon.txt")
gb_names = gbif.map(split_gbif).flatMap(get_gbif_word_type).union(named_tokens).aggregateByKey([],lambda a, s: list(set(a + [s])),lambda a1,a2: list(set(a1 + a2)))

idigbio = sc.textFile("uniquenames.csv")
idigbio_names = pycsv.csvToDataFrame(sqlCtx,idigbio)
idigbio_names.cache()
idigbio_words = idigbio_names.flatMap(get_idigbio_words_extended)
idigbio_tokens = idigbio_words.leftOuterJoin(gb_names)
# idigbio_tokens.cache()
# idigbio_tokens.saveAsTextFile("sn_parsed_intermediate")
idigbio_labeled = idigbio_tokens.map(un_tuple).aggregateByKey([],lambda a, s: a + [s], lambda a1, a2: a1 + a2).map(parts_in_order)
idigbio_labeled.cache()

idigbio_labeled.saveAsTextFile("sn_parsed_format_by_key")
idigbio_labeled.map(prep_count).reduceByKey(count).saveAsTextFile("sn_parsed")

idigbio_labeled_rev = idigbio_labeled.map(reverse_labels)
idigbio_parsed = idigbio_names.map(names_as_tuples).join(idigbio_labeled_rev).map(merge_labels_name)
idigbio_parsed.saveAsTextFile("uniquenames_parsed")
Exemple #16
0
num_cols = len(first_rec)

from pandas import read_csv
from numpy import float64 as np_float64
X = read_csv(s_input_path + 'clicks_X_valid_4-1_para_spark.csv', dtype=np_float64, header = None)
X2 = read_csv(s_input_path + 'clicks_X_valid_4-2_para_spark.csv', dtype=np_float64, header = None)
y = read_csv(s_input_path + 'clicks_y_valid_4-1_para_spark.csv', dtype=np_float64, header = None)
y2 = read_csv(s_input_path + 'clicks_y_valid_4-2_para_spark.csv', dtype=np_float64, header = None)
from numpy import concatenate as np_concat # Para concatenar varios ficheros en uno (leer_y_reshape)
X = np_concat((X, X2), axis=0)
y = np_concat((y, y2), axis=0)
X.shape, y.shape
num_cols = X.shape[1]

# NOTA: Cuidado que se ordena (por la primera columna...)
dfX = pycsv.csvToDataFrame(sqlCtx, txt_rdd, columns=['Col_' + str(i) for i in range(0,num_cols)])

txt_rdd = sc.textFile(s_spark_inputpath + 'clicks_y_valid_4.csv')
# NOTA: Cuidado que se ordena (por la primera columna...)
dfy = pycsv.csvToDataFrame(sqlCtx, txt_rdd, columns=['Clicked'])

dfX.select(['Col_' + str(i) for i in range(0,4)]).show(10)
dfy.select('Clicked').show(10)
# Ahora estos DataFrame tienen que convertirse en uno como hace [rdd = to_simple_rdd(sc, X_train, y_train)]
PENDIENTE*****
from elephas.utils.rdd_utils import to_simple_rdd
rdd = to_simple_rdd(sc, X_train, Y_train)

[?]
sc.statusTracker().getActiveJobsIds()
sc.statusTracker().getActiveStageIds()
Exemple #17
0
dataLines.take(5)

#RDD to Dense vector
vectorsUSD = dataLines.map(transformationDT.transformToNumeric)
vectorsUSD.take(5)

#Perform statistical Analysis
statsUSD=Statistics.colStats(vectorsUSD)
statsUSD.mean()
statsUSD.variance()
statsUSD.min()
statsUSD.max()
Statistics.corr(vectorsUSD)

#SPARK SQL
dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",")
dataframe.registerTempTable("dataUSDuprv")
dff1=sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show()
dataframe.show()


#LabeledPoint
lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint)
lpUSD.take(5)
dfUSD = sqlContext.createDataFrame(lpUSD, ["label", "features"])
dfUSD.select("label", "features").show(10)

#String Indexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(dfUSD)
td = si_model.transform(dfUSD)
import pyspark_csv as pycsv

expenditures = sc.textFile("swift://notebooks." + credentials_1['name'] + "/health_expenditures.csv")

def skip_header(idx, iterator):
    if (idx == 0):
        next(iterator)
    return iterator

expenditures_header = expenditures.first()

expenditures_header_list = expenditures_header.split(",")
expenditures_body = expenditures.mapPartitionsWithIndex(skip_header)

# create Spark DataFrame using pyspark-csv
expenditures_df = pycsv.csvToDataFrame(sqlContext, expenditures_body, sep = ",", columns = expenditures_header_list)


# In[6]:

life_expectancy = sc.textFile("swift://notebooks." + credentials_1['name'] + "/life_expectancy_at_birth.csv")

life_expectancy_header = life_expectancy.first()

life_expectancy_header_list = life_expectancy_header.split(",")
life_expectancy_body = life_expectancy.mapPartitionsWithIndex(skip_header)

# create Spark DataFrame using pyspark-csv
life_expectancy_df = pycsv.csvToDataFrame(sqlContext, life_expectancy_body, sep = ",", columns = life_expectancy_header_list)

# Using PySpark_CSV, read in Titanic data and build classification models for it.
# 
# #### (5a) Logistic Regression

# In[67]:

from pyspark.sql import SQLContext, Row
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.classification import LogisticRegressionWithSGD


    
sqlCtx = SQLContext(sc)
fileName = os.path.join(baseDir, 'titanic3.csv')
plaintext_rdd = sc.textFile(fileName)
titanicRawRDD = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd).rdd

#remove blank rows
titanicRDD = titanicRawRDD.filter(lambda r : (r[2] != None) )

#pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_dest

def parseRow(r):
    pclass = r[0]
    sex = 0 if r[3] == 'female' else 1
    age = r[4] if r[4] != None else -1 #flag invalid ages for filtering
    sibsp = r[5]
    parch = r[6]
    fare = r[8] if r[8] != None else -1 #flag missing fares for filtering
    try:
        lp =LabeledPoint(r[1], [pclass,sex,age,sibsp,parch,fare])
def train_validate_test_rpart():
  try:
    plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB
    pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",")
    
    anom = pat_proc.filter(pat_proc.is_anomalous == 1)
    benign = pat_proc.filter(pat_proc.is_anomalous == 0)
    n_benign = benign.count()
    print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count())) #anom.count() = 49542, benign.count() = 197406
    
    sample_from_benign = benign.sample(False, 50000/n_benign)
    pat_proc = anom.unionAll(sample_from_benign)
    print("pat_proc.count() = " + str(pat_proc.count())) #99,227
    
    all_columns = pat_proc.columns
    features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])]
    categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway 
    procedure_features = [x for x in features if (x not in categorical_features)]
    
    #Construct the map categoricalFeaturesInfo, which specifies which features are categorical and how many categorical values each of those features can take.
    
    #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries:
    #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature.
    cat_feature_number = 0
    dict_cat_features = {}
    
    for feature in categorical_features:
       agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect() #collect() is an action that returns all the elements of the dataset as an array at the driver program. 
       #Calls to collect() imply there would be communication between the executors and the driver, so use it with discretion. 
       distinct_values = map(lambda row: row.asDict().values()[0], agvalues)
       distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values))
       dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values]
       cat_feature_number += 1
       
    pat_proc = pat_proc.rdd
    print("pat_proc.getNumPartitions() = " + str(pat_proc.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8
    
    (train, test) = pat_proc.randomSplit([0.5, 0.5])
    test_data_size = test.count()
    print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
    training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    print("training_data.count() = " + str(training_data.count()))
    
    #Populate the actual categoricalFeaturesInfo dictionary
    cat_features_info = dict([(value[0], value[1]) for (key, value) in dict_cat_features.iteritems()])
    procedure_features_info = dict([(feature_id, 2) for feature_id in range(3, 2 + len(procedure_features))])
    cat_features_info = dict(cat_features_info.items() + procedure_features_info.items())
    
    t0 = time()
    model = DecisionTree.trainClassifier(training_data, numClasses = 2, categoricalFeaturesInfo = cat_features_info, impurity = 'gini', maxDepth = 2, maxBins = 32) 
    #Under the hood in DecisionTree.scala, RandomForest is called with numTrees = 1 and featureSubsetStrategy = "all".
    tt = time() - t0
    print "Classifier trained in {} seconds".format(round(tt,3)) #63.355 seconds (5.5 times compared to standalone R). Even when maxDepth was reduced from 5 to 2, time to train was 61.942 seconds.
    print(model)
    
    test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    
    t0 = time()
    predictions = model.predict(test_data.map(lambda p: p.features))
    tt = time() - t0
    print "Prediction made in {} seconds".format(round(tt,3)) #0.014 seconds
    
    labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) #Create a list of tuples with each tuple having the actual and the predicted label
    test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data_size)
    fpr = labels_and_preds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labels_and_preds.filter(lambda (v, p): v == 0).count() 
    fnr = labels_and_preds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labels_and_preds.filter(lambda (v, p): v == 1).count()
    print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #With maxDepth = 5, test accuracy is 0.9084, fpr is 0.1555, fnr is 0.0272.
    #With maxDepth = 2, test accuracy is 0.861, fpr is 0.2591, fnr is 0.018
    print model.toDebugString()
    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return model 
Exemple #21
0
def tempSave(csv_location, table_name, sc, sqlContext):
    csv_location = downloadHTTP(csv_location)
    rdd = sc.textFile(csv_location)
    df = pycsv.csvToDataFrame(sqlContext, rdd)
    df.registerTempTable(table_name)