def create_vector_file(pitch_outcome, path, folder): from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.util import MLUtils pitch_o_RDD = pitch_outcome.rdd x = pitch_o_RDD.map(lambda data: LabeledPoint(data[13],[data[0],data[1],data[2],data[3],\ data[4],data[5],data[6],data[7],data[8],data[9],data[10],data[11],data[12],data[14],\ data[15],data[16],data[17],data[18],data[19],data[20]])) MLUtils.saveAsLibSVMFile(x, path + folder)
def write_rdd(rdd, path, out_type = 'pickleFile', db_path = None, db_fields = {}) : """ Write an RDD to disk with a given output type and optionally adding an entry to an RDD metadata database. **Input** *rdd* : the rdd to write to disk *path* : absolute path for the output **Optional Keywords** *out_type* : which type of Spark output to create *db_path* : if you want to add the entries to a database, add the path here *db_fields* : if you specify a *db_path* you must also specify a dictionary of database fields and their values """ if out_type == 'pickleFile' : rdd.saveAsPickleFile(path) elif out_type == 'textFile' : rdd.saveAsTextFile(path) elif out_type == 'libsvm' : from pyspark.mllib.util import MLUtils MLUtils.saveAsLibSVMFile(rdd, path) else : raise RuntimeError("out_type must be either 'pickleFile' or 'textFile'") if db_path is not None : # open the database connection -- if the database doesn't exist it will automatically be created import sqlite3 import time conn = sqlite3.connect(db_path) with conn : c = conn.cursor() # create the RDDs table if it doesn't exist c.execute('create TABLE if NOT EXISTS RDDs (path text, date_time text, filter text, description text, script text, year_start INTEGER, year_end INTEGER)') filter_text = db_fields.get('filter', '') description_text = db_fields.get('description', '') script_text = db_fields.get('script', '') year_start = db_fields.get('year_start', 0) year_end = db_fields.get('year_end', 0) # form data tuple date = time.localtime() date_string = '%s-%02d-%02d_%02d:%02d:%02d'%(date.tm_year, int(date.tm_mon), int(date.tm_mday), int(date.tm_hour), int(date.tm_min), int(date.tm_sec)) data = (path, date_string, filter_text, description_text, script_text, year_start, year_end) c.execute('INSERT INTO RDDs VALUES (?,?,?,?,?,?,?)', data)
def saveaslibSVMfile(self): """ 保存libsvm格式的文件 :return: """ sc = SparkContext(master="local[2]", appName="SaveAsLibSVMFile" + os.path.basename(self.__savepath)) features = sc.textFile(self.__featurespath) TOTALFEATUREANDLABEL = sc.accumulator([], ListParamForLabeledPoint()) def codechange(line): """ 根据“_v”切分出类别信息 :param line:关键帧的特征 :return: (类别号,特征) """ classname = os.path.basename(line[0]).split("_v")[0] classnum = self.__classmap[classname] # ResultIterable = list(line[1]) # features = ResultIterable[0] + ResultIterable[1] + ResultIterable[2] # print(len(features)) return (classnum, list(line[1])) def getfeaturesandlabel(line): """ 返回LabeledPoint类型的标签和特征组合 :param line:(类别号,特征) :return:返回LabeledPoint类型的标签和特征组合 """ #global TOTALFEATUREANDLABEL return LabeledPoint(line[0], Vectors.dense(line[1])) #TOTALFEATUREANDLABEL += [LabeledPoint(line[0], Vectors.dense(line[1]))] featuresandlabel = features.map(lambda x: x.split(" ")).map( lambda x: (x[1], x[2:])).map(codechange).map( getfeaturesandlabel).repartition(1) featuresandlabel.count() print(featuresandlabel.count()) #totalfeatureandlabel = TOTALFEATUREANDLABEL.value MLUtils.saveAsLibSVMFile(featuresandlabel, self.__savepath) sc.stop()
d2.take(2) # In[21]: from pyspark.mllib.util import MLUtils dataOutput="libsvm_data.txt" import os.path import shutil if os.path.exists(dataOutput): shutil.rmtree(dataOutput)#os.rmdir(dataOutput) print dataOutput MLUtils.saveAsLibSVMFile(d2,"libsvm_data.txt") # In[22]: for i,x in enumerate(features): print i,x # In[23]: # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = d2.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous.
df_train.write.options( header="true").csv("hdfs://node1:9000/user/root/exp4/procd_train_real.csv") df_train.write.parquet( "hdfs://node1:9000/user/root/exp4/procd_train_real.parquet") # %% #填充缺失值 #第一种策略是将后8个特征所有null值填充为0 df_train_filled = df_train.fillna(0) df_train_filled.show() # %% #将数据转为合适的格式 from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import Vectors #先转成RDD df_train_rdd = df_train_filled.rdd #改成(label,features)的格式 df_train_rdd = df_train_rdd.map( lambda line: LabeledPoint(line[2], Vectors.dense(line[3:]))) # %% #保存为LibSVMFile格式,方便后面训练使用 from pyspark.mllib.util import MLUtils MLUtils.saveAsLibSVMFile(df_train_rdd, "hdfs://node1:9000/user/root/exp4/procd_train_real") # %% #别忘了关掉session spark.stop()
for v in values: if v in vocab[col]: word_indices.append(start_index + vocab[col].index(v)) for k, v in sorted(six.iteritems(Counter(word_indices))): feature_indices.append(k) feature_values.append(float(v)) start_index += len(vocab[col]) if col == target_col: label = vocab[col].index(col_value) if classification else col_value return {"label": label, "indices": feature_indices, "values": feature_values} return process_rows process_row_fn = make_process_rows_fn( classification, args.target, text_columns, category_columns, number_columns, vocab, stats) dfs = [] if args.train: dfTrain = spark.read.schema(schema).csv(args.train) dfs.append(("train", dfTrain)) if args.eval: dfEval = spark.read.schema(schema).csv(args.eval) dfs.append(("eval", dfEval)) for name, df in dfs: rdd = df.rdd.map(process_row_fn).map( lambda row: LabeledPoint(row["label"], SparseVector(feature_size, row["indices"], row["values"]))) MLUtils.saveAsLibSVMFile(rdd, os.path.join(args.output, name))
# Try and import the PySpark classes try: from pyspark import SparkContext from pyspark import SparkConf from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.classification import LabeledPoint from pyspark.mllib.util import MLUtils print("Successfully loaded Spark and MLlib classes...") except ImportError as e: print("Error importing spark modules", e) sys.exit(1) from numpy import array conf = SparkConf().setAppName("RecessionPredictionModel").setMaster("local") sc = SparkContext(conf=conf) data = sc.textFile("/Users/agaram/development/DataScienceExperiments/econometricsPoc/EconometricsDataSlope.csv/Sheet1-Table1.csv") parsedData = data.map(lambda line: LabeledPoint([float(x) for x in line.split(',')[1:8]][6], array([float(x) for x in line.split(',')[1:8]]))) MLUtils.saveAsLibSVMFile(parsedData, "/Users/agaram/development/DataScienceExperiments/econometricsPoc/svmDataSlope")
documents = sc.textFile("training_data.txt").map(lambda line: line.split("::")) features = documents.map(lambda line: line[1]) labels = documents.map(lambda line: line[3]) hashingTF = HashingTF(numFeatures=300) tf = hashingTF.transform(features) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) sparse_vectors = tfidf.collect() _labels = labels.collect() labelpoint_data = [] for i, j in zip(_labels, sparse_vectors): labelpoint_data.append(LabeledPoint(i, j)) #model = SVMWithSGD.train(sc.parallelize(labelpoint_data), iterations=10) MLUtils.saveAsLibSVMFile(sc.parallelize(labelpoint_data), 'tempFile') result = spark.read.format("libsvm").load("tempFile/part-00003") for i in range(3): inputData = spark.read.format("libsvm").load("tempFile/part-0000" + str(i)) result = result.unionAll(inputData) #(train, test) = result.randomSplit([0.8, 0.2]) (train, test1) = result.randomSplit([0.8, 0.2]) (train1, test) = train.randomSplit([0.8, 0.2]) lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) ovr = OneVsRest(classifier=lr) ovrModel = ovr.fit(train) predictions = ovrModel.transform(train) evaluator = MulticlassClassificationEvaluator(metricName="accuracy") accuracy = evaluator.evaluate(predictions)
from pyspark.mllib.util import MLUtils #>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] #>>> tempFile = NamedTemporaryFile(delete=True) #>>> tempFile.close() #>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name) from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import HashingTF, IDF from pyspark import SparkContext sc=SparkContext("local","dd") train = sc.parallelize(open("/home/madhura/ML_Spring16/MLProject/data/OriginalTraining.txt").read().splitlines()).map(lambda x: x.split(",")) trainlabels = train.map(lambda(a,b): int(b)) traintf = HashingTF().transform(train.map(lambda(a,b): a.split())) trainidf = IDF().fit(traintf) traintfidf = trainidf.transform(traintf) #densetrain = traintfidf.map(lambda x: pyspark.mllib.linalg.DenseVector(x.toArray())) #zippeddata = trainlabels.zip(densetrain) #new = zippeddata.map(lambda (a,vec) : (a,vec.toArray())) training = trainlabels.zip(traintfidf).map(lambda x : LabeledPoint(x[0], x[1])) MLUtils.saveAsLibSVMFile(training.coalesce(1),"/home/madhura/ML_Spring16/MLProject/data/libsvmfile") data = MLUtils.loadLibSVMFile(sc, "/home/madhura/ML_Spring16/MLProject/data/libsvmfile/part-00000") (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) model.save(sc, "/home/madhura/ML_Spring16/MLProject/SentimentAnalysis_NLTK_NB/src/myRandomForestClassificationModel")
StructField("channel", IntegerType(), True), StructField("click_time", TimestampType(), True), StructField("attributed_time", TimestampType(), True), StructField("is_attributed", IntegerType(), True)]) df = sqlContext.read.format("com.databricks.spark.csv").option("header", True).schema(trainschema).load(path) rdd1 = df.rdd print (rdd1.take(3)) #(ip=87540, app=12, device=1, os=13, channel=497, click_time=datetime.datetime(2017, 11, 7, 9, 30, 38), attributed_time=None, is_attributed=0) rdd2 = rdd1.map(lambda line: LabeledPoint(line[7],[line[0],line[1],line[2],line[3],line[4]])) print (rdd2.take(3)) MLUtils.saveAsLibSVMFile(rdd2, "gs://kb-advanced-bracketology/talkingdata-adtracking-fraud-detection/processed_files/training_files_libsvm/train_sample") ===============REF: https://stackoverflow.com/questions/43920111/convert-dataframe-to-libsvm-format ============= from pyspark.mllib.util import MLUtils from pyspark.mllib.regression import LabeledPoint # A DATAFRAME >>> df.show() +---+---+---+ | _1| _2| _3| +---+---+---+ | 1| 3| 6| | 4| 5| 20|
# configure env. variables. Refer to step 1 sc = config.config_env(version) sess = SparkSession(sc) # read from 'AMOS' and 'aircraftutilization' necessary metrics and create # response variable. Refer to steps 2 and 4 ACuti_Mevents = format_data_from_sources(sc) # read sensor information from HDFS or local csv's. Refer to step 3 averages = data_from_csvs(sc, sess, loadfrom, csv_path) # create enriched aircraft utilization metrics (join sensor data). # Refer to step 5 matrix = join_csvs_dwinfo(sc, averages, ACuti_Mevents) # Model saving procedure (to local). Refer to step 6 # format previous rdd to 'labeled points' labeledpoints = matrix.map(lambda t: LabeledPoint(t[4], t[:3])) # get (local) saving path matrix_path = os.getcwd() + '/data_matrix/' # remove previous matrix version, if one shutil.rmtree(matrix_path, onerror=lambda f, path, exinfo: ()) # save matrix MLUtils.saveAsLibSVMFile(labeledpoints, matrix_path) print(f'Data matrix saved in {matrix_path}')
data_svm_sql = sqlContext.read.format("libsvm").load("/user/wrt/credit/allexample.libsvm") data_svm = data_svm_sql.map(lambda row:LabeledPoint(int(row.label),row.features)) features = data_svm.map(lambda x: x.features) stat = Statistics.colStats(features) coverage = (stat.numNonzeros()/stat.count()).tolist() std = numpy.sqrt(stat.variance()).tolist() features_nums = data_svm.map(lambda x: x.features.size).take(1)[0] features_arr = range(0, features_nums) re = zip(zip(coverage,std),features_arr) filteredIndexes = map(lambda m: m[1],filter(lambda a:a[0][0] >=0.005,re)) slicer = VectorSlicer(inputCol="features", outputCol="featuresFiltered", indices=filteredIndexes) output_df = slicer.transform(data_svm_sql) data_svm_filtered = output_df.select("label","featuresFiltered") data_svm_labelpoint = data_svm_filtered.map(lambda row:LabeledPoint(int(row.label),row.featuresFiltered)) MLUtils.saveAsLibSVMFile(data_svm_labelpoint,"/user/wrt/credit/allexample_filter.libsvm") rdd_r = sc.textFile("/user/wrt/credit/allexample_filter.libsvm")\ .map(lambda x :x.split()[0].split('.')[0] + '\001' + ' '.join(x.split()[1:])) rdd_r.saveAsTextFile("/user/wrt/credit/allexample_filter_telindex_features") feature_raw = sc.textFile("/hive/warehouse/wlcredit.db/t_wrt_credit_all_features_name/ds=" + today + "_cms1234_anf")\ .map(lambda x:valid_jsontxt(x.split("\t")[0])).collect() fea_all_index = [] j = 1 for i in filteredIndexes: fea_all_index.append(feature_raw[i] + "\t" + str(j)) j += 1 sc.parallelize(fea_all_index).saveAsTextFile('/user/wrt/temp/filter_feature_name') hiveContext.sql("load data inpath '/user/wrt/temp/filter_feature_name' overwrite into table \ wlcredit.t_wrt_credit_all_features_name PARTITION (ds = '" + today + "_cms1234_anf_filter')")
''' scores = df.where("score IS NOT NULL") \ .where("type='story'") \ .where("title IS NOT NULL") \ .map(lambda row: row.score) def loadVecs(score_pairs): import numpy as np docvecs = np.load("/data/_hndata/hn.docvecs.doctag_syn0.npy") return [(s, docvecs[i]) for (s,i) in score_pairs] vecs = scores.zipWithIndex().mapPartitions(loadVecs) data = vecs.map(lambda pair: LabeledPoint(log(float(pair[0])+1.0), pair[1])) MLUtils.saveAsLibSVMFile(data, "hdfs:///hndata/docvecs") # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. rr = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=5, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) predictions = rr.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: log(lp.label+1.0)).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
output.select("mimic_features").show(truncate=False) # Load list of admissions # sample = pd.read_csv("sepsis_and_not_sepsis_admissions.csv") # sample['HADM_ID'] = sample['HADM_ID'].str.extract('([0-9]+)') # sample_admissions = np.array(sample['HADM_ID']).tolist() # sample = sample[["HADM_ID", "label"]] # sample = sqlCtx.createDataFrame(sample) # sample.write.csv("labels.csv", header=True, mode="overwrite") sample = spark.read.csv("labels.csv", header=True, inferSchema=True) # Get ready to save as libsvm # https://stackoverflow.com/questions/43920111/convert-dataframe-to-libsvm-format from pyspark.mllib.util import MLUtils from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import DenseVector, VectorUDT features_libsvm = output.select("HADM_ID", "mimic_features").join(sample, on="HADM_ID", how="left") features_libsvm = features_libsvm.select("label", "mimic_features") features_libsvm_rdd = features_libsvm.rdd features_libsvm_rdd.take(1) features_libsvm_format = features_libsvm_rdd.map( lambda line: LabeledPoint(line[0], DenseVector(line[1]))) features_libsvm_format.take(3) MLUtils.saveAsLibSVMFile(features_libsvm_format, "features_combined.libsvm")
.withColumn('dayofweek', dayofweek("CallDateTime")) \ .withColumn('dayofyear', dayofyear("CallDateTime")) \ .withColumn('hour', hour("CallDateTime")) \ .withColumn('minute', minute("CallDateTime")) \ .withColumn('weekofyear', weekofyear("CallDateTime")) # check to see if latitude is null df.filter(df.latitude.isNull()).count() df.filter(df.longitude.isNull()).count() # consider only data which has geo loctaion specified df = df.filter(df.latitude.isNotNull()) df_rolledup = df.groupBy("latitude", "longitude", "month", "dayofmonth").count() df_rolledup = df_rolledup.select("count","latitude", "longitude", "month", "dayofmonth") # convert to RDD first df_rdd = df_rolledup.rdd # Code for running a liner regression model on the same from pyspark.mllib.util import MLUtils from pyspark.mllib.regression import LabeledPoint # FROM RDD OF TUPLE TO A RDD OF LABELEDPOINT for training and testing df_libsvm = df_rdd.map(lambda line: LabeledPoint(line[0],line[1:])) # SAVE AS LIBSVM MLUtils.saveAsLibSVMFile(df_libsvm, "hdfs://worker2.hdp-internal:8020/user/sdeshpa1/data_final_test.txt") :
sc = spark.sparkContext #------------------------------------------------------------------------------- # Read the training data and build the model #------------------------------------------------------------------------------- #reading the train dataframes trainingDF = spark.read.load("../data/train_small.parquet") #convert every row to LabeledPoint transformedTrainingRDD = (trainingDF.rdd.map( lambda row: LabeledPoint(int(row.label) - 1, row.features))) #print transformedTrainingRDD.show() #Save the RDD in LibSVM format, as Naive Bayes reads in the same format MLUtils.saveAsLibSVMFile(transformedTrainingRDD, "trainingLibsvmfile") training = MLUtils.loadLibSVMFile(sc, "trainingLibsvmfile/*") print "trainingLibsvmfile created!!" # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(training, numClasses=10, categoricalFeaturesInfo={}, numTrees=24, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
from pyspark import SparkContext, SparkConf from pyspark.ml.classification import LogisticRegression from pyspark.sql import SparkSession from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.util import MLUtils if __name__ == "__main__": def parsePoint(line): values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:]) conf = SparkConf().setAppName("LG") sc = SparkContext(conf=conf) data = sc.textFile("hdfs://student83-x1:9000/sample_svm.txt") parsedData = data.map(parsePoint) MLUtils.saveAsLibSVMFile( parsedData, "hdfs://student83-x1:9000/sample_libsvm")