def create_vector_file(pitch_outcome, path, folder):
    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.util import MLUtils

    pitch_o_RDD = pitch_outcome.rdd
    x = pitch_o_RDD.map(lambda data: LabeledPoint(data[13],[data[0],data[1],data[2],data[3],\
            data[4],data[5],data[6],data[7],data[8],data[9],data[10],data[11],data[12],data[14],\
            data[15],data[16],data[17],data[18],data[19],data[20]]))
    MLUtils.saveAsLibSVMFile(x, path + folder)
Ejemplo n.º 2
0
    def write_rdd(rdd, path, out_type = 'pickleFile', db_path = None, db_fields = {}) : 
        """
        Write an RDD to disk with a given output type and optionally adding an entry to an RDD metadata database. 
        
        **Input**

        *rdd* : the rdd to write to disk

        *path* : absolute path for the output

        **Optional Keywords**

        *out_type* : which type of Spark output to create 

        *db_path* : if you want to add the entries to a database, add the path here

        *db_fields* : if you specify a *db_path* you must also specify a dictionary of database fields and their values
        """
        
        if out_type == 'pickleFile' : 
            rdd.saveAsPickleFile(path)        
        elif out_type == 'textFile' : 
            rdd.saveAsTextFile(path)
        elif out_type == 'libsvm' : 
            from pyspark.mllib.util import MLUtils
            MLUtils.saveAsLibSVMFile(rdd, path)
            
        else : 
            raise RuntimeError("out_type must be either 'pickleFile' or 'textFile'")

        if db_path is not None : 
            # open the database connection -- if the database doesn't exist it will automatically be created
            import sqlite3
            import time
            conn = sqlite3.connect(db_path)

            with conn :
                c = conn.cursor()

                # create the RDDs table if it doesn't exist
                c.execute('create TABLE if NOT EXISTS RDDs (path text, date_time text, filter text, description text, script text, year_start INTEGER, year_end INTEGER)')

                filter_text = db_fields.get('filter', '')
                description_text = db_fields.get('description', '')
                script_text = db_fields.get('script', '')
                year_start = db_fields.get('year_start', 0)
                year_end = db_fields.get('year_end', 0)

                # form data tuple
                date = time.localtime()
                date_string = '%s-%02d-%02d_%02d:%02d:%02d'%(date.tm_year, int(date.tm_mon), int(date.tm_mday), 
                                                   int(date.tm_hour), int(date.tm_min), int(date.tm_sec))
                data = (path, date_string, filter_text, description_text, script_text, year_start, year_end)
                c.execute('INSERT INTO RDDs VALUES (?,?,?,?,?,?,?)', data)
Ejemplo n.º 3
0
    def saveaslibSVMfile(self):
        """
        保存libsvm格式的文件
        :return:
        """
        sc = SparkContext(master="local[2]",
                          appName="SaveAsLibSVMFile" +
                          os.path.basename(self.__savepath))
        features = sc.textFile(self.__featurespath)
        TOTALFEATUREANDLABEL = sc.accumulator([], ListParamForLabeledPoint())

        def codechange(line):
            """
            根据“_v”切分出类别信息
            :param line:关键帧的特征
            :return: (类别号,特征)
            """
            classname = os.path.basename(line[0]).split("_v")[0]
            classnum = self.__classmap[classname]
            # ResultIterable = list(line[1])
            # features = ResultIterable[0] + ResultIterable[1] + ResultIterable[2]
            # print(len(features))
            return (classnum, list(line[1]))

        def getfeaturesandlabel(line):
            """
            返回LabeledPoint类型的标签和特征组合
            :param line:(类别号,特征)
            :return:返回LabeledPoint类型的标签和特征组合
            """
            #global TOTALFEATUREANDLABEL
            return LabeledPoint(line[0], Vectors.dense(line[1]))
            #TOTALFEATUREANDLABEL += [LabeledPoint(line[0], Vectors.dense(line[1]))]

        featuresandlabel = features.map(lambda x: x.split(" ")).map(
            lambda x: (x[1], x[2:])).map(codechange).map(
                getfeaturesandlabel).repartition(1)
        featuresandlabel.count()
        print(featuresandlabel.count())
        #totalfeatureandlabel = TOTALFEATUREANDLABEL.value
        MLUtils.saveAsLibSVMFile(featuresandlabel, self.__savepath)
        sc.stop()
Ejemplo n.º 4
0
d2.take(2)


# In[21]:

from pyspark.mllib.util import MLUtils

dataOutput="libsvm_data.txt"
import os.path
import shutil
if os.path.exists(dataOutput):
    shutil.rmtree(dataOutput)#os.rmdir(dataOutput)
    print dataOutput

MLUtils.saveAsLibSVMFile(d2,"libsvm_data.txt")


# In[22]:

for i,x in enumerate(features): print i,x


# In[23]:


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = d2.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
Ejemplo n.º 5
0
df_train.write.options(
    header="true").csv("hdfs://node1:9000/user/root/exp4/procd_train_real.csv")
df_train.write.parquet(
    "hdfs://node1:9000/user/root/exp4/procd_train_real.parquet")

# %%
#填充缺失值
#第一种策略是将后8个特征所有null值填充为0
df_train_filled = df_train.fillna(0)
df_train_filled.show()

# %%
#将数据转为合适的格式
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
#先转成RDD
df_train_rdd = df_train_filled.rdd
#改成(label,features)的格式
df_train_rdd = df_train_rdd.map(
    lambda line: LabeledPoint(line[2], Vectors.dense(line[3:])))

# %%
#保存为LibSVMFile格式,方便后面训练使用
from pyspark.mllib.util import MLUtils
MLUtils.saveAsLibSVMFile(df_train_rdd,
                         "hdfs://node1:9000/user/root/exp4/procd_train_real")

# %%
#别忘了关掉session
spark.stop()
Ejemplo n.º 6
0
          for v in values:
            if v in vocab[col]:
              word_indices.append(start_index + vocab[col].index(v))
          for k, v in sorted(six.iteritems(Counter(word_indices))):
            feature_indices.append(k)
            feature_values.append(float(v))
        start_index += len(vocab[col])
      if col == target_col:
        label = vocab[col].index(col_value) if classification else col_value
    return {"label": label, "indices": feature_indices, "values": feature_values}
  
  return process_rows


process_row_fn = make_process_rows_fn(
    classification, args.target, text_columns, category_columns, number_columns, vocab, stats)

dfs = []
if args.train:
  dfTrain = spark.read.schema(schema).csv(args.train)
  dfs.append(("train", dfTrain))
if args.eval:
  dfEval = spark.read.schema(schema).csv(args.eval)
  dfs.append(("eval", dfEval))

for name, df in dfs:
  rdd = df.rdd.map(process_row_fn).map(
      lambda row: LabeledPoint(row["label"],
                               SparseVector(feature_size, row["indices"], row["values"])))
  MLUtils.saveAsLibSVMFile(rdd, os.path.join(args.output, name))

# Try and import the PySpark classes
try:
    from pyspark import SparkContext
    from pyspark import SparkConf
    from pyspark.mllib.classification import LogisticRegressionWithSGD
    from pyspark.mllib.classification import LabeledPoint
    from pyspark.mllib.util import MLUtils

    print("Successfully loaded Spark and MLlib classes...")

except ImportError as e:
    print("Error importing spark modules", e)
    sys.exit(1)


from numpy import array

conf = SparkConf().setAppName("RecessionPredictionModel").setMaster("local")

sc = SparkContext(conf=conf)

data = sc.textFile("/Users/agaram/development/DataScienceExperiments/econometricsPoc/EconometricsDataSlope.csv/Sheet1-Table1.csv")

parsedData = data.map(lambda line: LabeledPoint([float(x) for x in line.split(',')[1:8]][6],
                                                array([float(x) for x in line.split(',')[1:8]])))

MLUtils.saveAsLibSVMFile(parsedData, "/Users/agaram/development/DataScienceExperiments/econometricsPoc/svmDataSlope")

Ejemplo n.º 8
0
documents = sc.textFile("training_data.txt").map(lambda line: line.split("::"))
features = documents.map(lambda line: line[1])
labels = documents.map(lambda line: line[3])

hashingTF = HashingTF(numFeatures=300)
tf = hashingTF.transform(features)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
sparse_vectors = tfidf.collect()
_labels = labels.collect()
labelpoint_data = []
for i, j in zip(_labels, sparse_vectors):
    labelpoint_data.append(LabeledPoint(i, j))
#model = SVMWithSGD.train(sc.parallelize(labelpoint_data), iterations=10)
MLUtils.saveAsLibSVMFile(sc.parallelize(labelpoint_data), 'tempFile')

result = spark.read.format("libsvm").load("tempFile/part-00003")
for i in range(3):
    inputData = spark.read.format("libsvm").load("tempFile/part-0000" + str(i))
    result = result.unionAll(inputData)

#(train, test) = result.randomSplit([0.8, 0.2])
(train, test1) = result.randomSplit([0.8, 0.2])
(train1, test) = train.randomSplit([0.8, 0.2])
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
ovr = OneVsRest(classifier=lr)
ovrModel = ovr.fit(train)
predictions = ovrModel.transform(train)
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
Ejemplo n.º 9
0
from pyspark.mllib.util import MLUtils
#>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),                         LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
#>>> tempFile = NamedTemporaryFile(delete=True)
#>>> tempFile.close()
#>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)



from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF, IDF
from pyspark import SparkContext

sc=SparkContext("local","dd")
train = sc.parallelize(open("/home/madhura/ML_Spring16/MLProject/data/OriginalTraining.txt").read().splitlines()).map(lambda x: x.split(","))
trainlabels = train.map(lambda(a,b): int(b))
traintf = HashingTF().transform(train.map(lambda(a,b): a.split()))
trainidf = IDF().fit(traintf)
traintfidf = trainidf.transform(traintf)
#densetrain = traintfidf.map(lambda x: pyspark.mllib.linalg.DenseVector(x.toArray()))
#zippeddata = trainlabels.zip(densetrain)
#new = zippeddata.map(lambda (a,vec) : (a,vec.toArray()))
training = trainlabels.zip(traintfidf).map(lambda x : LabeledPoint(x[0], x[1]))
MLUtils.saveAsLibSVMFile(training.coalesce(1),"/home/madhura/ML_Spring16/MLProject/data/libsvmfile")
data = MLUtils.loadLibSVMFile(sc, "/home/madhura/ML_Spring16/MLProject/data/libsvmfile/part-00000")
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)
model.save(sc, "/home/madhura/ML_Spring16/MLProject/SentimentAnalysis_NLTK_NB/src/myRandomForestClassificationModel")
Ejemplo n.º 10
0
    StructField("channel", IntegerType(), True),
    StructField("click_time", TimestampType(), True),
    StructField("attributed_time", TimestampType(), True),
    StructField("is_attributed", IntegerType(), True)])

df = sqlContext.read.format("com.databricks.spark.csv").option("header", True).schema(trainschema).load(path)

rdd1 = df.rdd
print (rdd1.take(3))
#(ip=87540, app=12, device=1, os=13, channel=497, click_time=datetime.datetime(2017, 11, 7, 9, 30, 38), attributed_time=None, is_attributed=0)


rdd2 = rdd1.map(lambda line: LabeledPoint(line[7],[line[0],line[1],line[2],line[3],line[4]]))
print (rdd2.take(3))

MLUtils.saveAsLibSVMFile(rdd2, "gs://kb-advanced-bracketology/talkingdata-adtracking-fraud-detection/processed_files/training_files_libsvm/train_sample")



===============REF: https://stackoverflow.com/questions/43920111/convert-dataframe-to-libsvm-format =============

from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

# A DATAFRAME
>>> df.show()
+---+---+---+
| _1| _2| _3|
+---+---+---+
|  1|  3|  6|  
|  4|  5| 20|
Ejemplo n.º 11
0
    # configure env. variables. Refer to step 1
    sc = config.config_env(version)
    sess = SparkSession(sc)

    # read from 'AMOS' and 'aircraftutilization' necessary metrics and create
    # response variable. Refer to steps 2 and 4
    ACuti_Mevents = format_data_from_sources(sc)

    # read sensor information from HDFS or local csv's. Refer to step 3
    averages = data_from_csvs(sc, sess, loadfrom, csv_path)

    # create enriched aircraft utilization metrics (join sensor data).
    # Refer to step 5
    matrix = join_csvs_dwinfo(sc, averages, ACuti_Mevents)

    # Model saving procedure (to local). Refer to step 6

    # format previous rdd to 'labeled points'
    labeledpoints = matrix.map(lambda t: LabeledPoint(t[4], t[:3]))

    # get (local) saving path
    matrix_path = os.getcwd() + '/data_matrix/'

    # remove previous matrix version, if one
    shutil.rmtree(matrix_path, onerror=lambda f, path, exinfo: ())

    # save matrix
    MLUtils.saveAsLibSVMFile(labeledpoints, matrix_path)
    print(f'Data matrix saved in {matrix_path}')
data_svm_sql = sqlContext.read.format("libsvm").load("/user/wrt/credit/allexample.libsvm")
data_svm = data_svm_sql.map(lambda row:LabeledPoint(int(row.label),row.features))
features = data_svm.map(lambda x: x.features)
stat = Statistics.colStats(features)
coverage = (stat.numNonzeros()/stat.count()).tolist()
std = numpy.sqrt(stat.variance()).tolist()
features_nums = data_svm.map(lambda x: x.features.size).take(1)[0]
features_arr = range(0, features_nums)	
re = zip(zip(coverage,std),features_arr)
filteredIndexes = map(lambda m: m[1],filter(lambda a:a[0][0] >=0.005,re))
slicer = VectorSlicer(inputCol="features", outputCol="featuresFiltered", indices=filteredIndexes)
output_df = slicer.transform(data_svm_sql)
data_svm_filtered = output_df.select("label","featuresFiltered")
data_svm_labelpoint = data_svm_filtered.map(lambda row:LabeledPoint(int(row.label),row.featuresFiltered))
MLUtils.saveAsLibSVMFile(data_svm_labelpoint,"/user/wrt/credit/allexample_filter.libsvm")
rdd_r = sc.textFile("/user/wrt/credit/allexample_filter.libsvm")\
    .map(lambda x :x.split()[0].split('.')[0] + '\001' + ' '.join(x.split()[1:]))
rdd_r.saveAsTextFile("/user/wrt/credit/allexample_filter_telindex_features")
feature_raw = sc.textFile("/hive/warehouse/wlcredit.db/t_wrt_credit_all_features_name/ds=" + today + "_cms1234_anf")\
    .map(lambda x:valid_jsontxt(x.split("\t")[0])).collect()
fea_all_index = []
j = 1
for i in filteredIndexes:
    fea_all_index.append(feature_raw[i] + "\t" + str(j))
    j += 1
sc.parallelize(fea_all_index).saveAsTextFile('/user/wrt/temp/filter_feature_name')


hiveContext.sql("load data inpath '/user/wrt/temp/filter_feature_name' overwrite into table \
wlcredit.t_wrt_credit_all_features_name PARTITION (ds = '" + today + "_cms1234_anf_filter')")
Ejemplo n.º 13
0
'''

scores = df.where("score IS NOT NULL") \
         .where("type='story'") \
         .where("title IS NOT NULL") \
         .map(lambda row: row.score)

def loadVecs(score_pairs):
    import numpy as np
    docvecs = np.load("/data/_hndata/hn.docvecs.doctag_syn0.npy")
    return [(s, docvecs[i]) for (s,i) in score_pairs]

vecs = scores.zipWithIndex().mapPartitions(loadVecs)
data = vecs.map(lambda pair: LabeledPoint(log(float(pair[0])+1.0), pair[1]))

MLUtils.saveAsLibSVMFile(data, "hdfs:///hndata/docvecs")

# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
rr = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    numTrees=5, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=4, maxBins=32)

predictions = rr.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: log(lp.label+1.0)).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
Ejemplo n.º 14
0
output.select("mimic_features").show(truncate=False)

# Load list of admissions
# sample = pd.read_csv("sepsis_and_not_sepsis_admissions.csv")
# sample['HADM_ID'] = sample['HADM_ID'].str.extract('([0-9]+)')
# sample_admissions = np.array(sample['HADM_ID']).tolist()
# sample = sample[["HADM_ID", "label"]]
# sample = sqlCtx.createDataFrame(sample)
# sample.write.csv("labels.csv", header=True, mode="overwrite")
sample = spark.read.csv("labels.csv", header=True, inferSchema=True)

# Get ready to save as libsvm
# https://stackoverflow.com/questions/43920111/convert-dataframe-to-libsvm-format
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import DenseVector, VectorUDT

features_libsvm = output.select("HADM_ID", "mimic_features").join(sample,
                                                                  on="HADM_ID",
                                                                  how="left")
features_libsvm = features_libsvm.select("label", "mimic_features")

features_libsvm_rdd = features_libsvm.rdd
features_libsvm_rdd.take(1)

features_libsvm_format = features_libsvm_rdd.map(
    lambda line: LabeledPoint(line[0], DenseVector(line[1])))
features_libsvm_format.take(3)

MLUtils.saveAsLibSVMFile(features_libsvm_format, "features_combined.libsvm")
    .withColumn('dayofweek', dayofweek("CallDateTime")) \
    .withColumn('dayofyear', dayofyear("CallDateTime")) \
    .withColumn('hour', hour("CallDateTime")) \
    .withColumn('minute', minute("CallDateTime")) \
    .withColumn('weekofyear', weekofyear("CallDateTime"))

# check to see if latitude is null
df.filter(df.latitude.isNull()).count()
df.filter(df.longitude.isNull()).count()

# consider only data which has geo loctaion specified
df = df.filter(df.latitude.isNotNull())

df_rolledup = df.groupBy("latitude", "longitude", "month", "dayofmonth").count()
df_rolledup = df_rolledup.select("count","latitude", "longitude", "month", "dayofmonth")


# convert to RDD first
df_rdd = df_rolledup.rdd

# Code for running a liner regression model on the same
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

# FROM RDD OF TUPLE TO A RDD OF LABELEDPOINT for training and testing
df_libsvm = df_rdd.map(lambda line: LabeledPoint(line[0],line[1:]))

# SAVE AS LIBSVM
MLUtils.saveAsLibSVMFile(df_libsvm, "hdfs://worker2.hdp-internal:8020/user/sdeshpa1/data_final_test.txt")
:
Ejemplo n.º 16
0
    sc = spark.sparkContext

    #-------------------------------------------------------------------------------
    # Read the training data and build the model
    #-------------------------------------------------------------------------------

    #reading the train dataframes
    trainingDF = spark.read.load("../data/train_small.parquet")

    #convert every row to LabeledPoint
    transformedTrainingRDD = (trainingDF.rdd.map(
        lambda row: LabeledPoint(int(row.label) - 1, row.features)))
    #print transformedTrainingRDD.show()

    #Save the RDD in LibSVM format, as Naive Bayes reads in the same format
    MLUtils.saveAsLibSVMFile(transformedTrainingRDD, "trainingLibsvmfile")
    training = MLUtils.loadLibSVMFile(sc, "trainingLibsvmfile/*")
    print "trainingLibsvmfile created!!"

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(training,
                                         numClasses=10,
                                         categoricalFeaturesInfo={},
                                         numTrees=24,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=4,
                                         maxBins=32)
Ejemplo n.º 17
0
from pyspark import SparkContext, SparkConf
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils
if __name__ == "__main__":
  def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])
  conf = SparkConf().setAppName("LG")
  sc = SparkContext(conf=conf)
  data = sc.textFile("hdfs://student83-x1:9000/sample_svm.txt")
  parsedData = data.map(parsePoint)
  MLUtils.saveAsLibSVMFile(
      parsedData, "hdfs://student83-x1:9000/sample_libsvm")