Exemple #1
0
grammer = NGram(n=2, inputCol="words", outputCol="grams")
data = grammer.transform(data).drop('doc')
print("grammed\ncvetirizing1")
#data = data.rdd.map(lambda x: Row(x['did'],x['words']+x['grams'])).toDF(['did','features'])
data = CountVectorizer(inputCol="words",
                       outputCol="wordCounts",
                       vocabSize=1000).fit(data).transform(data)
print("cvectorized1\ncvectorizing2")
data = CountVectorizer(inputCol="grams",
                       outputCol="gramCounts",
                       vocabSize=1000).fit(data).transform(data)
print("cvectorized2\nvectorassembling")
data = VectorAssembler(inputCols = ['wordCounts','gramCounts'],outputCol = 'features')\
                                   .transform(data)\
                                   .drop('words','grams','wordCounts','gramCounts','doc')
data = data.join(labels, ['did'])
print('joined\nindexing')
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
data = indexer.fit(data).transform(data)
print('indexed')
data.write.parquet("gs://elinor/NBTrainData")
#------------------------------FIT RANDOM FOREST-----------------------------#
print("fittingRF")
rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="features",
                            numTrees=10)
rfmodel = rf.fit(data)
print("fitted")
#------------------------------FIT NB ---------------------------------------#
print("fitting NB")
Exemple #2
0
toker = Tokenizer(inputCol = "doc",outputCol = "words")
data = toker.transform(data)
print("tokenized\ngramming")
grammer = NGram(n=2,inputCol="words",outputCol="grams")
data = grammer.transform(data).drop('doc')
print("grammed\ncvetirizing1")
#data = data.rdd.map(lambda x: Row(x['did'],x['words']+x['grams'])).toDF(['did','features'])
data = CountVectorizer(inputCol="words", outputCol="wordCounts").fit(data).transform(data)
print("cvectorized1\ncvectorizing2")
data = CountVectorizer(inputCol="grams", outputCol="gramCounts").fit(data).transform(data)
print("cvectorized2\nvectorassembling")
data = VectorAssembler(inputCols = ['wordCounts','gramCounts'],outputCol = 'features')\
                                   .transform(data)\
                                   .drop('words','grams','wordCounts','gramCouns','doc')
data = data.join(labels,['did'])
print('joined\nindexing')
indexer = StringIndexer(inputCol="lab", outputCol="indexedLabel")
data = indexer.fit(data).transform(data)
print('indexed')
#------------------------------DEAL WITH TEST DATA-----------------------------#
print("loading training data")
if DEBUG :
    wtf = sc.textFile(TEST_DATA)\
            .map(lambda x: "data/bytes/" + x + ".bytes")\
            .reduce(lambda accum,x: accum + "," + x)

else:
    wtf = sc.textFile(TEST_DATA)\
            .map(lambda x: "gs://uga-dsp/project2/data/bytes" + x + ".bytes")\
            .reduce(lambda accum,x: accum + "," + x)