コード例 #1
0
dfValidIndexed = string_indexer_model.transform(dfValidSelect).cache()
df_valid_pred = lrModel.transform(dfValidIndexed).cache()
res=evaluator.evaluate(df_valid_pred)
print res

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[12]:

import loadFiles as lf
print "Start loading  and preprocessing test data "
t0 = time()

test,names=lf.loadUknown('./data/test')
text_name=zip(test,names)
dfTest = sc.parallelize(text_name).toDF(['review','label']).cache()

dfTestPre=dfTest.map(preProcess).toDF(['words','label']).cache()
bigram = NGram(inputCol="words", outputCol="bigrams")
dfTestBi = bigram.transform(dfTestPre).cache()
finalDfSelect = dfTestBi.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache()

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[14]:

print "Classifying test data"
コード例 #2
0
ファイル: main.py プロジェクト: GuillaumeCarbajal/AdvBigData
dictionary={}
for i,word in enumerate(dict):
	dictionary[word]=i
#we need the dictionary to be available AS A WHOLE throughout the cluster
dict_broad=sc.broadcast(dictionary)
#build labelled Points from data
data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')]
dcRDD=sc.parallelize(data_class,numSlices=16)
#get the labelled points
labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value))
#Train NaiveBayes
model=NaiveBayes.train(labeledRDD)
#broadcast the model
mb=sc.broadcast(model)

test,names=lf.loadUknown('./data/test')
name_text=zip(names,test)
#for each doc :(name,text):
#apply the model on the vector representation of the text
#return the name and the class
predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect()

output=file('./classifications.txt','w')
for x in predictions:
	output.write('%s\t%d\n'%x)
output.close()




コード例 #3
0

# 5) GradientBoost
model = GradientBoostedTrees.trainClassifier(tmpLB, categoricalFeaturesInfo={},
                                                 numIterations=30, maxDepth=4)
                                                 
##### TODO 
# This could be done in parallel to test the different one 


##### TODO
# Perhaps combined such AdaBoost & GradientBoost


print "loading unlabeled data"
test,names=lf.loadUknown(testF) #load the unlabelled data . test : text per document. names : the respective file names
namesb=sc.broadcast(names) # broadcast the file names as we will need them for predictions

md=sc.broadcast(m) # broadcast the fitted model of the vecotrizer 
datadt=sc.broadcast(test) # broadcast the unlabeled data so that we may call the vectorizer in same manner

#apply the vestorization in a random worker
print "transforming unlabelled data"
test_data=sc.parallelize(ex,numSlices=16).filter(lambda x: x!=0).map(partial(computeTest, model=md,data=datadt)).collect()


datadt.unpersist()
print "convert data to non-compressed vector and predict the class"
#Steps:
#distribute the coordinate tf-idf of the transformed unlabelled data
#organize the coordinate by row