Esempio n. 1
0
num_documents = len(data)
print "number of documents %s"%num_documents




tfidf_vect = TfidfVectorizer( binary=False,analyzer = "word",lowercase= True,norm=None)
features = tfidf_vect.fit_transform(data)
unique_words = list(set(tfidf_vect.vocabulary_.keys()))
print("Unique words:"+str(len(unique_words)))


   #tw-idf features on train data
#features, idfs_learned, nodes= createGraphFeatures(num_documents,data,tfidf_vect.vocabulary_,sliding_window,True,idfs)
start = time.time()
features, idfs_learned, nodes= createGraphFeatures(num_documents,data,tfidf_vect.vocabulary_,sliding_window,True,idfs)
end = time.time()
print "it took %d" %start-end
print("Total time to build features:\t"+str(end - start))
data_train, data_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.4, random_state  =42)

model_names=[]
labels_predicted = np.expand_dims(np.zeros(len(labels_test)),axis=1)


from models import naiveBayes
model_names.append("MultinomialNB")
prediction = np.expand_dims(naiveBayes(data_train,labels_train,data_test,labels_test,show_infos=True),axis=1)
labels_predicted=np.append(labels_predicted, prediction ,axis=1)

from models import svc
Esempio n. 2
0
            #     features = np.loadtxt("reuters_gow_train.txt")
            #     print "\t Done!"

            myfile = open("results_"+kcore_par+".txt","w")

            for kcore_par_int in range(1,2):

                idfs = {}
                icws = {}
                dGcol_nodes = {}
                max_core_col = []
                max_core_feat = []
                feature_reduction = 0.0
                avglen = 0.0

                features, idfs_learned,icws_learned,collection_count_nodes, collection_count_edges, dGcol_nodes,max_core_col,feature_reduction, max_core_feat,avglen = createGraphFeatures(num_documents,clean_train_documents,unique_words,bigrams,sliding_window,b,idf_par,centrality_par,centrality_col_par,True,idfs,icws,kcore_par,dGcol_nodes,max_core_col,kcore_par_int,max_core_feat,feature_reduction,avglen)

                print "Training the classifier..."
                start = time.time()

                # Initialize a Random Forest classifier with 100 trees
                #clf = RandomForestClassifier(n_estimators = 100) 
                # clf = svm.SVC(kernel="linear",probability=True)
                clf = svm.LinearSVC(loss="hinge")
                #clf = AdaBoostClassifier(n_estimators=100)

                X = features
                rowsX,colsX = X.shape
                Y = train['class']

                classLabels = np.unique(Y) # different class labels on the dataset