num_documents = len(data) print "number of documents %s"%num_documents tfidf_vect = TfidfVectorizer( binary=False,analyzer = "word",lowercase= True,norm=None) features = tfidf_vect.fit_transform(data) unique_words = list(set(tfidf_vect.vocabulary_.keys())) print("Unique words:"+str(len(unique_words))) #tw-idf features on train data #features, idfs_learned, nodes= createGraphFeatures(num_documents,data,tfidf_vect.vocabulary_,sliding_window,True,idfs) start = time.time() features, idfs_learned, nodes= createGraphFeatures(num_documents,data,tfidf_vect.vocabulary_,sliding_window,True,idfs) end = time.time() print "it took %d" %start-end print("Total time to build features:\t"+str(end - start)) data_train, data_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.4, random_state =42) model_names=[] labels_predicted = np.expand_dims(np.zeros(len(labels_test)),axis=1) from models import naiveBayes model_names.append("MultinomialNB") prediction = np.expand_dims(naiveBayes(data_train,labels_train,data_test,labels_test,show_infos=True),axis=1) labels_predicted=np.append(labels_predicted, prediction ,axis=1) from models import svc
# features = np.loadtxt("reuters_gow_train.txt") # print "\t Done!" myfile = open("results_"+kcore_par+".txt","w") for kcore_par_int in range(1,2): idfs = {} icws = {} dGcol_nodes = {} max_core_col = [] max_core_feat = [] feature_reduction = 0.0 avglen = 0.0 features, idfs_learned,icws_learned,collection_count_nodes, collection_count_edges, dGcol_nodes,max_core_col,feature_reduction, max_core_feat,avglen = createGraphFeatures(num_documents,clean_train_documents,unique_words,bigrams,sliding_window,b,idf_par,centrality_par,centrality_col_par,True,idfs,icws,kcore_par,dGcol_nodes,max_core_col,kcore_par_int,max_core_feat,feature_reduction,avglen) print "Training the classifier..." start = time.time() # Initialize a Random Forest classifier with 100 trees #clf = RandomForestClassifier(n_estimators = 100) # clf = svm.SVC(kernel="linear",probability=True) clf = svm.LinearSVC(loss="hinge") #clf = AdaBoostClassifier(n_estimators=100) X = features rowsX,colsX = X.shape Y = train['class'] classLabels = np.unique(Y) # different class labels on the dataset