print len(avoidList) #How many features were there before print len(refinedfeatures) cleanFeatures = [i for i in refinedfeatures if i not in avoidList] #How many features after filtering out avoid list print len(cleanFeatures) #model redefines model = model.filter(features=cleanFeatures) #This will give k fold cross validation results; Instead of Naive Bayes you can try SVM, SLP, KNN etc print kfoldcv(NB, model) #Writing the features and their weights to a csv file listofFeatures = []
v3 = parsetree(review1, lemmata=True)[0] v4 = [w.lemma for w in v3 if w.tag.startswith(('JJ', 'NN', 'VB', '!'))] v5 = count(v4) return v5 data = csv('data/input/reviews.csv') data = [(v(review), int(rating) >= 3) for review, rating in data] print kfoldcv(NB, data) data = csv('data/input/reviews.csv') data = [(review, int(rating) >= 3) for review, rating in data] data = [ Document(review, type=rating, stopwords=True) for review, rating in data ] model = Model(documents=data, weight=TF) model = model.filter(features=model.feature_selection(top=1000)) print kfoldcv(NB, model) # gridsearch data = csv('data/input/reviews.csv') data = [(count(review), int(rating) >= 3) for review, rating in data] for (A, P, R, F, o), p in gridsearch(SVM, data, kernel=[RADIAL], gamma=[0.1, 1, 10]): print(A, P, R, F, o), p print kfoldcv(SVM, data, folds=10) # genetic algorithm def chseq(length=4, chars='abcdefghijklmnopqrstuvwxyz'): # returns a string of random characters.
all_entry_comment_text_filtered += len(entry_comments) * \ " xxludumscrapecommentcounterxx " #print(all_entry_comment_text_filtered) # A 'document' is a bag of words from all comments for one game # entry (seems to work better grouping all comments), associated with # it's rating or classification (eg type=output_vector). documents.append(Document(all_entry_comment_text_filtered, name="%s\t%s" % (author, url), type=output_vector, stopwords=True)) vectors = [] if use_feature_selection: vectors = Model(documents=documents, weight=pattern.vector.TFIDF) vectors = vectors.filter( features=vectors.feature_selection(top=select_top_n_features)) #print(vectors.vectors) else: vectors = documents if options["train"]: if classifier_type == "SVM": classifier = SVM(train=vectors, type=svm_type, kernel=svm_kernel) else: classifier = getattr(pattern.vector, classifier_type)(train=vectors) print("Classes: " + repr(classifier.classes)) #performance = kfoldcv(NB, vectors, folds=n_fold)