print len(avoidList)



#How many features were there before
print len(refinedfeatures)


cleanFeatures = [i for i in refinedfeatures if i not in avoidList]


#How many features after filtering out avoid list
print len(cleanFeatures)

#model redefines
model = model.filter(features=cleanFeatures)

#This will give k fold cross validation results; Instead of Naive Bayes you can try SVM, SLP, KNN etc

print kfoldcv(NB, model)







#Writing the features and their weights to a csv file

listofFeatures = []
Ejemplo n.º 2
0
    v3 = parsetree(review1, lemmata=True)[0]
    v4 = [w.lemma for w in v3 if w.tag.startswith(('JJ', 'NN', 'VB', '!'))]
    v5 = count(v4)
    return v5


data = csv('data/input/reviews.csv')
data = [(v(review), int(rating) >= 3) for review, rating in data]
print kfoldcv(NB, data)
data = csv('data/input/reviews.csv')
data = [(review, int(rating) >= 3) for review, rating in data]
data = [
    Document(review, type=rating, stopwords=True) for review, rating in data
]
model = Model(documents=data, weight=TF)
model = model.filter(features=model.feature_selection(top=1000))
print kfoldcv(NB, model)
# gridsearch
data = csv('data/input/reviews.csv')
data = [(count(review), int(rating) >= 3) for review, rating in data]
for (A, P, R, F, o), p in gridsearch(SVM,
                                     data,
                                     kernel=[RADIAL],
                                     gamma=[0.1, 1, 10]):
    print(A, P, R, F, o), p
print kfoldcv(SVM, data, folds=10)
# genetic algorithm


def chseq(length=4, chars='abcdefghijklmnopqrstuvwxyz'):
    # returns a string of random characters.
        all_entry_comment_text_filtered += len(entry_comments) * \
                                           " xxludumscrapecommentcounterxx "

    #print(all_entry_comment_text_filtered)
    # A 'document' is a bag of words from all comments for one game
    # entry (seems to work better grouping all comments), associated with
    # it's rating or classification (eg type=output_vector).
    documents.append(Document(all_entry_comment_text_filtered,
                              name="%s\t%s" % (author, url),
                              type=output_vector,
                              stopwords=True))

vectors = []
if use_feature_selection:
    vectors = Model(documents=documents, weight=pattern.vector.TFIDF)
    vectors = vectors.filter(
        features=vectors.feature_selection(top=select_top_n_features))
    #print(vectors.vectors)
else:
    vectors = documents

if options["train"]:
    if classifier_type == "SVM":
        classifier = SVM(train=vectors,
                         type=svm_type,
                         kernel=svm_kernel)
    else:
        classifier = getattr(pattern.vector, classifier_type)(train=vectors)

    print("Classes: " + repr(classifier.classes))

    #performance = kfoldcv(NB, vectors, folds=n_fold)