def __init__(self, trainset=[]):

        # initializes a SVM classifier
        self.classifier = SVM(type=CLASSIFICATION, kernel=LINEAR)

        self.bag_of_words = []
        self.classifier.probability = True
        self.train(self.classifier, trainset)
Ejemplo n.º 2
0
def main():
    global modifiers, booster_map, negator_map, bag_of_words
    #Creating the socket
    host = ''
    port = 9999
    backlog = 5
    size = 1024
    server = socket.socket(socket.AF_INET, type=socket.SOCK_STREAM, proto=0)
    #server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    server.bind((host, port))
    server.listen(5)
    #server.setblocking(0)
    input = [server, sys.stdin]
    #Loading the dictionaries and training data
    get_pol_map()
    booster_map = get_booster_map()
    negator_map = get_negator_map()
    modifiers = get_mod_map(booster_map, negator_map)
    bag_of_words = []
    train_data = load_data_from_file(sys.argv[1])

    #Initializing and training de classifier
    global svm_classifier
    svm_classifier = SVM(type=CLASSIFICATION, kernel=LINEAR)
    train_svm(svm_classifier, train_data)
    print "Training Completed......"

    running = 1
    while running:
        inputready, outputready, exceptready = select.select(input, [], [])

        for s in inputready:

            if s == server:
                # handle the server socket
                client, address = server.accept()
                input.append(client)

            elif s == sys.stdin:
                # handle standard input
                junk = sys.stdin.readline()
                running = 0

            else:
                # handle all other sockets
                data = s.recv(size)
                if data:
                    # if str(data) == "\n":
                    # 	print "Newline"
                    # if str(data) == "\r":
                    # 	print "R"
                    # if str(data).strip("\r\n") == "":
                    # 	print "Error! " + str(data).strip("\r\n")
                    # 	continue
                    s.send(hybrid_classify(str(data).strip("\n")) + '\r\n')
                else:
                    s.close()
                    input.remove(s)
    server.close()
    def __init__(self, trainset=[]):

        # initializes a SVM classifier
        self.classifier = SVM(type=CLASSIFICATION, kernel=LINEAR)

        self.bag_of_words = []
        self.classifier.probability = True
        self.train(self.classifier,trainset)
Ejemplo n.º 4
0
def main():
    global modifiers, booster_map, negator_map, bag_of_words
    #Creating the socket so that it can receive a tweet and send the resulting polarity
    host = ''
    port = 9999
    backlog = 5
    size = 1024
    server = socket.socket(socket.AF_INET, type=socket.SOCK_STREAM, proto=0)
    server.bind((host, port))
    server.listen(5)
    input = [server, sys.stdin]

    #Loading the dictionaries and training data
    get_pol_map()
    booster_map = get_booster_map()
    negator_map = get_negator_map()
    modifiers = get_mod_map(booster_map, negator_map)
    bag_of_words = []
    train_data = load_data_from_file(sys.argv[1])

    #Initializing and training de classifier
    global svm_classifier
    svm_classifier = SVM(type=CLASSIFICATION, kernel=LINEAR)
    train_svm(svm_classifier, train_data)

    print "Training Completed......"

    #Cycle that mantains the classifier running
    running = 1
    while running:
        inputready, outputready, exceptready = select.select(input, [], [])
        #An input is detected
        for s in inputready:

            if s == server:
                # Handle the message if it comes from a client
                #sending a connection request
                client, address = server.accept()
                input.append(client)

            elif s == sys.stdin:
                # Handle standard input, so that when entered text on the
                #console the server stops
                junk = sys.stdin.readline()
                running = 0

            else:
                # Handle all other socket connections, in this case clients sending tweets
                data = s.recv(size)
                if data:
                    #Classify the tweet and send the resulting polarity
                    s.send(hybrid_classify(str(data).strip("\n")) + '\r\n')
                else:
                    #The client wishes to disconnect himself, close the input and remove him from the list
                    s.close()
                    input.remove(s)
    server.close()
Ejemplo n.º 5
0
class Classifications():

    #static variables
    _category_path = os.path.join(os.path.dirname(__file__),
                                  "classifiers/category.slp")
    _rating_path = os.path.join(os.path.dirname(__file__),
                                "classifiers/rating.slp")
    _rating_nlp_path = os.path.join(os.path.dirname(__file__),
                                    "classifiers/rating_nlp.svm")
    _sentiment_path = os.path.join(os.path.dirname(__file__),
                                   "classifiers/sentiment.nb")

    _category = SLP.load(_category_path)
    _rating = SLP.load(_rating_path)
    _rating_nlp = SVM.load(_rating_nlp_path)
    _sentiment = NB.load(_sentiment_path)

    @staticmethod
    def selectWords(review):
        '''
        a function that gets a review and selects the nouns, adjectives, verbs and exclamation mark
        '''
        review = parsetree(review, lemmata=True)[0]  #lemmatize the review
        #select adjectives (JJ), nouns (NN), verbs (VB) and exclamation marks
        review = [
            w.lemma for w in review
            if w.tag.startswith(('JJ', 'NN', 'VB', '!'))
        ]
        review = count(review)  #a dictionary of (word, count)
        return review

    @staticmethod
    def classify(text):
        predicted_category = Classifications._category.classify(Document(text),
                                                                discrete=True)
        predicted_rate = Classifications._rating.classify(Document(text),
                                                          discrete=True)
        predicted_rate_nlp = Classifications._rating_nlp.classify(
            Classifications.selectWords(text), discrete=True)
        predicted_sentiment_dict = Classifications._sentiment.classify(
            Classifications.selectWords(text), discrete=False)
        predicted_sentiment = True if str(
            sorted(predicted_sentiment_dict.items(),
                   key=operator.itemgetter(1),
                   reverse=True)[1][0]) in ['True', '3.0', '4.0', '5.0'
                                            ] else False

        return {
            'text': text,
            'rate': predicted_rate,
            'category': predicted_category,
            'rate_nlp': predicted_rate_nlp,
            'positivity': predicted_sentiment
        }
Ejemplo n.º 6
0
def normal_test(data, type):
    print '----------------------------------------------------'
    print 'TEST FUNCTION STARTED FOR ' + type + '!'
    total_data_size = len(data)
    training_size = int(round(total_data_size/2))
    test_size = training_size
    print 'Total Size: ' + str(total_data_size)
    print 'Training Size: ' + str(training_size)
    print 'Test Size: ' + str(test_size)

    print 'Training Started for ' + type + '!'
    classification_methods = {
      #uncomment based on what classification algorithm you would like to test
      'NB' :  NB(train=data[:training_size], baseline=MAJORITY, method=MULTINOMIAL),
      'KNN2' : KNN(train=data[:training_size], baseline=MAJORITY, k=2, distance=COSINE),
      'KNN3' : KNN(train=data[:training_size], baseline=MAJORITY, k=3, distance=COSINE),
      'KNN4' : KNN(train=data[:training_size], baseline=MAJORITY, k=4, distance=COSINE),
      'KNN5' : KNN(train=data[:training_size], baseline=MAJORITY, k=5, distance=COSINE),
      'KNN6' : KNN(train=data[:training_size], baseline=MAJORITY, k=6, distance=COSINE),
      'KNN7' : KNN(train=data[:training_size], baseline=MAJORITY, k=7, distance=COSINE),
      'KNN8' : KNN(train=data[:training_size], baseline=MAJORITY, k=8, distance=COSINE),
      'KNN9' : KNN(train=data[:training_size], baseline=MAJORITY, k=9, distance=COSINE),
      'KNN10' : KNN(train=data[:training_size], baseline=MAJORITY, k=10, distance=COSINE),
      'SLP1' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=1),
      'SLP2' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=2),
      'SLP3' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=3),
      'SVM' : SVM(train=data[:training_size], type=CLASSIFICATION, kernel=POLYNOMIAL),
    }

    print 'Normal Testing Started!'
    # uncomment to start the normal test
    for classification in classification_methods.keys():
      #measure the time it takes to classify!
      start = timeit.default_timer()
      #normal test
      accuracy, precision, recall, f1 = classification_methods[classification].test(data[training_size:training_size+test_size])
      stop = timeit.default_timer()
      print '*' + classification + '*'
      print 'Accuracy: ' + str(accuracy)
      print 'Precision: ' + str(precision)
      print 'Recall: ' + str(recall)
      print 'F1-score: ' + str(f1)
      print 'Time: ' + str(stop - start)
      print
Ejemplo n.º 7
0
from pattern.db     import Datasheet
from pattern.en     import tag, predicative
from pattern.vector import SVM, KNN, NB, count, shuffled
import os
# import argparse

classifier = SVM()


print "loading data..."
data = os.path.join(os.path.dirname(__file__), "polarity2.csv")
data = Datasheet.load(data)
data = shuffled(data)

def instance(review):                     # "Great book!"
    v = tag(review)                       # [("Great", "JJ"), ("book", "NN"), ("!", "!")]
    v = [word for (word, pos) in v if pos in ("JJ", "RB", "VB", "VBZ", "NN", "NNS", "NNP", "NNPS") or word in ("!")]
    v = [predicative(word) for word in v] # ["great", "!", "!"]
    v = count(v)                          # {"great": 1, "!": 1}
    return v


# parser = argparse.ArgumentParser(description='This trains polarity data and then tests it on Reuters news!')
# parser.add_argument('-f', '--savefile', dest='savefile', default='checkpoint.p', help='file to save to: must have .p extension')
# args = parser.parse_args()
print "training..."
for score, review in data[:1000]:
    classifier.train(instance(review), type=int(score) > 0)
classifier.save("sentiment.p")

print "testing..."
class MachineLearningClassifier(object):
    def __init__(self, trainset=[]):

        # initializes a SVM classifier
        self.classifier = SVM(type=CLASSIFICATION, kernel=LINEAR)

        self.bag_of_words = []
        self.classifier.probability = True
        self.train(self.classifier, trainset)

    # Extract features for ML process
    def extract_features(self, tweet_message):

        if len(self.bag_of_words) == 0:
            printf('Bag-of-Words empty!')
            return None

        tweet_words = [
            word.lower() for word, tag in tweet_message
            if word not in stopwords and not word.isdigit()
        ]
        tweet_tags = [
            tag[:2] for word, tag in tweet_message
            if word not in stopwords and not word.isdigit()
        ]

        feature_set = {}

        # 1st set of features: bag-of-words
        for word in self.bag_of_words:
            feature_set['has_' + word] = (word in tweet_words)

        # 2nd set of features: the tags present in the message
        for tag in ['NN', 'VG', 'CD', 'JJ', 'CC', 'RB']:
            feature_set['has_' + tag] = (tag in tweet_tags)

        # 3rd feature: negation is present?
        negators = set([
            'not', 'none', 'nobody', 'never', 'nothing', 'lack', 't', 'n\'t',
            'dont', 'no'
        ])
        if len(negators.intersection(set(tweet_words))) > 0:
            feature_set['has_negator'] = True

        return feature_set

    # train the classifier
    # Tweets argument must be a list of dicitionaries. Each dictionary must
    # have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and
    # the classificationclass, respectively.
    def train(self, classifier, tweets):

        # build the bag-of-words list using the 1k most frequent words in
        # the corpus
        bag_of_words = {}
        for tweet in tweets:
            words = [
                w.lower() for w, t in pre_process(tweet['MESSAGE'])
                if w not in stopwords and not w.isdigit()
            ]
            for word in words:
                bag_of_words[word] = bag_of_words.get(word, 0) + 1

        # get the 1000 most frequent words
        self.bag_of_words = [
            w for w, freq in sorted(
                bag_of_words.items(), key=itemgetter(1), reverse=True)[:1000]
        ]

        # perform the training step
        for tweet in tweets:
            classifier.train(self.extract_features(
                pre_process(tweet['MESSAGE'])),
                             type=tweet['SENTIMENT'])

    # classify a new message. Return the scores (probabilities) for each
    # classification class
    def classify(self, tweet_message):
        scores = self.classifier.classify(self.extract_features(tweet_message))
        return scores
Ejemplo n.º 9
0
# The separation is going to be a rough approximation, obviously.
#
# Now imagine the following game:
# - The room is filled with static, floating red and blue marbles.
# - It is your task to separate them by inserting a glass panel between them.
#
# The 3-D space gives a lot more options. Adding more dimensions add even more options.
# This is roughly what a SVM does, using kernel functions to push the separation
# to a higher dimension.

# Pattern includes precompiled C binaries of libsvm.
# If these do not work on your system you have to compile libsvm manually.
# You can also change the "SVM()" statement below with "KNN()",
# so you can still follow the rest of the example.

classifier = SVM()

# We'll build a classifier to predict sentiment in Dutch movie reviews.
# For example, "geweldige film!" (great movie) indicates a positive sentiment.
# The CSV file at pattern/test/corpora/polarity-nl-bol.com.csv
# contains 1,500 positive and 1,500 negative reviews.

# The pattern.vector module has a shuffled() function
# which we use to randomly arrange the reviews in the list:

print "loading data..."
data = Datasheet.load(
    os.path.join("..", "..", "test", "corpora", "polarity-nl-bol.com.csv"))
data = shuffled(data)

# We do not necessarily need Document objects as in the previous examples.
Ejemplo n.º 10
0
 def classify(self, name):
     return SVM.classify(self, self.vector(name))
Ejemplo n.º 11
0
from pattern.vector import SVM, KNN, NB, count, shuffled
from pattern.en import tag, predicative


classifier = SVM()

classifier = SVM.load("sentiment.p")


def instance(review):  # "Great book!"
    v = tag(review)  # [("Great", "JJ"), ("book", "NN"), ("!", "!")]
    v = [word for (word, pos) in v if pos in ("JJ", "RB", "VB", "VBZ", "NN", "NNS", "NNP", "NNPS") or word in ("!")]
    v = [predicative(word) for word in v]  # ["great", "!", "!"]
    v = count(v)  # {"great": 1, "!": 1}
    return v


score = classifier.classify(instance("you little bitch"))

print(score)
Ejemplo n.º 12
0
 def _learner(self):
     return SVM(extension='libsvm')
Ejemplo n.º 13
0
from pattern.web import Twitter
from pattern.text.en import tag
from pattern.vector import KNN, count, NaiveBayes, SVM
import os, random
import file_io as fio
corp_dir = 'essays/original'
twitter, knn, nbayes, svm = Twitter(), KNN(), NaiveBayes(), SVM()
from nltk.corpus import stopwords
import lsa
cachedStopWords = stopwords.words("english")
testSet = []


def naive():
    trainingSet = []
    l = lsa.getMod()
    dirs = [x[0] for x in os.walk(os.path.abspath(corp_dir))]
    for dir in dirs:
        label = 0
        if 'low' in dir:
            label = -1
        elif 'high' in dir:
            label = 1
        tfiles = []
        tfiles = fio.getTopLevelFiles(dir, extension='txt')
        train_smpl = []
        if len(tfiles) > 0:
            train_smpl = [
                tfiles[i] for i in random.sample(xrange(len(tfiles)), 13)
            ]
        for file in tfiles:
Ejemplo n.º 14
0
from pattern.vector import SVM
#from Jseg import jieba
from os.path import realpath, dirname, join

CUR_PATH = dirname(realpath(__file__))
sentipol_cls = SVM.load(join(CUR_PATH, 'svm_mod.gpk'))
execfile(join(CUR_PATH, 'Sentipol.py'))


def sentipol_tmp(text):
    from Jseg import jieba
    text = jieba.seg(text).nopos().split()
    pol = sentipol_cls.classify(text)
    details = sentipol(text)
    return pol, details
Ejemplo n.º 15
0
data = Datasheet.load("books-fr.test.csv")
data.columns[1].map(lambda v: v == "True")

# Machine learning broadly uses two statistical techniques:
# - unsupervised machine learning (= classification), and
# - supervised machine learning (= clustering).
# Supervised machine learning requires human-tailored training examples.
# Human-tailored means that someone has tagged each training example with a class / label / type.
# In our case, the label is True or False (positive review or not?)
# A classifier will then attempt to predict the class for unknown (=unlabeled) examples.
# A fast and robust classification algorithm is included in Pattern: the support vector machine.
# http://www.clips.ua.ac.be/pages/pattern-vector#classification

# Training an SVM is very easy, 
# just give it strings or lists of words and a label as training material:
classifier = SVM()
for review, positive in data[:50]: # Note: 50 training examples is very little data!
    classifier.train(review, type=positive)

# The idea is that similar strings will contain similar words.
# For an unknown example, the SVM examine the words it contains,
# and look for trained examples with similar words.
# The labels of these trained examples are then used to predict
# the label of the unknown example.
# See: Chapter 6 in "Modeling Creativity: Case Studies in Python".
print "Review:", data[51][0]
print "Positive:", data[51][1]
print "Prediction:", classifier.classify(data[51][0])
print

# We can then evaluate how well the classifier performs,
Ejemplo n.º 16
0
data = Datasheet.load("books-fr.test.csv")
data.columns[1].map(lambda v: v == "True")

# Machine learning broadly uses two statistical techniques:
# - unsupervised machine learning (= classification), and
# - supervised machine learning (= clustering).
# Supervised machine learning requires human-tailored training examples.
# Human-tailored means that someone has tagged each training example with a class / label / type.
# In our case, the label is True or False (positive review or not?)
# A classifier will then attempt to predict the class for unknown (=unlabeled) examples.
# A fast and robust classification algorithm is included in Pattern: the support vector machine.
# http://www.clips.ua.ac.be/pages/pattern-vector#classification

# Training an SVM is very easy,
# just give it strings or lists of words and a label as training material:
classifier = SVM()
for review, positive in data[:
                             50]:  # Note: 50 training examples is very little data!
    classifier.train(review, type=positive)

# The idea is that similar strings will contain similar words.
# For an unknown example, the SVM examine the words it contains,
# and look for trained examples with similar words.
# The labels of these trained examples are then used to predict
# the label of the unknown example.
# See: Chapter 6 in "Modeling Creativity: Case Studies in Python".
print "Review:", data[51][0]
print "Positive:", data[51][1]
print "Prediction:", classifier.classify(data[51][0])
print
def main():
    global modifiers, booster_map, negator_map, bag_of_words
    get_pol_map()
    booster_map = get_booster_map()
    negator_map = get_negator_map()
    modifiers = get_mod_map(booster_map, negator_map)

    bag_of_words = []
    train_data, test_data, crowd_data = load_data_from_file(
        sys.argv[1], sys.argv[2], sys.argv[3])

    global svm_classifier
    svm_classifier = SVM(type=CLASSIFICATION, kernel=LINEAR)
    train_svm(svm_classifier, train_data)
    print "Training Completed......"
    hits = 0.0
    misses = 0.0
    counter = 0.0
    confussion = {}
    global EMO, BOW, AI
    EMO = 0
    BOW = 0
    AI = 0
    for n, tweet in enumerate(test_data):
        class1 = hybrid_classify(tweet['message'])
        counter += 1
        if (class1 == tweet['sentiment']):
            hits += 1
        else:
            misses += 1
        confussion[(class1, tweet['sentiment'])] = confussion.get(
            (class1, tweet['sentiment']), 0) + 1
    Accuracy = hits / (hits + misses)
    Recall = (hits + misses) / counter
    F1 = (2 * Accuracy * Recall) / (Accuracy + Recall)
    print ""
    print "TASS Test Results......"
    print "Accuracy: ", str(Accuracy)
    print "Recall: ", str(Recall)
    print "F1-Score: ", str(F1)
    print "Layer Summary:"
    print "Emoticon Layer: ", str(EMO)
    print "BOW Layer: ", str(BOW)
    print "SVM Layer: ", str(AI)
    print "Confussion Matrix:"
    for elem in confussion.items():
        print elem[0], "\t", str(elem[1])

    hits = 0.0
    misses = 0.0
    counter = 0.0
    EMO = 0
    BOW = 0
    AI = 0
    confussion = {}
    for n, tweet in enumerate(crowd_data):
        class1 = hybrid_classify(tweet['message'])
        counter += 1
        if class1 == tweet['sentiment']:
            hits += 1
        else:
            misses += 1
        confussion[(class1, tweet['sentiment'])] = confussion.get(
            (class1, tweet['sentiment']), 0) + 1
    Accuracy = hits / (hits + misses)
    Recall = (hits + misses) / counter
    F1 = (2 * Accuracy * Recall) / (Accuracy + Recall)
    print ""
    print "Crowd Test Results......"
    print "Accuracy: ", str(Accuracy)
    print "Recall: ", str(Recall)
    print "F1-Score: ", str(F1)
    print "Layer Summary:"
    print "Emoticon Layer: ", str(EMO)
    print "BOW Layer: ", str(BOW)
    print "SVM Layer: ", str(AI)
    print "Confussion Matrix:"
    for elem in confussion.items():
        print elem[0], "\t", str(elem[1])
Ejemplo n.º 18
0
print datadocs

#naive Bayes
#training set
nb = NB(train=datadocs[:500])
print 'nb distribution = ', nb.distribution
print 'nb confusion matrix = ', nb.confusion_matrix(datadocs[500:])
print 'nb confusion matrix for each class = ', nb.confusion_matrix(datadocs[500:])(True) # (TP, TN, FP, FN)
print 'nb features = ', nb.features
#test set
accuracy, precision, recall, f1 = nb.test(datadocs[500:])
print 'nb accuracy = ', accuracy, 'nb precision =', precision, 'nbrecall = ', \
    recall

#test SVM
testsvm = SVM(train=datadocs[:500])
print 'svm features = ', testsvm.features
saccuracy, sprecision, srecall, sf1 = testsvm.test(datadocs[500:])
print 'svm accuracy =', saccuracy


#classifier training example with test classificaiton
nb2 = NB()
for review, rating in data:
    v = Document(review, type=int(rating))
    #print v.vector
    nb2.train(v)

print 'nb2 classes', nb2.classes
print 'test classification', nb2.classify(Document('A poor movie!'))
Ejemplo n.º 19
0
 def train(self, name, gender=None):
     SVM.train(self, self.vector(name), gender)
class MachineLearningClassifier(object):

    def __init__(self, trainset=[]):

        # initializes a SVM classifier
        self.classifier = SVM(type=CLASSIFICATION, kernel=LINEAR)

        self.bag_of_words = []
        self.classifier.probability = True
        self.train(self.classifier,trainset)


    # Extract features for ML process
    def extract_features(self, tweet_message):

        if len(self.bag_of_words) == 0:
            printf('Bag-of-Words empty!')
            return None

        tweet_words = [word.lower() for word, tag in tweet_message if word not in stopwords and not word.isdigit()]
        tweet_tags = [tag[:2] for word, tag in tweet_message if word not in stopwords and not word.isdigit()]

        feature_set = {}

        # 1st set of features: bag-of-words
        for word in self.bag_of_words:
            feature_set['has_'+word] = (word in tweet_words)

        # 2nd set of features: the tags present in the message
        for tag in ['NN','VG','CD','JJ','CC','RB']:
            feature_set['has_'+tag] = (tag in tweet_tags)

        # 3rd feature: negation is present?
        negators = set(['not', 'none', 'nobody', 'never', 'nothing', 'lack', 't','n\'t','dont', 'no'])
        if len(negators.intersection(set(tweet_words))) > 0:
            feature_set['has_negator'] = True

        return feature_set


    # train the classifier
    # Tweets argument must be a list of dicitionaries. Each dictionary must
    # have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and
    # the classificationclass, respectively.
    def train(self,classifier,tweets):

        # build the bag-of-words list using the 1k most frequent words in
        # the corpus
        bag_of_words = {}
        for tweet in tweets:
            words = [w.lower() for w,t in pre_process(tweet['MESSAGE']) if w not in stopwords and not w.isdigit()]
            for word in words:
                bag_of_words[word] = bag_of_words.get(word,0) + 1

        # get the 1000 most frequent words
        self.bag_of_words = [w for w,freq in sorted(bag_of_words.items(),key=itemgetter(1),reverse=True)[:1000]]

        # perform the training step
        for tweet in tweets:
            classifier.train(self.extract_features(pre_process(tweet['MESSAGE'])),type=tweet['SENTIMENT'])


    # classify a new message. Return the scores (probabilities) for each
    # classification class
    def classify(self, tweet_message):
        scores = self.classifier.classify(self.extract_features(tweet_message))
        return scores
Ejemplo n.º 21
0
 def classify(self, name):
     return SVM.classify(self, self.vector(name))
documents = []
for line in neg_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type=0)
    documents.append(document)

for line in pos_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type=1)
    documents.append(document)

corpus = Corpus(documents,weight=TFIDF)
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

classifier=SVM(type=CLASSIFICATION,kernel=LINEAR)
for document in corpus:
    classifier.train(document,type=document.type)
print 'Training Done'
# To test the accuracy of a classifier, Using 10-fold crossvalidation
# This yields 4 scores: Accuracy, Precision, Recall and F-score.
print 'SVM Classifier'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
print SVM.test(corpus,folds=10,type=CLASSIFICATION,kernel=LINEAR)

# Testing on sample Data file in which top 10 are negative and the next 10 are positive
ft=open('test_20','r')
test_lines=ft.readlines()
for line in test_lines:
	t=(Document(line))
Ejemplo n.º 23
0
# The separation is going to be a rough approximation, obviously.
#
# Now imagine the following game:
# - The room is filled with static, floating red and blue marbles.
# - It is your task to separate them by inserting a glass panel between them.
#
# The 3-D space gives a lot more options. Adding more dimensions add even more options.
# This is roughly what a SVM does, using kernel functions to push the separation
# to a higher dimension.

# Pattern includes precompiled C binaries of libsvm.
# If these do not work on your system you have to compile libsvm manually.
# You can also change the "SVM()" statement below with "KNN()",
# so you can still follow the rest of the example.

classifier = SVM()

# We'll build a classifier to predict sentiment in Dutch movie reviews.
# For example, "geweldige film!" (great movie) indicates a positive sentiment.
# The CSV file at pattern/test/corpora/polarity-nl-bol.com.csv
# contains 1,500 positive and 1,500 negative reviews.

# The pattern.vector module has a shuffled() function
# which we use to randomly arrange the reviews in the list:

print("loading data...")
data = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora", "polarity-nl-bol.com.csv")
data = Datasheet.load(data)
data = shuffled(data)

# We do not necessarily need Document objects as in the previous examples.
Ejemplo n.º 24
0
 def train(self, name, gender=None):
     SVM.train(self, self.vector(name), gender)
Ejemplo n.º 25
0
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v


train = (("cat", "A cat has whiskers"), ("cat", "A cat says meow"),
         ("cat", "the animal was purring softly"),
         ("dog", "A dog is an animal that says woof"),
         ("dog", "Why is that dog still barking?"),
         ("dog", "He happily wagged his tail"))

# A robust, all-round classification algorithm is SVM.
# If SVM doesn't work on your machine, use SLP (= simple neural net).
classifier = SVM()
for name, s in train:
    classifier.train(v(s), type=name)

print classifier.classify(v("the animal is purring and meowing"))
print classifier.classify(v("woof!"))
print

# ------------------------------------------------------------------------------------

# Vectors can be constructed in many different ways;
# what features you include will influence how accurate the classifier is.
# For example, in the example above there is no way to match "barking" to "bark"
# (for the classifier they are different words).
# A good strategy is to use character n-grams as features:
# sequences of n successive characters (usually n=3).
Ejemplo n.º 26
0
    v = {}
    v.update(count(words(s)))
    return v
    
train = (
    ("cat", "A cat has whiskers"),
    ("cat", "A cat says meow"),
    ("cat", "the animal was purring softly"),
    ("dog", "A dog is an animal that says woof"),
    ("dog", "Why is that dog still barking?"),
    ("dog", "He happily wagged his tail")
)

# A robust, all-round classification algorithm is SVM.
# If SVM doesn't work on your machine, use SLP (= simple neural net).
classifier = SVM() 
for name, s in train:
    classifier.train(v(s), type=name)
    
print classifier.classify(v("the animal is purring and meowing"))
print classifier.classify(v("woof!"))
print

# ------------------------------------------------------------------------------------

# Vectors can be constructed in many different ways;
# what features you include will influence how accurate the classifier is.
# For example, in the example above there is no way to match "barking" to "bark"
# (for the classifier they are different words).
# A good strategy is to use character n-grams as features:
# sequences of n successive characters (usually n=3).
Ejemplo n.º 27
0
# #train for category
# slp = SLP(train=data[:len(data)], baseline=MAJORITY, iterations=3)
# slp.finalize()
# #save
# slp.save(f, True)

# print '--------------------'
# #training for rating rating_nlp
f = os.path.join(os.path.dirname(__file__), "classifiers/rating_nlp.svm")
data = []
data.extend(asDocumentReviewNLP(classification_data['musics']['reviews']))
data.extend(asDocumentReviewNLP(classification_data['movies']['reviews']))
data.extend(asDocumentReviewNLP(classification_data['games']['reviews']))
shuffle(data)
svm = SVM(train=data[:len(data)], type=CLASSIFICATION, kernel=POLYNOMIAL)
svm.finalize()
#save
svm.save(f, True)

# print '--------------------'
# #training for sentiment 
# f = os.path.join(os.path.dirname(__file__), "classifiers/sentiment.nb")
# data = []
# data.extend(asDocumentReviewNLP(classification_data['musics']['reviews']))
# data.extend(asDocumentReviewNLP(classification_data['movies']['reviews']))
# data.extend(asSentiment(classification_data['games']['reviews']))
# shuffle(data)
# nb = NB(train=data[:len(data)], baseline=MAJORITY, method=MULTINOMIAL, alpha=0.0001)
# nb.finalize()
# #save
                              type=output_vector,
                              stopwords=True))

vectors = []
if use_feature_selection:
    vectors = Model(documents=documents, weight=pattern.vector.TFIDF)
    vectors = vectors.filter(
        features=vectors.feature_selection(top=select_top_n_features))
    #print(vectors.vectors)
else:
    vectors = documents

if options["train"]:
    if classifier_type == "SVM":
        classifier = SVM(train=vectors,
                         type=svm_type,
                         kernel=svm_kernel)
    else:
        classifier = getattr(pattern.vector, classifier_type)(train=vectors)

    print("Classes: " + repr(classifier.classes))

    #performance = kfoldcv(NB, vectors, folds=n_fold)
    performance = kfoldcv(type(classifier), vectors, folds=n_fold)
    print("Accuracy: %.3f\n" \
          "Precision: %.3f\n" \
          "Recall: %.3f\n" \
          "F1: %.3f\n" \
          "Stddev:%.3f" % performance)
    print()
    print("Confusion matrx:")