コード例 #1
0
ファイル: 06-svm.py プロジェクト: clips/pattern
# average sentence length, a score for word diversity, etc.

# Use 1,000 random instances as training material.

print("training...")
for score, review in data[:1000]:
    classifier.train(instance(review), type=int(score) > 0)
#classifier.save("sentiment-nl-svm.p")
#classifier = SVM.load("sentiment-nl-svm.p")

# Use 500 random instances as test.

print("testing...")
i = n = 0
for score, review in data[1000:1500]:
    if classifier.classify(instance(review)) == (int(score) > 0):
        i += 1
    n += 1

# The overall accuracy is around 82%.
# A Naieve Bayes classifier has about 78% accuracy.
# A KNN classifier has about 80% accuracy.
# Careful: to get a reliable score you need to calculate precision and recall,
# study the documentation at:
# http://www.clips.ua.ac.be/pages/pattern-metrics#accuracy

print(float(i) / n)

# The work is not done here.
# Low accuracy is disappointing, but high accuracy is often suspicious.
# Things to look out for:
class MachineLearningClassifier(object):

    def __init__(self, trainset=[]):

        # initializes a SVM classifier
        self.classifier = SVM(type=CLASSIFICATION, kernel=LINEAR)

        self.bag_of_words = []
        self.classifier.probability = True
        self.train(self.classifier,trainset)


    # Extract features for ML process
    def extract_features(self, tweet_message):

        if len(self.bag_of_words) == 0:
            printf('Bag-of-Words empty!')
            return None

        tweet_words = [word.lower() for word, tag in tweet_message if word not in stopwords and not word.isdigit()]
        tweet_tags = [tag[:2] for word, tag in tweet_message if word not in stopwords and not word.isdigit()]

        feature_set = {}

        # 1st set of features: bag-of-words
        for word in self.bag_of_words:
            feature_set['has_'+word] = (word in tweet_words)

        # 2nd set of features: the tags present in the message
        for tag in ['NN','VG','CD','JJ','CC','RB']:
            feature_set['has_'+tag] = (tag in tweet_tags)

        # 3rd feature: negation is present?
        negators = set(['not', 'none', 'nobody', 'never', 'nothing', 'lack', 't','n\'t','dont', 'no'])
        if len(negators.intersection(set(tweet_words))) > 0:
            feature_set['has_negator'] = True

        return feature_set


    # train the classifier
    # Tweets argument must be a list of dicitionaries. Each dictionary must
    # have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and
    # the classificationclass, respectively.
    def train(self,classifier,tweets):

        # build the bag-of-words list using the 1k most frequent words in
        # the corpus
        bag_of_words = {}
        for tweet in tweets:
            words = [w.lower() for w,t in pre_process(tweet['MESSAGE']) if w not in stopwords and not w.isdigit()]
            for word in words:
                bag_of_words[word] = bag_of_words.get(word,0) + 1

        # get the 1000 most frequent words
        self.bag_of_words = [w for w,freq in sorted(bag_of_words.items(),key=itemgetter(1),reverse=True)[:1000]]

        # perform the training step
        for tweet in tweets:
            classifier.train(self.extract_features(pre_process(tweet['MESSAGE'])),type=tweet['SENTIMENT'])


    # classify a new message. Return the scores (probabilities) for each
    # classification class
    def classify(self, tweet_message):
        scores = self.classifier.classify(self.extract_features(tweet_message))
        return scores
class MachineLearningClassifier(object):
    def __init__(self, trainset=[]):

        # initializes a SVM classifier
        self.classifier = SVM(type=CLASSIFICATION, kernel=LINEAR)

        self.bag_of_words = []
        self.classifier.probability = True
        self.train(self.classifier, trainset)

    # Extract features for ML process
    def extract_features(self, tweet_message):

        if len(self.bag_of_words) == 0:
            printf('Bag-of-Words empty!')
            return None

        tweet_words = [
            word.lower() for word, tag in tweet_message
            if word not in stopwords and not word.isdigit()
        ]
        tweet_tags = [
            tag[:2] for word, tag in tweet_message
            if word not in stopwords and not word.isdigit()
        ]

        feature_set = {}

        # 1st set of features: bag-of-words
        for word in self.bag_of_words:
            feature_set['has_' + word] = (word in tweet_words)

        # 2nd set of features: the tags present in the message
        for tag in ['NN', 'VG', 'CD', 'JJ', 'CC', 'RB']:
            feature_set['has_' + tag] = (tag in tweet_tags)

        # 3rd feature: negation is present?
        negators = set([
            'not', 'none', 'nobody', 'never', 'nothing', 'lack', 't', 'n\'t',
            'dont', 'no'
        ])
        if len(negators.intersection(set(tweet_words))) > 0:
            feature_set['has_negator'] = True

        return feature_set

    # train the classifier
    # Tweets argument must be a list of dicitionaries. Each dictionary must
    # have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and
    # the classificationclass, respectively.
    def train(self, classifier, tweets):

        # build the bag-of-words list using the 1k most frequent words in
        # the corpus
        bag_of_words = {}
        for tweet in tweets:
            words = [
                w.lower() for w, t in pre_process(tweet['MESSAGE'])
                if w not in stopwords and not w.isdigit()
            ]
            for word in words:
                bag_of_words[word] = bag_of_words.get(word, 0) + 1

        # get the 1000 most frequent words
        self.bag_of_words = [
            w for w, freq in sorted(
                bag_of_words.items(), key=itemgetter(1), reverse=True)[:1000]
        ]

        # perform the training step
        for tweet in tweets:
            classifier.train(self.extract_features(
                pre_process(tweet['MESSAGE'])),
                             type=tweet['SENTIMENT'])

    # classify a new message. Return the scores (probabilities) for each
    # classification class
    def classify(self, tweet_message):
        scores = self.classifier.classify(self.extract_features(tweet_message))
        return scores
コード例 #4
0
ファイル: 08-wiktionary.py プロジェクト: DevKhokhar/pattern
 def classify(self, name):
     return SVM.classify(self, self.vector(name))
コード例 #5
0
 def classify(self, name):
     return SVM.classify(self, self.vector(name))
コード例 #6
0
# average sentence length, a score for word diversity, etc.

# Use 1,000 random instances as training material.

print "training..."
for score, review in data[:1000]:
    classifier.train(instance(review), type=int(score) > 0)
#classifier.save("sentiment-nl-svm.p")
#classifier = SVM.load("sentiment-nl-svm.p")

# Use 500 random instances as test.

print "testing..."
i = n = 0
for score, review in data[1000:1500]:
    if classifier.classify(instance(review)) == (int(score) > 0):
        i += 1
    n += 1

# The overall accuracy is around 82%.
# A Naieve Bayes classifier has about 79% accuracy.
# A KNN classifier has about 80% accuracy.
# Careful: to get a reliable score you need to calculate precision and recall,
# study the documentation at:
# http://www.clips.ua.ac.be/pages/pattern-metrics#accuracy

print float(i) / n

# The work is not done here.
# Low accuracy is disappointing, but high accuracy is often suspicious.
# Things to look out for:
コード例 #7
0
# Training an SVM is very easy, 
# just give it strings or lists of words and a label as training material:
classifier = SVM()
for review, positive in data[:50]: # Note: 50 training examples is very little data!
    classifier.train(review, type=positive)

# The idea is that similar strings will contain similar words.
# For an unknown example, the SVM examine the words it contains,
# and look for trained examples with similar words.
# The labels of these trained examples are then used to predict
# the label of the unknown example.
# See: Chapter 6 in "Modeling Creativity: Case Studies in Python".
print "Review:", data[51][0]
print "Positive:", data[51][1]
print "Prediction:", classifier.classify(data[51][0])
print

# We can then evaluate how well the classifier performs,
# by comparing the predicted labels to the hand-tailored labels.
# Important! Examples used for training may not be used for testing.

# A "binary classifier" is a classifier that only has two possible labels (e.g., True or False).
# For binary classification, we can calculate precision & recall.
# This is more reliable than a simple accuracy:
# - recall = the percentage of positive reviews recognized,
# - precision = the percentage of predicted positive reviews that *really are* positive.

# Example 1: P 0.50 R 1.00 
# This means that all positive reviews are discovered.
# But also that 50% of the unknown input is labeled as positive is in reality negative.
コード例 #8
0
    return v


train = (("cat", "A cat has whiskers"), ("cat", "A cat says meow"),
         ("cat", "the animal was purring softly"),
         ("dog", "A dog is an animal that says woof"),
         ("dog", "Why is that dog still barking?"),
         ("dog", "He happily wagged his tail"))

# A robust, all-round classification algorithm is SVM.
# If SVM doesn't work on your machine, use SLP (= simple neural net).
classifier = SVM()
for name, s in train:
    classifier.train(v(s), type=name)

print classifier.classify(v("the animal is purring and meowing"))
print classifier.classify(v("woof!"))
print

# ------------------------------------------------------------------------------------

# Vectors can be constructed in many different ways;
# what features you include will influence how accurate the classifier is.
# For example, in the example above there is no way to match "barking" to "bark"
# (for the classifier they are different words).
# A good strategy is to use character n-grams as features:
# sequences of n successive characters (usually n=3).
# The vector for the word "bark" then becomes: {"bar":1, "ark":1}
# Tne vector for the word "barking" becomes: {"bar":1, "ark":1, "rki":1, "kin":1, "ing":1}
# The two vectors now have overlap on 2 features ("bar" and "ark").
# This way, we can capture a lot of morphology, use of prepositions,
コード例 #9
0
ファイル: 4-svm.py プロジェクト: wathek/tomdesmedt
# Training an SVM is very easy,
# just give it strings or lists of words and a label as training material:
classifier = SVM()
for review, positive in data[:
                             50]:  # Note: 50 training examples is very little data!
    classifier.train(review, type=positive)

# The idea is that similar strings will contain similar words.
# For an unknown example, the SVM examine the words it contains,
# and look for trained examples with similar words.
# The labels of these trained examples are then used to predict
# the label of the unknown example.
# See: Chapter 6 in "Modeling Creativity: Case Studies in Python".
print "Review:", data[51][0]
print "Positive:", data[51][1]
print "Prediction:", classifier.classify(data[51][0])
print

# We can then evaluate how well the classifier performs,
# by comparing the predicted labels to the hand-tailored labels.
# Important! Examples used for training may not be used for testing.

# A "binary classifier" is a classifier that only has two possible labels (e.g., True or False).
# For binary classification, we can calculate precision & recall.
# This is more reliable than a simple accuracy:
# - recall = the percentage of positive reviews recognized,
# - precision = the percentage of predicted positive reviews that *really are* positive.

# Example 1: P 0.50 R 1.00
# This means that all positive reviews are discovered.
# But also that 50% of the unknown input is labeled as positive is in reality negative.
コード例 #10
0
ファイル: sample.py プロジェクト: EricZeiberg/MachineLearning
from pattern.vector import SVM, KNN, NB, count, shuffled
from pattern.en import tag, predicative


classifier = SVM()

classifier = SVM.load("sentiment.p")


def instance(review):  # "Great book!"
    v = tag(review)  # [("Great", "JJ"), ("book", "NN"), ("!", "!")]
    v = [word for (word, pos) in v if pos in ("JJ", "RB", "VB", "VBZ", "NN", "NNS", "NNP", "NNPS") or word in ("!")]
    v = [predicative(word) for word in v]  # ["great", "!", "!"]
    v = count(v)  # {"great": 1, "!": 1}
    return v


score = classifier.classify(instance("you little bitch"))

print(score)
コード例 #11
0
train = (
    ("cat", "A cat has whiskers"),
    ("cat", "A cat says meow"),
    ("cat", "the animal was purring softly"),
    ("dog", "A dog is an animal that says woof"),
    ("dog", "Why is that dog still barking?"),
    ("dog", "He happily wagged his tail")
)

# A robust, all-round classification algorithm is SVM.
# If SVM doesn't work on your machine, use SLP (= simple neural net).
classifier = SVM() 
for name, s in train:
    classifier.train(v(s), type=name)
    
print classifier.classify(v("the animal is purring and meowing"))
print classifier.classify(v("woof!"))
print

# ------------------------------------------------------------------------------------

# Vectors can be constructed in many different ways;
# what features you include will influence how accurate the classifier is.
# For example, in the example above there is no way to match "barking" to "bark"
# (for the classifier they are different words).
# A good strategy is to use character n-grams as features:
# sequences of n successive characters (usually n=3).
# The vector for the word "bark" then becomes: {"bar":1, "ark":1}
# Tne vector for the word "barking" becomes: {"bar":1, "ark":1, "rki":1, "kin":1, "ing":1}
# The two vectors now have overlap on 2 features ("bar" and "ark").
# This way, we can capture a lot of morphology, use of prepositions,
コード例 #12
0
corpus = Corpus(documents,weight=TFIDF)
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

classifier=SVM(type=CLASSIFICATION,kernel=LINEAR)
for document in corpus:
    classifier.train(document,type=document.type)
print 'Training Done'
# To test the accuracy of a classifier, Using 10-fold crossvalidation
# This yields 4 scores: Accuracy, Precision, Recall and F-score.
print 'SVM Classifier'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
print SVM.test(corpus,folds=10,type=CLASSIFICATION,kernel=LINEAR)

# Testing on sample Data file in which top 10 are negative and the next 10 are positive
ft=open('test_20','r')
test_lines=ft.readlines()
for line in test_lines:
	t=(Document(line))
	corpus.append(t)
	print line.strip()+' '+classifier.classify(t)
ft.close()

f_neg.close()
f_pos.close()

コード例 #13
0
          "Precision: %.3f\n" \
          "Recall: %.3f\n" \
          "F1: %.3f\n" \
          "Stddev:%.3f" % performance)
    print()
    print("Confusion matrx:")
    print(classifier.confusion_matrix(vectors).table)

    classifier.save(trained_filename)
elif options["predict"]:
    classifier = Classifier.load(trained_filename)

    print("#Author\tURL\tPrediction\tActual")
    for v in vectors:
        print("%s\t%s\t%s" % (v.name.encode('utf-8'),
                              repr(classifier.classify(v)),
                              repr(v.type)))

    # Remove any individual documents classified as 'None' prior to
    # calculating performance unless the entire set has no classifications,
    # in which case we assume we are doing a blind prediction but won't
    # calculate performance metrics (eg no rating shown on website,
    # so no classification to predict)
    pre_filter_n = len(vectors)
    fvectors = list(filter(lambda x: x.type is not None, vectors))
    post_filter_n = len(fvectors)
    # If every document in the set is labelled as type "None" we are probably
    # working on raw comments without known ratings (eg during judging
    # period), so treat this as a blind prediction and don't do performance
    # metrics
    # Really the user should have added the --no-performance-metrics