Example #1
0
for i in range(1, 15):
    for tweet in Twitter().search(
            '#win' or '#fail', start=i, count=100
    ):  #searches 15*100=1500 tweets for these classes of hashtags
        p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL'
        m = '#fail' in tweet.description.lower() and 'WIN' or 'FAIL'
        s = tweet.description.lower()
        s = Sentence(
            parse(s)
        )  #parse anlayzes & gives strings that are annotated with specified tags
        s = search('JJ',
                   s)  #searches for adjectives in tweets (JJ = adjectiive)
        s = [match[0].string for match in s]
        s = ' '.join(s)
        if len(s) > 0:
            corpus.append(Document(s, type=p))
            corpus.append(Document(s, type=m))

classifier = KNN()  #k-nearest neighbor classifier = K-NN
objects = []

for document in corpus:  #documents are an unordered bag of given sentences.

    classifier.train(
        document)  #adjective vectors in corpus trains the classifier
    objects.append(classifier.classify('awesome'))  #predicts awesome as win
    objects.append(classifier.classify('cool'))  #predicts cool as win
    objects.append(classifier.classify('damn'))  #predicts damn as fail
    objects.append(classifier.classify('sucks'))  #predicts sucks as fail

print objects
Example #2
0
import glob
from pattern.vector import Document, Corpus, Bayes, KNN, features, distance, Vector, _distance, COSINE, kdtree


#from pattern.web import PDF
##pdf = PDF(open("/users/tom/downloads/10-1.1.1.61.7217.pdf", "rb").read())
#pdf = PDF(open("/users/tom/downloads/10-1.1.1.14.8422.pdf", "rb").read())
#print Document(unicode(pdf), threshold=1).keywords(30)
#print xxx

corpus = Corpus()
for product in glob.glob(os.path.join("reviews", "*")):
    for review in glob.glob(os.path.join(product, "*.txt")):
        polarity = "yes" in review
        s = open(review).read()
        corpus.append(Document(s, type=polarity, top=50, threshold=2))

#print "testtree"
#V = lambda x: Vector(dict(enumerate(x)))
#v = [(2,3), (5,4), (9,6), (4,7), (8,1), (7,2)]
#v = [V(x) for x in v]
#t = kdtree(v)
#print t.nn(V((9,5)))
#print xxx

n = 10
x = 0
t1 = 0
t2 = 0

for j in range(n):
corpus = Corpus(documents,weight=TFIDF)
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

classifier=Bayes(aligned=True)
for document in corpus:
    classifier.train(document,type=document.type)
print 'Done training'

# To test the accuracy of a classifier, Using 10-fold crossvalidation
# This yields 4 scores: Accuracy, Precision, Recall and F-score.
print 'Bayes Classifier'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
print Bayes.test(corpus.documents,folds=10)

#Testing on the Sample dataset of 10 Negative and 10 Positive Tweets
ft=open('test_20','r')
test_lines=ft.readlines()
for line in test_lines:
	t=(Document(line))
	corpus.append(t)
	print line.strip(),' ',str(classifier.classify(t))
ft.close()
f_neg.close()
f_pos.close()

Example #4
0
corpus = Corpus()

# First, we mine a corpus of a 1000 tweets.
# We'll use hashtags as type.
for page in range(1, 10):
    for tweet in Twitter().search('#win OR #fail', start=page, count=100, cached=True):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL'
        s = tweet.description.lower()        # tweet in lowercase
        s = Sentence(parse(s))               # parse tree with part-of-speech tags
        s = search('JJ', s)                  # adjectives in the tweet
        s = [match[0].string for match in s] # adjectives as a list of strings
        s = " ".join(s)                      # adjectives as string
        if len(s) > 0:
            corpus.append(Document(s, type=p, stemmer=None))

# Train k-nearest neighbor on the corpus.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN()
for document in corpus:
    classifier.train(document)

# These are the adjectives the classifier has learned:
print sorted(classifier.terms)
print
Example #5
0
corpus = Corpus()

# First, we mine a corpus of tweets.
# We'll use hashtags as type.
for page in range(1,6):
    for tweet in Twitter().search('#win OR #fail', start=page, count=100, cached=False):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL'
        s = tweet.description.lower()        # tweet in lowercase
        s = Sentence(parse(s))               # parse tree with part-of-speech tags
        s = search('JJ', s)                  # adjectives in the tweet
        s = [match[0].string for match in s] # adjectives as a list of strings
        s = " ".join(s)                      # adjectives as string
        if len(s) > 0:
            corpus.append(Document(s, type=p, threshold=0, stemmer=None))

# Train k-nearest neighbor on the corpus.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN()
for document in corpus:
    classifier.train(document)

# These are the words the classifier has learned:
print classifier.terms
print