for i in range(1, 15): for tweet in Twitter().search( '#win' or '#fail', start=i, count=100 ): #searches 15*100=1500 tweets for these classes of hashtags p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL' m = '#fail' in tweet.description.lower() and 'WIN' or 'FAIL' s = tweet.description.lower() s = Sentence( parse(s) ) #parse anlayzes & gives strings that are annotated with specified tags s = search('JJ', s) #searches for adjectives in tweets (JJ = adjectiive) s = [match[0].string for match in s] s = ' '.join(s) if len(s) > 0: corpus.append(Document(s, type=p)) corpus.append(Document(s, type=m)) classifier = KNN() #k-nearest neighbor classifier = K-NN objects = [] for document in corpus: #documents are an unordered bag of given sentences. classifier.train( document) #adjective vectors in corpus trains the classifier objects.append(classifier.classify('awesome')) #predicts awesome as win objects.append(classifier.classify('cool')) #predicts cool as win objects.append(classifier.classify('damn')) #predicts damn as fail objects.append(classifier.classify('sucks')) #predicts sucks as fail print objects
import glob from pattern.vector import Document, Corpus, Bayes, KNN, features, distance, Vector, _distance, COSINE, kdtree #from pattern.web import PDF ##pdf = PDF(open("/users/tom/downloads/10-1.1.1.61.7217.pdf", "rb").read()) #pdf = PDF(open("/users/tom/downloads/10-1.1.1.14.8422.pdf", "rb").read()) #print Document(unicode(pdf), threshold=1).keywords(30) #print xxx corpus = Corpus() for product in glob.glob(os.path.join("reviews", "*")): for review in glob.glob(os.path.join(product, "*.txt")): polarity = "yes" in review s = open(review).read() corpus.append(Document(s, type=polarity, top=50, threshold=2)) #print "testtree" #V = lambda x: Vector(dict(enumerate(x))) #v = [(2,3), (5,4), (9,6), (4,7), (8,1), (7,2)] #v = [V(x) for x in v] #t = kdtree(v) #print t.nn(V((9,5))) #print xxx n = 10 x = 0 t1 = 0 t2 = 0 for j in range(n):
corpus = Corpus(documents,weight=TFIDF) print "number of documents:", len(corpus) print "number of words:", len(corpus.vector) print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus)) print classifier=Bayes(aligned=True) for document in corpus: classifier.train(document,type=document.type) print 'Done training' # To test the accuracy of a classifier, Using 10-fold crossvalidation # This yields 4 scores: Accuracy, Precision, Recall and F-score. print 'Bayes Classifier' print '-------------------------' print '(Accuracy, Precision,REcall,F-Measure)' print Bayes.test(corpus.documents,folds=10) #Testing on the Sample dataset of 10 Negative and 10 Positive Tweets ft=open('test_20','r') test_lines=ft.readlines() for line in test_lines: t=(Document(line)) corpus.append(t) print line.strip(),' ',str(classifier.classify(t)) ft.close() f_neg.close() f_pos.close()
corpus = Corpus() # First, we mine a corpus of a 1000 tweets. # We'll use hashtags as type. for page in range(1, 10): for tweet in Twitter().search('#win OR #fail', start=page, count=100, cached=True): # If the tweet contains #win hashtag, we'll set its type to 'WIN': p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL' s = tweet.description.lower() # tweet in lowercase s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: corpus.append(Document(s, type=p, stemmer=None)) # Train k-nearest neighbor on the corpus. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN() for document in corpus: classifier.train(document) # These are the adjectives the classifier has learned: print sorted(classifier.terms) print
corpus = Corpus() # First, we mine a corpus of tweets. # We'll use hashtags as type. for page in range(1,6): for tweet in Twitter().search('#win OR #fail', start=page, count=100, cached=False): # If the tweet contains #win hashtag, we'll set its type to 'WIN': p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL' s = tweet.description.lower() # tweet in lowercase s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: corpus.append(Document(s, type=p, threshold=0, stemmer=None)) # Train k-nearest neighbor on the corpus. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN() for document in corpus: classifier.train(document) # These are the words the classifier has learned: print classifier.terms print