Example #1
0
def normal_test(data, type):
    print '----------------------------------------------------'
    print 'TEST FUNCTION STARTED FOR ' + type + '!'
    total_data_size = len(data)
    training_size = int(round(total_data_size/2))
    test_size = training_size
    print 'Total Size: ' + str(total_data_size)
    print 'Training Size: ' + str(training_size)
    print 'Test Size: ' + str(test_size)

    print 'Training Started for ' + type + '!'
    classification_methods = {
      #uncomment based on what classification algorithm you would like to test
      'NB' :  NB(train=data[:training_size], baseline=MAJORITY, method=MULTINOMIAL),
      'KNN2' : KNN(train=data[:training_size], baseline=MAJORITY, k=2, distance=COSINE),
      'KNN3' : KNN(train=data[:training_size], baseline=MAJORITY, k=3, distance=COSINE),
      'KNN4' : KNN(train=data[:training_size], baseline=MAJORITY, k=4, distance=COSINE),
      'KNN5' : KNN(train=data[:training_size], baseline=MAJORITY, k=5, distance=COSINE),
      'KNN6' : KNN(train=data[:training_size], baseline=MAJORITY, k=6, distance=COSINE),
      'KNN7' : KNN(train=data[:training_size], baseline=MAJORITY, k=7, distance=COSINE),
      'KNN8' : KNN(train=data[:training_size], baseline=MAJORITY, k=8, distance=COSINE),
      'KNN9' : KNN(train=data[:training_size], baseline=MAJORITY, k=9, distance=COSINE),
      'KNN10' : KNN(train=data[:training_size], baseline=MAJORITY, k=10, distance=COSINE),
      'SLP1' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=1),
      'SLP2' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=2),
      'SLP3' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=3),
      'SVM' : SVM(train=data[:training_size], type=CLASSIFICATION, kernel=POLYNOMIAL),
    }

    print 'Normal Testing Started!'
    # uncomment to start the normal test
    for classification in classification_methods.keys():
      #measure the time it takes to classify!
      start = timeit.default_timer()
      #normal test
      accuracy, precision, recall, f1 = classification_methods[classification].test(data[training_size:training_size+test_size])
      stop = timeit.default_timer()
      print '*' + classification + '*'
      print 'Accuracy: ' + str(accuracy)
      print 'Precision: ' + str(precision)
      print 'Recall: ' + str(recall)
      print 'F1-score: ' + str(f1)
      print 'Time: ' + str(stop - start)
      print
Example #2
0
def report(model_file, text_file):
    pl = Pipeline()
    model = KNN.load(model_file)
    nlp = en_core_web_sm.load()
    make_doc = lambda text: nlp(unidecode(text).strip())

    text = [line for line in open(text_file) if line.strip()]
    docs = [make_doc(line) for line in text]
    sentences = [sent.text for doc in docs for sent in doc.sents]

    print('\n'.join(text))

    predictions = pl.predict(model, sentences, print_pred=False)
    print('\n \n ###############  Soft Skills  ############\n')
    print(
        *[sent for sent, pred in zip(sentences, predictions) if 'yes' == pred],
        sep='\n')
Example #3
0
    def train(cls, train_file, model_file):
        sents_dic = (json.loads(jsonl)
                     for jsonl in SoftSkills.load(train_file))
        model = KNN()

        for sent in sents_dic:
            text = sent['text']
            v = count([word for word, pos in tag(text)])  # {'sweet': 1}
            if v:
                model.train(v, type=sent['soft skill'])
        model.save(model_file)
        return model
Example #4
0
# Let's test how our model performs as a classifier.
# A document can have a label (or type, or class).
# For example, in the movie reviews corpus,
# there are positive reviews (score > 0) and negative reviews (score < 0).
# A classifier uses a model as "training" data
# to predict the label (type/class) of unlabeled documents.
# In this case, it can predict whether a new movie review is positive or
# negative.

# The details are not that important right now, just observe the accuracy.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.

t = time.time()
print("accuracy:", KNN.test(m, folds=10)[-1])
print("time:", time.time() - t)
print()

# Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features).
print("LSA reduction...")
print()
m.reduce(10)

t = time.time()
print("accuracy:", KNN.test(m, folds=10)[-1])
print("time:", time.time() - t)
print()

# Accuracy is about the same, but the performance is better: 2x-3x faster,
# because each document is now a "10-word summary" of the original review.
Example #5
0
        s = tweet.text.lower()               # tweet in lowercase
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels      
        s = Sentence(parse(s))               # parse tree with part-of-speech tags
        s = search('JJ', s)                  # adjectives in the tweet
        s = [match[0].string for match in s] # adjectives as a list of strings
        s = " ".join(s)                      # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))

# Train k-Nearest Neighbor on the model.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN(baseline=None) # By default, baseline=MAJORITY
for document in m:              # (classify unknown documents with the most frequent type).
    classifier.train(document)

# These are the adjectives the classifier has learned:
print sorted(classifier.features)
print

# We can now ask it to classify documents containing these words.
# Note that you may get different results than the ones below,
# since you will be mining other (more recent) tweets.
# Again, a robust classifier needs lots and lots of training data.
# If None is returned, the word was not recognized,
# and the classifier returned the default value (see above).
print classifier.classify('sweet')  # yields 'WIN'
print classifier.classify('stupid') # yields 'FAIL'
Example #6
0
# -*- coding: utf-8 -*-
"""
Module with methods used for retreiving tweets, containing certain
search words, from Twitter.
"""
from pattern.web import Twitter
from pattern.vector import KNN
import urllib2

twitter = Twitter(license=None, throttle=0.5)
knn = KNN()


def getSearchWords():
    """Function that returns list of search words"""
    # Test data, we will use synonyms for war (found in nltk word net)
    # words = ['war', 'conflict', 'jihad', ]
    words = []
    conflict = ['feud',
                'vendetta',
                'class struggle',
                'strife',
                'countercurrent',
                'discord',
                'trench warfare',
                'fight',
                'hassle',
                'beating',
                'in fighting',
                'single combat',
                'gunfight',
Example #7
0
 def _learner(self):
     return KNN()
Example #8
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 25 19:37:34 2019

@author: alternatif
"""

from pattern.web import Twitter
from pattern.en import tag
from pattern.vector import KNN, count

twitter, knn = Twitter(), KNN()

for i in range(1, 3):
    for tweet in twitter.search('#win OR #fail', start=i, count=100):
        s = tweet.text.lower()
        p = '#win' in s and 'WIN' or 'FAIL'
        v = tag(s)
        v = [word for word, pos in v if pos == 'JJ']  # JJ = adjective
        v = count(v)  # {'sweet': 1}
        if v:
            knn.train(v, type=p)

print(knn.classify('sweet potato burger'))
print(knn.classify('stupid autocorrect'))
Example #9
0
from pattern.web import Twitter
from pattern.text.en import tag
from pattern.vector import KNN, count, NaiveBayes, SVM
import os, random
import file_io as fio
corp_dir = 'essays/original'
twitter, knn, nbayes, svm = Twitter(), KNN(), NaiveBayes(), SVM()
from nltk.corpus import stopwords
import lsa
cachedStopWords = stopwords.words("english")
testSet = []


def naive():
    trainingSet = []
    l = lsa.getMod()
    dirs = [x[0] for x in os.walk(os.path.abspath(corp_dir))]
    for dir in dirs:
        label = 0
        if 'low' in dir:
            label = -1
        elif 'high' in dir:
            label = 1
        tfiles = []
        tfiles = fio.getTopLevelFiles(dir, extension='txt')
        train_smpl = []
        if len(tfiles) > 0:
            train_smpl = [
                tfiles[i] for i in random.sample(xrange(len(tfiles)), 13)
            ]
        for file in tfiles:
Example #10
0
        s = tweet.text.lower()  # tweet in lowercase
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels
        s = Sentence(parse(s))  # parse tree with part-of-speech tags
        s = search('JJ', s)  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)  # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))

# Train k-Nearest Neighbor on the model.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN(baseline=None)  # By default, baseline=MAJORITY
for document in m:  # (classify unknown documents with the most frequent type).
    classifier.train(document)

# These are the adjectives the classifier has learned:
print sorted(classifier.features)
print

# We can now ask it to classify documents containing these words.
# Note that you may get different results than the ones below,
# since you will be mining other (more recent) tweets.
# Again, a robust classifier needs lots and lots of training data.
# If None is returned, the word was not recognized,
# and the classifier returned the default value (see above).
print classifier.classify('sweet')  # yields 'WIN'
print classifier.classify('stupid')  # yields 'FAIL'
Example #11
0
        s = tweet.text.lower()  # tweet in lowercase
        p = "#win" in s and "WIN" or "FAIL"  # document labels
        s = Sentence(parse(s))  # parse tree with part-of-speech tags
        s = search("JJ", s)  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)  # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))

# Train k-Nearest Neighbor on the model.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN(baseline=None)  # By default, baseline=MAJORITY
for document in m:  # (classify unknown documents with the most frequent type).
    classifier.train(document)

# These are the adjectives the classifier has learned:
print sorted(classifier.features)
print

# We can now ask it to classify documents containing these words.
# Note that you may get different results than the ones below,
# since you will be mining other (more recent) tweets.
# Again, a robust classifier needs lots and lots of training data.
# If None is returned, the word was not recognized,
# and the classifier returned the default value (see above).
print classifier.classify("sweet potato burger")  # yields 'WIN'
print classifier.classify("stupid autocorrect")  # yields 'FAIL'
Example #12
0
def setup():
    global pages
    global urlalias
    global revurlalias
    global knn
    pages = dict()
    urlalias = dict()
    revurlalias = dict()
    knn = KNN()
    db = MySQLdb.connect(host="192.168.200.26",
                         user="******",
                         passwd="xxxsecretxxx",
                         db="pla")
    cur = db.cursor()
    cur.execute("select source, alias from url_alias")
    for row in cur.fetchall():
        urlalias[row[1]] = row[0]
        revurlalias[row[0]] = row[1]
    cur.execute("select tid, name, description, vid from taxonomy_term_data;")
    for row in cur.fetchall():
        url = 'taxonomy/term/' + str(row[0])
        pages[url] = row[1]
        if url in revurlalias:
            pages[revurlalias[url]] = row[1]
            url = revurlalias[url]
        if row[3] == 3:
            soup = bs4.BeautifulSoup(row[2])
            the_text = re.sub(r'[\n\r]+', r'  ', soup.get_text(' ')).lower()
            knn.train(Document(the_text, stemmer=PORTER), url)
            knn.train(Document(row[1].lower()), url)
    cur.execute(
        "select a.tid, c.body_value, d.title from taxonomy_term_data as a inner join field_data_field_practice_areas as b on (a.tid=b.field_practice_areas_tid and b.entity_type='node' and b.bundle != 'professionals' and b.deleted=0) inner join field_data_body as c on (b.entity_id=c.entity_id and b.entity_type=c.entity_type) inner join node as d on (c.entity_id=d.nid);"
    )
    for row in cur.fetchall():
        url = 'taxonomy/term/' + str(row[0])
        if url in revurlalias:
            url = revurlalias[url]
        soup = bs4.BeautifulSoup(row[1])
        the_text = re.sub(r'[\n\r]+', r'  ', soup.get_text(' ')).lower()
        knn.train(Document(the_text, stemmer=PORTER), url)
        knn.train(Document(row[2].lower()), url)
    cur.execute("select nid, title from node where status=1;")
    for row in cur.fetchall():
        url = 'node/' + str(row[0])
        pages[url] = row[1]
        if url in revurlalias:
            pages[revurlalias[url]] = row[1]
    db.close()
    pgcur = conn.cursor()
    pgcur.execute(
        "select query, target from website_queries where target is not null group by query, target"
    )
    for row in pgcur.fetchall():
        words = re.split(r'[\n\r,;]+ *', row[1])
        for word in words:
            print("training on " + row[0].lower() + " for " + word)
            knn.train(Document(row[0].lower()), word)
    conn.commit()
    pgcur.close()
Example #13
0
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(
    len(d.terms) for d in corpus.documents) / float(len(corpus))
print

# This may be too much words for some clustering algorithms (e.g., hierarchical).
# We'll reduce the documents to vectors of 4 concepts.

# First, let's test how the corpus would perform as a classifier.
# The details of KNN are not that important right now, just observe the numbers.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.
t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words).
print "LSA reduction..."
print
corpus.reduce(4)

t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Not bad, accuracy is about the same but performance is 3x faster,
# because each document is now a "4-word summary" of the original review.
Example #14
0
        s = tweet.text.lower()  # tweet in lowercase
        p = "#win" in s and "WIN" or "FAIL"  # document labels
        s = Sentence(parse(s))  # parse tree with part-of-speech tags
        s = search("JJ", s)  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)  # adjectives as string
        if len(s) > 0:
            corpus.append(Document(s, type=p, stemmer=None))

# Train k-nearest neighbor on the corpus.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN()
for document in corpus:
    classifier.train(document)

# These are the adjectives the classifier has learned:
print sorted(classifier.terms)
print

# We can ask it to classify texts containing those words.
# Note that you may get different results than the ones indicated below,
# since you will be mining other (more recent) tweets.
# Again, a robust classifier needs lots and lots of training data.
print classifier.classify("sweet")  # yields 'WIN'
print classifier.classify("stupid")  # yields 'FAIL'

# "What can I do with it?"
Example #15
0
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels
        # parse tree with part-of-speech tags
        s = Sentence(parse(s))
        s = search('JJ', s)                  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)                      # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))

# Train k-Nearest Neighbor on the model.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN(baseline=None)  # By default, baseline=MAJORITY
# (classify unknown documents with the most frequent type).
for document in m:
    classifier.train(document)

# These are the adjectives the classifier has learned:
print(sorted(classifier.features))
print()

# We can now ask it to classify documents containing these words.
# Note that you may get different results than the ones below,
# since you will be mining other (more recent) tweets.
# Again, a robust classifier needs lots and lots of training data.
# If None is returned, the word was not recognized,
# and the classifier returned the default value (see above).
print(classifier.classify('sweet potato burger'))  # yields 'WIN'
Example #16
0
    ):  #searches 15*100=1500 tweets for these classes of hashtags
        p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL'
        m = '#fail' in tweet.description.lower() and 'WIN' or 'FAIL'
        s = tweet.description.lower()
        s = Sentence(
            parse(s)
        )  #parse anlayzes & gives strings that are annotated with specified tags
        s = search('JJ',
                   s)  #searches for adjectives in tweets (JJ = adjectiive)
        s = [match[0].string for match in s]
        s = ' '.join(s)
        if len(s) > 0:
            corpus.append(Document(s, type=p))
            corpus.append(Document(s, type=m))

classifier = KNN()  #k-nearest neighbor classifier = K-NN
objects = []

for document in corpus:  #documents are an unordered bag of given sentences.

    classifier.train(
        document)  #adjective vectors in corpus trains the classifier
    objects.append(classifier.classify('awesome'))  #predicts awesome as win
    objects.append(classifier.classify('cool'))  #predicts cool as win
    objects.append(classifier.classify('damn'))  #predicts damn as fail
    objects.append(classifier.classify('sucks'))  #predicts sucks as fail

print objects
wincounter = 0
failcounter = 0
for thing in objects:
Example #17
0
        p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL'
        s = tweet.description.lower()        # tweet in lowercase
        s = Sentence(parse(s))               # parse tree with part-of-speech tags
        s = search('JJ', s)                  # adjectives in the tweet
        s = [match[0].string for match in s] # adjectives as a list of strings
        s = " ".join(s)                      # adjectives as string
        if len(s) > 0:
            corpus.append(Document(s, type=p, stemmer=None))

# Train k-nearest neighbor on the corpus.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN()
for document in corpus:
    classifier.train(document)

# These are the adjectives the classifier has learned:
print sorted(classifier.terms)
print

# We can ask it to classify texts containing those words.
# Note that you may get different results than the ones indicated below,
# since you will be mining other (more recent) tweets.
# Again, a robust classifier needs lots and lots of training data.
print classifier.classify('sweet')  # yields 'WIN'
print classifier.classify('stupid') # yields 'FAIL'

# "What can I do with it?"
Example #18
0
# We'll reduce the document vectors to 10 concepts.

# Let's test how our model performs as a classifier.
# A document can have a label (or type, or class).
# For example, in the movie reviews corpus,
# there are positive reviews (score > 0) and negative reviews (score < 0).
# A classifier uses a model as "training" data
# to predict the label (type/class) of unlabeled documents.
# In this case, it can predict whether a new movie review is positive or negative.

# The details are not that important right now, just observe the accuracy.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.

t = time.time()
print "accuracy:", KNN.test(m, folds=10)[-1]
print "time:", time.time() - t
print

# Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features).
print "LSA reduction..."
print
m.reduce(10)

t = time.time()
print "accuracy:", KNN.test(m, folds=10)[-1]
print "time:", time.time() - t
print

# Accuracy is about the same, but the performance is better: 2x-3x faster,
# because each document is now a "10-word summary" of the original review.
Example #19
0
corpus = Corpus(documents)

print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

# This may be too much words for some clustering algorithms (e.g., hierarchical).
# We'll reduce the documents to vectors of 4 concepts.

# First, let's test how the corpus would perform as a classifier.
# The details of KNN are not that important right now, just observe the numbers.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.
t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words).
print "LSA reduction..."
print
corpus.reduce(4)

t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Not bad, accuracy is about the same but performance is 3x faster,
# because each document is now a "4-word summary" of the original review.
Example #20
0
# We'll reduce the document vectors to 10 concepts.

# Let's test how our model performs as a classifier.
# A document can have a label (or type, or class).
# For example, in the movie reviews corpus,
# there are positive reviews (score > 0) and negative reviews (score < 0).
# A classifier uses a model as "training" data
# to predict the label (type/class) of unlabeled documents.
# In this case, it can predict whether a new movie review is positive or negative.

# The details are not that important right now, just observe the accuracy.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.

t = time.time()
print("accuracy:", KNN.test(m, folds=10)[-1])
print("time:", time.time() - t)
print()

# Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features).
print("LSA reduction...")
print()
m.reduce(10)

t = time.time()
print("accuracy:", KNN.test(m, folds=10)[-1])
print("time:", time.time() - t)
print()

# Accuracy is about the same, but the performance is better: 2x-3x faster,
# because each document is now a "10-word summary" of the original review.
Example #21
0
        x += 1

print "ERROR"
print x / n
print t1
print t2

#print xxx


print len(corpus)
print len(corpus.features)
print len(corpus.documents[0].vector)
from time import time
t = time()
print KNN.test(corpus, folds=10)
print time()-t

print "filter..."

from time import time
t = time()
f = corpus.feature_selection(150, verbose=False)
print f
print time()-t
corpus = corpus.filter(f)

#corpus.reduce(300)
#print len(corpus.lsa.vectors[corpus.documents[0].id])
#print corpus.lsa.vectors[corpus.documents[0].id]
#print len(corpus)
Example #22
0
        s = tweet.text.lower()  # tweet in lowercase
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels
        s = Sentence(parse(s))  # parse tree with part-of-speech tags
        s = search('JJ', s)  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)  # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))

# Train k-Nearest Neighbor on the model.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN(baseline=None)  # By default, baseline=MAJORITY
for document in m:  # (classify unknown documents with the most frequent type).
    classifier.train(document)

# These are the adjectives the classifier has learned:
print sorted(classifier.features)
print

# We can now ask it to classify documents containing these words.
# Note that you may get different results than the ones below,
# since you will be mining other (more recent) tweets.
# Again, a robust classifier needs lots and lots of training data.
# If None is returned, the word was not recognized,
# and the classifier returned the default value (see above).
print classifier.classify('sweet potato burger')  # yields 'WIN'
print classifier.classify('stupid autocorrect')  # yields 'FAIL'
print 'Number of Negative Tweets:',len(neg_lines)
print 'Number of Positive Tweets:',len(pos_lines)

documents = []
for line in neg_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type='0')
    documents.append(document)
for line in pos_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type='1')
    documents.append(document)

corpus = Corpus(documents,weight=TFIDF)
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

#Filtering top 1000 features using Information Gain Criterion
corpus=corpus.filter(features=(corpus.feature_selection(top=1000,method=IG)))

# To test the accuracy of a classifier, Using 10-fold crossvalidation
# This yields 4 scores: Accuracy, Precision, Recall and F-score.
print 'classifying using KNN'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
print KNN.test(corpus,k=100,folds=10,distance=COSINE)

f_neg.close()
f_pos.close()