Python PositiveNaiveBayesClassifierの例、nltk.classify.PositiveNaiveBayesClassifier Pythonの例

コード例 #1

0

ファイルを表示

def train():
    """Train the clasifier"""
    words = {}

    for user in User.objects.all():
        print 'Training for User ' + str(user.id),
        for subscription in user.subscriptions:
            interested_titles_list = []
            unlabled_titles_list = []
            for article in Article.objects(interested_users=user.id,
                                           feed_id=subscription.feed_id):
                interested_titles_list.append(article.features.title)
                unlabled_titles_list.append(article.features.title)
            for article in Article.objects(uninterested_users=user.id,
                                           feed_id=subscription.feed_id):
                unlabled_titles_list.append(article.features.title)
            words = map(get_words_in_title, interested_titles_list)
            print interested_titles_list
            classifier = PositiveNaiveBayesClassifier.train(
                words, map(get_words_in_title, unlabled_titles_list))
            subscription.classifier_object = pickle.dumps(classifier)
        try:
            user.save()
        except Exception as e:
            print 'Failed: %s' % e
        print 'Classifier Saved'

コード例 #2

0

ファイルを表示

ファイル: autoclassifier.py プロジェクト: juchiyama/bigdata_fall2015

	def test_classifier(self):
		bgram_doc = list(self.ft_db.find({"subreddit" : "UkrainianConflict"},to_tuples=True,field="bigrams"))[0]
		allbgram_doc = list(self.ft_db.find({"subreddit" : "all"}, to_tuples=True, field='bigrams'))[0]

		pos_fts = { d[0]:True for d in bgram_doc["bigrams"] }
		neu_fts = { d[0]:True for d in allbgram_doc["bigrams"] }
		
		ukr = []
		neu = []

		for doc, fts in self.source.find_ft({"subreddit" : "UkrainianConflict"}):
			nomore = []
			for key in fts.keys():
				if key not in pos_fts:
					nomore = []
				for n in nomore:
					del fts[n]
			if len(fts.keys()) > 0:
				ukr.append(fts)

		for doc, fts in self.source.find_ft(limit=6000):
			neu.append(fts)

		nvb = PositiveNaiveBayesClassifier.train(ukr,neu)
		for do, fts in self.source.find_ft(skip=6000,limit=10):
			print(nvb.classify(fts))
		nvb.show_most_informative_features()

		"""ukr = []

コード例 #3

0

ファイルを表示

ファイル: bayes.py プロジェクト: StevenLOL/detie

def train(spam_words, unlabeled_words):

    spams = list(map(features, spam_words))
    unlabeled = list(map(features, unlabeled_words))

    model = PositiveNaiveBayesClassifier.train(spams, unlabeled, 0.5)
    data = PickleData('bayesmodel.pickle')
    data.write(model)
    return model

コード例 #4

0

ファイルを表示

ファイル: classifier.py プロジェクト: juchiyama/bigdata_fall2015

def main():
	positive_featuresets = list(map(features, matches))
	unlabeled_featuresets = list(map(features, nomatches))
	classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets)
	cursor.rewind()
	events = []
	for doc in cursor:
		body = sent_tokenize(doc['body'])
		for sent in body:
			blob = TextBlob(sent)
			if (classifier.classify(features(blob))) is True:
				# to print what is written in the finalout file
				#print ( '"subreddit_id":' + json.dumps(doc['subreddit_id']) + ',"features":' + json.dumps(commonfeatures(str(blob))))
				events.append('{ "subreddit_id":' + json.dumps(doc['subreddit_id']) + ',"features":' + json.dumps(commonfeatures(str(blob))) + '}')
				
				
	f = open(finaloutput, 'w')
	f.write('{')
	f.write(json.dumps(events))
	f.write('}')
	f.write('\n')
	f.close()

コード例 #5

0

ファイルを表示

ファイル: train.py プロジェクト: AkarshES/Readless

def train():
    """Train the clasifier"""
    words = {}

    for user in User.objects.all():
        print "Training for User " + str(user.id),
        for subscription in user.subscriptions:
            interested_titles_list = []
            unlabled_titles_list = []
            for article in Article.objects(interested_users=user.id, feed_id=subscription.feed_id):
                interested_titles_list.append(article.features.title)
                unlabled_titles_list.append(article.features.title)
            for article in Article.objects(uninterested_users=user.id, feed_id=subscription.feed_id):
                unlabled_titles_list.append(article.features.title)
            words = map(get_words_in_title, interested_titles_list)
            print interested_titles_list
            classifier = PositiveNaiveBayesClassifier.train(words, map(get_words_in_title, unlabled_titles_list))
            subscription.classifier_object = pickle.dumps(classifier)
        try:
            user.save()
        except Exception as e:
            print "Failed: %s" % e
        print "Classifier Saved"

コード例 #6

0

ファイルを表示

ファイル: strip_html.py プロジェクト: AkarshES/Readless

       Function: Gets the article only version of the URL using Instapaper.
       Extracts the text in the artcile and removes any non AlphaNumeric characters in the text
       Returns a list of words in the article present in the URL.'''
    html_data = BeautifulSoup(urllib.urlopen(
                     "http://www.instapaper.com/m?%s" % urllib.urlencode({'u':url})).read()) #URLencoding the url to pass it to Instapaper
    html_data = html_data.find("body") 		#Using only the contents in HTML <body> tag, avoides Javascript from being treated as text.
    pattern = re.compile('[\W_ ]+')    		#Compile regex for alphanumeric characters and spaces(for multiword strings).
    words = html_data.findAll(text=True)	#setting text to True to extract only the text in the <body>
    word_list = []				            #Stores the list of words
    for word in words[30:]:			        #Removing redundant content from Instapaper Mobilizer headers
	for w in word.split(" "):		        #splitting on spcae for multiword strings
	    wd = (pattern.sub('',w.lower()))	#substituing non alphanumeric characters with ''
	    if len(wd) > 1 : word_list.append(wd)#exclude strings of less than 2 characters
    filtered_words = [w for w in word_list if not w in nltk.corpus.stopwords.words('english')]
    return dict((word,True) for word in word_list)

if __name__ == '__main__':
    print get_article_snippet("sduhfuihsejdsddsfsdfsdf<p>njksnn</p><a>snjkksfbksdbf</a>ksjdfn",15)
    parser = argparse.ArgumentParser(description = "Accepts a URL")
    parser.add_argument("--url",dest = "url") #Extracts url from command line, if available
    urls = parser.parse_args()
    if urls.url == None:
        print ("No URL Specified")
        sys.exit()
    positive_examples = map(get_words_in_article, ['http://www.engadget.com/2012/11/16/htc-droid-dna-review/', 'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/'])
    misc_examples = map(get_words_in_article, ['http://www.engadget.com/2012/11/16/sharp-aquos-sh930w-reviewed-early-in-russia-with-1080p-display/', 'http://www.engadget.com/2012/11/15/nexus-4-backordered/', 'http://www.engadget.com/2012/11/16/htc-windows-phone-8x-t-mobile-review/', 'http://www.engadget.com/2012/11/16/distro-issue-66-holiday-gift-guide/', 'http://www.engadget.com/2012/10/29/apple-macbook-pro-with-retina-display-review-13-inch/', 'http://www.engadget.com/2012/11/17/skydrive-sdk-net-windows-phone-8/'])
    classifier = PositiveNaiveBayesClassifier.train(positive_examples,misc_examples)

    print classifier.classify(get_words_in_article(urls.url))
    classifier.show_most_informative_features()

コード例 #7

0

ファイルを表示

ファイル: TweetMatch.py プロジェクト: jschlitt84/ChatterGrabber

def getClassifier(tweetfile,cfg):
    degreesToUse = cfg['NLPnGrams']
    print "DEBOOOOO", degreesToUse, type(degreesToUse)
    classMode = cfg['NLPMode'].replace('-',' ').replace('_',' ')
    shortClass = classMode.replace(' ','').lower()
    loadNeeded = True 

    if 'NLPTEST' not in cfg.keys():
	degreeString = '-'.join([str(degree) for degree in degreesToUse])
        pickleFile = 'nlpTrainers/'+tweetfile.replace('.csv','.'+shortClass+degreeString+'.pickle')  
	if isfile(pickleFile):
		print "Loading pickled", shortClass, "classifier"
		fileIn = open(pickleFile)
		classifier = cPickle.load(fileIn)
		fileIn.close()
		loadNeeded = False
    
    if loadNeeded:
        if 'NLPTEST'in cfg.keys():
            content = prepText(tweetfile)
            categorized = prepClassifications(content)
            NGrammized = collectNGrams(categorized,degreesToUse,cfg)
        else:
            print "Loading content & preparing text"
            content = prepText(loadFile(tweetfile))
            print "Categorizing contents"
            categorized = prepClassifications(content)
            print "Deriving NGrams of length(s)", degreesToUse
            NGrammized = collectNGrams(categorized,degreesToUse,cfg)
            print "Compiling Results"
        readyToSend = []
        allCats = [str(key) for key in NGrammized.keys()]
        for category in allCats:
            readyToSend += NGrammized[category]
            
        print "Attempting Classification by mode", classMode, degreesToUse
        if classMode == 'naive bayes':
            from nltk.classify import NaiveBayesClassifier
            classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'}
        elif classMode == 'positive naive bayes':
            from nltk.classify import PositiveNaiveBayesClassifier
            classifier = {'class':PositiveNaiveBayesClassifier.train(readyToSend),'mode':'pnb'}
        elif classMode == 'max ent':
            #import nltk.classify
            #from sklearn.linear_model import LogisticRegression
            #from nltk.classify import SklearnClassifier
            #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'}
            from nltk.classify import MaxentClassifier
            classifier = {'class':MaxentClassifier.train(readyToSend,algorithm='iis'),'mode':'me'}
        elif classMode == 'decision tree':
            from nltk.classify import DecisionTreeClassifier
            classifier = {'class':DecisionTreeClassifier.train(readyToSend),'mode':'dt'}
        elif classMode == 'svm':
            if "SVMOrder" in cfg.keys():
                priority =  cfg['SVMOrder']
            else:
                priority =  "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210"
            if type(priority) is str:
                priority = list(priority)
            priority = [entry for entry in priority if entry in allCats]
            preppedSVM = prepSVMAll(readyToSend,priority,allCats,cfg)
            classifier = {'class':preppedSVM,'mode':'svm','priority':priority}
	else:
	    from nltk.classify import NaiveBayesClassifier
            classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'}
        
        if 'NLPTEST' not in cfg.keys():
            print "Pickling Classifier"
            fileOut = open(pickleFile, 'wb')
            cPickle.dump(classifier, fileOut)
            fileOut.close() 
              
    if 'NLPTEST' not in cfg.keys():
        if classMode != 'svm': 
        	classifier['class'].show_most_informative_features(n=150)
	"""else:
		for key in classifier['class'].keys():
			print classifier		
			print classifier.keys()
			classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))"""    
    
    return classifier

コード例 #8

0

ファイルを表示

ファイル: PositiveNBClassifier.py プロジェクト: fayimora/FinalYearProject

def get_data(path, label):
    examples = glob.glob(path)
    to_text = lambda fname: features(open(fname).read())
    return map(to_text, examples)

def features(sentence):
    words = sentence.lower().split()
    return dict((w, True) for w in words)

print "Extracting relevant and irrelevant examples..."
relevant_examples = get_data("data/relevant/*", "relevant")
irrelevant_examples = get_data("data/irrelevant/*", "irrelevant")

print "Creating training set..."
featuresets = relevant_examples + irrelevant_examples
print "Featuresets: " + str(len(featuresets))

N = 65000
train_set, test_set = featuresets[N:], featuresets[:N]
print "Train set: " + str(len(train_set))
print "Test set: " + str(len(test_set))

print "Training in progress..."
classifier = PositiveNaiveBayesClassifier.train(irrelevant_examples, featuresets)
print "Finished training!"

classifier.show_most_informative_features()
# accuracy = nltk.classify.util.accuracy(classifier, test_set)
# print "Accuracy: " + str(accuracy)

コード例 #9

0

ファイルを表示

ファイル: indix_tc.py プロジェクト: akdeepak/Indix

def features(sentence):
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)

data_425_sentences = topicList
various_sentences = [ 'The President did not comment',
                       'I lost the keys',
                       'The team won the game',
                       'Sara has two kids',
                       'The ball went off the court',
                       'They had the ball for the whole game',
                       'The show is over' ]

data_425_featuresets = list(map(features, data_425_sentences))
classifier = PositiveNaiveBayesClassifier.train(data_425_featuresets,
                                                 unlabeled_featuresets)
                                                 
classifier.classify(features('The cat is on the table'))                                                 
classifier.classify(features('sata cable'))

#############################################################

c_filename =  "C:\\DEEPAK\\INDIX\\classification_blind_set_corrected\\classification_blind_set_corrected.tsv"

c_df = pd.read_csv(c_filename ,sep="\t",low_memory = False)
c_data = defaultdict(list)


for c_row in c_df.itertuples():
    c_data[c_row[2]].append(c_row [1])

コード例 #10

0

ファイルを表示

ファイル: posBayesDemo.py プロジェクト: wilmtang/StatisticalNLP

positive_featuresets = map(features, sports_sentences)

print '\n ' 'positive_featuresets' ' full list: \n', positive_featuresets
print '\n positive_featuresets:'
for ii in positive_featuresets:
    print 'answer:', ii

# unlabeled_featuresets - A list of featuresets whose label is unknownself.
unlabeled_featuresets = map(features, various_sentences)

print '\n unlabeled_featuresets:'
for ii in unlabeled_featuresets:
    print 'answer:', ii

# To train, pass in a list of 'true' dictionaries for POS and for NEG
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
                                                unlabeled_featuresets)

# Is the following sentence about sports?
print '\n', 'The cat is on the table --', classifier.classify(
    features('The cat is on the table'))

# What about this one?
print 'My team lost the game --', classifier.classify(
    features('My team lost the game'))

# Output

#  positive_featuresets full list:
# [{'the': True, 'dominated': True, 'game': True, 'team': True}, {'the': True, 'ball': True, 'lost': True, 'they': True}, {'the': True, 'was': True, 'game': True, 'intense': True}, {'the': True, 'ball': True, 'goalkeeper': True, 'catched': True}, {'the': True, 'other': True, 'controlled': True, 'ball': True, 'team': True}]

#  positive_featuresets:

コード例 #11

0

ファイルを表示

ファイル: pcorp.py プロジェクト: pgeiss/Mouse-Disambiguation

import os
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify import PositiveNaiveBayesClassifier
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

def features(sentence):
	words = sentence.lower().split()
	return dict(('contains(%s)' % w, True) for w in words)

corpusdir = './text'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
positive_featuresets = list(map(features, newcorpus.raw('comp.txt')))
unlabeled_featuresets = list(map(features, newcorpus.raw('animal.txt')))
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, 
	unlabeled_featuresets, .3)
print classifier.classify(features('.'))

コード例 #12

0

ファイルを表示

ファイル: getRandomWords.py プロジェクト: navd/python

                     'I lost the keys',
                     'The team won the game',
                     'Sara has two kids',
                     'The ball went off the court',
                     'They had the ball for the whole game',
                     'The show is over'
                     ]
                    
def features(sentence):
    sentence = ' '.join([word for word in sentence.split() if word.lower() not in cachedStopWords])
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)
    
positive_featuresets = list(map(features, sports_sentences))
unlabeled_featuresets = list(map(features, various_sentences))
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets)

positive_sentiments = list(map(features, posTrainFeatures))
negative_sentiments = list(map(features, negTrainFeatures))
sentimentClassifier = PositiveNaiveBayesClassifier.train(positive_sentiments, negative_sentiments)

#print (classifier.classify(features('The cat is on the table')))
#print (classifier.classify(features('My team lost the game')))

referenceSets = collections.defaultdict(set)
testSets = collections.defaultdict(set)
positives = 0
negatives = 0
tp = 0
tn = 0
fp = 0

コード例 #13

0

ファイルを表示

ファイル: classify.py プロジェクト: sushantwason/WatchOrNot

def main():
        positive_featuresets = list(map(features, sports_sentences))
        unlabeled_featuresets = list(map(features, various_sentences))
        classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,unlabeled_featuresets)
        print classifier.classify(features('My team lost the game'))

コード例 #14

0

ファイルを表示

ファイル: classifier.py プロジェクト: juchiyama/bigdata_fall2015

def positive_naive_bayes(pos_cursor,unlabeled_cursor):
	"""send over entire documents"""
	return PositiveNaiveBayesClassifier.train(list(map(feature, pos_cursor)), list(map(feature, unlabeled_cursor)))

コード例 #15

0

ファイルを表示

def getClassifier(tweetfile, cfg):
    degreesToUse = cfg['NLPnGrams']
    print "DEBOOOOO", degreesToUse, type(degreesToUse)
    classMode = cfg['NLPMode'].replace('-', ' ').replace('_', ' ')
    shortClass = classMode.replace(' ', '').lower()
    loadNeeded = True

    if 'NLPTEST' not in cfg.keys():
        degreeString = '-'.join([str(degree) for degree in degreesToUse])
        pickleFile = 'nlpTrainers/' + tweetfile.replace(
            '.csv', '.' + shortClass + degreeString + '.pickle')
        if isfile(pickleFile):
            print "Loading pickled", shortClass, "classifier"
            fileIn = open(pickleFile)
            classifier = cPickle.load(fileIn)
            fileIn.close()
            loadNeeded = False

    if loadNeeded:
        if 'NLPTEST' in cfg.keys():
            content = prepText(tweetfile)
            categorized = prepClassifications(content)
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
        else:
            print "Loading content & preparing text"
            content = prepText(loadFile(tweetfile))
            print "Categorizing contents"
            categorized = prepClassifications(content)
            print "Deriving NGrams of length(s)", degreesToUse
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
            print "Compiling Results"
        readyToSend = []
        allCats = [str(key) for key in NGrammized.keys()]
        for category in allCats:
            readyToSend += NGrammized[category]

        print "Attempting Classification by mode", classMode, degreesToUse
        if classMode == 'naive bayes':
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }
        elif classMode == 'positive naive bayes':
            from nltk.classify import PositiveNaiveBayesClassifier
            classifier = {
                'class': PositiveNaiveBayesClassifier.train(readyToSend),
                'mode': 'pnb'
            }
        elif classMode == 'max ent':
            #import nltk.classify
            #from sklearn.linear_model import LogisticRegression
            #from nltk.classify import SklearnClassifier
            #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'}
            from nltk.classify import MaxentClassifier
            classifier = {
                'class': MaxentClassifier.train(readyToSend, algorithm='iis'),
                'mode': 'me'
            }
        elif classMode == 'decision tree':
            from nltk.classify import DecisionTreeClassifier
            classifier = {
                'class': DecisionTreeClassifier.train(readyToSend),
                'mode': 'dt'
            }
        elif classMode == 'svm':
            if "SVMOrder" in cfg.keys():
                priority = cfg['SVMOrder']
            else:
                priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210"
            if type(priority) is str:
                priority = list(priority)
            priority = [entry for entry in priority if entry in allCats]
            preppedSVM = prepSVMAll(readyToSend, priority, allCats, cfg)
            classifier = {
                'class': preppedSVM,
                'mode': 'svm',
                'priority': priority
            }
        else:
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }

        if 'NLPTEST' not in cfg.keys():
            print "Pickling Classifier"
            fileOut = open(pickleFile, 'wb')
            cPickle.dump(classifier, fileOut)
            fileOut.close()

    if 'NLPTEST' not in cfg.keys():
        if classMode != 'svm':
            classifier['class'].show_most_informative_features(n=150)
        """else:
		for key in classifier['class'].keys():
			print classifier		
			print classifier.keys()
			classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))"""

    return classifier

コード例 #16

0

ファイルを表示

ファイル: indix_problem1.py プロジェクト: akdeepak/Indix

    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)


data_425_sentences = topicList
various_sentences = [
    'The President did not comment', 'I lost the keys',
    'The team won the game', 'Sara has two kids',
    'The ball went off the court', 'They had the ball for the whole game',
    'The show is over'
]

data_425_featuresets = list(map(features, data_425_sentences))
unlabeled_featuresets = list(map(features, various_sentences))

classifier = PositiveNaiveBayesClassifier.train(data_425_featuresets,
                                                unlabeled_featuresets)

classifier.classify(features('The cat is on the table'))
classifier.classify(features('sata cable'))

#############################################################


def c_read_fileData(dataFormatter):
    c_data = defaultdict(list)
    for row in dataFormatter.itertuples():
        c_data[row[2]].append(row[1])
    text = c_data[425]
    print(text)
    return c_data

コード例 #17

0

ファイルを表示

train_unknown_responses = unknown_responses.loc[:, ['2_x']]
positive = train_pos_responses['2_x'].tolist()
unlabelled = train_unknown_responses['2_x'].tolist()


def create_features(text):
    # Remove all the punctuations.
    table = str.maketrans({key: None for key in string.punctuation})
    text = text.translate(table)
    words = text.lower().split()
    # Create Bag of words.
    dictionary_words = dict(
        ('contains(%s)' % w, True) for w in words if len(w) > 2)
    return dictionary_words


pos_features = list(map(create_features, positive))
unknown_features = list(map(create_features, unlabelled))

# Learn the model just based on positive Naive Bayes Classifier.
classifier = PositiveNaiveBayesClassifier.train(pos_features, unknown_features)
#print(classifier.classify(create_features()))

for tip in tips.iloc[:, 2].tolist():
    try:
        tips["class"] = classifier.classify(create_features(tip))
    except AttributeError:
        pass
print(tips)
print(np.unique(tips.loc[:, 'class']))

コード例 #18

0

ファイルを表示

if __name__ == '__main__':
    print get_article_snippet(
        "sduhfuihsejdsddsfsdfsdf<p>njksnn</p><a>snjkksfbksdbf</a>ksjdfn", 15)
    parser = argparse.ArgumentParser(description="Accepts a URL")
    parser.add_argument(
        "--url", dest="url")  #Extracts url from command line, if available
    urls = parser.parse_args()
    if urls.url == None:
        print("No URL Specified")
        sys.exit()
    positive_examples = map(get_words_in_article, [
        'http://www.engadget.com/2012/11/16/htc-droid-dna-review/',
        'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/',
        'http://www.engadget.com/2012/11/16/htc-desire-x-review/',
        'http://www.engadget.com/2012/11/16/htc-desire-x-review/'
    ])
    misc_examples = map(get_words_in_article, [
        'http://www.engadget.com/2012/11/16/sharp-aquos-sh930w-reviewed-early-in-russia-with-1080p-display/',
        'http://www.engadget.com/2012/11/15/nexus-4-backordered/',
        'http://www.engadget.com/2012/11/16/htc-windows-phone-8x-t-mobile-review/',
        'http://www.engadget.com/2012/11/16/distro-issue-66-holiday-gift-guide/',
        'http://www.engadget.com/2012/10/29/apple-macbook-pro-with-retina-display-review-13-inch/',
        'http://www.engadget.com/2012/11/17/skydrive-sdk-net-windows-phone-8/'
    ])
    classifier = PositiveNaiveBayesClassifier.train(positive_examples,
                                                    misc_examples)

    print classifier.classify(get_words_in_article(urls.url))
    classifier.show_most_informative_features()