Example #1
0
def train():
    """Train the clasifier"""
    words = {}

    for user in User.objects.all():
        print 'Training for User ' + str(user.id),
        for subscription in user.subscriptions:
            interested_titles_list = []
            unlabled_titles_list = []
            for article in Article.objects(interested_users=user.id,
                                           feed_id=subscription.feed_id):
                interested_titles_list.append(article.features.title)
                unlabled_titles_list.append(article.features.title)
            for article in Article.objects(uninterested_users=user.id,
                                           feed_id=subscription.feed_id):
                unlabled_titles_list.append(article.features.title)
            words = map(get_words_in_title, interested_titles_list)
            print interested_titles_list
            classifier = PositiveNaiveBayesClassifier.train(
                words, map(get_words_in_title, unlabled_titles_list))
            subscription.classifier_object = pickle.dumps(classifier)
        try:
            user.save()
        except Exception as e:
            print 'Failed: %s' % e
        print 'Classifier Saved'
	def test_classifier(self):
		bgram_doc = list(self.ft_db.find({"subreddit" : "UkrainianConflict"},to_tuples=True,field="bigrams"))[0]
		allbgram_doc = list(self.ft_db.find({"subreddit" : "all"}, to_tuples=True, field='bigrams'))[0]

		pos_fts = { d[0]:True for d in bgram_doc["bigrams"] }
		neu_fts = { d[0]:True for d in allbgram_doc["bigrams"] }
		
		ukr = []
		neu = []

		for doc, fts in self.source.find_ft({"subreddit" : "UkrainianConflict"}):
			nomore = []
			for key in fts.keys():
				if key not in pos_fts:
					nomore = []
				for n in nomore:
					del fts[n]
			if len(fts.keys()) > 0:
				ukr.append(fts)

		for doc, fts in self.source.find_ft(limit=6000):
			neu.append(fts)

		nvb = PositiveNaiveBayesClassifier.train(ukr,neu)
		for do, fts in self.source.find_ft(skip=6000,limit=10):
			print(nvb.classify(fts))
		nvb.show_most_informative_features()

		"""ukr = []
Example #3
0
train_unknown_responses = unknown_responses.loc[:, ['2_x']]
positive = train_pos_responses['2_x'].tolist()
unlabelled = train_unknown_responses['2_x'].tolist()


def create_features(text):
    # Remove all the punctuations.
    table = str.maketrans({key: None for key in string.punctuation})
    text = text.translate(table)
    words = text.lower().split()
    # Create Bag of words.
    dictionary_words = dict(
        ('contains(%s)' % w, True) for w in words if len(w) > 2)
    return dictionary_words


pos_features = list(map(create_features, positive))
unknown_features = list(map(create_features, unlabelled))

# Learn the model just based on positive Naive Bayes Classifier.
classifier = PositiveNaiveBayesClassifier.train(pos_features, unknown_features)
#print(classifier.classify(create_features()))

for tip in tips.iloc[:, 2].tolist():
    try:
        tips["class"] = classifier.classify(create_features(tip))
    except AttributeError:
        pass
print(tips)
print(np.unique(tips.loc[:, 'class']))
Example #4
0
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)


data_425_sentences = topicList
various_sentences = [
    'The President did not comment', 'I lost the keys',
    'The team won the game', 'Sara has two kids',
    'The ball went off the court', 'They had the ball for the whole game',
    'The show is over'
]

data_425_featuresets = list(map(features, data_425_sentences))
unlabeled_featuresets = list(map(features, various_sentences))

classifier = PositiveNaiveBayesClassifier.train(data_425_featuresets,
                                                unlabeled_featuresets)

classifier.classify(features('The cat is on the table'))
classifier.classify(features('sata cable'))

#############################################################


def c_read_fileData(dataFormatter):
    c_data = defaultdict(list)
    for row in dataFormatter.itertuples():
        c_data[row[2]].append(row[1])
    text = c_data[425]
    print(text)
    return c_data
Example #5
0
positive_featuresets = map(features, sports_sentences)

print '\n ' 'positive_featuresets' ' full list: \n', positive_featuresets
print '\n positive_featuresets:'
for ii in positive_featuresets:
    print 'answer:', ii

# unlabeled_featuresets - A list of featuresets whose label is unknownself.
unlabeled_featuresets = map(features, various_sentences)

print '\n unlabeled_featuresets:'
for ii in unlabeled_featuresets:
    print 'answer:', ii

# To train, pass in a list of 'true' dictionaries for POS and for NEG
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
                                                unlabeled_featuresets)

# Is the following sentence about sports?
print '\n', 'The cat is on the table --', classifier.classify(
    features('The cat is on the table'))

# What about this one?
print 'My team lost the game --', classifier.classify(
    features('My team lost the game'))

# Output

#  positive_featuresets full list:
# [{'the': True, 'dominated': True, 'game': True, 'team': True}, {'the': True, 'ball': True, 'lost': True, 'they': True}, {'the': True, 'was': True, 'game': True, 'intense': True}, {'the': True, 'ball': True, 'goalkeeper': True, 'catched': True}, {'the': True, 'other': True, 'controlled': True, 'ball': True, 'team': True}]

#  positive_featuresets:
Example #6
0
def getClassifier(tweetfile, cfg):
    degreesToUse = cfg['NLPnGrams']
    print "DEBOOOOO", degreesToUse, type(degreesToUse)
    classMode = cfg['NLPMode'].replace('-', ' ').replace('_', ' ')
    shortClass = classMode.replace(' ', '').lower()
    loadNeeded = True

    if 'NLPTEST' not in cfg.keys():
        degreeString = '-'.join([str(degree) for degree in degreesToUse])
        pickleFile = 'nlpTrainers/' + tweetfile.replace(
            '.csv', '.' + shortClass + degreeString + '.pickle')
        if isfile(pickleFile):
            print "Loading pickled", shortClass, "classifier"
            fileIn = open(pickleFile)
            classifier = cPickle.load(fileIn)
            fileIn.close()
            loadNeeded = False

    if loadNeeded:
        if 'NLPTEST' in cfg.keys():
            content = prepText(tweetfile)
            categorized = prepClassifications(content)
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
        else:
            print "Loading content & preparing text"
            content = prepText(loadFile(tweetfile))
            print "Categorizing contents"
            categorized = prepClassifications(content)
            print "Deriving NGrams of length(s)", degreesToUse
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
            print "Compiling Results"
        readyToSend = []
        allCats = [str(key) for key in NGrammized.keys()]
        for category in allCats:
            readyToSend += NGrammized[category]

        print "Attempting Classification by mode", classMode, degreesToUse
        if classMode == 'naive bayes':
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }
        elif classMode == 'positive naive bayes':
            from nltk.classify import PositiveNaiveBayesClassifier
            classifier = {
                'class': PositiveNaiveBayesClassifier.train(readyToSend),
                'mode': 'pnb'
            }
        elif classMode == 'max ent':
            #import nltk.classify
            #from sklearn.linear_model import LogisticRegression
            #from nltk.classify import SklearnClassifier
            #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'}
            from nltk.classify import MaxentClassifier
            classifier = {
                'class': MaxentClassifier.train(readyToSend, algorithm='iis'),
                'mode': 'me'
            }
        elif classMode == 'decision tree':
            from nltk.classify import DecisionTreeClassifier
            classifier = {
                'class': DecisionTreeClassifier.train(readyToSend),
                'mode': 'dt'
            }
        elif classMode == 'svm':
            if "SVMOrder" in cfg.keys():
                priority = cfg['SVMOrder']
            else:
                priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210"
            if type(priority) is str:
                priority = list(priority)
            priority = [entry for entry in priority if entry in allCats]
            preppedSVM = prepSVMAll(readyToSend, priority, allCats, cfg)
            classifier = {
                'class': preppedSVM,
                'mode': 'svm',
                'priority': priority
            }
        else:
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }

        if 'NLPTEST' not in cfg.keys():
            print "Pickling Classifier"
            fileOut = open(pickleFile, 'wb')
            cPickle.dump(classifier, fileOut)
            fileOut.close()

    if 'NLPTEST' not in cfg.keys():
        if classMode != 'svm':
            classifier['class'].show_most_informative_features(n=150)
        """else:
		for key in classifier['class'].keys():
			print classifier		
			print classifier.keys()
			classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))"""

    return classifier
Example #7
0
if __name__ == '__main__':
    print get_article_snippet(
        "sduhfuihsejdsddsfsdfsdf<p>njksnn</p><a>snjkksfbksdbf</a>ksjdfn", 15)
    parser = argparse.ArgumentParser(description="Accepts a URL")
    parser.add_argument(
        "--url", dest="url")  #Extracts url from command line, if available
    urls = parser.parse_args()
    if urls.url == None:
        print("No URL Specified")
        sys.exit()
    positive_examples = map(get_words_in_article, [
        'http://www.engadget.com/2012/11/16/htc-droid-dna-review/',
        'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/',
        'http://www.engadget.com/2012/11/16/htc-desire-x-review/',
        'http://www.engadget.com/2012/11/16/htc-desire-x-review/'
    ])
    misc_examples = map(get_words_in_article, [
        'http://www.engadget.com/2012/11/16/sharp-aquos-sh930w-reviewed-early-in-russia-with-1080p-display/',
        'http://www.engadget.com/2012/11/15/nexus-4-backordered/',
        'http://www.engadget.com/2012/11/16/htc-windows-phone-8x-t-mobile-review/',
        'http://www.engadget.com/2012/11/16/distro-issue-66-holiday-gift-guide/',
        'http://www.engadget.com/2012/10/29/apple-macbook-pro-with-retina-display-review-13-inch/',
        'http://www.engadget.com/2012/11/17/skydrive-sdk-net-windows-phone-8/'
    ])
    classifier = PositiveNaiveBayesClassifier.train(positive_examples,
                                                    misc_examples)

    print classifier.classify(get_words_in_article(urls.url))
    classifier.show_most_informative_features()