def train(): """Train the clasifier""" words = {} for user in User.objects.all(): print 'Training for User ' + str(user.id), for subscription in user.subscriptions: interested_titles_list = [] unlabled_titles_list = [] for article in Article.objects(interested_users=user.id, feed_id=subscription.feed_id): interested_titles_list.append(article.features.title) unlabled_titles_list.append(article.features.title) for article in Article.objects(uninterested_users=user.id, feed_id=subscription.feed_id): unlabled_titles_list.append(article.features.title) words = map(get_words_in_title, interested_titles_list) print interested_titles_list classifier = PositiveNaiveBayesClassifier.train( words, map(get_words_in_title, unlabled_titles_list)) subscription.classifier_object = pickle.dumps(classifier) try: user.save() except Exception as e: print 'Failed: %s' % e print 'Classifier Saved'
def test_classifier(self): bgram_doc = list(self.ft_db.find({"subreddit" : "UkrainianConflict"},to_tuples=True,field="bigrams"))[0] allbgram_doc = list(self.ft_db.find({"subreddit" : "all"}, to_tuples=True, field='bigrams'))[0] pos_fts = { d[0]:True for d in bgram_doc["bigrams"] } neu_fts = { d[0]:True for d in allbgram_doc["bigrams"] } ukr = [] neu = [] for doc, fts in self.source.find_ft({"subreddit" : "UkrainianConflict"}): nomore = [] for key in fts.keys(): if key not in pos_fts: nomore = [] for n in nomore: del fts[n] if len(fts.keys()) > 0: ukr.append(fts) for doc, fts in self.source.find_ft(limit=6000): neu.append(fts) nvb = PositiveNaiveBayesClassifier.train(ukr,neu) for do, fts in self.source.find_ft(skip=6000,limit=10): print(nvb.classify(fts)) nvb.show_most_informative_features() """ukr = []
train_unknown_responses = unknown_responses.loc[:, ['2_x']] positive = train_pos_responses['2_x'].tolist() unlabelled = train_unknown_responses['2_x'].tolist() def create_features(text): # Remove all the punctuations. table = str.maketrans({key: None for key in string.punctuation}) text = text.translate(table) words = text.lower().split() # Create Bag of words. dictionary_words = dict( ('contains(%s)' % w, True) for w in words if len(w) > 2) return dictionary_words pos_features = list(map(create_features, positive)) unknown_features = list(map(create_features, unlabelled)) # Learn the model just based on positive Naive Bayes Classifier. classifier = PositiveNaiveBayesClassifier.train(pos_features, unknown_features) #print(classifier.classify(create_features())) for tip in tips.iloc[:, 2].tolist(): try: tips["class"] = classifier.classify(create_features(tip)) except AttributeError: pass print(tips) print(np.unique(tips.loc[:, 'class']))
words = sentence.lower().split() return dict(('contains(%s)' % w, True) for w in words) data_425_sentences = topicList various_sentences = [ 'The President did not comment', 'I lost the keys', 'The team won the game', 'Sara has two kids', 'The ball went off the court', 'They had the ball for the whole game', 'The show is over' ] data_425_featuresets = list(map(features, data_425_sentences)) unlabeled_featuresets = list(map(features, various_sentences)) classifier = PositiveNaiveBayesClassifier.train(data_425_featuresets, unlabeled_featuresets) classifier.classify(features('The cat is on the table')) classifier.classify(features('sata cable')) ############################################################# def c_read_fileData(dataFormatter): c_data = defaultdict(list) for row in dataFormatter.itertuples(): c_data[row[2]].append(row[1]) text = c_data[425] print(text) return c_data
positive_featuresets = map(features, sports_sentences) print '\n ' 'positive_featuresets' ' full list: \n', positive_featuresets print '\n positive_featuresets:' for ii in positive_featuresets: print 'answer:', ii # unlabeled_featuresets - A list of featuresets whose label is unknownself. unlabeled_featuresets = map(features, various_sentences) print '\n unlabeled_featuresets:' for ii in unlabeled_featuresets: print 'answer:', ii # To train, pass in a list of 'true' dictionaries for POS and for NEG classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets) # Is the following sentence about sports? print '\n', 'The cat is on the table --', classifier.classify( features('The cat is on the table')) # What about this one? print 'My team lost the game --', classifier.classify( features('My team lost the game')) # Output # positive_featuresets full list: # [{'the': True, 'dominated': True, 'game': True, 'team': True}, {'the': True, 'ball': True, 'lost': True, 'they': True}, {'the': True, 'was': True, 'game': True, 'intense': True}, {'the': True, 'ball': True, 'goalkeeper': True, 'catched': True}, {'the': True, 'other': True, 'controlled': True, 'ball': True, 'team': True}] # positive_featuresets:
def getClassifier(tweetfile, cfg): degreesToUse = cfg['NLPnGrams'] print "DEBOOOOO", degreesToUse, type(degreesToUse) classMode = cfg['NLPMode'].replace('-', ' ').replace('_', ' ') shortClass = classMode.replace(' ', '').lower() loadNeeded = True if 'NLPTEST' not in cfg.keys(): degreeString = '-'.join([str(degree) for degree in degreesToUse]) pickleFile = 'nlpTrainers/' + tweetfile.replace( '.csv', '.' + shortClass + degreeString + '.pickle') if isfile(pickleFile): print "Loading pickled", shortClass, "classifier" fileIn = open(pickleFile) classifier = cPickle.load(fileIn) fileIn.close() loadNeeded = False if loadNeeded: if 'NLPTEST' in cfg.keys(): content = prepText(tweetfile) categorized = prepClassifications(content) NGrammized = collectNGrams(categorized, degreesToUse, cfg) else: print "Loading content & preparing text" content = prepText(loadFile(tweetfile)) print "Categorizing contents" categorized = prepClassifications(content) print "Deriving NGrams of length(s)", degreesToUse NGrammized = collectNGrams(categorized, degreesToUse, cfg) print "Compiling Results" readyToSend = [] allCats = [str(key) for key in NGrammized.keys()] for category in allCats: readyToSend += NGrammized[category] print "Attempting Classification by mode", classMode, degreesToUse if classMode == 'naive bayes': from nltk.classify import NaiveBayesClassifier classifier = { 'class': NaiveBayesClassifier.train(readyToSend), 'mode': 'nb' } elif classMode == 'positive naive bayes': from nltk.classify import PositiveNaiveBayesClassifier classifier = { 'class': PositiveNaiveBayesClassifier.train(readyToSend), 'mode': 'pnb' } elif classMode == 'max ent': #import nltk.classify #from sklearn.linear_model import LogisticRegression #from nltk.classify import SklearnClassifier #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'} from nltk.classify import MaxentClassifier classifier = { 'class': MaxentClassifier.train(readyToSend, algorithm='iis'), 'mode': 'me' } elif classMode == 'decision tree': from nltk.classify import DecisionTreeClassifier classifier = { 'class': DecisionTreeClassifier.train(readyToSend), 'mode': 'dt' } elif classMode == 'svm': if "SVMOrder" in cfg.keys(): priority = cfg['SVMOrder'] else: priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210" if type(priority) is str: priority = list(priority) priority = [entry for entry in priority if entry in allCats] preppedSVM = prepSVMAll(readyToSend, priority, allCats, cfg) classifier = { 'class': preppedSVM, 'mode': 'svm', 'priority': priority } else: from nltk.classify import NaiveBayesClassifier classifier = { 'class': NaiveBayesClassifier.train(readyToSend), 'mode': 'nb' } if 'NLPTEST' not in cfg.keys(): print "Pickling Classifier" fileOut = open(pickleFile, 'wb') cPickle.dump(classifier, fileOut) fileOut.close() if 'NLPTEST' not in cfg.keys(): if classMode != 'svm': classifier['class'].show_most_informative_features(n=150) """else: for key in classifier['class'].keys(): print classifier print classifier.keys() classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))""" return classifier
if __name__ == '__main__': print get_article_snippet( "sduhfuihsejdsddsfsdfsdf<p>njksnn</p><a>snjkksfbksdbf</a>ksjdfn", 15) parser = argparse.ArgumentParser(description="Accepts a URL") parser.add_argument( "--url", dest="url") #Extracts url from command line, if available urls = parser.parse_args() if urls.url == None: print("No URL Specified") sys.exit() positive_examples = map(get_words_in_article, [ 'http://www.engadget.com/2012/11/16/htc-droid-dna-review/', 'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/' ]) misc_examples = map(get_words_in_article, [ 'http://www.engadget.com/2012/11/16/sharp-aquos-sh930w-reviewed-early-in-russia-with-1080p-display/', 'http://www.engadget.com/2012/11/15/nexus-4-backordered/', 'http://www.engadget.com/2012/11/16/htc-windows-phone-8x-t-mobile-review/', 'http://www.engadget.com/2012/11/16/distro-issue-66-holiday-gift-guide/', 'http://www.engadget.com/2012/10/29/apple-macbook-pro-with-retina-display-review-13-inch/', 'http://www.engadget.com/2012/11/17/skydrive-sdk-net-windows-phone-8/' ]) classifier = PositiveNaiveBayesClassifier.train(positive_examples, misc_examples) print classifier.classify(get_words_in_article(urls.url)) classifier.show_most_informative_features()