def train(): """Train the clasifier""" words = {} for user in User.objects.all(): print 'Training for User ' + str(user.id), for subscription in user.subscriptions: interested_titles_list = [] unlabled_titles_list = [] for article in Article.objects(interested_users=user.id, feed_id=subscription.feed_id): interested_titles_list.append(article.features.title) unlabled_titles_list.append(article.features.title) for article in Article.objects(uninterested_users=user.id, feed_id=subscription.feed_id): unlabled_titles_list.append(article.features.title) words = map(get_words_in_title, interested_titles_list) print interested_titles_list classifier = PositiveNaiveBayesClassifier.train( words, map(get_words_in_title, unlabled_titles_list)) subscription.classifier_object = pickle.dumps(classifier) try: user.save() except Exception as e: print 'Failed: %s' % e print 'Classifier Saved'
def test_classifier(self): bgram_doc = list(self.ft_db.find({"subreddit" : "UkrainianConflict"},to_tuples=True,field="bigrams"))[0] allbgram_doc = list(self.ft_db.find({"subreddit" : "all"}, to_tuples=True, field='bigrams'))[0] pos_fts = { d[0]:True for d in bgram_doc["bigrams"] } neu_fts = { d[0]:True for d in allbgram_doc["bigrams"] } ukr = [] neu = [] for doc, fts in self.source.find_ft({"subreddit" : "UkrainianConflict"}): nomore = [] for key in fts.keys(): if key not in pos_fts: nomore = [] for n in nomore: del fts[n] if len(fts.keys()) > 0: ukr.append(fts) for doc, fts in self.source.find_ft(limit=6000): neu.append(fts) nvb = PositiveNaiveBayesClassifier.train(ukr,neu) for do, fts in self.source.find_ft(skip=6000,limit=10): print(nvb.classify(fts)) nvb.show_most_informative_features() """ukr = []
def train(spam_words, unlabeled_words): spams = list(map(features, spam_words)) unlabeled = list(map(features, unlabeled_words)) model = PositiveNaiveBayesClassifier.train(spams, unlabeled, 0.5) data = PickleData('bayesmodel.pickle') data.write(model) return model
def main(): positive_featuresets = list(map(features, matches)) unlabeled_featuresets = list(map(features, nomatches)) classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets) cursor.rewind() events = [] for doc in cursor: body = sent_tokenize(doc['body']) for sent in body: blob = TextBlob(sent) if (classifier.classify(features(blob))) is True: # to print what is written in the finalout file #print ( '"subreddit_id":' + json.dumps(doc['subreddit_id']) + ',"features":' + json.dumps(commonfeatures(str(blob)))) events.append('{ "subreddit_id":' + json.dumps(doc['subreddit_id']) + ',"features":' + json.dumps(commonfeatures(str(blob))) + '}') f = open(finaloutput, 'w') f.write('{') f.write(json.dumps(events)) f.write('}') f.write('\n') f.close()
def train(): """Train the clasifier""" words = {} for user in User.objects.all(): print "Training for User " + str(user.id), for subscription in user.subscriptions: interested_titles_list = [] unlabled_titles_list = [] for article in Article.objects(interested_users=user.id, feed_id=subscription.feed_id): interested_titles_list.append(article.features.title) unlabled_titles_list.append(article.features.title) for article in Article.objects(uninterested_users=user.id, feed_id=subscription.feed_id): unlabled_titles_list.append(article.features.title) words = map(get_words_in_title, interested_titles_list) print interested_titles_list classifier = PositiveNaiveBayesClassifier.train(words, map(get_words_in_title, unlabled_titles_list)) subscription.classifier_object = pickle.dumps(classifier) try: user.save() except Exception as e: print "Failed: %s" % e print "Classifier Saved"
Function: Gets the article only version of the URL using Instapaper. Extracts the text in the artcile and removes any non AlphaNumeric characters in the text Returns a list of words in the article present in the URL.''' html_data = BeautifulSoup(urllib.urlopen( "http://www.instapaper.com/m?%s" % urllib.urlencode({'u':url})).read()) #URLencoding the url to pass it to Instapaper html_data = html_data.find("body") #Using only the contents in HTML <body> tag, avoides Javascript from being treated as text. pattern = re.compile('[\W_ ]+') #Compile regex for alphanumeric characters and spaces(for multiword strings). words = html_data.findAll(text=True) #setting text to True to extract only the text in the <body> word_list = [] #Stores the list of words for word in words[30:]: #Removing redundant content from Instapaper Mobilizer headers for w in word.split(" "): #splitting on spcae for multiword strings wd = (pattern.sub('',w.lower())) #substituing non alphanumeric characters with '' if len(wd) > 1 : word_list.append(wd)#exclude strings of less than 2 characters filtered_words = [w for w in word_list if not w in nltk.corpus.stopwords.words('english')] return dict((word,True) for word in word_list) if __name__ == '__main__': print get_article_snippet("sduhfuihsejdsddsfsdfsdf<p>njksnn</p><a>snjkksfbksdbf</a>ksjdfn",15) parser = argparse.ArgumentParser(description = "Accepts a URL") parser.add_argument("--url",dest = "url") #Extracts url from command line, if available urls = parser.parse_args() if urls.url == None: print ("No URL Specified") sys.exit() positive_examples = map(get_words_in_article, ['http://www.engadget.com/2012/11/16/htc-droid-dna-review/', 'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/']) misc_examples = map(get_words_in_article, ['http://www.engadget.com/2012/11/16/sharp-aquos-sh930w-reviewed-early-in-russia-with-1080p-display/', 'http://www.engadget.com/2012/11/15/nexus-4-backordered/', 'http://www.engadget.com/2012/11/16/htc-windows-phone-8x-t-mobile-review/', 'http://www.engadget.com/2012/11/16/distro-issue-66-holiday-gift-guide/', 'http://www.engadget.com/2012/10/29/apple-macbook-pro-with-retina-display-review-13-inch/', 'http://www.engadget.com/2012/11/17/skydrive-sdk-net-windows-phone-8/']) classifier = PositiveNaiveBayesClassifier.train(positive_examples,misc_examples) print classifier.classify(get_words_in_article(urls.url)) classifier.show_most_informative_features()
def getClassifier(tweetfile,cfg): degreesToUse = cfg['NLPnGrams'] print "DEBOOOOO", degreesToUse, type(degreesToUse) classMode = cfg['NLPMode'].replace('-',' ').replace('_',' ') shortClass = classMode.replace(' ','').lower() loadNeeded = True if 'NLPTEST' not in cfg.keys(): degreeString = '-'.join([str(degree) for degree in degreesToUse]) pickleFile = 'nlpTrainers/'+tweetfile.replace('.csv','.'+shortClass+degreeString+'.pickle') if isfile(pickleFile): print "Loading pickled", shortClass, "classifier" fileIn = open(pickleFile) classifier = cPickle.load(fileIn) fileIn.close() loadNeeded = False if loadNeeded: if 'NLPTEST'in cfg.keys(): content = prepText(tweetfile) categorized = prepClassifications(content) NGrammized = collectNGrams(categorized,degreesToUse,cfg) else: print "Loading content & preparing text" content = prepText(loadFile(tweetfile)) print "Categorizing contents" categorized = prepClassifications(content) print "Deriving NGrams of length(s)", degreesToUse NGrammized = collectNGrams(categorized,degreesToUse,cfg) print "Compiling Results" readyToSend = [] allCats = [str(key) for key in NGrammized.keys()] for category in allCats: readyToSend += NGrammized[category] print "Attempting Classification by mode", classMode, degreesToUse if classMode == 'naive bayes': from nltk.classify import NaiveBayesClassifier classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'} elif classMode == 'positive naive bayes': from nltk.classify import PositiveNaiveBayesClassifier classifier = {'class':PositiveNaiveBayesClassifier.train(readyToSend),'mode':'pnb'} elif classMode == 'max ent': #import nltk.classify #from sklearn.linear_model import LogisticRegression #from nltk.classify import SklearnClassifier #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'} from nltk.classify import MaxentClassifier classifier = {'class':MaxentClassifier.train(readyToSend,algorithm='iis'),'mode':'me'} elif classMode == 'decision tree': from nltk.classify import DecisionTreeClassifier classifier = {'class':DecisionTreeClassifier.train(readyToSend),'mode':'dt'} elif classMode == 'svm': if "SVMOrder" in cfg.keys(): priority = cfg['SVMOrder'] else: priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210" if type(priority) is str: priority = list(priority) priority = [entry for entry in priority if entry in allCats] preppedSVM = prepSVMAll(readyToSend,priority,allCats,cfg) classifier = {'class':preppedSVM,'mode':'svm','priority':priority} else: from nltk.classify import NaiveBayesClassifier classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'} if 'NLPTEST' not in cfg.keys(): print "Pickling Classifier" fileOut = open(pickleFile, 'wb') cPickle.dump(classifier, fileOut) fileOut.close() if 'NLPTEST' not in cfg.keys(): if classMode != 'svm': classifier['class'].show_most_informative_features(n=150) """else: for key in classifier['class'].keys(): print classifier print classifier.keys() classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))""" return classifier
def get_data(path, label): examples = glob.glob(path) to_text = lambda fname: features(open(fname).read()) return map(to_text, examples) def features(sentence): words = sentence.lower().split() return dict((w, True) for w in words) print "Extracting relevant and irrelevant examples..." relevant_examples = get_data("data/relevant/*", "relevant") irrelevant_examples = get_data("data/irrelevant/*", "irrelevant") print "Creating training set..." featuresets = relevant_examples + irrelevant_examples print "Featuresets: " + str(len(featuresets)) N = 65000 train_set, test_set = featuresets[N:], featuresets[:N] print "Train set: " + str(len(train_set)) print "Test set: " + str(len(test_set)) print "Training in progress..." classifier = PositiveNaiveBayesClassifier.train(irrelevant_examples, featuresets) print "Finished training!" classifier.show_most_informative_features() # accuracy = nltk.classify.util.accuracy(classifier, test_set) # print "Accuracy: " + str(accuracy)
def features(sentence): words = sentence.lower().split() return dict(('contains(%s)' % w, True) for w in words) data_425_sentences = topicList various_sentences = [ 'The President did not comment', 'I lost the keys', 'The team won the game', 'Sara has two kids', 'The ball went off the court', 'They had the ball for the whole game', 'The show is over' ] data_425_featuresets = list(map(features, data_425_sentences)) classifier = PositiveNaiveBayesClassifier.train(data_425_featuresets, unlabeled_featuresets) classifier.classify(features('The cat is on the table')) classifier.classify(features('sata cable')) ############################################################# c_filename = "C:\\DEEPAK\\INDIX\\classification_blind_set_corrected\\classification_blind_set_corrected.tsv" c_df = pd.read_csv(c_filename ,sep="\t",low_memory = False) c_data = defaultdict(list) for c_row in c_df.itertuples(): c_data[c_row[2]].append(c_row [1])
positive_featuresets = map(features, sports_sentences) print '\n ' 'positive_featuresets' ' full list: \n', positive_featuresets print '\n positive_featuresets:' for ii in positive_featuresets: print 'answer:', ii # unlabeled_featuresets - A list of featuresets whose label is unknownself. unlabeled_featuresets = map(features, various_sentences) print '\n unlabeled_featuresets:' for ii in unlabeled_featuresets: print 'answer:', ii # To train, pass in a list of 'true' dictionaries for POS and for NEG classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets) # Is the following sentence about sports? print '\n', 'The cat is on the table --', classifier.classify( features('The cat is on the table')) # What about this one? print 'My team lost the game --', classifier.classify( features('My team lost the game')) # Output # positive_featuresets full list: # [{'the': True, 'dominated': True, 'game': True, 'team': True}, {'the': True, 'ball': True, 'lost': True, 'they': True}, {'the': True, 'was': True, 'game': True, 'intense': True}, {'the': True, 'ball': True, 'goalkeeper': True, 'catched': True}, {'the': True, 'other': True, 'controlled': True, 'ball': True, 'team': True}] # positive_featuresets:
import os import nltk from nltk.classify.naivebayes import NaiveBayesClassifier from nltk.classify import PositiveNaiveBayesClassifier from nltk.corpus.reader.plaintext import PlaintextCorpusReader def features(sentence): words = sentence.lower().split() return dict(('contains(%s)' % w, True) for w in words) corpusdir = './text' newcorpus = PlaintextCorpusReader(corpusdir, '.*') positive_featuresets = list(map(features, newcorpus.raw('comp.txt'))) unlabeled_featuresets = list(map(features, newcorpus.raw('animal.txt'))) classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets, .3) print classifier.classify(features('.'))
'I lost the keys', 'The team won the game', 'Sara has two kids', 'The ball went off the court', 'They had the ball for the whole game', 'The show is over' ] def features(sentence): sentence = ' '.join([word for word in sentence.split() if word.lower() not in cachedStopWords]) words = sentence.lower().split() return dict(('contains(%s)' % w, True) for w in words) positive_featuresets = list(map(features, sports_sentences)) unlabeled_featuresets = list(map(features, various_sentences)) classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets) positive_sentiments = list(map(features, posTrainFeatures)) negative_sentiments = list(map(features, negTrainFeatures)) sentimentClassifier = PositiveNaiveBayesClassifier.train(positive_sentiments, negative_sentiments) #print (classifier.classify(features('The cat is on the table'))) #print (classifier.classify(features('My team lost the game'))) referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) positives = 0 negatives = 0 tp = 0 tn = 0 fp = 0
def main(): positive_featuresets = list(map(features, sports_sentences)) unlabeled_featuresets = list(map(features, various_sentences)) classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,unlabeled_featuresets) print classifier.classify(features('My team lost the game'))
def positive_naive_bayes(pos_cursor,unlabeled_cursor): """send over entire documents""" return PositiveNaiveBayesClassifier.train(list(map(feature, pos_cursor)), list(map(feature, unlabeled_cursor)))
def getClassifier(tweetfile, cfg): degreesToUse = cfg['NLPnGrams'] print "DEBOOOOO", degreesToUse, type(degreesToUse) classMode = cfg['NLPMode'].replace('-', ' ').replace('_', ' ') shortClass = classMode.replace(' ', '').lower() loadNeeded = True if 'NLPTEST' not in cfg.keys(): degreeString = '-'.join([str(degree) for degree in degreesToUse]) pickleFile = 'nlpTrainers/' + tweetfile.replace( '.csv', '.' + shortClass + degreeString + '.pickle') if isfile(pickleFile): print "Loading pickled", shortClass, "classifier" fileIn = open(pickleFile) classifier = cPickle.load(fileIn) fileIn.close() loadNeeded = False if loadNeeded: if 'NLPTEST' in cfg.keys(): content = prepText(tweetfile) categorized = prepClassifications(content) NGrammized = collectNGrams(categorized, degreesToUse, cfg) else: print "Loading content & preparing text" content = prepText(loadFile(tweetfile)) print "Categorizing contents" categorized = prepClassifications(content) print "Deriving NGrams of length(s)", degreesToUse NGrammized = collectNGrams(categorized, degreesToUse, cfg) print "Compiling Results" readyToSend = [] allCats = [str(key) for key in NGrammized.keys()] for category in allCats: readyToSend += NGrammized[category] print "Attempting Classification by mode", classMode, degreesToUse if classMode == 'naive bayes': from nltk.classify import NaiveBayesClassifier classifier = { 'class': NaiveBayesClassifier.train(readyToSend), 'mode': 'nb' } elif classMode == 'positive naive bayes': from nltk.classify import PositiveNaiveBayesClassifier classifier = { 'class': PositiveNaiveBayesClassifier.train(readyToSend), 'mode': 'pnb' } elif classMode == 'max ent': #import nltk.classify #from sklearn.linear_model import LogisticRegression #from nltk.classify import SklearnClassifier #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'} from nltk.classify import MaxentClassifier classifier = { 'class': MaxentClassifier.train(readyToSend, algorithm='iis'), 'mode': 'me' } elif classMode == 'decision tree': from nltk.classify import DecisionTreeClassifier classifier = { 'class': DecisionTreeClassifier.train(readyToSend), 'mode': 'dt' } elif classMode == 'svm': if "SVMOrder" in cfg.keys(): priority = cfg['SVMOrder'] else: priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210" if type(priority) is str: priority = list(priority) priority = [entry for entry in priority if entry in allCats] preppedSVM = prepSVMAll(readyToSend, priority, allCats, cfg) classifier = { 'class': preppedSVM, 'mode': 'svm', 'priority': priority } else: from nltk.classify import NaiveBayesClassifier classifier = { 'class': NaiveBayesClassifier.train(readyToSend), 'mode': 'nb' } if 'NLPTEST' not in cfg.keys(): print "Pickling Classifier" fileOut = open(pickleFile, 'wb') cPickle.dump(classifier, fileOut) fileOut.close() if 'NLPTEST' not in cfg.keys(): if classMode != 'svm': classifier['class'].show_most_informative_features(n=150) """else: for key in classifier['class'].keys(): print classifier print classifier.keys() classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))""" return classifier
words = sentence.lower().split() return dict(('contains(%s)' % w, True) for w in words) data_425_sentences = topicList various_sentences = [ 'The President did not comment', 'I lost the keys', 'The team won the game', 'Sara has two kids', 'The ball went off the court', 'They had the ball for the whole game', 'The show is over' ] data_425_featuresets = list(map(features, data_425_sentences)) unlabeled_featuresets = list(map(features, various_sentences)) classifier = PositiveNaiveBayesClassifier.train(data_425_featuresets, unlabeled_featuresets) classifier.classify(features('The cat is on the table')) classifier.classify(features('sata cable')) ############################################################# def c_read_fileData(dataFormatter): c_data = defaultdict(list) for row in dataFormatter.itertuples(): c_data[row[2]].append(row[1]) text = c_data[425] print(text) return c_data
train_unknown_responses = unknown_responses.loc[:, ['2_x']] positive = train_pos_responses['2_x'].tolist() unlabelled = train_unknown_responses['2_x'].tolist() def create_features(text): # Remove all the punctuations. table = str.maketrans({key: None for key in string.punctuation}) text = text.translate(table) words = text.lower().split() # Create Bag of words. dictionary_words = dict( ('contains(%s)' % w, True) for w in words if len(w) > 2) return dictionary_words pos_features = list(map(create_features, positive)) unknown_features = list(map(create_features, unlabelled)) # Learn the model just based on positive Naive Bayes Classifier. classifier = PositiveNaiveBayesClassifier.train(pos_features, unknown_features) #print(classifier.classify(create_features())) for tip in tips.iloc[:, 2].tolist(): try: tips["class"] = classifier.classify(create_features(tip)) except AttributeError: pass print(tips) print(np.unique(tips.loc[:, 'class']))
if __name__ == '__main__': print get_article_snippet( "sduhfuihsejdsddsfsdfsdf<p>njksnn</p><a>snjkksfbksdbf</a>ksjdfn", 15) parser = argparse.ArgumentParser(description="Accepts a URL") parser.add_argument( "--url", dest="url") #Extracts url from command line, if available urls = parser.parse_args() if urls.url == None: print("No URL Specified") sys.exit() positive_examples = map(get_words_in_article, [ 'http://www.engadget.com/2012/11/16/htc-droid-dna-review/', 'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/' ]) misc_examples = map(get_words_in_article, [ 'http://www.engadget.com/2012/11/16/sharp-aquos-sh930w-reviewed-early-in-russia-with-1080p-display/', 'http://www.engadget.com/2012/11/15/nexus-4-backordered/', 'http://www.engadget.com/2012/11/16/htc-windows-phone-8x-t-mobile-review/', 'http://www.engadget.com/2012/11/16/distro-issue-66-holiday-gift-guide/', 'http://www.engadget.com/2012/10/29/apple-macbook-pro-with-retina-display-review-13-inch/', 'http://www.engadget.com/2012/11/17/skydrive-sdk-net-windows-phone-8/' ]) classifier = PositiveNaiveBayesClassifier.train(positive_examples, misc_examples) print classifier.classify(get_words_in_article(urls.url)) classifier.show_most_informative_features()