def create_classifier(): sample_matched = [ "What's the difference between an ocean and a sea?", "What is the difference between weather and climate?", "What would you say about the difference between atoms and elements?", "How would you compare lacrosse to football?", "How would you compare NetSuite vs Intacct?", "How many km are between Iasi and Hawaii?", "Tell me the difference between Obama and Trump", "What is the difference between me and you?", "Elaborate on why Trump is better than Obama" ] sample_unmatched = nchat.xml_posts()[:800] features_matched = [] features_unmatched = [] for text in sample_matched: features = (feature_select(text), 'matched') features_matched.append(features) for item in sample_unmatched: text = item.text features = (feature_select(text), 'unmatched') features_unmatched.append(features) train_features = features_matched + features_unmatched return nltk.NaiveBayesClassifier.train(train_features)
def build_model(h): global model model = Model(h) global history history = h global vocab for w in word_tokenize('\n'.join(h)): if w.lower() in vocab: vocab[w.lower()] += 1 else: vocab[w.lower()] = 1 # print(vocab) global named_entities try: with open('named.pickle', 'rb') as f: named_entities = pickle.load(f) except FileNotFoundError: h_tokens = [word_tokenize(s) for s in h] tagged = tagger.tag_sents(h_tokens) named_entities = [tagged[0][0]] for n_e in tagged: for i in range(1, len(n_e)): if n_e[i][1] == n_e[i - 1][1]: named_entities[-1] = (named_entities[-1][0] + ' ' + n_e[i][0], n_e[i][1]) else: named_entities.append(n_e[i]) # print(named_entities) with open('named.pickle', 'wb') as f: pickle.dump(named_entities, f) generate_greeting_classifier_nps() # print('finding greetings') # greeting_classified = {s: classify_greeting(s) for s in h[:100]} # print('found greetings') global hellos, byes # hellos = {s: greeting_classified[s] for s in greeting_classified if greeting_classified[s] == 'Greet'} # byes = {s: greeting_classified[s] for s in greeting_classified if greeting_classified[s] == 'Bye'} hellos = { s.text: s.get('class') for s in nps_chat.xml_posts() if s.get('class') == 'Greet' } byes = { s.text: s.get('class') for s in nps_chat.xml_posts() if s.get('class') == 'Bye' } print('ready')
def _setQuestionWorld(self): self.dictionary = PyDictionary() posts = nchat.xml_posts()[:10000] featuresets = [(self.dialogue_act_features(post.text), post.get('class')) for post in posts] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] self.classifier = nltk.NaiveBayesClassifier.train(train_set) self.classifier.labels()
def get_nltk_nps_corpus(): posts = [] for xml_file in nps_chat.xml_posts(): post_one = [] for post in xml_file: t = post[0] at = t.attrib post_one.append([at['word'], at['pos']]) posts.append(post_one) return posts
def generate_greeting_classifier_nps(): global greeting_classifier try: with open('greet_classifier.pickle', 'rb') as f: greeting_classifier = pickle.load(f) except FileNotFoundError: v = set([w.lower() for w in nps_chat.words()]) posts = nps_chat.xml_posts()[:5000] h = [ (sentence_features(s.text.lower(), v=v), s.get('class') if s.get('class') in ['Greet', 'Bye'] else 'Other') for s in posts ] generate_greeting_classifier(h) with open('greet_classifier.pickle', 'wb') as f: pickle.dump(greeting_classifier, f)
def build_informal_set(self): minlines = 6 maxlines = 25 labeled_sets = [] xml_posts = nps_chat.xml_posts() lines = 0 goal = random.randint(minlines, maxlines) builder = "" for msg in xml_posts: if ".ACTION" not in msg.text: builder = builder+" "+msg.text.strip() lines += 1 if lines > goal: labeled_sets.append((self.extract_features(builder), self.informal_label)) goal = random.randint(minlines, maxlines) builder = "" lines = 0 return labeled_sets
def build_informal_set(self): minlines = 6 maxlines = 25 labeled_sets = [] xml_posts = nps_chat.xml_posts() lines = 0 goal = random.randint(minlines, maxlines) builder = "" for msg in xml_posts: if ".ACTION" not in msg.text: builder = builder + " " + msg.text.strip() lines += 1 if lines > goal: labeled_sets.append( (self.extract_features(builder), self.informal_label)) goal = random.randint(minlines, maxlines) builder = "" lines = 0 return labeled_sets
def calculate_confidence_index(): cfd = nltk.ConditionalFreqDist((target, fileid[:10]) for fileid in nps.fileids() for posts in nps.xml_posts(fileid) for target in ['ynQuestion'] if (posts.get('class') == 'ynQuestion')) cfd.plot() # if(flagCount != 0 && timeElapsed != 0) # { # }else{ # } print("Printing confidence index as a function" "of flagCount and timeElapsed")
def classify_text(): posts = nps_chat.xml_posts()[:10000] featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) #print(nltk.classify.accuracy(classifier, test_set)) print( classifier.classify( dialogue_act_features("how are you, my sweet cat?"))) print(classifier.classify(dialogue_act_features("Shut up and get out"))) print( classifier.classify( dialogue_act_features( "You are the most wonderful thing in my life"))) print(classifier.classify(dialogue_act_features("I loved the movie"))) print( classifier.classify( dialogue_act_features("How many suns does Jupitor have"))) print(classifier.classify(dialogue_act_features("Do you love me?")))
def load_data(): global N, words, labels posts = corpus.xml_posts()[:10000] freqs = [ FreqDist(post.text) for post in posts ] words = list(set(word for dist in freqs for word in dist.keys() if word not in ENGLISH_STOP_WORDS and word not in punctuation)) labels = list(set([ post.get('class') for post in posts ])) data = [] N = len(words) for post, dist in zip(posts, freqs): V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) data.append((V, labels.index(post.get('class')))) return data
import sys import numpy from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance import nltk.corpus from nltk import decorators import nltk.stem from nltk.corpus import nps_chat posts = nps_chat.xml_posts() stemmer_func = nltk.stem.snowball.EnglishStemmer().stem stopwords = set(nltk.corpus.stopwords.words('english')) @decorators.memoize def normalize_word(word): return stemmer_func(word.lower()) def get_words(titles): words = set() for title in job_titles: for word in title.split(): words.add(normalize_word(word)) return list(words) @decorators.memoize def vectorspaced(title): title_components = [normalize_word(word) for word in title.split()] return numpy.array(
# # # # import nltk import random from nltk.corpus import nps_chat from nltk.corpus import stopwords import pickle stop_words = set(stopwords.words('english')) stop_words.remove('no') stop_words.add('...') xml_posts_0 = nps_chat.xml_posts() posts_0 = nps_chat.posts() categorized_posts = [] index = 0 # Categorize 'Accept' and 'Non-accept' posts for el in xml_posts_0: if el.attrib.get('class') == 'yAnswer': categorized_posts.append((posts_0[index], 'Yes')) elif el.attrib.get('class') == 'nAnswer': categorized_posts.append((posts_0[index], 'No')) index += 1 all_words = [] for (post, category) in categorized_posts:
def make_sentece(self, augment={}, threshold=0.4): sent = ['__BEGIN__', '__BEGIN__'] while sent[-1] != '__END__': state = sent[-self.state_size:] counts = [a[1] for a in self.model if a[0] == state][0] if not counts: counts = [a[1] for a in self.model if a[0] == state][0] augments = { a: augment[a] if augment[a] >= threshold else 0 for a in augment } weights = { a: counts[a] * (augments[a]) if a in augments else counts[a] for a in counts } total = sum(weights.values()) if total == 0: weights = {a: 1 for a in weights} total = sum(weights.values()) probs = [a / total for a in weights.values()] draw = numpy.random.choice(list(counts.keys()), 1, p=probs)[0] sent.append(draw) detokenizer = MosesDetokenizer() return detokenizer.detokenize(sent[2:-1]) if __name__ == '__main__': chain = AugmentedChain([a.text for a in nps_chat.xml_posts()][:5000]) print(chain.model) print(chain.make_sentece())
import os import sklearn from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score import nltk from nltk.corpus import nps_chat import numpy as np from nltk.data import load from nltk.corpus import stopwords posts = nltk.corpus.nps_chat.xml_posts() label_names = np.array( sorted(nltk.FreqDist(p.attrib['class'] for p in posts).keys())) chatroom = nps_chat.xml_posts() features = [] stop_words = set(stopwords.words('english')) stop_words.update( ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) for item in chatroom: text = nltk.word_tokenize(item.text) sent = (list(nltk.pos_tag(text))) if item.get('class') == 'whQuestion' or item.get('class') == 'ynQuestion': for x in sent: if x not in stop_words: features.append(x) features = nltk.FreqDist(features) print(features.most_common(15))
import nltk from nltk.tokenize import TweetTokenizer from nltk.corpus import nps_chat as nps import os # twitterSamples = nltk.corpus.twitter_samples # negTweets = twitter_samples.strings('negative_tweets.json') teenChat = nps.xml_posts("11-08-teens_706posts.xml") chatWords = nps.words("11-08-teens_706posts.xml") chatBigrams = nltk.bigrams(chatWords) cfd = nltk.ConditionalFreqDist(chatBigrams) maxConfidence = 100 flagFile = open('flagList.txt') flagList = flagFile.read() def calculate_flags(): flagNumber = 0 tokens = nltk.word_tokenize(flagList) # TODO: using a list of flags to be determined, # iterate through posts to find instances of any flags cfd = nltk.ConditionalFreqDist((tokens, fileid[:10]) for fileid in nps.fileids() for posts in nps.words(fileid) for target in [tokens] #you need a check if len(samples) < 1
except FileNotFoundError: categorized_sentences = [] # load up categorized sentences if found try: f = open('sentence_clusters.pickle', 'rb') sentence_clusters= pickle.load(f) f.close() except FileNotFoundError: sentence_clusters = [] # preprocessing nps chat corpus for sentence classification all_words = nltk.FreqDist(w.lower() for w in nps_chat.words()) word_features = [a[0] for a in all_words.most_common()[:2000]] sentences = [(nltk.word_tokenize(a.text.lower()), a.attrib['class']) for a in nps_chat.xml_posts()] # logical response types for each input sentence type response_types = { 'Accept': ['Statement', 'Emotion', 'Emphasis'], 'Bye': ['Bye'], 'Clarify': ['Accept', 'Reject', 'Statement', 'Emphasis'], 'Emotion': ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'], 'Continuer': ['Accept', 'Reject', 'Statement', 'Emphasis'], 'Emphasis': ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'], 'Greet': ['Greet'], 'Other': ['Statement'], 'Reject': ['Statement', 'Emotion', 'Emphasis'], 'Statement': ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'], 'System': ['Statement'], 'nAnswer': ['Statement', 'Emotion', 'Emphasis'],
@author: Kamalakanta """ import db_operations as dbo from nltk.corpus import nps_chat as nc from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC from sklearn.model_selection import train_test_split as tst from sklearn.feature_extraction.text import CountVectorizer from sklearn.neighbors import KNeighborsClassifier from sklearn import metrics import matplotlib.pyplot as plt import pickle posts = nc.xml_posts() categories = [ 'accept', 'statement', 'yanswer', 'clarify', 'nanswer', 'reject', 'bye', 'greet', 'whquestion', 'ynquestion', 'command' ] categories_dict = {} for i in range(len(categories)): if (categories[i] in {'accept', 'statement', 'yanswer', 'clarify'}): categories_dict[categories[i]] = 0 elif (categories[i] in {'nanswer', 'reject'}): categories_dict[categories[i]] = 1 elif (categories[i] == 'bye'): categories_dict[categories[i]] = 2 elif (categories[i] == 'greet'):