def quicktree(sentence): """Parse a sentence and return a visual representation in IPython""" import os from nltk import Tree from nltk.draw.util import CanvasFrame from nltk.draw import TreeWidget from stat_parser import Parser try: from IPython.display import display from IPython.display import Image except: pass try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False parser = Parser() parsed = parser.parse(sentence) cf = CanvasFrame() tc = TreeWidget(cf.canvas(),parsed) cf.add_widget(tc,10,10) # (10,10) offsets cf.print_to_file('tree.ps') cf.destroy() if have_ipython: tregex_command = 'convert tree.ps tree.png' result = get_ipython().getoutput(tregex_command) else: tregex_command = ["convert", "tree.ps", "tree.png"] result = subprocess.check_output(tregex_command) os.remove("tree.ps") return Image(filename='tree.png') os.remove("tree.png")
def quicktree(sentence): """Parse a sentence and return a visual representation in IPython""" import os from nltk import Tree from nltk.draw.util import CanvasFrame from nltk.draw import TreeWidget from stat_parser import Parser try: from IPython.display import display from IPython.display import Image except: pass try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False parser = Parser() parsed = parser.parse(sentence) cf = CanvasFrame() tc = TreeWidget(cf.canvas(), parsed) cf.add_widget(tc, 10, 10) # (10,10) offsets cf.print_to_file('tree.ps') cf.destroy() if have_ipython: tregex_command = 'convert tree.ps tree.png' result = get_ipython().getoutput(tregex_command) else: tregex_command = ["convert", "tree.ps", "tree.png"] result = subprocess.check_output(tregex_command) os.remove("tree.ps") return Image(filename='tree.png') os.remove("tree.png")
def parse_sentence(my_sentence): ''' Generate nonterminal rules using a stochastic sentence parser Parameters ---------- my_sentence : str A single sentence (str) ''' parser = Parser() parsee = parser.parse(my_sentence) rules = "" # possibly add: brackets, double quotes for production in parsee.productions(): if not is_terminal(production.rhs()[0]): rules += str(production) + '\n' # now re-tag special characters swappairs = zip(to_replace, replacements) for member in swappairs: rules = rules.replace(member[0], member[1]) return rules
def best_candidate(Sentence, Question): #Sentence = 'Notre Dame\'s most recent when?' #Sentence = Sentence.replace('[',' ') #Sentence = Sentence.replace(']',' ') # print "Sentence: " + Sentence #print Question key = get_md5(Sentence) if key in parse_cache: print "hit" tree = parse_cache[key] else: try: parser = Parser() tree = parser.parse(Sentence) parse_cache[key] = tree except: return " " list1 = [] list2 = [] traverseTree(tree, list1, list2, Question.split()) min_overlap = min(list2) num = [[i, len(list1[i])] for i in range(len(list1)) if list2[i] == min_overlap] s = sorted(num, key=lambda x: -x[1]) return " ".join(list1[s[0][0]])
def question_noun_phrase(query): if (len(query.split()) <= 1): return query query = oclock_remover(query) query = benedict_remover(query) parser = Parser() tree = parser.parse(query) if 'NP' == tree.label() or \ 'NP+NP'== tree.label() or \ 'NX+NX'== tree.label() or \ 'NX+NP'== tree.label() or \ 'NP+NX'== tree.label() or \ 'FRAG'== tree.label() or \ 'NX' == tree.label(): words = query noun_phrase = [] # this is code for finding the noun phrase noun_phrase = tree.leaves() # this code removes the article from the beginning if noun_phrase: if (noun_phrase[0] == 'a' or \ noun_phrase[0] == 'an' or noun_phrase[0] == 'the'): del noun_phrase[0] #print noun_phrase noun_phrase = ' '.join(noun_phrase) return noun_phrase for element in [tree] + [e for e in tree ]: # Include the root element in the for loop if "SBAR" in element.label(): for subtree in element.subtrees(): if "W" in subtree.label(): noun_phrase = [] print noun_phrase # this is code for finding the noun phrase for noun_subtree in element.subtrees(): if not "SBAR" in noun_subtree.label() \ and not "W" in noun_subtree.label() \ and "NP" in noun_subtree.label() \ and len(noun_subtree.leaves()) > len(noun_phrase): noun_phrase = noun_subtree.leaves() # this code removes the article from the beginning if noun_phrase: if (noun_phrase[0] == 'a' or \ noun_phrase[0] == 'an' or noun_phrase[0] == 'the'): del noun_phrase[0] noun_phrase = ' '.join(noun_phrase) return noun_phrase return ""
def question_noun_phrase(query): if(len(query.split()) <= 1): return query query = oclock_remover(query) query = benedict_remover(query) parser = Parser() tree = parser.parse(query) if 'NP' == tree.label() or \ 'NP+NP'== tree.label() or \ 'NX+NX'== tree.label() or \ 'NX+NP'== tree.label() or \ 'NP+NX'== tree.label() or \ 'FRAG'== tree.label() or \ 'NX' == tree.label(): words = query noun_phrase = [] # this is code for finding the noun phrase noun_phrase = tree.leaves() # this code removes the article from the beginning if noun_phrase: if (noun_phrase[0] == 'a' or \ noun_phrase[0] == 'an' or noun_phrase[0] == 'the'): del noun_phrase[0] #print noun_phrase noun_phrase = ' '.join(noun_phrase) return noun_phrase for element in [tree] + [e for e in tree]: # Include the root element in the for loop if "SBAR" in element.label(): for subtree in element.subtrees(): if "W" in subtree.label(): noun_phrase = [] print noun_phrase # this is code for finding the noun phrase for noun_subtree in element.subtrees(): if not "SBAR" in noun_subtree.label() \ and not "W" in noun_subtree.label() \ and "NP" in noun_subtree.label() \ and len(noun_subtree.leaves()) > len(noun_phrase): noun_phrase = noun_subtree.leaves() # this code removes the article from the beginning if noun_phrase: if (noun_phrase[0] == 'a' or \ noun_phrase[0] == 'an' or noun_phrase[0] == 'the'): del noun_phrase[0] noun_phrase = ' '.join(noun_phrase) return noun_phrase return ""
def ret_tree(sentence,rep,model): parser = Parser() tree_list = [] # sentence = "How are you" tree = parser.parse(sentence) #tree.draw() all_nodes = [] def compute_tree_list(t, root_ptr1,rep,model): # if len(t.leaves()) == 2: # # tree_list.append(t.) # l = t.leaves() # # print l #print root_ptr1 # # print l[0] # # print l[1] # return Node(l[0], l[1], True) if len(t.leaves()) == 1: # tree_list.append(t.leaves()) l = t.leaves() # print l[0] return Node(l[0]) else: subts = list(t) left_id = root_ptr1+1 right_id = root_ptr1*2 #print "left id = %f" % left_id # print "right id = %f" % right_id # print len(subts) left_tree = compute_tree_list(subts[0], left_id,rep,model) right_tree = compute_tree_list(subts[1], right_id,rep,model) if isinstance(left_tree, Node): left_id = left_tree.left rep[left_id]=np.transpose(model[left_id]).reshape([300,1]) # print rep[left_id].shape if isinstance(right_tree, Node): right_id = right_tree.left w=model.most_similar(positive=right_id,topn=1); rep[right_id]=np.transpose(model[right_id]).reshape([300,1]) # print rep[right_id].shape # print "root ptr.... = %f" % root_ptr tree_list.append({"ip1": left_id, "ip2": right_id, "op": root_ptr1}) # return Node(left_tree, right_tree) compute_tree_list(tree, 10000,rep,model) print "Tree List",tree_list return tree_list,rep
def chat_with_robo(): parser = Parser() flag = True print("The instructions for talk with me: \n", "If you want finish the conversation, please type thanks or bye.\n") print("ROBO: Hi, my name is Robo.") while flag == True: message = input() message = message.lower() if message != 'bye': # Analyzing the input print('\nvocabulary: ', nltk.tokenize(message)) print('\nword frequency: ' + nltk.FreqDist(nltk.tokenize(message)).most_common(10)) # ----------- # add part-of-speech tags to text # ----------- # Tagging message with basic nltk tokenize print(nltk.pos_tag(nltk.word_tokenize(message))) # Tiene problemas con la identificación del pronombre 'I', lo pone como noun (sustantivo) # Tagging message # trace = 1: then the parser will report the steps that it takes as it parses a text. # rd_parser = nltk.RecursiveDescentParser(, trace = 1) # Review grammar # rd_parser = nltk.RecursiveDescentParser(nltk.ChartParser) rd_parser = parser.parse(message) i = 1 for tree_struc in rd_parser: print(str(i) + 'tree_struc: ', tree_struc) wrong_syntax = 1 s = tree_struc wrong_syntax = 0 print("\n Correct Grammar") i += 1 if wrong_syntax == 1: print("\n Wrong Grammar") # write_output_file(... else: flag = False print("ROBO: Bye! take care..")
def quicktree(sentence): """Parse a sentence and return a visual representation""" from nltk import Tree from nltk.draw.util import CanvasFrame from nltk.draw import TreeWidget from stat_parser import Parser from IPython.display import display from IPython.display import Image parser = Parser() parsed = parser.parse(sentence) cf = CanvasFrame() tc = TreeWidget(cf.canvas(), parsed) cf.add_widget(tc, 10, 10) # (10,10) offsets cf.print_to_file('tree.ps') cf.destroy()
def main(): text = "Smoking Mothers May Alter the DNA of Their Children." parser = Parser() tree = parser.parse(text) print "Parse Tree:\n"+str(tree)+"\n" phrasesTree = extractTaggedPhrases(tree, 'NP') print "Extracted Phrases:\n"+str(phrasesTree)+"\n" phrases = [] for phrase in phrasesTree: phrases.append(" ".join(phrase.leaves())) imagesDict = buildImagesDict(phrases) for phrase, images in imagesDict.iteritems(): print phrase+":" print "\n".join([image for image in images]) print
def delegate(task_queue, completed_queue): graph = Graph() parser = Parser() while True: try: sentence = task_queue.get(False) except: completed_queue.put(graph) print "My work here is done" return True print "Parsing sentence" parsed = parser.parse(sentence) print "Adding sentence to graph" # graph.update(parsed) print "Added"
def generate(filename, word_limit=None): global syntaxes parser = Parser() if not os.path.exists(SYNTAXES_FILE): # sents = nltk.corpus.gutenberg.sents('results.txt') # NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus. with codecs.open(filename, encoding='utf-8') as corpus: sents = nltk.sent_tokenize(corpus.read()) if word_limit: sents = [sent for sent in sents if len(sent) < word_limit] sent_limit = min(1500, len(sents)) sents[0:sent_limit] for sent in tqdm(sents): try: parsed = parser.parse(sent) except TypeError: pass syntax_signature(parsed, save=True) with open(SYNTAXES_FILE, 'wb+') as pickle_file: pickle.dump(syntaxes, pickle_file) else: with open(SYNTAXES_FILE, 'rb+') as pickle_file: syntaxes = pickle.load(pickle_file) if not os.path.exists(CFDS_FILE): with codecs.open(filename, encoding='utf-8') as corpus: cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)] with open(CFDS_FILE, 'wb+') as pickle_file: pickle.dump(cfds, pickle_file) else: with open(CFDS_FILE, 'rb+') as pickle_file: cfds = pickle.load(pickle_file) sents = nltk.corpus.gutenberg.sents('austen-emma.txt') if word_limit: sents = [sent for sent in sents if len(sent) < word_limit] sent = random.choice(sents) parsed = parser.parse(' '.join(sent)) print(parsed) print(' '.join(parsed.leaves())) replaced_tree = tree_replace(parsed, cfds, []) print('=' * 30) print(' '.join(replaced_tree.leaves())) print(replaced_tree)
def generate(): global syntaxes parser = Parser() if not os.path.exists(SYNTAXES_FILE): # sents = nltk.corpus.gutenberg.sents('results.txt') # NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus. with codecs.open('results.txt', encoding='utf-8') as corpus: sents = nltk.sent_tokenize(corpus.read()) sents = [sent for sent in sents if len(sent) < 150][0:1500] for sent in tqdm(sents): try: parsed = parser.parse(sent) except TypeError: pass syntax_signature(parsed, save=True) with open(SYNTAXES_FILE, 'wb+') as pickle_file: pickle.dump(syntaxes, pickle_file) else: with open(SYNTAXES_FILE, 'rb+') as pickle_file: syntaxes = pickle.load(pickle_file) if not os.path.exists(CFDS_FILE): # corpus = nltk.corpus.gutenberg.raw('results.txt') with codecs.open('results.txt', encoding='utf-8') as corpus: cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)] with open(CFDS_FILE, 'wb+') as pickle_file: pickle.dump(cfds, pickle_file) else: with open(CFDS_FILE, 'rb+') as pickle_file: cfds = pickle.load(pickle_file) sents = nltk.corpus.gutenberg.sents('austen-emma.txt') sents = [sent for sent in sents if len(sent) < 50] sent = random.choice(sents) parsed = parser.parse(' '.join(sent)) print(parsed) print(' '.join(parsed.leaves())) replaced_tree = tree_replace(parsed, cfds, []) print('=' * 30) print(' '.join(replaced_tree.leaves())) print(replaced_tree)
def sentsSelector(self,sents): texts = [] new_indices = [] index = 0 parser = Parser() for sent in sents: # Extract triplets & store in neo4j database tripletExtractor(parser.parse(sent)) # Process sent tokens = sent.split() if len(self.removeSingleOccurWords(self.removeStopWords(tokens)))>0: texts.append(self.removeStopWords(tokens)) new_indices.append(index) #print(self.removeSingleOccurWords(self.removeStopWords(tokens))) index += 1 self.makeDictAndCorpus(texts) new_sents = [] for index in self.DocumentSIMQuery(): new_sents.append(sents[index]) return new_sents
def __init__(self): self.filename='dictionary.txt' self.dict = {} bgm = nltk.collocations.BigramAssocMeasures() finder = nltk.collocations.BigramCollocationFinder.from_words(nltk.corpus.brown.words()) scores = finder.score_ngrams(bgm.likelihood_ratio) self.scored = {} for key, score in scores: self.scored[key] = score self.specialWords = [u'了', u'的'] self.directions = ['east', 'west', 'south', \ 'north','northeast', 'southeast', 'northwest', 'southwest'] self.parser = Parser()
def shortestPath(sentence, word1, word2): parser = Parser() tree = parser.parse(sentence) print(tree) #print(type(tree)) path1 = findword(tree, word1.lower()) path2 = findword(tree, word2.lower()) #print(path1) #print(path2) # compare both paths # -> find first different element j = 0 for i in range(1, min(len(path1), len(path2))): if path1[i] != path2[i]: j = i - 1 break # now join both list from the jth element # we need to take into account the "order" of appearance in the tree # left or right, which is left to the other one, cuz it's tree will be reversed # S VP NP Mary # S VP NP Bob <-> Bob Np VP S # always the reversed list goes first and that's it? sublist1 = path1[j:] #print("sublist1",sublist1) if j < len(path2) - 1: j = j + 1 sublist2 = path2[j:] #print("sublist2",sublist2) sublist2.reverse() #print("sublist2",sublist2) shortestpath = sublist2 + sublist1 return shortestpath
def shortestPath(sentence, word1, word2): parser = Parser() tree = parser.parse(sentence) print(tree) #print(type(tree)) path1 = findword(tree,word1.lower()) path2 = findword(tree,word2.lower()) #print(path1) #print(path2) # compare both paths # -> find first different element j = 0 for i in range(1,min(len(path1),len(path2))): if path1[i] != path2[i]: j = i - 1 break # now join both list from the jth element # we need to take into account the "order" of appearance in the tree # left or right, which is left to the other one, cuz it's tree will be reversed # S VP NP Mary # S VP NP Bob <-> Bob Np VP S # always the reversed list goes first and that's it? sublist1 = path1[j:] #print("sublist1",sublist1) if j< len(path2)-1: j=j+1 sublist2 = path2[j:] #print("sublist2",sublist2) sublist2.reverse() #print("sublist2",sublist2) shortestpath = sublist2 + sublist1 return shortestpath
def pipeline(records): wordnet_lemmatizer = WordNetLemmatizer() parser = Parser() for record in records: sentences = nltk.sent_tokenize(record) for sentence in sentences: words = nltk.word_tokenize(sentence) print(sentence) print(nltk.pos_tag(sentence)) print parser.parse(sentence) for word in words: print("Word: ", word) print("Lemma: ", wordnet_lemmatizer.lemmatize(word)) for synset in wordnet.synsets(word): print (synset, synset.hypernyms()) print (synset, synset.hyponyms()) print (synset, synset.part_meronyms()) print (synset, synset.substance_meronyms()) print (synset, synset.part_holonyms()) print (synset, synset.substance_holonyms()) print("*********")
def string_to_query(string): print("Helloooo") parser = Parser() parsed = parser.parse(string) question = [] question.extend( [word for word, pos in parsed.pos() if pos == 'WP' or pos == 'WRB']) action = [] action.extend([word for word, pos in parsed.pos() if 'VB' in pos]) affected_entity = [] affected_entity.extend( [word for word, pos in parsed.pos() if 'NN' in pos or pos == "VBG"]) particle = list(parsed.subtrees(filter=lambda x: x.label() == 'PRT')) tree = parsed verb_phrases_list = list( parsed.subtrees(filter=lambda x: "V" in x.label()))[0] entity = None try: entity = [x for x in parsed.pos() if x[0] == affected_entity[0]][0] except: entity = None print(question) print(action) print(affected_entity) print(parsed.pos()) try: print(parsed.pos()[parsed.pos().index(entity) + 1:]) except: print("entity is None") output = { "question": question, "action": action, "affected_entity": affected_entity, "rest": None, #parsed.pos()[parsed.pos().index(entity)+1:], "sentence": string } return output
def best_candidate_token(Sentence, Question, token): #parser = Parser() #tree = parser.parse(Sentence) print "Sentence: " + Sentence key = get_md5(Sentence) if key in parse_cache: print "hit" tree = parse_cache[key] else: try: parser = Parser() tree = parser.parse(Sentence) parse_cache[key] = tree except: return " " list1 = [] list2 = [] traverseTree_token(tree, list1, list2, Question.split(), token) min_overlap = min(list2) num = [[i, len(list1[i])] for i in range(len(list1)) if list2[i] == min_overlap] s = sorted(num, key=lambda x: -x[1]) return " ".join(list1[s[0][0]])
class Solution(object): def __init__(self): print("Initializing Data Loader...") # Load Parser and process it self.parser = Parser() self.count = 0 def is_question(self, sentence): """ Class function for the boolean tagging :param sentence: :return: """ self.count += 1 # To check which line gives error print(self.count) result = str(self.parser.parse(sentence)).split() if '(SBARQ' in result[0]: return "QUESTION_CODE" else: return "n/a" def question_classify(self): """ This function classifies each sentence in the input file and outputs a tsv into a result.txt file :return: result.txt file """ with open(DATA_FILE, 'r') as doc: with open("result.txt", 'w') as target: dat = doc.read() lines = dat.splitlines() for line in lines[1:]: query, freq = line.strip().split("\t") try: # Since some sentences may not have any valid parse is_q = self.is_question(query) bin_val = [0, 1][is_q == "QUESTION_CODE"] except TypeError: # It is highly likely that it ont be a question is_q = "n/a" bin_val = 0 target.write("%s\t%s\t%s\n" % (query, bin_val, is_q)) target.close() doc.close()
def __init__(self, text): parser = Parser() self.math_ops = ('add', 'multiply', 'divide', 'subtract', 'power') self.ops = ('summarize', 'translate') custom_sent_tokenizer = PunktSentenceTokenizer(text) tokenized = custom_sent_tokenizer.tokenize(text) words = nltk.word_tokenize(tokenized[0]) self.parsed_text = nltk.pos_tag(words) self.verb = [] self.verb_ranges = [] self.language = '' self.keywords = [ 'odd', 'even', 'prime', 'composite', 'squares', 'square', 'cubes', 'cube' ] self.special = sum( [word[0] in self.keywords for word in self.parsed_text]) > 0 self.special_keywords = set() for word in self.parsed_text: if word[0] in self.keywords: self.special_keywords.add(word[0]) self.special_keywords = list(self.special_keywords) self.d = d = { "odd": lambda x: x % 2 == 1, "even": lambda x: x % 2 == 0, "square": lambda x: math.sqrt(x).is_integer(), "cube": lambda x: (x**(1. / 3.)).is_integer(), "squares": lambda x: math.sqrt(x).is_integer(), "cubes": lambda x: (x**(1. / 3.)).is_integer(), "prime": lambda x: x > 1 and all( x % i for i in islice(count(2), int(sqrt(x) - 1))), "composite": lambda x: x > 1 and not all( x % i for i in islice(count(2), int(sqrt(x) - 1))) }
def batch_parse_multiprocessing(sentences_list, num_processes=1): sentences_queue = JoinableQueue() parsed_queue = JoinableQueue() parser = Parser() for _ in range(num_processes): proc = Process(target=batch_parse, args=(parser, sentences_queue, parsed_queue)) proc.daemon = True proc.start() sr = SaveResults(parsed_queue) for sent in sentences_list: sentences_queue.put(sent) sentences_queue.join() parsed_queue.join() return sr.get_ordered_results()
def __init__(self, pronouncer=Pronouncer(), phoneme_to_digit_dict=None, max_vocab_size=10000, parser=Parser(), evaluator=NgramEvaluator(2)): ''' Initializes the ParserEncoder. ''' super(ParserEncoder, self).__init__(pronouncer=pronouncer, phoneme_to_digit_dict=phoneme_to_digit_dict) # set up our size-limited vocab if max_vocab_size != None: vocabulary = self._get_vocab(max_vocab_size) self.phonemes_to_words_dict = self._get_phonemes_to_words_dict( vocabulary) else: self.phonemes_to_words_dict = self._get_phonemes_to_words_dict() self.parser = parser self.evaluator = evaluator
import nltk from parsedatetime import Calendar from nltk.tag import pos_tag, map_tag from stat_parser import Parser, display_tree from time import mktime from datetime import datetime , timedelta from dateutil.relativedelta import * parser = Parser() # Build this outside the fn. so it doesn't rebuild each time cal = Calendar() schedule_verbs = ['add', 'set', 'make', 'create', 'get', 'schedule', 'appoint', 'slate', 'arrange', 'organize', 'construct', 'coordinate', 'establish', 'form', 'formulate', 'run', 'compose', 'have', 'meet', 'reschedule', 'find'] #'find' is for schedule-suggesting; be careful schedule_suggest_verbs = ['suggest', 'recommend', 'propose', 'show'] schedule_nouns = ['appointment', 'meeting','meetup', 'reservation', 'session' 'talk', 'call', 'powwow', 'meet', 'rendezvous', 'event', 'conference', 'time'] doc_verbs = ['open', 'open up', 'view', 'launch', 'look','display', 'check', 'start', 'begin','create', 'make', 'get', 'have', 'set', 'generate', 'show', 'pull'] avail_words = ['free', 'available', 'works', 'potential', 'options'] time_words = ['tomorrow', 'today', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday','a.m.', 'am', 'p.m.', 'pm', 'week', 'month', 'day', 'time', 'year', 'date'] doc_nouns = ['doc', 'dog', 'dock' , 'document', 'script', 'record', 'report', 'page', 'notepad']
from contractions import contractions sent_tokenizer = PunktSentenceTokenizer() with open("<source of text>", "r") as f: text = f.read() for k, v in contractions.items(): text = text.replace(k, v) sents = [] for paragraph in text.split('\n'): sents += sent_tokenizer.tokenize(paragraph) parser = Parser() productions = [] for sent in sents[:25]: try: tree = parser.parse(sent) productions += tree.productions() except: pass S = Nonterminal('S') grammar = induce_pcfg(S, productions) for sentence in generate(grammar, depth=5): print " ".join(sentence) + "\n"
from flask import Flask, send_file, request, jsonify import nltk from nltk import Tree from stat_parser import Parser import sys import json app = Flask(__name__) @app.route("/") def index(): return send_file("static/index.html") parser = Parser() #converts an nltk Tree to a dictionary def tree2dict(tree, parent=None): return { "parent": parent, "name": tree.label(), "children": [ tree2dict(t, tree.label()) if isinstance(t, Tree) else { "name": t, "parent": tree.label(), "children": None } for t in tree
def classify(self, sentence, debug = 0): #If the sentence hasn't been parsed, we must parse it. plaintext = False if type(sentence) != list: plaintext = True original = sentence try: sentence = self.parser.parse(sentence) except: try: self.parser = Parser() sentence = self.parser.parse(sentence) except: print 'Couldn\'t create a parsing object.' print 'Prehaps pystatparser is not loaded?' print 'type \"from stat_parser import Parser\"' #Deal with new root types if sentence[0] not in self.importantRootProbabilities: self.importantRootProbabilities[sentence[0]] = self.alpha/(self.alpha*(len(self.importantRootProbabilities) + 1)) if sentence[0] not in self.regularRootProbabilities: self.regularRootProbabilities[sentence[0]] = self.alpha/(self.alpha*(len(self.regularRootProbabilities) + 1)) #Deal with new non-root tag types flat = self.recursiveFlatten(sentence) flat = filter(lambda x: type(x) == unicode, flat) for i,tag in enumerate(flat): if tag not in self.tags: #Set a priori beliefs for multiplicity parameters self.importantMultiplictyParameter[tag] = self.beta self.regularMultiplictyParameter[tag] = self.beta #Set a priori beliefs for conditional presence parameters self.importantCondPresenceProbs.loc[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.importantCondPresenceProbs.columns)) self.regularCondPresenceProbs.loc[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.regularCondPresenceProbs.columns)) if type(flat[i+1])==unicode: #Set a priori beliefs for conditional presence parameters self.importantCondPresenceProbs[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.importantCondPresenceProbs.index)) self.regularCondPresenceProbs[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.regularCondPresenceProbs.index)) ##Get P(x|y = REGULAR) PxGy1 = math.log(self.importantRootProbabilities[sentence[0]]) + self.getConditionalLevelProbability([sentence,self.importantCondPresenceProbs,self.importantMultiplictyParameter,sentence[0],debug>=2]) ##Get P(x|y = REGULAR) PxGy0 = math.log(self.regularRootProbabilities[sentence[0]]) + self.getConditionalLevelProbability([sentence,self.regularCondPresenceProbs,self.regularMultiplictyParameter,sentence[0],debug>=2]) #Get priors in a log form: Py1 = math.log(self.classPriors[1]) Py0 = math.log(self.classPriors[0]) #Get log Probabilities of each class through Bayes' Rule Py1Gx = PxGy1+Py1 Py0Gx = PxGy0+Py0 #SoftMax probabilities denom = math.log(math.e**Py1Gx + math.e**Py0Gx) sPy1Gx = Py1Gx-denom sPy0Gx = Py0Gx-denom #Turn back into probabilities for output sPy1Gx = math.e**sPy1Gx sPy0Gx = math.e**sPy0Gx if debug > -1: print 'Estimating Class for sentence:' if plaintext: print '\"' + original + '\"' else: print sentence if debug > 0: print ' ------------------------------------------------------------------' print 'Class Priors (log probability):' print 'P(important) = ' + str(Py1) print 'P(unimportant) = ' + str(Py0) print ' ------------------------------------------------------------------' print 'Conditional Sentence Log Probabilities:' print 'P(sentence | important) = ' + str(PxGy1) print 'P(sentence | unimportant) = ' + str(PxGy0) print ' ------------------------------------------------------------------' print 'Unnormalized Conditional Class Log Probabilities' print 'P(important | sentence) = ' + str(Py1Gx) print 'P(unimportant | sentence) = ' + str(Py0Gx) if debug > -1: print ' ------------------------------------------------------------------' print 'Softmaxed Conditional Class Probabilities' print 'P(important | sentence) = ' + str(sPy1Gx) print 'P(unimportant | sentence) = ' + str(sPy0Gx) return(sPy1Gx)
class SySE: ####Supervised Training. #trainingSentences: sentences on which to train (Must already be parsed) #labels: corresponding binary (1,0) labels. #alpha: laplace/additive smoothing parameter (default = 1) def train(self, trainingSentences, labels, alpha = 0.1, beta = 0.1, debug = 0): if debug > -1: print print '*********************************************************' print ' SySE V 0.1 ' print 'Beginning Training Sequence with ' + \ str(len(trainingSentences)) + ' training sentences...' print '*********************************************************' if debug > 0: print print 'Initializing... ' if type(trainingSentences[0]) != list: print 'These sentences do not appear to have been parsed.' print 'They will be parsed now.' if len(trainingSentences > 10): print 'Given their volume, this will take some time.' try: self.parser = Parser() except: print 'This environment should have pystatparser loaded ' + \ 'in order to train on unparsed sentences.' print 'Exiting...' return trainingSentences = [parser.parse(x) for x in trainingSentences] ####Initialization #Save hyperparameters self.alpha = alpha self.beta = beta #See what tags are in the training data. tags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for el in flat: if type(el) == unicode and el not in tags: tags.append(el) self.tags = set(tags) #What kind of root tags are there? self.sentenceTypes = set([x[0] for x in trainingSentences]) #Which tags may contain other tags? self.phraseTags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for i in range(0,len(flat)): try: if type(flat[i]) == unicode and type(flat[i+1]) == unicode and flat[i] not in self.phraseTags: self.phraseTags.append(flat[i]) except IndexError: print 'We\'ve reached the end of this sentence' self.phraseTags = set(self.phraseTags) - self.sentenceTypes #Robustness labels = list(labels) #Split training sentences into Important (I) and Regular (R) (Unimportant) importantSentences = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences) regularSentences = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences) self.classPriors = [] ###Test inputs #Make sure labels are right length for sentences. if len(labels) != len(trainingSentences): print 'Labels and trainingSentencs must be the same length!' return #Make sure labels are valid for label in labels: if label != 0 and label != 1: print 'Lables should be either 0 or 1.' print 'exiting...' return #Split training sentences into Important (I) and Regular (R) (Unimportant) self.importantRootProbabilities = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences) self.regularRootProbabilities = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences) ###Train Class Priors self.classPriors.append(float(labels.count(0))/float(len(labels))) self.classPriors.append(float(labels.count(1))/float(len(labels))) if debug > 0: print '*********************************************************' print 'These are the class priors' print '*********************************************************' print self.classPriors print print ###Train Sentence Type self.importantRootProbabilities = dict(zip(list(self.sentenceTypes),np.zeros(len(list(self.sentenceTypes))))) self.regularRootProbabilities = dict(zip(list(self.sentenceTypes),np.zeros(len(list(self.sentenceTypes))))) #Get the count of each sentence type in I for sentence in importantSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.importantRootProbabilities[sentence[0]] += 1 #We will now implement a softmax to turn the counts into probabilities for param in self.importantRootProbabilities: self.importantRootProbabilities[param]=(float(self.importantRootProbabilities[param]) + alpha)/ \ (float(len(trainingSentences)) + alpha*(len(self.importantRootProbabilities)+1)) #Get the count of each sentence type in R for sentence in regularSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.regularRootProbabilities[sentence[0]] += 1 #We will now implement a softmax to turn the counts into probabilities for param in self.regularRootProbabilities: self.regularRootProbabilities[param]=float(self.regularRootProbabilities[param])/ \ float(len(trainingSentences)) if debug > 0: print '*********************************************************' print 'These are the sentence type parameters' print '*********************************************************' print ' --------------------------------------------------------' print ' For Important Sentences:' print self.importantRootProbabilities print ' --------------------------------------------------------' print ' For Regular Sentences:' print self.regularRootProbabilities print print ###Train Phrases ##Primitive Inference on Multiplicity Parameter #Define dictionaries to store times a tag was included in a phrase tagInclusionI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many times is a tag in a level? tagInclusionR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many times is a tag in a level? #Define dictionaries to store times a tag was used at all. tagCountI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear? tagCountR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear? #To store dumb poisson inference self.importantMultiplictyParameter = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#For storing parameter estimates. self.regularMultiplictyParameter = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#For storing parameter estimates. #Get Inclusion for I for sentence in importantSentences: self.getInclusions([sentence,tagInclusionI,debug>=2]) #Get Inclusion for R for sentence in regularSentences: self.getInclusions([sentence,tagInclusionR,debug>=2]) #Get Counts for I for sentence in importantSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags[1:]: tagCountI[tag] += 1 #Get Counts for R for sentence in regularSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags[1:]: tagCountR[tag] += 1 #Estimate Parameters for I for tag in tagInclusionI.keys(): if (tagCountI[tag] > 1): self.importantMultiplictyParameter[tag] = (tagCountI[tag]-1) / tagInclusionI[tag] #Estimate Parameters for R for tag in tagInclusionR.keys(): if (tagCountR[tag] > 1): self.regularMultiplictyParameter[tag] = (tagCountR[tag]-1) / tagInclusionR[tag] if debug > 0: print '*********************************************************' print ' Estimation for Multiplicity Parameters ' print '*********************************************************' print print ' ------------------------------------------------------------------' print 'Tag Counts for Important Sentences:' print tagCountI print 'Tag Counts for Regular Sentences:' print tagCountR print ' ------------------------------------------------------------------' print 'Tag Inclusion for Important Sentences:' print tagInclusionI print 'Tag Inclusion for Regular Sentences:' print tagInclusionR print ' ------------------------------------------------------------------' print 'Dumb Parameter Estimates for Imporant Sentences:' print self.importantMultiplictyParameter print 'Dumb Parameter Estimates for Regular Sentences:' print self.regularMultiplictyParameter print print ##Primitive Inference on Presence Parameters #We need to find inclusions given parent #To store conditional presence probabilities, what can almost be \ #thought of as transition probabilities. #This is the uinformed probability of a particular presence. ui = self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)) #For important phrases self.importantCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)]) self.importantCondPresenceProbs = pd.DataFrame(self.importantCondPresenceProbs).applymap(lambda x: x + ui) self.importantCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags) self.importantCondPresenceProbs.index = list(self.tags) #For regularPhrases self.regularCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)]) self.regularCondPresenceProbs = pd.DataFrame(self.regularCondPresenceProbs).applymap(lambda x: x + ui) self.regularCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags) self.regularCondPresenceProbs.index = list(self.tags) #Define dictionaries to store times a tag was used at all. This time, \ #We care about root/sentence tags as well. tagCountI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear? tagCountR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear? #Tag counts, but on sentences as well, unlike above. #Count Conditional Inclusions for Important Sentences for sentence in importantSentences: self.getInclusionsGivenParent([sentence,self.importantCondPresenceProbs,sentence[0],debug>=2]) #Count Conditional Inclusions for Regular Sentences for sentence in regularSentences: self.getInclusionsGivenParent([sentence,self.regularCondPresenceProbs,sentence[0],debug>=2]) #Get Counts for I for sentence in importantSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags: tagCountI[tag] += 1 #Get Counts for R for sentence in regularSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags: tagCountR[tag] += 1 #Calculate Conditional Presence Parameter for Important Sentences for column in self.importantCondPresenceProbs.columns: if tagCountI[column] > 0: num = self.importantCondPresenceProbs.loc[:,column] + alpha denom = tagCountI[column] + (len(self.importantCondPresenceProbs.columns) + 1)*alpha self.importantCondPresenceProbs.loc[:,column] = num/denom #Calculate Conditional Presence Parameter for Regular Sentences for column in self.regularCondPresenceProbs.columns: if tagCountR[column] > 0: #AdditiveSmoothing num = self.regularCondPresenceProbs.loc[:,column] + alpha denom = tagCountR[column] + (len(self.regularCondPresenceProbs.columns) + 1)*alpha self.regularCondPresenceProbs.loc[:,column] = num/denom if debug > 1: print '*********************************************************' print 'Presence Parameter Estimation' print '*********************************************************' print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Important Sentences' print self.importantCondPresenceProbs print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Regular Sentences' print self.regularCondPresenceProbs print ' ------------------------------------------------------------------' if debug > -1: print print print '...Finished' ####Classification def classify(self, sentence, debug = 0): #If the sentence hasn't been parsed, we must parse it. plaintext = False if type(sentence) != list: plaintext = True original = sentence try: sentence = self.parser.parse(sentence) except: try: self.parser = Parser() sentence = self.parser.parse(sentence) except: print 'Couldn\'t create a parsing object.' print 'Prehaps pystatparser is not loaded?' print 'type \"from stat_parser import Parser\"' #Deal with new root types if sentence[0] not in self.importantRootProbabilities: self.importantRootProbabilities[sentence[0]] = self.alpha/(self.alpha*(len(self.importantRootProbabilities) + 1)) if sentence[0] not in self.regularRootProbabilities: self.regularRootProbabilities[sentence[0]] = self.alpha/(self.alpha*(len(self.regularRootProbabilities) + 1)) #Deal with new non-root tag types flat = self.recursiveFlatten(sentence) flat = filter(lambda x: type(x) == unicode, flat) for i,tag in enumerate(flat): if tag not in self.tags: #Set a priori beliefs for multiplicity parameters self.importantMultiplictyParameter[tag] = self.beta self.regularMultiplictyParameter[tag] = self.beta #Set a priori beliefs for conditional presence parameters self.importantCondPresenceProbs.loc[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.importantCondPresenceProbs.columns)) self.regularCondPresenceProbs.loc[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.regularCondPresenceProbs.columns)) if type(flat[i+1])==unicode: #Set a priori beliefs for conditional presence parameters self.importantCondPresenceProbs[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.importantCondPresenceProbs.index)) self.regularCondPresenceProbs[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.regularCondPresenceProbs.index)) ##Get P(x|y = REGULAR) PxGy1 = math.log(self.importantRootProbabilities[sentence[0]]) + self.getConditionalLevelProbability([sentence,self.importantCondPresenceProbs,self.importantMultiplictyParameter,sentence[0],debug>=2]) ##Get P(x|y = REGULAR) PxGy0 = math.log(self.regularRootProbabilities[sentence[0]]) + self.getConditionalLevelProbability([sentence,self.regularCondPresenceProbs,self.regularMultiplictyParameter,sentence[0],debug>=2]) #Get priors in a log form: Py1 = math.log(self.classPriors[1]) Py0 = math.log(self.classPriors[0]) #Get log Probabilities of each class through Bayes' Rule Py1Gx = PxGy1+Py1 Py0Gx = PxGy0+Py0 #SoftMax probabilities denom = math.log(math.e**Py1Gx + math.e**Py0Gx) sPy1Gx = Py1Gx-denom sPy0Gx = Py0Gx-denom #Turn back into probabilities for output sPy1Gx = math.e**sPy1Gx sPy0Gx = math.e**sPy0Gx if debug > -1: print 'Estimating Class for sentence:' if plaintext: print '\"' + original + '\"' else: print sentence if debug > 0: print ' ------------------------------------------------------------------' print 'Class Priors (log probability):' print 'P(important) = ' + str(Py1) print 'P(unimportant) = ' + str(Py0) print ' ------------------------------------------------------------------' print 'Conditional Sentence Log Probabilities:' print 'P(sentence | important) = ' + str(PxGy1) print 'P(sentence | unimportant) = ' + str(PxGy0) print ' ------------------------------------------------------------------' print 'Unnormalized Conditional Class Log Probabilities' print 'P(important | sentence) = ' + str(Py1Gx) print 'P(unimportant | sentence) = ' + str(Py0Gx) if debug > -1: print ' ------------------------------------------------------------------' print 'Softmaxed Conditional Class Probabilities' print 'P(important | sentence) = ' + str(sPy1Gx) print 'P(unimportant | sentence) = ' + str(sPy0Gx) return(sPy1Gx) def summarize(self, article, verbosity = 0.5, debug = 0): sentences = self.split_into_sentences(article) keepers = [] i = 0 for sentence in sentences: i += 1 try: if self.classify(sentence, debug = debug) > verbosity: keepers.append(sentence) except: print 'Error classifying sentence ' + str(i) print 'FullText: ' print sentence if len(keepers) == 0: print 'No sentences found important' return('') reduced = reduce(lambda x,y: x + ' ' + y, keepers) return(reduced) ####Function Definitions #Returns the log probability of a level occuring, along with using recursion to \ #find the levels contained therein. May be passed an entire sentence. def getConditionalLevelProbability(self, inputs): level = inputs[0] tagDF = inputs[1] mult = inputs[2] parent = inputs[3] debug = inputs[4] ret = 0 if debug == 1: print 'Beginning Level...........' print level inTags = [x[0] for x in level[1:]] if u'' in inTags: inTags.remove(u'') #Do some recursion for i,tag in enumerate(inTags): if tag in self.phraseTags or tag in self.sentenceTypes: if debug == 1: print 'beginning recursion due to:' print tag ret = ret + self.getConditionalLevelProbability([level[i+1],tagDF,mult,tag,debug]) #Do multiplicity for this level for tag in inTags: x = inTags.count(tag) mu = mult[tag] ret = ret + 0#math.log((math.exp(-mu) * mu**x / math.factorial(x))) #Do presence for this level inTags = list(set(inTags)) for tag in inTags: if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case. print 'Breaking Due to non-unicode tag in getConditionalLevelProbability!' print tag break if debug == 1: print 'Probability of ' + tag + ' given ' + parent + ' is ' + str(tagDF.loc[tag,parent]) ret = ret + math.log(tagDF.loc[tag,parent]) return(ret) #To get the inclusion def getInclusionsGivenParent(self, inputs): level = inputs[0] tagDF = inputs[1] parent = inputs[2] debug = inputs[3] if debug == 1: print 'Beginning Level...........' print level inTags = [x[0] for x in level[1:]] if u'' in inTags: inTags.remove(u'') #Do some recursion for i,tag in enumerate(inTags): if tag in self.phraseTags or tag in self.sentenceTypes: if debug == 1: print 'beginning recursion due to:' print tag self.getInclusionsGivenParent([level[i+1],tagDF,tag,debug]) #Add count for this level inTags = list(set(inTags)) for tag in inTags: if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case. break if debug == 1: print 'incrementing: ' + tag + ' when conditioned on ' + parent tagDF.loc[tag,parent] += 1 #To get the inclusions in a level recursively. def getInclusions(self,inputs): level = inputs[0] tagDict = inputs[1] debug = inputs[2] if debug == 1: print 'Beginning Level...........' print level inTags = [x[0] for x in level[1:]] if u'' in inTags: inTags.remove(u'') #Do some recursion for i,tag in enumerate(inTags): if tag in self.phraseTags or tag in self.sentenceTypes: if debug == 1: print 'beginning recursion due to:' print tag self.getInclusions([level[i+1],tagDict,debug]) #Add count for this level inTags = list(set(inTags)) for tag in inTags: if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case. break if debug == 1: print 'incrementing: ' + tag tagDict[tag] += 1 #To find all PoS tags (pystatparser's documentation is literally non-existant) def getTagsRecursively(self, ss, knownTags = [], debug = 0): ret = knownTags for sentence in ss: for phrase in sentence: for element in phrase: if type(element) == unicode: if element not in ret: ret.append(element) if type(element) == list: ret.extend(self.getTagsRecursively(element)) return(ret) #Flatten an n-dimensional list into a 1D list def recursiveFlatten(self, myList): ret = [] for element in myList: if type(element) == list: element = self.recursiveFlatten(element) if type(element) == str or type(element) == unicode: ret.append(element) else: ret.extend(list(element)) return(ret) #From http://stackoverflow.com/questions/4576077/python-split-text-on-sentences def split_into_sentences(self, text): if type(text) == unicode: text = unicode(text.encode('utf-8'), errors = 'ignore') text = unicodedata.normalize('NFKD',text).encode('ascii','ignore') caps = "([A-Z])" prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" suffixes = "(Inc|Ltd|Jr|Sr|Co)" starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" websites = "[.](com|net|org|io|gov)" text = " " + text + " " text = text.replace("\n"," ") text = re.sub(prefixes,"\\1<prd>",text) text = re.sub(websites,"<prd>\\1",text) if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>") if 'a.m.' in text: text = text.replace('a.m.','a<prd>m<prd>') if 'p.m.' in text: text = text.replace('p.m.','p<prd>m<prd>') if '...' in text: text = text.replace('...','<prd><prd><prd>') text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text) text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text) text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text) text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text) text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text) text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text) text = re.sub(" " + caps + "[.]"," \\1<prd>",text) if "”" in text: text = text.replace(".”","”.") if "\"" in text: text = text.replace(".\"","\".") if "!" in text: text = text.replace("!\"","\"!") if "?" in text: text = text.replace("?\"","\"?") text = text.replace(".",".<stop>") text = text.replace("?","?<stop>") text = text.replace("!","!<stop>") text = text.replace("<prd>",".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences
def main(): time1 = time() parser = Parser() inFile = sys.argv[1] outFile = sys.argv[2] f = open(outFile,'w+') for line in open(inFile): if config.print_line: print line global code, errorCode, nodeList, nodeNum, firstVBNN, firstNNVB, found, sqFlag, gqFlag, NNNode, VBNode code = 0 nodeNum = 0 errorCode = [] nodeList = [] firstVBNN = [] firstNNVB = [] found = 0 sqFlag = 0 gpFlag = 0 NNNode = [] VBNode = [] wordNum = len(line.split()) if config.print_word_num: print 'Word Num: ', wordNum if config.ignore_long_sentence and wordNum>config.max_word_num: # print 'Long Sentence' # print '\n'*2 # print -1, line, f.write(str(-1) + ' ' + line) continue try: start = time() indentTree = parser.raw_parse(line) # raw parse for indent tree list end = time() except: # print 'Parsing Error' # print '\n'*2 # print -1, line, f.write(str(-1) + ' ' + line) continue if config.print_parse_time: print 'Raw Parse Time: ', end-start, 's' if config.print_indent_tree: pprint(indentTree) # unit test root = trans(indentTree) # recursively transform list to "tree of node" assignCode(root) assignDesNum(root) if config.dfs_indent: dfsIndent(root,0) # unit test if config.print_node_list: # unit test for node in nodeList: print node.getData(), '\t', print if config.show_nltk_tree: start = time() nlktTree = parser.parse(line) # nlktTree, could be drawn into graph end = time() print 'NLKT Tree Parse Time: ', end-start, 's' display_tree(nlktTree) # unit test """ Now the check begins! """ preCheck() totalCheck(root) """ If there is some NP+VP left """ for i in range(len(nodeList)-1): node1 = nodeList[i] node2 = nodeList[i+1] if node2.getParent().getData() == u'RB' and i+2<len(nodeList): node2 = nodeList[i+2] if node1.getParent().getData() in NNList and node2.getParent().getData() in VBList: if (node1 in NNNode and node2 in VBNode) or (firstNNVB and (node1==firstNNVB[0] and node2==firstNNVB[1])): pass else: NNNode.append(node1) VBNode.append(node2) """ Be Check """ for i in range(len(nodeList)-1): node1 = nodeList[i] node2 = nodeList[i+1] if node2.getParent().getData() == u'RB' and i+2<len(nodeList): node2 = nodeList[i+2] if node1.getData().lower() == 'i': if node2.getData() in BEList and node2.getData() not in ['am', 'was',"'m"]: errorCode.append(node2.getCode()) elif node1.getData().lower() in PRPSecondList: if node2.getData() in BEList and node2.getData() not in ['are', 'were',"'re"]: errorCode.append(node2.getCode()) elif node1.getData().lower() in PRPThirdList: if node1.getData().lower()=='that': if i>0 and nodeList[i-1].getParent().getData() in NNList: continue if node2.getData() in BEList and node2.getData() not in ['is', 'was',"'s"]: errorCode.append(node2.getCode()) else: pass """ del duplicates """ if firstNNVB: n = firstNNVB[0] v = firstNNVB[1] if n in NNNode and v in VBNode: NNNode.remove(n) VBNode.remove(v) """ replace RB with the word after (Maybe VB) """ if firstNNVB: v = firstNNVB[1] if v.getParent().getData()==u'RB': code = v.getCode() new = nodeList[code+1] if new.getParent().getData() in VBList: firstNNVB[1] = new else: pass for v in VBNode: if v.getParent().getData()==u'RB': code = v.getCode() new = nodeList[code+1] if new.getParent().getData() in VBList: VBNode[VBNode.index(v)] = new else: pass """ print dependencies """ if config.print_npvp: print 'FROM ERRORCODE:' for i in errorCode: print nodeList[i].getData(), '\t', print print 'FROM QUESTION:' for node in firstVBNN: print node.getData() for node in firstNNVB: print node.getData() print print 'FROM NPVP & OTHERS:' for i in range(len(NNNode)): print NNNode[i].getParent().getData(), NNNode[i].getData(), '\t', VBNode[i].getParent().getData(), VBNode[i].getData() print """ canonicalize """ if firstVBNN: if not (firstVBNN[0].getParent().getData() in VBList and firstVBNN[1].getParent().getData() in NNList): firstVBNN = [] if firstNNVB: if not (firstNNVB[1].getParent().getData() in VBList and firstNNVB[0].getParent().getData() in NNList): firstNNVB = [] for v in VBNode: i = VBNode.index(v) if not (NNNode[i].getParent().getData() in NNList and VBNode[i].getParent().getData() in VBList): del NNNode[i] del VBNode[i] """ Finally! We add codes! """ if config.print_standard_answer: """ FROM QUESTION """ if sqFlag or gqFlag: if firstVBNN: v = firstVBNN[0] n = firstVBNN[1] if n.getData().lower() in PRPThirdList or n.getParent().getData() in [u'NN', u'NNP']: # single noun if v.getParent().getData() in [u'VB', u'VBP']: errorCode.append(v.getCode()) else: if v.getParent().getData()==u'VBZ': errorCode.append(v.getCode()) if firstNNVB: v = firstNNVB[1] if v.getParent().getData()==u'VBZ': errorCode.append(v.getCode()) else: if firstNNVB: if not (firstNNVB[0] in NNNode and firstNNVB[1] in VBNode): NNNode.append(firstNNVB[0]) VBNode.append(firstNNVB[1]) """ FROM NPVP & OTHERS """ for i in range(len(NNNode)): n = NNNode[i] v = VBNode[i] if n.getData().lower() in PRPThirdList or n.getParent().getData() in [u'NN', u'NNP']: # single noun if v.getParent().getData() in [u'VB', u'VBP']: errorCode.append(v.getCode()) else: if v.getParent().getData()==u'VBZ': errorCode.append(v.getCode()) errorCode = list(set(errorCode)) errorCode.sort() if errorCode: for i in errorCode: # print i+1, f.write(str(i+1) + ' ') else: # print -1, f.write(str(-1) + ' ') # print line, f.write(line) if config.print_vb: printVB(root) # print all verb and MD in tree print if config.print_empty_line: print '\n'*2 time2 = time() if config.print_total_time: print 'Total Execution Time: ', time2-time1, 's' if config.show_nltk_tree: Tkinter._test() # show the nlktTree graph
from stat_parser import Parser from graph import Graph,get_leaves,merge_graphs from nltk.tree import ParentedTree parser = Parser() trees = [] trees.append(parser.parse("The food was on the table where the child likes to eat")) trees.append(parser.parse("The money is on the table")) trees.append(parser.parse("Put the data in the table")) trees.append(parser.parse("Add more rows to the database table")) trees.append(parser.parse("Add more rows to the database table")) trees.append(parser.parse("Why is the table empty It should have data in it")) trees.append(parser.parse("Do not put your elbows on the table while you eat")) for tree in trees: tree = ParentedTree.convert(tree) graphs = [] for tree in trees: g = Graph() g.update(tree) graphs.append(g) new_graph = merge_graphs(graphs) new_graph.draw("new_graph") new_graph.save_to_file("new_graph.gml") new_graph.load_from_file("new_graph.gml") print new_graph.get_median_relatedness() print new_graph.get_senses("table")
import nltk import json import yaml from random import choice from stat_parser import Parser, display_tree parser = Parser() d = json.load(open('tree_ship_words.json')) f = open("ships.yml") ships = yaml.load(f.read()) keywords = {} keys = [] def is_leaf(tree): if len(tree.leaves()) == 1: return True else: return False def check_for_roll(tree): leaves = list(tree) key = "" words = [] for leaf in leaves: if is_leaf(leaf): bottom = leaf while type(bottom) not in [str, unicode]: leaf = bottom bottom = list(leaf)[0] key += leaf.node words.append(leaf.leaves()[0])
CONSUMER_KEY = os.environ['tw_pg_consumerkey'] CONSUMER_SECRET = os.environ['tw_pg_consumer'] OAUTH_TOKEN = os.environ['tw_pg_token'] OAUTH_TOKEN_SECRET = os.environ['tw_pg_secret'] auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET) twitter_api = twitter.Twitter(domain = 'api.twitter.com', api_version = '1.1', auth = auth, format = 'json') posts = twitter_api.statuses.user_timeline(count = '200') #does not fetch retweeets right now, set included_rts = true if needed # pull out tweets as list tweets = [ipost['text'] for ipost in posts] ##### # apply regexp chunker to get noun phrases # http://nltk.org/book3/ch07.html # process: 1) sentence segmentation, 2) tokenization, 3) part of speech tagging # 4) entity detection # # stat_parser does all for steps above within its parser. # TODO still need to check the results if they are reasonable. parser = Parser() result = [parser.parse(tweet) for tweet in tweets] for res in result: print(res)
def testing_stat_parser(message): parser = Parser() return parser.parse(message)
# -*- coding: utf-8 -*- # Import required libraries import nltk, re, csv from stat_parser import Parser, display_tree parser = Parser() from nltk.tree import Tree from nltk.stem.wordnet import WordNetLemmatizer # Define all the lists that are checked for the requirements not_atomic_list = ["and that", "and also", "but ", "so that", "while ", "however ", "whereas ", "on the other hand", "in addition to", "respectively", "as well as", "thereby", "though ", "thus ", " hence ", "therefore", "yet ", " including ", "in contrast", "contrary to", " beside", "aside from", "other than", "explaining", "which explains"] not_independent_list = ["this study ", "our study", "the results ", "results ", "the findings ", "the present study ", "these findings ", "these results ", "this research ", "this data ", "the data ", "these data", "our data", "these observations", "this experiment ", "this publication ", "this analysis", "these analyses", "evidence", "this paper ", "the paper ", "this report ", "the report ", "this effect ", "we ", "compared with", "and other", "previous ", "previously", "the bacterium "] not_declarative_list = ["?", "!"] not_absolute_list = ["probabl", "perhaps", "potentially", "putative", "maybe", "plausible", "possible", "likely", "feasible", "hypothetical", "may", "could ", " seem ", "appears to", "appear to", " appear ", " might ", " suggest ", "minimally sufficient", "is predicted", "is foreseen", "is envisioned", "revealed that", "reveals that", "significant", "significantly", "to reveal", " estimated ", " estimate"] # From here on, all the functions are defined that check whether the sentence fulfills the AIDA rules, # and if they do not, the sentence is rewritten with individual functions per requirement # (Yes, for the moment nothing is done when a sentence is not atomic or not independent..) def check_if_atomic(sentence, parsed_sentence, tags): counter = 0 atomic_check = re.compile("|".join(not_atomic_list)) tree = Tree('s', parsed_sentence) for child in tree: string = str(child) if string.startswith("(S"): counter += 1 if atomic_check.search(sentence_lower): return False elif counter > 1:
data = open("data/extracted.txt").read() data = ''.join([i if ord(i) < 128 else '' for i in data]) print "Tokenizing sentences" sentences = nltk.tokenize.sent_tokenize(data) open("token_cache", "w").write(pickle.dumps(sentences)) sentences = sentences[:500] process_count = cpu_count() # process_count = 1 # sentence_tasks = [sentences[i::process_count] for i in xrange(process_count)] print "Using %d processes" % (process_count,) parser = Parser() parsed = parser.parse("This is a very long sentence.") def delegate(task_queue, completed_queue): graph = Graph() parser = Parser() while True: try: sentence = task_queue.get(False) except: completed_queue.put(graph) print "My work here is done" return True print "Parsing sentence" parsed = parser.parse(sentence)
print ("Tagged words: %r\n" % tagged) #------------------------------------------------------------------------------- # Generate nltk tree #------------------------------------------------------------------------------- parser = Parser() # http://www.thrivenotes.com/the-last-question/ #sentance = "What is the population of the country France?" tree = parser.parse(sentence) print ("--- Printing trees -----") #print ("Tree 1: ",tree) #print("\nTree 2: ", tree.pformat_latex_qtree()) #print("\nPretty tree:\n")
import sys import nltk import nltk.data import nltk.tree from stat_parser import Parser import re parser = Parser() def getNodes(parent): for node in parent: if type(node) is nltk.Tree: if not getNodes(node): if node.label() == "VP": # we want to remove some sentences describing # environment sentence = " ".join(node.leaves()).lower() commands = re.split(r';|\,|\.|\>|\band\b|\bor\b|\bthen\b', sentence) done = False for command in commands: if re.match(r'^[a-zA-Z0-9\;\,\.\-\*\:\'\"\/\s]{1,80}$', command) \ and re.search(r'[a-zA-Z]', command): tokens = nltk.word_tokenize(command) if len(tokens) > 0 and len(tokens) <= 5: tagged = nltk.pos_tag(tokens) if tagged[0][1] not in \ ["VBZ", "VBN", "VBD", "VBP", "VBG", "MD", "NNS", "DT", "JJ"]:
class Translator: def __init__(self): self.filename='dictionary.txt' self.dict = {} bgm = nltk.collocations.BigramAssocMeasures() finder = nltk.collocations.BigramCollocationFinder.from_words(nltk.corpus.brown.words()) scores = finder.score_ngrams(bgm.likelihood_ratio) self.scored = {} for key, score in scores: self.scored[key] = score self.specialWords = [u'了', u'的'] self.directions = ['east', 'west', 'south', \ 'north','northeast', 'southeast', 'northwest', 'southwest'] self.parser = Parser() def tranlate(self): pass def loadDictionary(self): f = codecs.open(self.filename,'r','utf-8') regex = re.compile('(.*)\((.*)\)') ls = [ line.strip() for line in f] for i in ls : t = i.split(':') cn_word = t[0] en_words = [] for w in t[1].split(';'): word = w.strip() m = regex.match(word) if (m is not None): en_words.append((m.group(1).strip(), m.group(2).strip())) else: en_words.append((w.strip(), 'default')) self.dict[cn_word] = en_words f.close() def isNumerical(self, word): if word.isdigit(): return True word = word.lower() if word in ['one', 'a', 'an']: return True else: return False def preProcess(self, sentence, pickWord): words = sentence.split(' ') en_sentence = [] for word in words: w = word.split('#') word = w[0] t = w[1] if word == u'。': word = '.' elif word == u',': word = ',' elif word == u'“' or word == u'”': word = '"' elif word == u'、': word = ',' elif word == u':': word = ':' elif word == u'``': word = '"' if word in self.specialWords: #add tense if word == u'了': en_sentence[-1] = conjugate(en_sentence[-1].strip(), 'p') continue if word in self.dict: #remove measure words if 'M' == type: if len(en_sentence) > 0 and self.isNumerical(en_sentence[-1]): #change one to a if en_sentence[-1].lower() == 'one': en_sentence[-1] = 'a' continue if pickWord == 'baseline': en_sentence.append(self.dict[word][0][0]) else: if len(en_sentence) > 0: en_sentence.append(self.pick(self.dict[word], t, en_sentence[-1])) else: en_sentence.append(self.pick(self.dict[word], t, '')) else: en_sentence.append(word) return en_sentence def pick(self, dict, t, prev): candidates = [] for w in dict: if w[1] == t: candidates.append(w[0]) if len(candidates) == 0: for w in dict: candidates.append(w[0]) if prev == '': return candidates[0] else: return max(candidates, key=lambda x:self.scored[(prev, x)] if (prev, x) in self.scored else 0) def parse(self, sentence): sent_str = '' for w in sentence: sent_str += w + ' ' sent_str = sent_str.strip() tree = self.parser.parse(sent_str) return tree def orderOneOf(self, sentence): full_sentence = nltk.word_tokenize(' '.join(sentence)) tags = nltk.pos_tag(full_sentence) new_sentence = [] for i in range(len(full_sentence) - 1): if full_sentence[i] == 'one' and full_sentence[i + 1] == 'of': for j in reversed(range(i - 1)): if 'VB' in tags[j][1] and tags[j][1] != 'VBD' and tags[j + 1][1] == 'RB': new_sentence.insert(j + 2, 'of') new_sentence.insert(j + 2, 'one') break elif tags[j][1] == 'IN' or ('VB' in tags[j][1] and tags[j][1] != 'VBD'): new_sentence.insert(j + 1, 'of') new_sentence.insert(j + 1, 'one') break elif tags[j][1] == 'DT': new_sentence.insert(j, 'of') new_sentence.insert(j, 'one') break elif i < 2 or (full_sentence[i] != 'of' and full_sentence[i - 1] != 'one'): new_sentence.append(full_sentence[i]) return new_sentence def pluralize(self, tree): if type(tree) is Tree: if tree.node in ['VB', 'VP'] and not type(tree[0]) is Tree: tree[0] = pattern.en.conjugate(tree[0], '3sg') #if tree.node == 'VBP': # tree[0] = pattern.en.conjugate(tree[0], tense=PARTICIPLE, parse=True) if tree.node in ['NP','ADJP','UCP']: findCD = False for child in tree: if child.node == 'CD' and not type(child[0]) is Tree\ and child[0].lower() not in ['1', 'a', 'an', 'one']: findCD = True if child.node == 'JJ' and not type(child[0]) is Tree\ and child[0].lower() in ['many', 'numerous', 'a lot']: findCD = True if child.node == 'QP': findCD = True if findCD and child.node == 'NN': child[0] = pattern.en.pluralize(child[0]) for child in tree: self.pluralize(child) def arrangeLocations(self, tree): if type(tree) is Tree: if tree.node == 'NAC': for i in range(0, len(tree)): child = tree[i] if i<len(tree)-1 and child.node == 'NNP' \ and not type(tree[i+1][0]) is Tree and \ tree[i+1][0].lower() in ['state', 'city']: del tree[i+1] del tree[i] tree.insert(0, child) if i >= len(tree)-1: break for child in tree: self.arrangeLocations(child) def uncompleteSentence(self, sentence): full_sentence = nltk.word_tokenize(' '.join(sentence)) tags = nltk.pos_tag(full_sentence) new_sentence = [] for i in range(len(full_sentence) - 1): isVerb = True synsets = wordnet.synsets(tags[i][0]) for syn in synsets: if 'verb.' not in syn.lexname: isVerb = False break if ('NN' == tags[i][1] or tags[i][1] == 'RB') and isVerb: new_sentence.append(conjugate(tags[i][0], 'part')) elif tags[i][1] == 'JJ' and isVerb: new_sentence.append(conjugate(tags[i][0], 'ppart')) else: new_sentence.append(tags[i][0]) new_sentence.append(full_sentence[-1]) return new_sentence def forwardDirectionWord(self, tree): if type(tree) is Tree: if tree.node == 'NP': for i in range(0, len(tree)): child = tree[i] if child.node in ['NNP', 'NNPS'] and \ not type(child[0]) is Tree and child[0].lower() \ in self.directions: del tree[i] child[0] = 'the '+child[0]+' of' tree.insert(0, child) return for child in tree: self.forwardDirectionWord(child) def suchAs(self,sentence): st=' '.join(sentence) reg=r'for example : ([\w\s,]+) etc\.' st=re.sub(reg, 'such as \g<1>, etc', st) return st.split(' ') def arrangeDate(self,sentence): year=r'[12]\d{3}' month=r'January|February|March|April|May|June|July|August|September|October|November|December' day=r'\d{1,2}' for i in range(len(sentence)): yWord=mWord=dWord='' word=sentence[i] if re.match(year,word): yWord=sentence[i] if i+1<len(sentence): nextWord=sentence[i+1] if re.match(month, nextWord): mWord=nextWord.capitalize() if i+2<len(sentence): nextNextWord=sentence[i+2] if re.match(day, nextNextWord): dWord=nextNextWord if yWord!='' and mWord!='' and dWord!='': sentence[i:i+3]=[mWord, dWord, yWord] i+=3 elif yWord!='' and mWord!='': sentence[i:i+2]=[mWord, yWord] i+=2 return sentence def superlative(self, tree): if type(tree) is Tree: for i in range(0, len(tree)): if i+1<len(tree): if tree[i].node=='RBS' and tree[i+1].node=='JJ': if type(tree[i][0]) is not Tree and \ type(tree[i+1][0]) is not Tree: superWord = 'the ' + pattern.en.superlative(tree[i+1][0]) if 'most'==superWord: del tree[i+1] elif 'most' not in superWord: tree[i+1][0]=superWord del tree[i] else: tree[i+1][0]=superWord del tree[i] return for child in tree: self.superlative(child) def flatSentence(self, wl): result='' for i in range(len(wl)): if i==0: result=wl[0].capitalize() elif wl[i]==',' or wl[i]=='.': result+=wl[i] else: result+=' '+wl[i] return result+'.' def postProcess(self,sentence): strategies=[\ (self.suchAs, False),\ (self.arrangeDate, False),\ (self.pluralize, True), \ (self.arrangeLocations, True),\ (self.superlative, True),\ (self.orderOneOf, False), \ (self.forwardDirectionWord, True), \ (self.uncompleteSentence, False) \ ] #Process flat sentence first for (func,isTree) in strategies: if not isTree: sentence=func(sentence) else: tree=self.parse(sentence) func(tree) sentence=tree.leaves() return self.flatSentence(sentence)
def classify(self, sentence, debug = 0, varianceExponent = 0): #If the sentence hasn't been parsed, we must parse it. plaintext = False if type(sentence) != list: plaintext = True original = sentence try: sentence = self.parser.parse(sentence) except: try: self.parser = Parser() sentence = self.parser.parse(sentence) except: print 'Couldn\'t create a parsing object.' print 'Prehaps pystatparser is not loaded?' print 'type \"from stat_parser import Parser\"' print 'Otherwise, you must install it from Github as ' + \ 'directed in the README' #Deal with new root types if sentence[0] not in self.importantRootProbabilities: self.importantRootProbabilities[sentence[0]] = self.biomialParamDist(self.binomHyperParams) if sentence[0] not in self.regularRootProbabilities: self.regularRootProbabilities[sentence[0]] = self.biomialParamDist(self.binomHyperParams) #Deal with new non-root tag types flat = self.recursiveFlatten(sentence) flat = filter(lambda x: type(x) == unicode, flat) for i,tag in enumerate(flat): if tag not in self.tags: #Set a priori beliefs for multiplicity parameters self.importantMultiplicityParameters[tag] = self.poissonParamDist(self.poissonHyperParams) self.regularMultiplicityParameters[tag] = self.poissonParamDist(self.poissonHyperParams) #Set a priori beliefs for conditional presence parameters being contained by anything else self.importantCondPresenceProbs.loc[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.importantCondPresenceProbs.columns] self.regularCondPresenceProbs.loc[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.columns] if type(flat[i+1])==unicode: #Set a priori beliefs for conditional presence parameters containing other things self.importantCondPresenceProbs[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index] self.regularCondPresenceProbs[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index] ##Get P(x|y = Important) PxGy1 = math.log(self.importantRootProbabilities[sentence[0]].getMean()) PxGy1 = PxGy1 / self.importantRootProbabilities[sentence[0]].getVar()**varianceExponent PxGy1 += self.getConditionalLevelProbability([sentence,self.importantCondPresenceProbs,self.importantMultiplicityParameters,sentence[0],varianceExponent,debug>=2]) ##Get P(x|y = REGULAR) PxGy0 = math.log(self.regularRootProbabilities[sentence[0]].getMean()) PxGy0 = PxGy0/self.regularRootProbabilities[sentence[0]].getVar()**varianceExponent PxGy0 += self.getConditionalLevelProbability([sentence,self.regularCondPresenceProbs,self.regularMultiplicityParameters,sentence[0],varianceExponent,debug>=2]) #Get priors in a log form: Py1 = math.log(self.classPriors[1]) Py0 = math.log(self.classPriors[0]) #Get log Probabilities of each class through Bayes' Rule Py1Gx = PxGy1+Py1 Py0Gx = PxGy0+Py0 #Derive softmax shift parameter for very small probabilities. shift = 0 if min([Py1Gx,Py0Gx]) < -20: shift = -1*min([Py1Gx,Py0Gx]) - 20 #SoftMax probabilities try: denom = math.log(math.e**(shift + Py1Gx) + math.e**(shift + Py0Gx)) sPy1Gx = shift + Py1Gx-denom sPy0Gx = shift + Py0Gx-denom #Turn back into probabilities for output sPy1Gx = math.e**sPy1Gx sPy0Gx = math.e**sPy0Gx except OverflowError: if debug > -1: print 'Overflow error' if Py1Gx >= Py0Gx: print 'Assigning important sentence with probability one.' else: print 'Assigning regular sentence with probability one.' print 'Before softmax, log probabilities were:' print 'P(important | sentence) = ' + str(Py1Gx) print 'P(unimportant | sentence) = ' + str(Py0Gx) if Py1Gx >= Py0Gx: sPy1Gx = 1.0 sPy0Gx = 0.0 else: sPy1Gx = 0.0 sPy0Gx = 1.0 if debug > -1: print 'Estimating Class for sentence:' if plaintext: print '\"' + original + '\"' else: print sentence if debug > 0: print ' ------------------------------------------------------------------' print 'Class Priors (log probability):' print 'P(important) = ' + str(Py1) print 'P(unimportant) = ' + str(Py0) print ' ------------------------------------------------------------------' print 'Conditional Sentence Log Probabilities:' print 'P(sentence | important) = ' + str(PxGy1) print 'P(sentence | unimportant) = ' + str(PxGy0) print ' ------------------------------------------------------------------' print 'Unnormalized Conditional Class Log Probabilities' print 'P(important | sentence) = ' + str(Py1Gx) print 'P(unimportant | sentence) = ' + str(Py0Gx) if debug > -1: print ' ------------------------------------------------------------------' print 'Softmaxed Conditional Class Probabilities' print 'P(important | sentence) = ' + str(sPy1Gx) print 'P(unimportant | sentence) = ' + str(sPy0Gx) return(sPy1Gx)
from nltk.corpus import stopwords from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer import nltk from nltk.tokenize import sent_tokenize, word_tokenize import pprint from stat_parser import Parser import questanalyser import sys parser = Parser() f=open('questions.txt','r') cont=f.readlines() f=open('cprogramming.txt','r') train_text=f.readlines() for sample_text in cont : if len(sample_text)<3: break parsed_tree = parser.parse(sample_text) print "" np = [" ".join(i.leaves()) for i in parsed_tree.subtrees() if i.label() == 'NP'] np_mwe_nocomma = [j for j in [" ".join(i.leaves()) for i in parsed_tree.subtrees() if i.label() == 'NP'] if j.count(' ') > 0 and j.count(',') == 0] x = [] for i in sorted(np_mwe_nocomma, key=len):
def train(self, trainingSentences, labels, alpha = 0.1, beta = 0.1, debug = 0): if debug > -1: print print '*********************************************************' print ' SySE V 0.1 ' print 'Beginning Training Sequence with ' + \ str(len(trainingSentences)) + ' training sentences...' print '*********************************************************' if debug > 0: print print 'Initializing... ' if type(trainingSentences[0]) != list: print 'These sentences do not appear to have been parsed.' print 'They will be parsed now.' if len(trainingSentences > 10): print 'Given their volume, this will take some time.' try: self.parser = Parser() except: print 'This environment should have pystatparser loaded ' + \ 'in order to train on unparsed sentences.' print 'Exiting...' return trainingSentences = [parser.parse(x) for x in trainingSentences] ####Initialization #Save hyperparameters self.alpha = alpha self.beta = beta #See what tags are in the training data. tags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for el in flat: if type(el) == unicode and el not in tags: tags.append(el) self.tags = set(tags) #What kind of root tags are there? self.sentenceTypes = set([x[0] for x in trainingSentences]) #Which tags may contain other tags? self.phraseTags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for i in range(0,len(flat)): try: if type(flat[i]) == unicode and type(flat[i+1]) == unicode and flat[i] not in self.phraseTags: self.phraseTags.append(flat[i]) except IndexError: print 'We\'ve reached the end of this sentence' self.phraseTags = set(self.phraseTags) - self.sentenceTypes #Robustness labels = list(labels) #Split training sentences into Important (I) and Regular (R) (Unimportant) importantSentences = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences) regularSentences = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences) self.classPriors = [] ###Test inputs #Make sure labels are right length for sentences. if len(labels) != len(trainingSentences): print 'Labels and trainingSentencs must be the same length!' return #Make sure labels are valid for label in labels: if label != 0 and label != 1: print 'Lables should be either 0 or 1.' print 'exiting...' return #Split training sentences into Important (I) and Regular (R) (Unimportant) self.importantRootProbabilities = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences) self.regularRootProbabilities = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences) ###Train Class Priors self.classPriors.append(float(labels.count(0))/float(len(labels))) self.classPriors.append(float(labels.count(1))/float(len(labels))) if debug > 0: print '*********************************************************' print 'These are the class priors' print '*********************************************************' print self.classPriors print print ###Train Sentence Type self.importantRootProbabilities = dict(zip(list(self.sentenceTypes),np.zeros(len(list(self.sentenceTypes))))) self.regularRootProbabilities = dict(zip(list(self.sentenceTypes),np.zeros(len(list(self.sentenceTypes))))) #Get the count of each sentence type in I for sentence in importantSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.importantRootProbabilities[sentence[0]] += 1 #We will now implement a softmax to turn the counts into probabilities for param in self.importantRootProbabilities: self.importantRootProbabilities[param]=(float(self.importantRootProbabilities[param]) + alpha)/ \ (float(len(trainingSentences)) + alpha*(len(self.importantRootProbabilities)+1)) #Get the count of each sentence type in R for sentence in regularSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.regularRootProbabilities[sentence[0]] += 1 #We will now implement a softmax to turn the counts into probabilities for param in self.regularRootProbabilities: self.regularRootProbabilities[param]=float(self.regularRootProbabilities[param])/ \ float(len(trainingSentences)) if debug > 0: print '*********************************************************' print 'These are the sentence type parameters' print '*********************************************************' print ' --------------------------------------------------------' print ' For Important Sentences:' print self.importantRootProbabilities print ' --------------------------------------------------------' print ' For Regular Sentences:' print self.regularRootProbabilities print print ###Train Phrases ##Primitive Inference on Multiplicity Parameter #Define dictionaries to store times a tag was included in a phrase tagInclusionI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many times is a tag in a level? tagInclusionR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many times is a tag in a level? #Define dictionaries to store times a tag was used at all. tagCountI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear? tagCountR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear? #To store dumb poisson inference self.importantMultiplictyParameter = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#For storing parameter estimates. self.regularMultiplictyParameter = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#For storing parameter estimates. #Get Inclusion for I for sentence in importantSentences: self.getInclusions([sentence,tagInclusionI,debug>=2]) #Get Inclusion for R for sentence in regularSentences: self.getInclusions([sentence,tagInclusionR,debug>=2]) #Get Counts for I for sentence in importantSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags[1:]: tagCountI[tag] += 1 #Get Counts for R for sentence in regularSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags[1:]: tagCountR[tag] += 1 #Estimate Parameters for I for tag in tagInclusionI.keys(): if (tagCountI[tag] > 1): self.importantMultiplictyParameter[tag] = (tagCountI[tag]-1) / tagInclusionI[tag] #Estimate Parameters for R for tag in tagInclusionR.keys(): if (tagCountR[tag] > 1): self.regularMultiplictyParameter[tag] = (tagCountR[tag]-1) / tagInclusionR[tag] if debug > 0: print '*********************************************************' print ' Estimation for Multiplicity Parameters ' print '*********************************************************' print print ' ------------------------------------------------------------------' print 'Tag Counts for Important Sentences:' print tagCountI print 'Tag Counts for Regular Sentences:' print tagCountR print ' ------------------------------------------------------------------' print 'Tag Inclusion for Important Sentences:' print tagInclusionI print 'Tag Inclusion for Regular Sentences:' print tagInclusionR print ' ------------------------------------------------------------------' print 'Dumb Parameter Estimates for Imporant Sentences:' print self.importantMultiplictyParameter print 'Dumb Parameter Estimates for Regular Sentences:' print self.regularMultiplictyParameter print print ##Primitive Inference on Presence Parameters #We need to find inclusions given parent #To store conditional presence probabilities, what can almost be \ #thought of as transition probabilities. #This is the uinformed probability of a particular presence. ui = self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)) #For important phrases self.importantCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)]) self.importantCondPresenceProbs = pd.DataFrame(self.importantCondPresenceProbs).applymap(lambda x: x + ui) self.importantCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags) self.importantCondPresenceProbs.index = list(self.tags) #For regularPhrases self.regularCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)]) self.regularCondPresenceProbs = pd.DataFrame(self.regularCondPresenceProbs).applymap(lambda x: x + ui) self.regularCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags) self.regularCondPresenceProbs.index = list(self.tags) #Define dictionaries to store times a tag was used at all. This time, \ #We care about root/sentence tags as well. tagCountI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear? tagCountR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear? #Tag counts, but on sentences as well, unlike above. #Count Conditional Inclusions for Important Sentences for sentence in importantSentences: self.getInclusionsGivenParent([sentence,self.importantCondPresenceProbs,sentence[0],debug>=2]) #Count Conditional Inclusions for Regular Sentences for sentence in regularSentences: self.getInclusionsGivenParent([sentence,self.regularCondPresenceProbs,sentence[0],debug>=2]) #Get Counts for I for sentence in importantSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags: tagCountI[tag] += 1 #Get Counts for R for sentence in regularSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags: tagCountR[tag] += 1 #Calculate Conditional Presence Parameter for Important Sentences for column in self.importantCondPresenceProbs.columns: if tagCountI[column] > 0: num = self.importantCondPresenceProbs.loc[:,column] + alpha denom = tagCountI[column] + (len(self.importantCondPresenceProbs.columns) + 1)*alpha self.importantCondPresenceProbs.loc[:,column] = num/denom #Calculate Conditional Presence Parameter for Regular Sentences for column in self.regularCondPresenceProbs.columns: if tagCountR[column] > 0: #AdditiveSmoothing num = self.regularCondPresenceProbs.loc[:,column] + alpha denom = tagCountR[column] + (len(self.regularCondPresenceProbs.columns) + 1)*alpha self.regularCondPresenceProbs.loc[:,column] = num/denom if debug > 1: print '*********************************************************' print 'Presence Parameter Estimation' print '*********************************************************' print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Important Sentences' print self.importantCondPresenceProbs print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Regular Sentences' print self.regularCondPresenceProbs print ' ------------------------------------------------------------------' if debug > -1: print print print '...Finished'
from nltk.corpus import stopwords from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer import nltk from nltk.tokenize import sent_tokenize, word_tokenize import pprint from stat_parser import Parser parser = Parser() EXAMPLE_TEXT = "What is a pointer on pointer?" stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(EXAMPLE_TEXT) filtered_sentence = [w for w in word_tokens if not w in stop_words] """ print(word_tokens) print(filtered_sentence) """ train_text = """Hello girls and guys, welcome to an in-depth and practical machine learning course. The objective of this course is to give you a wholistic understanding of machine learning, covering theory, application, and inner workings of supervised, unsupervised, and deep learning algorithms. In this series, we'll be covering linear regression, K Nearest Neighbors, Support Vector Machines (SVM), flat clustering, hierarchical clustering, and neural networks. For each major algorithm that we cover, we will discuss the high level intuitions of the algorithms and how they are logically meant to work. Next, we'll apply the algorithms in code using real world data sets along with a module, such as with Scikit-Learn. Finally, we'll be diving into the inner workings of each of the algorithms by recreating them in code, from scratch, ourselves, including all of the math involved. This should give you a complete understanding of exactly how the algorithms work, how they can be tweaked, what advantages are, and what their disadvantages are. In order to follow along with the series, I suggest you have at the very least a basic understanding of Python. If you do not, I suggest you at least follow the Python 3 Basics tutorial until the module
class SySE: def __init__(self): try: send = resource_filename(__name__, 'default.dat') self.loadParameters(send) except: print "Could not load default parameters." print "You should either train this object using the \"train\" " +\ "method, or load parameters with the \"loadParameters\" method" ####Supervised Training. #trainingSentences: sentences on which to train (May already be parsed) #labels: corresponding binary (1,0) labels. #HyperParams for the conjugate Beta and Gamma priors respectively. #The Gamma distributionis parameterized such that the term with the beta \ #parameters looks like this: e^(-x*beta) for random variable x. def train(self, trainingSentences, labels, binomHyperParams = [0.5,0.5], poissonHyperParams = [0.0001,0.005], debug = 0): if debug > -1: print print '**********************************************************' print ' SySE V 1.1.2 ' print 'Beginning Training Sequence with ' + \ str(len(trainingSentences)) + ' training sentences...' print '**********************************************************' if debug > 0: print print 'Initializing... ' if type(trainingSentences[0]) != list: print 'These sentences do not appear to have been parsed.' print 'They will be parsed now.' if len(trainingSentences) > 10: print 'Given their volume, this will take some time.' try: self.parser = Parser() trainingSentences = [self.parser.parse(x) for x in trainingSentences] except: print 'This environment should have pystatparser installed ' +\ 'in order to train on unparsed sentences.' print 'Parameters could not be fit' print 'Exiting...' return ####Initialization #Save hyperparameters self.binomHyperParams = binomHyperParams self.poissonHyperParams = poissonHyperParams #See what tags are in the training data. tags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for el in flat: if type(el) == unicode and el not in tags: tags.append(el) self.tags = set(tags) #What kind of root tags are there? self.sentenceTypes = set([x[0] for x in trainingSentences]) #Which tags may contain other tags? self.phraseTags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for i in range(0,len(flat)): try: if type(flat[i]) == unicode and type(flat[i+1]) == unicode and flat[i] not in self.phraseTags: self.phraseTags.append(flat[i]) except IndexError: print 'We\'ve reached the end of this sentence' self.phraseTags = set(self.phraseTags) - self.sentenceTypes #Robustness labels = list(labels) #Split training sentences into Important (I) and Regular (R) (Unimportant) importantSentences = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences) regularSentences = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences) self.classPriors = [] ###Test inputs #Make sure labels are right length for sentences. if len(labels) != len(trainingSentences): print 'Labels and trainingSentencs must be the same length!' return #Make sure labels are valid for label in labels: if label != 0 and label != 1: print 'Lables should be either 0 or 1.' print 'exiting...' return ###Train Class Priors self.classPriors.append(float(labels.count(0))/float(len(labels))) self.classPriors.append(float(labels.count(1))/float(len(labels))) if debug > 0: print '*********************************************************' print 'These are the class priors' print '*********************************************************' print self.classPriors print print ###Train Sentence Type self.importantRootProbabilities = dict(zip(list(self.sentenceTypes),[binomialParamDist(self.binomHyperParams) for x in range(0,len(list(self.sentenceTypes)))])) self.regularRootProbabilities = dict(zip(list(self.sentenceTypes),[binomialParamDist(self.binomHyperParams) for x in range(0,len(list(self.sentenceTypes)))])) #Get the count of each sentence type in I for sentence in importantSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.importantRootProbabilities[sentence[0]].update(1) for sentence1 in importantSentences: if sentence1 != sentence: self.importantRootProbabilities[sentence1[0]].update(False) #Get the count of each sentence type in R for sentence in regularSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.regularRootProbabilities[sentence[0]].update(1) for sentence1 in importantSentences: if sentence1 != sentence: self.regularRootProbabilities[sentence1[0]].update(False) if debug > 0: print '*********************************************************' print 'These are the sentence type parameters' print '*********************************************************' print ' --------------------------------------------------------' print ' For Important Sentences:' print self.importantRootProbabilities print ' --------------------------------------------------------' print ' For Regular Sentences:' print self.regularRootProbabilities print print ###Train Phrases ##Primitive Inference on Multiplicity Parameter #To store poisson beliefs self.importantMultiplicityParameters = dict(zip(list(self.tags), [poissonParamDist(self.poissonHyperParams) for x in range(0,len(list(self.tags)))]))#For storing parameter estimates. self.regularMultiplicityParameters = dict(zip(list(self.tags), [poissonParamDist(self.poissonHyperParams) for x in range(0,len(list(self.tags)))]))#For storing parameter estimates. #Get Inclusion for I for sentence in importantSentences: self.getInclusions([sentence,self.importantMultiplicityParameters,debug>=2]) #Get Inclusion for R for sentence in regularSentences: self.getInclusions([sentence,self.regularMultiplicityParameters,debug>=2]) #Get Counts for I for sentence in importantSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags[1:]: self.importantMultiplicityParameters[tag].updateCount(1) #Get Counts for R for sentence in regularSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags[1:]: self.regularMultiplicityParameters[tag].updateCount(1) ####YOUNEED TO GO OVER THIS AGAIN #Estimate Parameters for I for tag in self.importantMultiplicityParameters.keys(): if (self.importantMultiplicityParameters[tag].alpha > 1): self.importantMultiplicityParameters[tag].updateCount(-1) #Estimate Parameters for R for tag in self.regularMultiplicityParameters.keys(): if (self.regularMultiplicityParameters[tag].alpha > 1): self.regularMultiplicityParameters[tag].updateCount(-1) if debug > 0: print '*********************************************************' print ' Estimation for Multiplicity Parameters ' print '*********************************************************' print print 'Dumb Parameter Estimates for Imporant Sentences:' print self.regularMultiplicityParameters print 'Dumb Parameter Estimates for Regular Sentences:' print self.regularMultiplictyParameter print print ##Primitive Inference on Presence Parameters #We need to find inclusions given parent #To store conditional presence probabilities, what can almost be \ #thought of as transition probabilities. #For important phrases self.importantCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)]) self.importantCondPresenceProbs = pd.DataFrame(self.importantCondPresenceProbs).applymap(lambda x: binomialParamDist(self.binomHyperParams)) self.importantCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags) self.importantCondPresenceProbs.index = list(self.tags) #For regularPhrases self.regularCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)]) self.regularCondPresenceProbs = pd.DataFrame(self.regularCondPresenceProbs).applymap(lambda x: binomialParamDist(self.binomHyperParams)) self.regularCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags) self.regularCondPresenceProbs.index = list(self.tags) #Count Conditional Inclusions for Important Sentences for sentence in importantSentences: self.getInclusionsGivenParent([sentence,self.importantCondPresenceProbs,sentence[0],debug>=2]) #Count Conditional Inclusions for Regular Sentences for sentence in regularSentences: self.getInclusionsGivenParent([sentence,self.regularCondPresenceProbs,sentence[0],debug>=2]) if debug > 1: print '*********************************************************' print 'Presence Parameter Estimation' print '*********************************************************' print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Important Sentences' print self.importantCondPresenceProbs print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Regular Sentences' print self.regularCondPresenceProbs print ' ------------------------------------------------------------------' if debug > -1: print print print '...Finished' ####Classification def classify(self, sentence, debug = 0, varianceExponent = 0): #If the sentence hasn't been parsed, we must parse it. plaintext = False if type(sentence) != list: plaintext = True original = sentence try: sentence = self.parser.parse(sentence) except: try: self.parser = Parser() sentence = self.parser.parse(sentence) except: print 'Couldn\'t create a parsing object.' print 'Prehaps pystatparser is not loaded?' print 'type \"from stat_parser import Parser\"' print 'Otherwise, you must install it from Github as ' + \ 'directed in the README' #Deal with new root types if sentence[0] not in self.importantRootProbabilities: self.importantRootProbabilities[sentence[0]] = self.biomialParamDist(self.binomHyperParams) if sentence[0] not in self.regularRootProbabilities: self.regularRootProbabilities[sentence[0]] = self.biomialParamDist(self.binomHyperParams) #Deal with new non-root tag types flat = self.recursiveFlatten(sentence) flat = filter(lambda x: type(x) == unicode, flat) for i,tag in enumerate(flat): if tag not in self.tags: #Set a priori beliefs for multiplicity parameters self.importantMultiplicityParameters[tag] = self.poissonParamDist(self.poissonHyperParams) self.regularMultiplicityParameters[tag] = self.poissonParamDist(self.poissonHyperParams) #Set a priori beliefs for conditional presence parameters being contained by anything else self.importantCondPresenceProbs.loc[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.importantCondPresenceProbs.columns] self.regularCondPresenceProbs.loc[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.columns] if type(flat[i+1])==unicode: #Set a priori beliefs for conditional presence parameters containing other things self.importantCondPresenceProbs[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index] self.regularCondPresenceProbs[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index] ##Get P(x|y = Important) PxGy1 = math.log(self.importantRootProbabilities[sentence[0]].getMean()) PxGy1 = PxGy1 / self.importantRootProbabilities[sentence[0]].getVar()**varianceExponent PxGy1 += self.getConditionalLevelProbability([sentence,self.importantCondPresenceProbs,self.importantMultiplicityParameters,sentence[0],varianceExponent,debug>=2]) ##Get P(x|y = REGULAR) PxGy0 = math.log(self.regularRootProbabilities[sentence[0]].getMean()) PxGy0 = PxGy0/self.regularRootProbabilities[sentence[0]].getVar()**varianceExponent PxGy0 += self.getConditionalLevelProbability([sentence,self.regularCondPresenceProbs,self.regularMultiplicityParameters,sentence[0],varianceExponent,debug>=2]) #Get priors in a log form: Py1 = math.log(self.classPriors[1]) Py0 = math.log(self.classPriors[0]) #Get log Probabilities of each class through Bayes' Rule Py1Gx = PxGy1+Py1 Py0Gx = PxGy0+Py0 #Derive softmax shift parameter for very small probabilities. shift = 0 if min([Py1Gx,Py0Gx]) < -20: shift = -1*min([Py1Gx,Py0Gx]) - 20 #SoftMax probabilities try: denom = math.log(math.e**(shift + Py1Gx) + math.e**(shift + Py0Gx)) sPy1Gx = shift + Py1Gx-denom sPy0Gx = shift + Py0Gx-denom #Turn back into probabilities for output sPy1Gx = math.e**sPy1Gx sPy0Gx = math.e**sPy0Gx except OverflowError: if debug > -1: print 'Overflow error' if Py1Gx >= Py0Gx: print 'Assigning important sentence with probability one.' else: print 'Assigning regular sentence with probability one.' print 'Before softmax, log probabilities were:' print 'P(important | sentence) = ' + str(Py1Gx) print 'P(unimportant | sentence) = ' + str(Py0Gx) if Py1Gx >= Py0Gx: sPy1Gx = 1.0 sPy0Gx = 0.0 else: sPy1Gx = 0.0 sPy0Gx = 1.0 if debug > -1: print 'Estimating Class for sentence:' if plaintext: print '\"' + original + '\"' else: print sentence if debug > 0: print ' ------------------------------------------------------------------' print 'Class Priors (log probability):' print 'P(important) = ' + str(Py1) print 'P(unimportant) = ' + str(Py0) print ' ------------------------------------------------------------------' print 'Conditional Sentence Log Probabilities:' print 'P(sentence | important) = ' + str(PxGy1) print 'P(sentence | unimportant) = ' + str(PxGy0) print ' ------------------------------------------------------------------' print 'Unnormalized Conditional Class Log Probabilities' print 'P(important | sentence) = ' + str(Py1Gx) print 'P(unimportant | sentence) = ' + str(Py0Gx) if debug > -1: print ' ------------------------------------------------------------------' print 'Softmaxed Conditional Class Probabilities' print 'P(important | sentence) = ' + str(sPy1Gx) print 'P(unimportant | sentence) = ' + str(sPy0Gx) return(sPy1Gx) def summarize(self, article, verbosity = 0.5, debug = 0): sentences = self.split_into_sentences(article) keepers = [] i = 0 for sentence in sentences: i += 1 try: if self.classify(sentence, debug = debug) > verbosity: keepers.append(sentence) except: print 'Error classifying sentence ' + str(i) print 'FullText: ' print sentence if len(keepers) == 0: print 'No sentences found important' return('') reduced = reduce(lambda x,y: x + ' ' + y, keepers) return(reduced) ####Function Definitions #Returns the log probability of a level occuring, along with using recursion to \ #find the levels contained therein. May be passed an entire sentence. def getConditionalLevelProbability(self, inputs): level = inputs[0] tagDF = inputs[1] mult = inputs[2] parent = inputs[3] varExp = inputs[4] debug = inputs[5] ret = 0 if debug == 1: print 'Beginning Level...........' print level inTags = [x[0] for x in level[1:]] if u'' in inTags: inTags.remove(u'') #Do some recursion for i,tag in enumerate(inTags): if tag in self.phraseTags or tag in self.sentenceTypes: if debug == 1: print 'beginning recursion due to:' print tag ret = ret + self.getConditionalLevelProbability([level[i+1],tagDF,mult,tag,varExp,debug]) #Do multiplicity for this level for tag in inTags: x = inTags.count(tag) mu = mult[tag].getMean() ret = ret + math.log((math.exp(-mu) * mu**x / math.factorial(x)))/mult[tag].getVar()**varExp #Do presence for this level inTags = list(set(inTags)) for tag in inTags: if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case. print 'Breaking Due to non-unicode tag in getConditionalLevelProbability!' print tag break if debug == 1: print 'Probability of ' + tag + ' given ' + parent + ' is ' + str(tagDF.loc[tag,parent]) ret = ret + math.log(tagDF.loc[tag,parent].getMean())/tagDF.loc[tag,parent].getVar()**varExp return(ret) #To get the inclusion def getInclusionsGivenParent(self, inputs): level = inputs[0] tagDF = inputs[1] parent = inputs[2] debug = inputs[3] if debug == 1: print 'Beginning Level...........' print level inTags = [x[0] for x in level[1:]] if u'' in inTags: inTags.remove(u'') #Do some recursion for i,tag in enumerate(inTags): if tag in self.phraseTags or tag in self.sentenceTypes: if debug == 1: print 'beginning recursion due to:' print tag self.getInclusionsGivenParent([level[i+1],tagDF,tag,debug]) #Update tags on this level inTags = list(set(inTags)) for tag in inTags: if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case. break if debug == 1: print 'incrementing: ' + tag + ' when conditioned on ' + parent tagDF.loc[tag,parent].update(True) #Update tags not on this level for tag in self.tags: if tag not in inTags: tagDF.loc[tag,parent].update(False) #To get the inclusions in a level recursively. def getInclusions(self,inputs): level = inputs[0] tagDict = inputs[1] debug = inputs[2] if debug == 1: print 'Beginning Level...........' print level inTags = [x[0] for x in level[1:]] if u'' in inTags: inTags.remove(u'') #Do some recursion for i,tag in enumerate(inTags): if tag in self.phraseTags or tag in self.sentenceTypes: if debug == 1: print 'beginning recursion due to:' print tag self.getInclusions([level[i+1],tagDict,debug]) #Add count for this level inTags = list(set(inTags)) for tag in inTags: if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case. break if debug == 1: print 'incrementing: ' + tag tagDict[tag].incrementTrials() #To find all PoS tags (pystatparser's documentation is literally non-existant) def getTagsRecursively(self, ss, knownTags = [], debug = 0): ret = knownTags for sentence in ss: for phrase in sentence: for element in phrase: if type(element) == unicode: if element not in ret: ret.append(element) if type(element) == list: ret.extend(self.getTagsRecursively(element)) return(ret) #Flatten an n-dimensional list into a 1D list def recursiveFlatten(self, myList): ret = [] for element in myList: if type(element) == list: element = self.recursiveFlatten(element) if type(element) == str or type(element) == unicode: ret.append(element) else: ret.extend(list(element)) return(ret) #From http://stackoverflow.com/questions/4576077/python-split-text-on-sentences def split_into_sentences(self, text): if type(text) == unicode: text = unicode(text.encode('utf-8'), errors = 'ignore') text = unicodedata.normalize('NFKD',text).encode('ascii','ignore') caps = "([A-Z])" prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" suffixes = "(Inc|Ltd|Jr|Sr|Co)" starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" websites = "[.](com|net|org|io|gov)" text = " " + text + " " text = text.replace("\n"," ") text = re.sub(prefixes,"\\1<prd>",text) text = re.sub(websites,"<prd>\\1",text) if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>") if 'a.m.' in text: text = text.replace('a.m.','a<prd>m<prd>') if 'p.m.' in text: text = text.replace('p.m.','p<prd>m<prd>') if '...' in text: text = text.replace('...','<prd><prd><prd>') text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text) text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text) text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text) text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text) text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text) text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text) text = re.sub(" " + caps + "[.]"," \\1<prd>",text) if "”" in text: text = text.replace(".”","”.") if "\"" in text: text = text.replace(".\"","\".") if "!" in text: text = text.replace("!\"","\"!") if "?" in text: text = text.replace("?\"","\"?") text = text.replace(".",".<stop>") text = text.replace("?","?<stop>") text = text.replace("!","!<stop>") text = text.replace("<prd>",".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences #Write the parameters we have to file. This will create three files. #Passing the parameter "default" to this function will overwrite the \ #parameters fit by the author. def storeParameters(self, target): try: str(target) except: print "store parameters needs to be passed a string" return f = open(target,'w') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.classPriors) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.importantRootProbabilities.keys()) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), [x.store() for x in self.importantRootProbabilities.values()]) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.regularRootProbabilities.keys()) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), [x.store() for x in self.regularRootProbabilities.values()]) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.importantMultiplicityParameters.keys()) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), [x.store() for x in self.importantMultiplicityParameters.values()]) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.regularMultiplicityParameters.keys()) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), [x.store() for x in self.regularMultiplicityParameters.values()]) + '\n') #f.write(reduce(lambda x,y: str(x) + ',' + str(y), self.alpha) + '\n')Good for bayse #f.write(reduce(lambda x,y: str(x) + ',' + str(y), self.beta) + '\n') f.write(str(self.binomHyperParams[0]) + '/-_-/' + str(self.binomHyperParams[1]) + '\n') f.write(str(self.poissonHyperParams[0]) + '/-_-/' + str(self.poissonHyperParams[1]) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.tags) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.sentenceTypes) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.phraseTags) + '\n') #Element-wise store parameters ICP = [] for i in self.importantCondPresenceProbs.index: for j in self.importantCondPresenceProbs.columns: ICP.append(self.importantCondPresenceProbs.loc[i,j].store()) RCP = [] for i in self.regularCondPresenceProbs.index: for j in self.regularCondPresenceProbs.columns: RCP.append(self.regularCondPresenceProbs.loc[i,j].store()) f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), ICP) + '\n') f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), RCP) + '\n') f.close() #Load parameters from file. Simply provide it with the name you provided \ #to storeParameters. The argument "default" will load the parameters fit \ #by the author. def loadParameters(self, target): try: str(target) except: print "load parameters needs to be passed a string" return f = open(target,'r') groups = [x.split('/-_-/') for x in f.read().split('\n')] self.classPriors = [float(x) for x in groups[0]] self.importantRootProbabilities = dict(zip([unicode(x) for x in groups[1]],[binomialParamDist().load(x) for x in groups[2]])) self.regularRootProbabilities = dict(zip([unicode(x) for x in groups[3]],[binomialParamDist().load(x) for x in groups[4]])) self.importantMultiplicityParameters = dict(zip([unicode(x) for x in groups[5]],[poissonParamDist().load(x) for x in groups[6]])) self.regularMultiplicityParameters = dict(zip([unicode(x) for x in groups[7]],[poissonParamDist().load(x) for x in groups[8]])) #self.alpha = groups[10]#comin with the bayes update #self.beta = groups[11] self.binomHyperParams = [float(x) for x in groups[9]] self.poissonHyperParams = [float(x) for x in groups[10]] self.tags = [unicode(x) for x in groups[11]] self.sentenceTypes = [unicode(x) for x in groups[12]] self.phraseTags = [unicode(x) for x in groups[13]] #Unpack dataframes self.importantCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)]) self.importantCondPresenceProbs = pd.DataFrame(self.importantCondPresenceProbs) self.importantCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags) self.importantCondPresenceProbs.index = list(self.tags) self.regularCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)]) self.regularCondPresenceProbs = pd.DataFrame(self.regularCondPresenceProbs) self.regularCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags) self.regularCondPresenceProbs.index = list(self.tags) for i,row in enumerate(self.importantCondPresenceProbs.index): for j,column in enumerate(self.importantCondPresenceProbs.columns): self.importantCondPresenceProbs.loc[row,column] = binomialParamDist().load(groups[14][i*len(self.importantCondPresenceProbs.columns) + j]) for i,row in enumerate(self.regularCondPresenceProbs.index): for j,column in enumerate(self.regularCondPresenceProbs.columns): self.regularCondPresenceProbs.loc[row,column] = binomialParamDist().load(groups[15][i*len(self.regularCondPresenceProbs.columns) + j]) f.close() def binomialParamDist(params): return(binomialParamDist(params)) def poissonParamDist(params): return(poissonParamDist(params))
class SySE: def __init__(self): try: send = resource_filename(__name__, 'default.dat') self.loadParameters(send) except: print "Could not load default parameters." print "You should either train this object using the \"train\" " +\ "method, or load parameters with the \"loadParameters\" method" ####Supervised Training. #trainingSentences: sentences on which to train (May already be parsed) #labels: corresponding binary (1,0) labels. #HyperParams for the conjugate Beta and Gamma priors respectively. #The Gamma distributionis parameterized such that the term with the beta \ #parameters looks like this: e^(-x*beta) for random variable x. def train(self, trainingSentences, labels, binomHyperParams=[0.5, 0.5], poissonHyperParams=[0.0001, 0.005], debug=0): if debug > -1: print print '**********************************************************' print ' SySE V 1.1.2 ' print 'Beginning Training Sequence with ' + \ str(len(trainingSentences)) + ' training sentences...' print '**********************************************************' if debug > 0: print print 'Initializing... ' if type(trainingSentences[0]) != list: print 'These sentences do not appear to have been parsed.' print 'They will be parsed now.' if len(trainingSentences) > 10: print 'Given their volume, this will take some time.' try: self.parser = Parser() trainingSentences = [ self.parser.parse(x) for x in trainingSentences ] except: print 'This environment should have pystatparser installed ' +\ 'in order to train on unparsed sentences.' print 'Parameters could not be fit' print 'Exiting...' return ####Initialization #Save hyperparameters self.binomHyperParams = binomHyperParams self.poissonHyperParams = poissonHyperParams #See what tags are in the training data. tags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for el in flat: if type(el) == unicode and el not in tags: tags.append(el) self.tags = set(tags) #What kind of root tags are there? self.sentenceTypes = set([x[0] for x in trainingSentences]) #Which tags may contain other tags? self.phraseTags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for i in range(0, len(flat)): try: if type(flat[i]) == unicode and type(flat[ i + 1]) == unicode and flat[i] not in self.phraseTags: self.phraseTags.append(flat[i]) except IndexError: print 'We\'ve reached the end of this sentence' self.phraseTags = set(self.phraseTags) - self.sentenceTypes #Robustness labels = list(labels) #Split training sentences into Important (I) and Regular (R) (Unimportant) importantSentences = filter( lambda x: labels[trainingSentences.index(x)] == 1, trainingSentences) regularSentences = filter( lambda x: not labels[trainingSentences.index(x)] == 1, trainingSentences) self.classPriors = [] ###Test inputs #Make sure labels are right length for sentences. if len(labels) != len(trainingSentences): print 'Labels and trainingSentencs must be the same length!' return #Make sure labels are valid for label in labels: if label != 0 and label != 1: print 'Lables should be either 0 or 1.' print 'exiting...' return ###Train Class Priors self.classPriors.append(float(labels.count(0)) / float(len(labels))) self.classPriors.append(float(labels.count(1)) / float(len(labels))) if debug > 0: print '*********************************************************' print 'These are the class priors' print '*********************************************************' print self.classPriors print print ###Train Sentence Type self.importantRootProbabilities = dict( zip(list(self.sentenceTypes), [ binomialParamDist(self.binomHyperParams) for x in range(0, len(list(self.sentenceTypes))) ])) self.regularRootProbabilities = dict( zip(list(self.sentenceTypes), [ binomialParamDist(self.binomHyperParams) for x in range(0, len(list(self.sentenceTypes))) ])) #Get the count of each sentence type in I for sentence in importantSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.importantRootProbabilities[sentence[0]].update(1) for sentence1 in importantSentences: if sentence1 != sentence: self.importantRootProbabilities[sentence1[0]].update(False) #Get the count of each sentence type in R for sentence in regularSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.regularRootProbabilities[sentence[0]].update(1) for sentence1 in importantSentences: if sentence1 != sentence: self.regularRootProbabilities[sentence1[0]].update(False) if debug > 0: print '*********************************************************' print 'These are the sentence type parameters' print '*********************************************************' print ' --------------------------------------------------------' print ' For Important Sentences:' print self.importantRootProbabilities print ' --------------------------------------------------------' print ' For Regular Sentences:' print self.regularRootProbabilities print print ###Train Phrases ##Primitive Inference on Multiplicity Parameter #To store poisson beliefs self.importantMultiplicityParameters = dict( zip(list(self.tags), [ poissonParamDist(self.poissonHyperParams) for x in range(0, len(list(self.tags))) ])) #For storing parameter estimates. self.regularMultiplicityParameters = dict( zip(list(self.tags), [ poissonParamDist(self.poissonHyperParams) for x in range(0, len(list(self.tags))) ])) #For storing parameter estimates. #Get Inclusion for I for sentence in importantSentences: self.getInclusions( [sentence, self.importantMultiplicityParameters, debug >= 2]) #Get Inclusion for R for sentence in regularSentences: self.getInclusions( [sentence, self.regularMultiplicityParameters, debug >= 2]) #Get Counts for I for sentence in importantSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x) == unicode, flat) for tag in currentTags[1:]: self.importantMultiplicityParameters[tag].updateCount(1) #Get Counts for R for sentence in regularSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x) == unicode, flat) for tag in currentTags[1:]: self.regularMultiplicityParameters[tag].updateCount(1) ####YOUNEED TO GO OVER THIS AGAIN #Estimate Parameters for I for tag in self.importantMultiplicityParameters.keys(): if (self.importantMultiplicityParameters[tag].alpha > 1): self.importantMultiplicityParameters[tag].updateCount(-1) #Estimate Parameters for R for tag in self.regularMultiplicityParameters.keys(): if (self.regularMultiplicityParameters[tag].alpha > 1): self.regularMultiplicityParameters[tag].updateCount(-1) if debug > 0: print '*********************************************************' print ' Estimation for Multiplicity Parameters ' print '*********************************************************' print print 'Dumb Parameter Estimates for Imporant Sentences:' print self.regularMultiplicityParameters print 'Dumb Parameter Estimates for Regular Sentences:' print self.regularMultiplictyParameter print print ##Primitive Inference on Presence Parameters #We need to find inclusions given parent #To store conditional presence probabilities, what can almost be \ #thought of as transition probabilities. #For important phrases self.importantCondPresenceProbs = np.zeros( [len(self.tags), len(self.phraseTags) + len(self.sentenceTypes)]) self.importantCondPresenceProbs = pd.DataFrame( self.importantCondPresenceProbs).applymap( lambda x: binomialParamDist(self.binomHyperParams)) self.importantCondPresenceProbs.columns = list( self.sentenceTypes) + list(self.phraseTags) self.importantCondPresenceProbs.index = list(self.tags) #For regularPhrases self.regularCondPresenceProbs = np.zeros( [len(self.tags), len(self.phraseTags) + len(self.sentenceTypes)]) self.regularCondPresenceProbs = pd.DataFrame( self.regularCondPresenceProbs).applymap( lambda x: binomialParamDist(self.binomHyperParams)) self.regularCondPresenceProbs.columns = list( self.sentenceTypes) + list(self.phraseTags) self.regularCondPresenceProbs.index = list(self.tags) #Count Conditional Inclusions for Important Sentences for sentence in importantSentences: self.getInclusionsGivenParent([ sentence, self.importantCondPresenceProbs, sentence[0], debug >= 2 ]) #Count Conditional Inclusions for Regular Sentences for sentence in regularSentences: self.getInclusionsGivenParent([ sentence, self.regularCondPresenceProbs, sentence[0], debug >= 2 ]) if debug > 1: print '*********************************************************' print 'Presence Parameter Estimation' print '*********************************************************' print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Important Sentences' print self.importantCondPresenceProbs print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Regular Sentences' print self.regularCondPresenceProbs print ' ------------------------------------------------------------------' if debug > -1: print print print '...Finished' ####Classification def classify(self, sentence, debug=0, varianceExponent=0): #If the sentence hasn't been parsed, we must parse it. plaintext = False if type(sentence) != list: plaintext = True original = sentence try: sentence = self.parser.parse(sentence) except: try: self.parser = Parser() sentence = self.parser.parse(sentence) except: print 'Couldn\'t create a parsing object.' print 'Prehaps pystatparser is not loaded?' print 'type \"from stat_parser import Parser\"' print 'Otherwise, you must install it from Github as ' + \ 'directed in the README' #Deal with new root types if sentence[0] not in self.importantRootProbabilities: self.importantRootProbabilities[ sentence[0]] = self.biomialParamDist(self.binomHyperParams) if sentence[0] not in self.regularRootProbabilities: self.regularRootProbabilities[sentence[0]] = self.biomialParamDist( self.binomHyperParams) #Deal with new non-root tag types flat = self.recursiveFlatten(sentence) flat = filter(lambda x: type(x) == unicode, flat) for i, tag in enumerate(flat): if tag not in self.tags: #Set a priori beliefs for multiplicity parameters self.importantMultiplicityParameters[ tag] = self.poissonParamDist(self.poissonHyperParams) self.regularMultiplicityParameters[ tag] = self.poissonParamDist(self.poissonHyperParams) #Set a priori beliefs for conditional presence parameters being contained by anything else self.importantCondPresenceProbs.loc[tag] = [ self.biomialParamDist(self.binomHyperParams) for x in self.importantCondPresenceProbs.columns ] self.regularCondPresenceProbs.loc[tag] = [ self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.columns ] if type(flat[i + 1]) == unicode: #Set a priori beliefs for conditional presence parameters containing other things self.importantCondPresenceProbs[tag] = [ self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index ] self.regularCondPresenceProbs[tag] = [ self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index ] ##Get P(x|y = Important) PxGy1 = math.log( self.importantRootProbabilities[sentence[0]].getMean()) PxGy1 = PxGy1 / self.importantRootProbabilities[ sentence[0]].getVar()**varianceExponent PxGy1 += self.getConditionalLevelProbability([ sentence, self.importantCondPresenceProbs, self.importantMultiplicityParameters, sentence[0], varianceExponent, debug >= 2 ]) ##Get P(x|y = REGULAR) PxGy0 = math.log(self.regularRootProbabilities[sentence[0]].getMean()) PxGy0 = PxGy0 / self.regularRootProbabilities[ sentence[0]].getVar()**varianceExponent PxGy0 += self.getConditionalLevelProbability([ sentence, self.regularCondPresenceProbs, self.regularMultiplicityParameters, sentence[0], varianceExponent, debug >= 2 ]) #Get priors in a log form: Py1 = math.log(self.classPriors[1]) Py0 = math.log(self.classPriors[0]) #Get log Probabilities of each class through Bayes' Rule Py1Gx = PxGy1 + Py1 Py0Gx = PxGy0 + Py0 #Derive softmax shift parameter for very small probabilities. shift = 0 if min([Py1Gx, Py0Gx]) < -20: shift = -1 * min([Py1Gx, Py0Gx]) - 20 #SoftMax probabilities try: denom = math.log(math.e**(shift + Py1Gx) + math.e**(shift + Py0Gx)) sPy1Gx = shift + Py1Gx - denom sPy0Gx = shift + Py0Gx - denom #Turn back into probabilities for output sPy1Gx = math.e**sPy1Gx sPy0Gx = math.e**sPy0Gx except OverflowError: if debug > -1: print 'Overflow error' if Py1Gx >= Py0Gx: print 'Assigning important sentence with probability one.' else: print 'Assigning regular sentence with probability one.' print 'Before softmax, log probabilities were:' print 'P(important | sentence) = ' + str(Py1Gx) print 'P(unimportant | sentence) = ' + str(Py0Gx) if Py1Gx >= Py0Gx: sPy1Gx = 1.0 sPy0Gx = 0.0 else: sPy1Gx = 0.0 sPy0Gx = 1.0 if debug > -1: print 'Estimating Class for sentence:' if plaintext: print '\"' + original + '\"' else: print sentence if debug > 0: print ' ------------------------------------------------------------------' print 'Class Priors (log probability):' print 'P(important) = ' + str(Py1) print 'P(unimportant) = ' + str(Py0) print ' ------------------------------------------------------------------' print 'Conditional Sentence Log Probabilities:' print 'P(sentence | important) = ' + str(PxGy1) print 'P(sentence | unimportant) = ' + str(PxGy0) print ' ------------------------------------------------------------------' print 'Unnormalized Conditional Class Log Probabilities' print 'P(important | sentence) = ' + str(Py1Gx) print 'P(unimportant | sentence) = ' + str(Py0Gx) if debug > -1: print ' ------------------------------------------------------------------' print 'Softmaxed Conditional Class Probabilities' print 'P(important | sentence) = ' + str(sPy1Gx) print 'P(unimportant | sentence) = ' + str(sPy0Gx) return (sPy1Gx) def summarize(self, article, verbosity=0.5, debug=0): sentences = self.split_into_sentences(article) keepers = [] i = 0 for sentence in sentences: i += 1 try: if self.classify(sentence, debug=debug) > verbosity: keepers.append(sentence) except: print 'Error classifying sentence ' + str(i) print 'FullText: ' print sentence if len(keepers) == 0: print 'No sentences found important' return ('') reduced = reduce(lambda x, y: x + ' ' + y, keepers) return (reduced) ####Function Definitions #Returns the log probability of a level occuring, along with using recursion to \ #find the levels contained therein. May be passed an entire sentence. def getConditionalLevelProbability(self, inputs): level = inputs[0] tagDF = inputs[1] mult = inputs[2] parent = inputs[3] varExp = inputs[4] debug = inputs[5] ret = 0 if debug == 1: print 'Beginning Level...........' print level inTags = [x[0] for x in level[1:]] if u'' in inTags: inTags.remove(u'') #Do some recursion for i, tag in enumerate(inTags): if tag in self.phraseTags or tag in self.sentenceTypes: if debug == 1: print 'beginning recursion due to:' print tag ret = ret + self.getConditionalLevelProbability( [level[i + 1], tagDF, mult, tag, varExp, debug]) #Do multiplicity for this level for tag in inTags: x = inTags.count(tag) mu = mult[tag].getMean() ret = ret + math.log( (math.exp(-mu) * mu**x / math.factorial(x))) / mult[tag].getVar()**varExp #Do presence for this level inTags = list(set(inTags)) for tag in inTags: if type( tag ) != unicode: #Some sentences contain only a word, and we won't need to add anything in that case. print 'Breaking Due to non-unicode tag in getConditionalLevelProbability!' print tag break if debug == 1: print 'Probability of ' + tag + ' given ' + parent + ' is ' + str( tagDF.loc[tag, parent]) ret = ret + math.log(tagDF.loc[tag, parent].getMean()) / tagDF.loc[ tag, parent].getVar()**varExp return (ret) #To get the inclusion def getInclusionsGivenParent(self, inputs): level = inputs[0] tagDF = inputs[1] parent = inputs[2] debug = inputs[3] if debug == 1: print 'Beginning Level...........' print level inTags = [x[0] for x in level[1:]] if u'' in inTags: inTags.remove(u'') #Do some recursion for i, tag in enumerate(inTags): if tag in self.phraseTags or tag in self.sentenceTypes: if debug == 1: print 'beginning recursion due to:' print tag self.getInclusionsGivenParent( [level[i + 1], tagDF, tag, debug]) #Update tags on this level inTags = list(set(inTags)) for tag in inTags: if type( tag ) != unicode: #Some sentences contain only a word, and we won't need to add anything in that case. break if debug == 1: print 'incrementing: ' + tag + ' when conditioned on ' + parent tagDF.loc[tag, parent].update(True) #Update tags not on this level for tag in self.tags: if tag not in inTags: tagDF.loc[tag, parent].update(False) #To get the inclusions in a level recursively. def getInclusions(self, inputs): level = inputs[0] tagDict = inputs[1] debug = inputs[2] if debug == 1: print 'Beginning Level...........' print level inTags = [x[0] for x in level[1:]] if u'' in inTags: inTags.remove(u'') #Do some recursion for i, tag in enumerate(inTags): if tag in self.phraseTags or tag in self.sentenceTypes: if debug == 1: print 'beginning recursion due to:' print tag self.getInclusions([level[i + 1], tagDict, debug]) #Add count for this level inTags = list(set(inTags)) for tag in inTags: if type( tag ) != unicode: #Some sentences contain only a word, and we won't need to add anything in that case. break if debug == 1: print 'incrementing: ' + tag tagDict[tag].incrementTrials() #To find all PoS tags (pystatparser's documentation is literally non-existant) def getTagsRecursively(self, ss, knownTags=[], debug=0): ret = knownTags for sentence in ss: for phrase in sentence: for element in phrase: if type(element) == unicode: if element not in ret: ret.append(element) if type(element) == list: ret.extend(self.getTagsRecursively(element)) return (ret) #Flatten an n-dimensional list into a 1D list def recursiveFlatten(self, myList): ret = [] for element in myList: if type(element) == list: element = self.recursiveFlatten(element) if type(element) == str or type(element) == unicode: ret.append(element) else: ret.extend(list(element)) return (ret) #From http://stackoverflow.com/questions/4576077/python-split-text-on-sentences def split_into_sentences(self, text): if type(text) == unicode: text = unicode(text.encode('utf-8'), errors='ignore') text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') caps = "([A-Z])" prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" suffixes = "(Inc|Ltd|Jr|Sr|Co)" starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" websites = "[.](com|net|org|io|gov)" text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") if 'a.m.' in text: text = text.replace('a.m.', 'a<prd>m<prd>') if 'p.m.' in text: text = text.replace('p.m.', 'p<prd>m<prd>') if '...' in text: text = text.replace('...', '<prd><prd><prd>') text = re.sub("\s" + caps + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(caps + "[.]" + caps + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + caps + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences #Write the parameters we have to file. This will create three files. #Passing the parameter "default" to this function will overwrite the \ #parameters fit by the author. def storeParameters(self, target): try: str(target) except: print "store parameters needs to be passed a string" return f = open(target, 'w') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.classPriors) + '\n') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.importantRootProbabilities.keys()) + '\n') f.write( reduce( lambda x, y: str(x) + '/-_-/' + str(y), [x.store() for x in self.importantRootProbabilities.values()]) + '\n') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.regularRootProbabilities.keys()) + '\n') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), [x.store() for x in self.regularRootProbabilities.values()]) + '\n') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.importantMultiplicityParameters.keys()) + '\n') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), [ x.store() for x in self.importantMultiplicityParameters.values() ]) + '\n') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.regularMultiplicityParameters.keys()) + '\n') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), [ x.store() for x in self.regularMultiplicityParameters.values() ]) + '\n') #f.write(reduce(lambda x,y: str(x) + ',' + str(y), self.alpha) + '\n')Good for bayse #f.write(reduce(lambda x,y: str(x) + ',' + str(y), self.beta) + '\n') f.write( str(self.binomHyperParams[0]) + '/-_-/' + str(self.binomHyperParams[1]) + '\n') f.write( str(self.poissonHyperParams[0]) + '/-_-/' + str(self.poissonHyperParams[1]) + '\n') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.tags) + '\n') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.sentenceTypes) + '\n') f.write( reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.phraseTags) + '\n') #Element-wise store parameters ICP = [] for i in self.importantCondPresenceProbs.index: for j in self.importantCondPresenceProbs.columns: ICP.append(self.importantCondPresenceProbs.loc[i, j].store()) RCP = [] for i in self.regularCondPresenceProbs.index: for j in self.regularCondPresenceProbs.columns: RCP.append(self.regularCondPresenceProbs.loc[i, j].store()) f.write(reduce(lambda x, y: str(x) + '/-_-/' + str(y), ICP) + '\n') f.write(reduce(lambda x, y: str(x) + '/-_-/' + str(y), RCP) + '\n') f.close() #Load parameters from file. Simply provide it with the name you provided \ #to storeParameters. The argument "default" will load the parameters fit \ #by the author. def loadParameters(self, target): try: str(target) except: print "load parameters needs to be passed a string" return f = open(target, 'r') groups = [x.split('/-_-/') for x in f.read().split('\n')] self.classPriors = [float(x) for x in groups[0]] self.importantRootProbabilities = dict( zip([unicode(x) for x in groups[1]], [binomialParamDist().load(x) for x in groups[2]])) self.regularRootProbabilities = dict( zip([unicode(x) for x in groups[3]], [binomialParamDist().load(x) for x in groups[4]])) self.importantMultiplicityParameters = dict( zip([unicode(x) for x in groups[5]], [poissonParamDist().load(x) for x in groups[6]])) self.regularMultiplicityParameters = dict( zip([unicode(x) for x in groups[7]], [poissonParamDist().load(x) for x in groups[8]])) #self.alpha = groups[10]#comin with the bayes update #self.beta = groups[11] self.binomHyperParams = [float(x) for x in groups[9]] self.poissonHyperParams = [float(x) for x in groups[10]] self.tags = [unicode(x) for x in groups[11]] self.sentenceTypes = [unicode(x) for x in groups[12]] self.phraseTags = [unicode(x) for x in groups[13]] #Unpack dataframes self.importantCondPresenceProbs = np.zeros( [len(self.tags), len(self.phraseTags) + len(self.sentenceTypes)]) self.importantCondPresenceProbs = pd.DataFrame( self.importantCondPresenceProbs) self.importantCondPresenceProbs.columns = list( self.sentenceTypes) + list(self.phraseTags) self.importantCondPresenceProbs.index = list(self.tags) self.regularCondPresenceProbs = np.zeros( [len(self.tags), len(self.phraseTags) + len(self.sentenceTypes)]) self.regularCondPresenceProbs = pd.DataFrame( self.regularCondPresenceProbs) self.regularCondPresenceProbs.columns = list( self.sentenceTypes) + list(self.phraseTags) self.regularCondPresenceProbs.index = list(self.tags) for i, row in enumerate(self.importantCondPresenceProbs.index): for j, column in enumerate( self.importantCondPresenceProbs.columns): self.importantCondPresenceProbs.loc[ row, column] = binomialParamDist().load(groups[14][ i * len(self.importantCondPresenceProbs.columns) + j]) for i, row in enumerate(self.regularCondPresenceProbs.index): for j, column in enumerate(self.regularCondPresenceProbs.columns): self.regularCondPresenceProbs.loc[ row, column] = binomialParamDist().load( groups[15][i * len(self.regularCondPresenceProbs.columns) + j]) f.close() def binomialParamDist(params): return (binomialParamDist(params)) def poissonParamDist(params): return (poissonParamDist(params))
def classify(self, sentence, debug=0, varianceExponent=0): #If the sentence hasn't been parsed, we must parse it. plaintext = False if type(sentence) != list: plaintext = True original = sentence try: sentence = self.parser.parse(sentence) except: try: self.parser = Parser() sentence = self.parser.parse(sentence) except: print 'Couldn\'t create a parsing object.' print 'Prehaps pystatparser is not loaded?' print 'type \"from stat_parser import Parser\"' print 'Otherwise, you must install it from Github as ' + \ 'directed in the README' #Deal with new root types if sentence[0] not in self.importantRootProbabilities: self.importantRootProbabilities[ sentence[0]] = self.biomialParamDist(self.binomHyperParams) if sentence[0] not in self.regularRootProbabilities: self.regularRootProbabilities[sentence[0]] = self.biomialParamDist( self.binomHyperParams) #Deal with new non-root tag types flat = self.recursiveFlatten(sentence) flat = filter(lambda x: type(x) == unicode, flat) for i, tag in enumerate(flat): if tag not in self.tags: #Set a priori beliefs for multiplicity parameters self.importantMultiplicityParameters[ tag] = self.poissonParamDist(self.poissonHyperParams) self.regularMultiplicityParameters[ tag] = self.poissonParamDist(self.poissonHyperParams) #Set a priori beliefs for conditional presence parameters being contained by anything else self.importantCondPresenceProbs.loc[tag] = [ self.biomialParamDist(self.binomHyperParams) for x in self.importantCondPresenceProbs.columns ] self.regularCondPresenceProbs.loc[tag] = [ self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.columns ] if type(flat[i + 1]) == unicode: #Set a priori beliefs for conditional presence parameters containing other things self.importantCondPresenceProbs[tag] = [ self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index ] self.regularCondPresenceProbs[tag] = [ self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index ] ##Get P(x|y = Important) PxGy1 = math.log( self.importantRootProbabilities[sentence[0]].getMean()) PxGy1 = PxGy1 / self.importantRootProbabilities[ sentence[0]].getVar()**varianceExponent PxGy1 += self.getConditionalLevelProbability([ sentence, self.importantCondPresenceProbs, self.importantMultiplicityParameters, sentence[0], varianceExponent, debug >= 2 ]) ##Get P(x|y = REGULAR) PxGy0 = math.log(self.regularRootProbabilities[sentence[0]].getMean()) PxGy0 = PxGy0 / self.regularRootProbabilities[ sentence[0]].getVar()**varianceExponent PxGy0 += self.getConditionalLevelProbability([ sentence, self.regularCondPresenceProbs, self.regularMultiplicityParameters, sentence[0], varianceExponent, debug >= 2 ]) #Get priors in a log form: Py1 = math.log(self.classPriors[1]) Py0 = math.log(self.classPriors[0]) #Get log Probabilities of each class through Bayes' Rule Py1Gx = PxGy1 + Py1 Py0Gx = PxGy0 + Py0 #Derive softmax shift parameter for very small probabilities. shift = 0 if min([Py1Gx, Py0Gx]) < -20: shift = -1 * min([Py1Gx, Py0Gx]) - 20 #SoftMax probabilities try: denom = math.log(math.e**(shift + Py1Gx) + math.e**(shift + Py0Gx)) sPy1Gx = shift + Py1Gx - denom sPy0Gx = shift + Py0Gx - denom #Turn back into probabilities for output sPy1Gx = math.e**sPy1Gx sPy0Gx = math.e**sPy0Gx except OverflowError: if debug > -1: print 'Overflow error' if Py1Gx >= Py0Gx: print 'Assigning important sentence with probability one.' else: print 'Assigning regular sentence with probability one.' print 'Before softmax, log probabilities were:' print 'P(important | sentence) = ' + str(Py1Gx) print 'P(unimportant | sentence) = ' + str(Py0Gx) if Py1Gx >= Py0Gx: sPy1Gx = 1.0 sPy0Gx = 0.0 else: sPy1Gx = 0.0 sPy0Gx = 1.0 if debug > -1: print 'Estimating Class for sentence:' if plaintext: print '\"' + original + '\"' else: print sentence if debug > 0: print ' ------------------------------------------------------------------' print 'Class Priors (log probability):' print 'P(important) = ' + str(Py1) print 'P(unimportant) = ' + str(Py0) print ' ------------------------------------------------------------------' print 'Conditional Sentence Log Probabilities:' print 'P(sentence | important) = ' + str(PxGy1) print 'P(sentence | unimportant) = ' + str(PxGy0) print ' ------------------------------------------------------------------' print 'Unnormalized Conditional Class Log Probabilities' print 'P(important | sentence) = ' + str(Py1Gx) print 'P(unimportant | sentence) = ' + str(Py0Gx) if debug > -1: print ' ------------------------------------------------------------------' print 'Softmaxed Conditional Class Probabilities' print 'P(important | sentence) = ' + str(sPy1Gx) print 'P(unimportant | sentence) = ' + str(sPy0Gx) return (sPy1Gx)
def train(self, trainingSentences, labels, binomHyperParams=[0.5, 0.5], poissonHyperParams=[0.0001, 0.005], debug=0): if debug > -1: print print '**********************************************************' print ' SySE V 1.1.2 ' print 'Beginning Training Sequence with ' + \ str(len(trainingSentences)) + ' training sentences...' print '**********************************************************' if debug > 0: print print 'Initializing... ' if type(trainingSentences[0]) != list: print 'These sentences do not appear to have been parsed.' print 'They will be parsed now.' if len(trainingSentences) > 10: print 'Given their volume, this will take some time.' try: self.parser = Parser() trainingSentences = [ self.parser.parse(x) for x in trainingSentences ] except: print 'This environment should have pystatparser installed ' +\ 'in order to train on unparsed sentences.' print 'Parameters could not be fit' print 'Exiting...' return ####Initialization #Save hyperparameters self.binomHyperParams = binomHyperParams self.poissonHyperParams = poissonHyperParams #See what tags are in the training data. tags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for el in flat: if type(el) == unicode and el not in tags: tags.append(el) self.tags = set(tags) #What kind of root tags are there? self.sentenceTypes = set([x[0] for x in trainingSentences]) #Which tags may contain other tags? self.phraseTags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for i in range(0, len(flat)): try: if type(flat[i]) == unicode and type(flat[ i + 1]) == unicode and flat[i] not in self.phraseTags: self.phraseTags.append(flat[i]) except IndexError: print 'We\'ve reached the end of this sentence' self.phraseTags = set(self.phraseTags) - self.sentenceTypes #Robustness labels = list(labels) #Split training sentences into Important (I) and Regular (R) (Unimportant) importantSentences = filter( lambda x: labels[trainingSentences.index(x)] == 1, trainingSentences) regularSentences = filter( lambda x: not labels[trainingSentences.index(x)] == 1, trainingSentences) self.classPriors = [] ###Test inputs #Make sure labels are right length for sentences. if len(labels) != len(trainingSentences): print 'Labels and trainingSentencs must be the same length!' return #Make sure labels are valid for label in labels: if label != 0 and label != 1: print 'Lables should be either 0 or 1.' print 'exiting...' return ###Train Class Priors self.classPriors.append(float(labels.count(0)) / float(len(labels))) self.classPriors.append(float(labels.count(1)) / float(len(labels))) if debug > 0: print '*********************************************************' print 'These are the class priors' print '*********************************************************' print self.classPriors print print ###Train Sentence Type self.importantRootProbabilities = dict( zip(list(self.sentenceTypes), [ binomialParamDist(self.binomHyperParams) for x in range(0, len(list(self.sentenceTypes))) ])) self.regularRootProbabilities = dict( zip(list(self.sentenceTypes), [ binomialParamDist(self.binomHyperParams) for x in range(0, len(list(self.sentenceTypes))) ])) #Get the count of each sentence type in I for sentence in importantSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.importantRootProbabilities[sentence[0]].update(1) for sentence1 in importantSentences: if sentence1 != sentence: self.importantRootProbabilities[sentence1[0]].update(False) #Get the count of each sentence type in R for sentence in regularSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.regularRootProbabilities[sentence[0]].update(1) for sentence1 in importantSentences: if sentence1 != sentence: self.regularRootProbabilities[sentence1[0]].update(False) if debug > 0: print '*********************************************************' print 'These are the sentence type parameters' print '*********************************************************' print ' --------------------------------------------------------' print ' For Important Sentences:' print self.importantRootProbabilities print ' --------------------------------------------------------' print ' For Regular Sentences:' print self.regularRootProbabilities print print ###Train Phrases ##Primitive Inference on Multiplicity Parameter #To store poisson beliefs self.importantMultiplicityParameters = dict( zip(list(self.tags), [ poissonParamDist(self.poissonHyperParams) for x in range(0, len(list(self.tags))) ])) #For storing parameter estimates. self.regularMultiplicityParameters = dict( zip(list(self.tags), [ poissonParamDist(self.poissonHyperParams) for x in range(0, len(list(self.tags))) ])) #For storing parameter estimates. #Get Inclusion for I for sentence in importantSentences: self.getInclusions( [sentence, self.importantMultiplicityParameters, debug >= 2]) #Get Inclusion for R for sentence in regularSentences: self.getInclusions( [sentence, self.regularMultiplicityParameters, debug >= 2]) #Get Counts for I for sentence in importantSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x) == unicode, flat) for tag in currentTags[1:]: self.importantMultiplicityParameters[tag].updateCount(1) #Get Counts for R for sentence in regularSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x) == unicode, flat) for tag in currentTags[1:]: self.regularMultiplicityParameters[tag].updateCount(1) ####YOUNEED TO GO OVER THIS AGAIN #Estimate Parameters for I for tag in self.importantMultiplicityParameters.keys(): if (self.importantMultiplicityParameters[tag].alpha > 1): self.importantMultiplicityParameters[tag].updateCount(-1) #Estimate Parameters for R for tag in self.regularMultiplicityParameters.keys(): if (self.regularMultiplicityParameters[tag].alpha > 1): self.regularMultiplicityParameters[tag].updateCount(-1) if debug > 0: print '*********************************************************' print ' Estimation for Multiplicity Parameters ' print '*********************************************************' print print 'Dumb Parameter Estimates for Imporant Sentences:' print self.regularMultiplicityParameters print 'Dumb Parameter Estimates for Regular Sentences:' print self.regularMultiplictyParameter print print ##Primitive Inference on Presence Parameters #We need to find inclusions given parent #To store conditional presence probabilities, what can almost be \ #thought of as transition probabilities. #For important phrases self.importantCondPresenceProbs = np.zeros( [len(self.tags), len(self.phraseTags) + len(self.sentenceTypes)]) self.importantCondPresenceProbs = pd.DataFrame( self.importantCondPresenceProbs).applymap( lambda x: binomialParamDist(self.binomHyperParams)) self.importantCondPresenceProbs.columns = list( self.sentenceTypes) + list(self.phraseTags) self.importantCondPresenceProbs.index = list(self.tags) #For regularPhrases self.regularCondPresenceProbs = np.zeros( [len(self.tags), len(self.phraseTags) + len(self.sentenceTypes)]) self.regularCondPresenceProbs = pd.DataFrame( self.regularCondPresenceProbs).applymap( lambda x: binomialParamDist(self.binomHyperParams)) self.regularCondPresenceProbs.columns = list( self.sentenceTypes) + list(self.phraseTags) self.regularCondPresenceProbs.index = list(self.tags) #Count Conditional Inclusions for Important Sentences for sentence in importantSentences: self.getInclusionsGivenParent([ sentence, self.importantCondPresenceProbs, sentence[0], debug >= 2 ]) #Count Conditional Inclusions for Regular Sentences for sentence in regularSentences: self.getInclusionsGivenParent([ sentence, self.regularCondPresenceProbs, sentence[0], debug >= 2 ]) if debug > 1: print '*********************************************************' print 'Presence Parameter Estimation' print '*********************************************************' print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Important Sentences' print self.importantCondPresenceProbs print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Regular Sentences' print self.regularCondPresenceProbs print ' ------------------------------------------------------------------' if debug > -1: print print print '...Finished'
import nltk from stat_parser import Parser import sys """Helper script to add POS tags (headline and body) and syntactic parse (headline only) to a file of training data""" """First argument is the input file, second argument is the output file""" reload(sys) sys.setdefaultencoding('utf8') parse_dict = {} parser = Parser() input_file = sys.argv[1] file_with_feats = sys.argv[2] with open(input_file, 'rU') as f: with open(file_with_feats, 'w') as w: i = 1 for line in f: print(i) line = line.strip() art_id = str(i) try: art_title, art_text, source, source_type = line.split('\t') try: title_tokens = nltk.word_tokenize(art_title) title_pos_tags = [x[1] for x in nltk.pos_tag(title_tokens)] except: title_pos_tags = ['n/a'] try: text_tokens = nltk.word_tokenize(art_text)
from stat_parser import Parser, display_tree parser = Parser() [tree1, tree2] = parser.parse("John saw Mary with the telescope") display_tree([tree1, tree2])
from stat_parser import Parser, display_tree parser = Parser() # http://www.thrivenotes.com/the-last-question/ tree = parser.parse("Multiple SQL injection vulnerabilities in myBloggie 2.1.6 and earlier allow remote attackers to execute arbitrary SQL commands via the (1) cat_id or (2) year parameter to index.php in a viewuser action, different vectors than CVE-2005-1500 and CVE-2005-4225. ") display_tree(tree)
def train(self, trainingSentences, labels, binomHyperParams = [0.5,0.5], poissonHyperParams = [0.0001,0.005], debug = 0): if debug > -1: print print '**********************************************************' print ' SySE V 1.1.2 ' print 'Beginning Training Sequence with ' + \ str(len(trainingSentences)) + ' training sentences...' print '**********************************************************' if debug > 0: print print 'Initializing... ' if type(trainingSentences[0]) != list: print 'These sentences do not appear to have been parsed.' print 'They will be parsed now.' if len(trainingSentences) > 10: print 'Given their volume, this will take some time.' try: self.parser = Parser() trainingSentences = [self.parser.parse(x) for x in trainingSentences] except: print 'This environment should have pystatparser installed ' +\ 'in order to train on unparsed sentences.' print 'Parameters could not be fit' print 'Exiting...' return ####Initialization #Save hyperparameters self.binomHyperParams = binomHyperParams self.poissonHyperParams = poissonHyperParams #See what tags are in the training data. tags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for el in flat: if type(el) == unicode and el not in tags: tags.append(el) self.tags = set(tags) #What kind of root tags are there? self.sentenceTypes = set([x[0] for x in trainingSentences]) #Which tags may contain other tags? self.phraseTags = [] for sentence in trainingSentences: flat = self.recursiveFlatten(sentence) for i in range(0,len(flat)): try: if type(flat[i]) == unicode and type(flat[i+1]) == unicode and flat[i] not in self.phraseTags: self.phraseTags.append(flat[i]) except IndexError: print 'We\'ve reached the end of this sentence' self.phraseTags = set(self.phraseTags) - self.sentenceTypes #Robustness labels = list(labels) #Split training sentences into Important (I) and Regular (R) (Unimportant) importantSentences = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences) regularSentences = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences) self.classPriors = [] ###Test inputs #Make sure labels are right length for sentences. if len(labels) != len(trainingSentences): print 'Labels and trainingSentencs must be the same length!' return #Make sure labels are valid for label in labels: if label != 0 and label != 1: print 'Lables should be either 0 or 1.' print 'exiting...' return ###Train Class Priors self.classPriors.append(float(labels.count(0))/float(len(labels))) self.classPriors.append(float(labels.count(1))/float(len(labels))) if debug > 0: print '*********************************************************' print 'These are the class priors' print '*********************************************************' print self.classPriors print print ###Train Sentence Type self.importantRootProbabilities = dict(zip(list(self.sentenceTypes),[binomialParamDist(self.binomHyperParams) for x in range(0,len(list(self.sentenceTypes)))])) self.regularRootProbabilities = dict(zip(list(self.sentenceTypes),[binomialParamDist(self.binomHyperParams) for x in range(0,len(list(self.sentenceTypes)))])) #Get the count of each sentence type in I for sentence in importantSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.importantRootProbabilities[sentence[0]].update(1) for sentence1 in importantSentences: if sentence1 != sentence: self.importantRootProbabilities[sentence1[0]].update(False) #Get the count of each sentence type in R for sentence in regularSentences: #Make sure we get what we expect if type(sentence[0]) != unicode: print "We are looking for a non-unicode sentence type. exiting..." break return #if it isn't in the list yet, add it. self.regularRootProbabilities[sentence[0]].update(1) for sentence1 in importantSentences: if sentence1 != sentence: self.regularRootProbabilities[sentence1[0]].update(False) if debug > 0: print '*********************************************************' print 'These are the sentence type parameters' print '*********************************************************' print ' --------------------------------------------------------' print ' For Important Sentences:' print self.importantRootProbabilities print ' --------------------------------------------------------' print ' For Regular Sentences:' print self.regularRootProbabilities print print ###Train Phrases ##Primitive Inference on Multiplicity Parameter #To store poisson beliefs self.importantMultiplicityParameters = dict(zip(list(self.tags), [poissonParamDist(self.poissonHyperParams) for x in range(0,len(list(self.tags)))]))#For storing parameter estimates. self.regularMultiplicityParameters = dict(zip(list(self.tags), [poissonParamDist(self.poissonHyperParams) for x in range(0,len(list(self.tags)))]))#For storing parameter estimates. #Get Inclusion for I for sentence in importantSentences: self.getInclusions([sentence,self.importantMultiplicityParameters,debug>=2]) #Get Inclusion for R for sentence in regularSentences: self.getInclusions([sentence,self.regularMultiplicityParameters,debug>=2]) #Get Counts for I for sentence in importantSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags[1:]: self.importantMultiplicityParameters[tag].updateCount(1) #Get Counts for R for sentence in regularSentences: flat = self.recursiveFlatten(sentence) currentTags = filter(lambda x: type(x)==unicode, flat) for tag in currentTags[1:]: self.regularMultiplicityParameters[tag].updateCount(1) ####YOUNEED TO GO OVER THIS AGAIN #Estimate Parameters for I for tag in self.importantMultiplicityParameters.keys(): if (self.importantMultiplicityParameters[tag].alpha > 1): self.importantMultiplicityParameters[tag].updateCount(-1) #Estimate Parameters for R for tag in self.regularMultiplicityParameters.keys(): if (self.regularMultiplicityParameters[tag].alpha > 1): self.regularMultiplicityParameters[tag].updateCount(-1) if debug > 0: print '*********************************************************' print ' Estimation for Multiplicity Parameters ' print '*********************************************************' print print 'Dumb Parameter Estimates for Imporant Sentences:' print self.regularMultiplicityParameters print 'Dumb Parameter Estimates for Regular Sentences:' print self.regularMultiplictyParameter print print ##Primitive Inference on Presence Parameters #We need to find inclusions given parent #To store conditional presence probabilities, what can almost be \ #thought of as transition probabilities. #For important phrases self.importantCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)]) self.importantCondPresenceProbs = pd.DataFrame(self.importantCondPresenceProbs).applymap(lambda x: binomialParamDist(self.binomHyperParams)) self.importantCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags) self.importantCondPresenceProbs.index = list(self.tags) #For regularPhrases self.regularCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)]) self.regularCondPresenceProbs = pd.DataFrame(self.regularCondPresenceProbs).applymap(lambda x: binomialParamDist(self.binomHyperParams)) self.regularCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags) self.regularCondPresenceProbs.index = list(self.tags) #Count Conditional Inclusions for Important Sentences for sentence in importantSentences: self.getInclusionsGivenParent([sentence,self.importantCondPresenceProbs,sentence[0],debug>=2]) #Count Conditional Inclusions for Regular Sentences for sentence in regularSentences: self.getInclusionsGivenParent([sentence,self.regularCondPresenceProbs,sentence[0],debug>=2]) if debug > 1: print '*********************************************************' print 'Presence Parameter Estimation' print '*********************************************************' print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Important Sentences' print self.importantCondPresenceProbs print print ' ------------------------------------------------------------------' print ' Conditional Parameters for Regular Sentences' print self.regularCondPresenceProbs print ' ------------------------------------------------------------------' if debug > -1: print print print '...Finished'
from stat_parser import Parser, display_tree parser = Parser() # http://www.thrivenotes.com/the-last-question/ tree = parser.parse( "How can the net amount of entropy of the universe be massively decreased?" ) display_tree(tree)
import nltk import json import yaml import pdb from stat_parser import Parser, display_tree f = open("ships.yml") f2 = open("ships2.yml") all_tokens = [] ships = yaml.load(f.read()) ships2 = yaml.load(f2.read()) d = dict() parser = Parser() def is_leaf(tree): if len(tree.leaves()) == 1: return True else: return False def top_level_leaves(tree): leaves = list(tree) is_top_level = True for leaf in leaves: if not is_leaf(leaf): is_top_level = False return is_top_level def check_for_roll(tree): leaves = list(tree) key = ""
import nltk import json import yaml from random import choice from stat_parser import Parser, display_tree parser = Parser() d = json.load(open('tree_ship_words.json')) f = open("ships.yml") ships = yaml.load(f.read()) name = choice(ships) print name.lower() #tree = parser.parse("Anticipation of a new lover's arrival") tree = parser.parse("they had good pretzels") display_tree(tree)
#! /usr/bin/env python2 import nltk from stat_parser import Parser import os import pickle, string import hw1_util spunctuation = set(string.punctuation) poem_names = [] poem_names.extend(['gb_poems/' + s for s in os.listdir("gb_poems")]) poem_names.extend(['lh_poems/' + s for s in os.listdir("lh_poems")]) #poem_names = ['gb_poems/Sadie_and_Maud.txt'] parser = Parser() vocab = dict() parsed_lines = dict() rhymes = dict() for poem_name in poem_names: with open(poem_name, 'rb') as f: parsed_lines[poem_name] = list() rhyme_count = ord('A') rhymes[poem_name] = list() print poem_name for line in f.read().split('\n'): line = line.decode('utf-8') line = line.replace("'", "") line = line.replace('"', '') last_word = '' try: if line == "" or line.isspace(): raise TypeError
#!/usr/bin/env python # vim: set noexpandtab tabstop=2 shiftwidth=2 softtabstop=-1 fileencoding=utf-8: import sys from stat_parser import Parser parser = Parser() print 'Init complete' print parser.parse(sys.argv[1])
from nltk.corpus import brown import sys from graph import Graph,merge_graphs from nltk.tree import ParentedTree from stat_parser import Parser parser = Parser() user_sentence = sys.argv[1] query = sys.argv[2] trees = [] done = 0 # for sentence in brown.sents(): # if done >= 20: # break # if not query in sentence: # continue # if len(sentence) > 20: # continue # try: # trees.append(parser.parse(" ".join(sentence))) # done += 1 # print done # except: # print "oops couldn't parse that one" # trees = [] # trees.append(parser.parse("The food was on the table where the child likes to eat")) # trees.append(parser.parse("I eat food at the table")) # trees.append(parser.parse("I eat the food that is on the table")) # trees.append(parser.parse("The money is on the table")) # trees.append(parser.parse("Put the data in the table")) # trees.append(parser.parse("Add more rows to the database table"))