Ejemplo n.º 1
0
def quicktree(sentence):
    """Parse a sentence and return a visual representation in IPython"""
    import os
    from nltk import Tree
    from nltk.draw.util import CanvasFrame
    from nltk.draw import TreeWidget
    from stat_parser import Parser
    try:
        from IPython.display import display
        from IPython.display import Image
    except:
        pass
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False
    parser = Parser()
    parsed = parser.parse(sentence)
    cf = CanvasFrame()
    tc = TreeWidget(cf.canvas(),parsed)
    cf.add_widget(tc,10,10) # (10,10) offsets
    cf.print_to_file('tree.ps')
    cf.destroy()
    if have_ipython:
        tregex_command = 'convert tree.ps tree.png'
        result = get_ipython().getoutput(tregex_command)
    else:
        tregex_command = ["convert", "tree.ps", "tree.png"]
        result = subprocess.check_output(tregex_command)    
    os.remove("tree.ps")
    return Image(filename='tree.png')
    os.remove("tree.png")
Ejemplo n.º 2
0
def quicktree(sentence):
    """Parse a sentence and return a visual representation in IPython"""
    import os
    from nltk import Tree
    from nltk.draw.util import CanvasFrame
    from nltk.draw import TreeWidget
    from stat_parser import Parser
    try:
        from IPython.display import display
        from IPython.display import Image
    except:
        pass
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False
    parser = Parser()
    parsed = parser.parse(sentence)
    cf = CanvasFrame()
    tc = TreeWidget(cf.canvas(), parsed)
    cf.add_widget(tc, 10, 10)  # (10,10) offsets
    cf.print_to_file('tree.ps')
    cf.destroy()
    if have_ipython:
        tregex_command = 'convert tree.ps tree.png'
        result = get_ipython().getoutput(tregex_command)
    else:
        tregex_command = ["convert", "tree.ps", "tree.png"]
        result = subprocess.check_output(tregex_command)
    os.remove("tree.ps")
    return Image(filename='tree.png')
    os.remove("tree.png")
Ejemplo n.º 3
0
def parse_sentence(my_sentence):
    '''
    Generate nonterminal rules using a stochastic sentence parser
    
    Parameters
    ----------
    my_sentence : str
        A single sentence (str) 
    
    '''

    parser = Parser()
    parsee = parser.parse(my_sentence)

    rules = ""

    # possibly add: brackets, double quotes

    for production in parsee.productions():
        if not is_terminal(production.rhs()[0]):
            rules += str(production) + '\n'

    # now re-tag special characters
    swappairs = zip(to_replace, replacements)
    for member in swappairs:
        rules = rules.replace(member[0], member[1])

    return rules
Ejemplo n.º 4
0
def best_candidate(Sentence, Question):
    #Sentence = 'Notre Dame\'s most recent when?'

    #Sentence = Sentence.replace('[',' ')
    #Sentence = Sentence.replace(']',' ')
    #        print "Sentence: " +  Sentence
    #print Question
    key = get_md5(Sentence)
    if key in parse_cache:
        print "hit"
        tree = parse_cache[key]
    else:
        try:
            parser = Parser()
            tree = parser.parse(Sentence)
            parse_cache[key] = tree
        except:
            return " "
    list1 = []
    list2 = []
    traverseTree(tree, list1, list2, Question.split())
    min_overlap = min(list2)
    num = [[i, len(list1[i])] for i in range(len(list1))
           if list2[i] == min_overlap]
    s = sorted(num, key=lambda x: -x[1])
    return " ".join(list1[s[0][0]])
Ejemplo n.º 5
0
def question_noun_phrase(query):
    if (len(query.split()) <= 1):
        return query

    query = oclock_remover(query)
    query = benedict_remover(query)
    parser = Parser()
    tree = parser.parse(query)

    if 'NP' == tree.label() or \
    'NP+NP'== tree.label() or \
    'NX+NX'== tree.label() or \
    'NX+NP'== tree.label() or \
    'NP+NX'== tree.label() or \
    'FRAG'== tree.label() or \
    'NX' == tree.label():
        words = query
        noun_phrase = []
        # this is code for finding the noun phrase
        noun_phrase = tree.leaves()

        # this code removes the article from the beginning
        if noun_phrase:
            if (noun_phrase[0] == 'a' or \
              noun_phrase[0] == 'an' or noun_phrase[0] == 'the'):
                del noun_phrase[0]

        #print noun_phrase
        noun_phrase = ' '.join(noun_phrase)

        return noun_phrase

    for element in [tree] + [e for e in tree
                             ]:  # Include the root element in the for loop
        if "SBAR" in element.label():
            for subtree in element.subtrees():
                if "W" in subtree.label():
                    noun_phrase = []
                    print noun_phrase
                    # this is code for finding the noun phrase
                    for noun_subtree in element.subtrees():
                        if not "SBAR" in noun_subtree.label() \
                        and not "W" in noun_subtree.label() \
                        and "NP" in noun_subtree.label() \
                        and len(noun_subtree.leaves()) > len(noun_phrase):

                            noun_phrase = noun_subtree.leaves()

                    # this code removes the article from the beginning
                    if noun_phrase:
                        if (noun_phrase[0] == 'a' or \
                          noun_phrase[0] == 'an' or noun_phrase[0] == 'the'):
                            del noun_phrase[0]

                    noun_phrase = ' '.join(noun_phrase)

                    return noun_phrase

    return ""
def question_noun_phrase(query):
	if(len(query.split()) <= 1):
		return query

	query = oclock_remover(query)
	query = benedict_remover(query)
	parser = Parser()
	tree = parser.parse(query)

	if 'NP' == tree.label() or \
	'NP+NP'== tree.label() or \
	'NX+NX'== tree.label() or \
	'NX+NP'== tree.label() or \
	'NP+NX'== tree.label() or \
	'FRAG'== tree.label() or \
	'NX' == tree.label():
		words = query
		noun_phrase = []
		# this is code for finding the noun phrase
		noun_phrase = tree.leaves()

		# this code removes the article from the beginning
		if noun_phrase:
			if (noun_phrase[0] == 'a' or \
			 	noun_phrase[0] == 'an' or noun_phrase[0] == 'the'):
				del noun_phrase[0]

		#print noun_phrase
		noun_phrase = ' '.join(noun_phrase)

		return noun_phrase

	for element in [tree] + [e for e in tree]: # Include the root element in the for loop
		if "SBAR" in element.label():
			for subtree in element.subtrees():
				if "W" in subtree.label():
					noun_phrase = []
					print noun_phrase
					# this is code for finding the noun phrase
					for noun_subtree in element.subtrees():
						if not "SBAR" in noun_subtree.label() \
						and not "W" in noun_subtree.label() \
						and "NP" in noun_subtree.label() \
						and len(noun_subtree.leaves()) > len(noun_phrase):

							noun_phrase = noun_subtree.leaves()

					# this code removes the article from the beginning
					if noun_phrase:
						if (noun_phrase[0] == 'a' or \
						 	noun_phrase[0] == 'an' or noun_phrase[0] == 'the'):
							del noun_phrase[0]

					noun_phrase = ' '.join(noun_phrase)

					return noun_phrase

	return ""
def ret_tree(sentence,rep,model):
    parser = Parser()

    tree_list = []
# sentence = "How are you"


    tree = parser.parse(sentence)
    #tree.draw()
    all_nodes = []


    def compute_tree_list(t, root_ptr1,rep,model):
        # if len(t.leaves()) == 2:
        # 	# tree_list.append(t.)
        # 	l = t.leaves()
        # 	# print l
        #print root_ptr1
        # 	# print l[0]
        # 	# print l[1]
        # 	return Node(l[0], l[1], True)
        if len(t.leaves()) == 1:
            # tree_list.append(t.leaves())
            l = t.leaves()
            # print l[0]
            return Node(l[0])
        else:
            subts = list(t)
            left_id = root_ptr1+1
            right_id = root_ptr1*2

#print "left id = %f" % left_id
 #           print "right id = %f" % right_id
            # print len(subts)
            left_tree = compute_tree_list(subts[0], left_id,rep,model)
            right_tree = compute_tree_list(subts[1], right_id,rep,model)
            if isinstance(left_tree, Node):
                left_id = left_tree.left
                rep[left_id]=np.transpose(model[left_id]).reshape([300,1])
   #             print rep[left_id].shape

            if isinstance(right_tree, Node):
                right_id = right_tree.left
                w=model.most_similar(positive=right_id,topn=1);
                rep[right_id]=np.transpose(model[right_id]).reshape([300,1])
  #              print rep[right_id].shape
            # print "root ptr.... = %f" % root_ptr
            tree_list.append({"ip1": left_id, "ip2": right_id, "op": root_ptr1})

            # return Node(left_tree, right_tree)

    compute_tree_list(tree, 10000,rep,model)
    print "Tree List",tree_list
    return tree_list,rep
Ejemplo n.º 8
0
def chat_with_robo():
    parser = Parser()

    flag = True
    print("The instructions for talk with me: \n",
          "If you want finish the conversation, please type thanks or bye.\n")
    print("ROBO: Hi, my name is Robo.")
    while flag == True:
        message = input()
        message = message.lower()

        if message != 'bye':
            # Analyzing the input
            print('\nvocabulary: ', nltk.tokenize(message))
            print('\nword frequency: ' +
                  nltk.FreqDist(nltk.tokenize(message)).most_common(10))

            # -----------
            # add part-of-speech tags to text
            # -----------
            # Tagging message with basic nltk tokenize
            print(nltk.pos_tag(nltk.word_tokenize(message)))
            # Tiene problemas con la identificación del pronombre 'I', lo pone como noun (sustantivo)

            # Tagging message

            # trace = 1: then the parser will report the steps that it takes as it parses a text.
            # rd_parser = nltk.RecursiveDescentParser(, trace = 1)

            # Review grammar
            # rd_parser = nltk.RecursiveDescentParser(nltk.ChartParser)
            rd_parser = parser.parse(message)

            i = 1

            for tree_struc in rd_parser:
                print(str(i) + 'tree_struc: ', tree_struc)

                wrong_syntax = 1
                s = tree_struc
                wrong_syntax = 0
                print("\n Correct Grammar")
                i += 1
            if wrong_syntax == 1:
                print("\n Wrong Grammar")

                # write_output_file(...

        else:
            flag = False
            print("ROBO: Bye! take care..")
Ejemplo n.º 9
0
def quicktree(sentence):
    """Parse a sentence and return a visual representation"""
    from nltk import Tree
    from nltk.draw.util import CanvasFrame
    from nltk.draw import TreeWidget
    from stat_parser import Parser
    from IPython.display import display
    from IPython.display import Image
    parser = Parser()
    parsed = parser.parse(sentence)
    cf = CanvasFrame()
    tc = TreeWidget(cf.canvas(), parsed)
    cf.add_widget(tc, 10, 10)  # (10,10) offsets
    cf.print_to_file('tree.ps')
    cf.destroy()
Ejemplo n.º 10
0
def main():
 text = "Smoking Mothers May Alter the DNA of Their Children."
 parser = Parser()
 tree = parser.parse(text)
 print "Parse Tree:\n"+str(tree)+"\n"
 phrasesTree = extractTaggedPhrases(tree, 'NP')
 print "Extracted Phrases:\n"+str(phrasesTree)+"\n"
 phrases = []
 for phrase in phrasesTree:
  phrases.append(" ".join(phrase.leaves()))

 imagesDict = buildImagesDict(phrases)
 for phrase, images in imagesDict.iteritems():
  print phrase+":"
  print "\n".join([image for image in images])
  print
def delegate(task_queue, completed_queue):
    graph = Graph()
    parser = Parser()

    while True:
        try:
            sentence = task_queue.get(False)
        except:
            completed_queue.put(graph)
            print "My work here is done"
            return True
        print "Parsing sentence"
        parsed = parser.parse(sentence)
        print "Adding sentence to graph"
        # graph.update(parsed)
        print "Added"
Ejemplo n.º 12
0
def generate(filename, word_limit=None):
    global syntaxes
    parser = Parser()
    if not os.path.exists(SYNTAXES_FILE):
        #  sents = nltk.corpus.gutenberg.sents('results.txt')
        # NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus.
        with codecs.open(filename, encoding='utf-8') as corpus:
            sents = nltk.sent_tokenize(corpus.read())
            if word_limit:
                sents = [sent for sent in sents if len(sent) < word_limit]
            sent_limit = min(1500, len(sents))
            sents[0:sent_limit]
            for sent in tqdm(sents):
                try:
                    parsed = parser.parse(sent)
                except TypeError:
                    pass
                syntax_signature(parsed, save=True)
        with open(SYNTAXES_FILE, 'wb+') as pickle_file:
            pickle.dump(syntaxes, pickle_file)
    else:
        with open(SYNTAXES_FILE, 'rb+') as pickle_file:
            syntaxes = pickle.load(pickle_file)

    if not os.path.exists(CFDS_FILE):
        with codecs.open(filename, encoding='utf-8') as corpus:
            cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)]
            with open(CFDS_FILE, 'wb+') as pickle_file:
                pickle.dump(cfds, pickle_file)
    else:
        with open(CFDS_FILE, 'rb+') as pickle_file:
            cfds = pickle.load(pickle_file)

    sents = nltk.corpus.gutenberg.sents('austen-emma.txt')
    if word_limit:
        sents = [sent for sent in sents if len(sent) < word_limit]
    sent = random.choice(sents)
    parsed = parser.parse(' '.join(sent))
    print(parsed)
    print(' '.join(parsed.leaves()))
    replaced_tree = tree_replace(parsed, cfds, [])
    print('=' * 30)
    print(' '.join(replaced_tree.leaves()))
    print(replaced_tree)
Ejemplo n.º 13
0
def generate():
    global syntaxes
    parser = Parser()
    if not os.path.exists(SYNTAXES_FILE):
        #  sents = nltk.corpus.gutenberg.sents('results.txt')
        # NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus.
        with codecs.open('results.txt', encoding='utf-8') as corpus:
            sents = nltk.sent_tokenize(corpus.read())
            sents = [sent for sent in sents if len(sent) < 150][0:1500]
            for sent in tqdm(sents):
                try:
                    parsed = parser.parse(sent)
                except TypeError:
                    pass
                syntax_signature(parsed, save=True)
        with open(SYNTAXES_FILE, 'wb+') as pickle_file:
            pickle.dump(syntaxes, pickle_file)
    else:
        with open(SYNTAXES_FILE, 'rb+') as pickle_file:
            syntaxes = pickle.load(pickle_file)

    if not os.path.exists(CFDS_FILE):
        #  corpus = nltk.corpus.gutenberg.raw('results.txt')
        with codecs.open('results.txt', encoding='utf-8') as corpus:
            cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)]
            with open(CFDS_FILE, 'wb+') as pickle_file:
                pickle.dump(cfds, pickle_file)
    else:
        with open(CFDS_FILE, 'rb+') as pickle_file:
            cfds = pickle.load(pickle_file)

    sents = nltk.corpus.gutenberg.sents('austen-emma.txt')
    sents = [sent for sent in sents if len(sent) < 50]
    sent = random.choice(sents)
    parsed = parser.parse(' '.join(sent))
    print(parsed)
    print(' '.join(parsed.leaves()))
    replaced_tree = tree_replace(parsed, cfds, [])
    print('=' * 30)
    print(' '.join(replaced_tree.leaves()))
    print(replaced_tree)
Ejemplo n.º 14
0
	def sentsSelector(self,sents):
		texts = []
		new_indices = []
		index = 0
		parser = Parser()
		for sent in sents:
			# Extract triplets & store in neo4j database
			tripletExtractor(parser.parse(sent))
			# Process sent
			tokens = sent.split()
			if len(self.removeSingleOccurWords(self.removeStopWords(tokens)))>0:
				texts.append(self.removeStopWords(tokens))
				new_indices.append(index)
				#print(self.removeSingleOccurWords(self.removeStopWords(tokens)))
			index += 1

		self.makeDictAndCorpus(texts)
		new_sents = []
		for index in self.DocumentSIMQuery():
			new_sents.append(sents[index])
		return new_sents
Ejemplo n.º 15
0
 def __init__(self):
     self.filename='dictionary.txt'
     self.dict = {}
     bgm  = nltk.collocations.BigramAssocMeasures()
     finder = nltk.collocations.BigramCollocationFinder.from_words(nltk.corpus.brown.words())
     scores = finder.score_ngrams(bgm.likelihood_ratio)
     self.scored = {}
     for key, score in scores:
         self.scored[key] = score
     self.specialWords = [u'了', u'的']
     self.directions = ['east', 'west', 'south', \
     'north','northeast', 'southeast', 'northwest', 'southwest']
     self.parser = Parser()
Ejemplo n.º 16
0
def shortestPath(sentence, word1, word2):

    parser = Parser()
    tree = parser.parse(sentence)

    print(tree)
    #print(type(tree))

    path1 = findword(tree, word1.lower())
    path2 = findword(tree, word2.lower())
    #print(path1)
    #print(path2)

    # compare both paths
    #   -> find first different element
    j = 0
    for i in range(1, min(len(path1), len(path2))):
        if path1[i] != path2[i]:
            j = i - 1
            break

    # now join both list from the jth element
    # we need to take into account the "order" of appearance in the tree
    # left or right, which is left to the other one, cuz it's tree will be reversed
    #  S VP NP Mary
    #  S VP NP Bob <-> Bob Np VP S
    #  always the reversed list goes first and that's it?
    sublist1 = path1[j:]
    #print("sublist1",sublist1)
    if j < len(path2) - 1:
        j = j + 1
    sublist2 = path2[j:]
    #print("sublist2",sublist2)
    sublist2.reverse()
    #print("sublist2",sublist2)
    shortestpath = sublist2 + sublist1

    return shortestpath
def shortestPath(sentence, word1, word2):

    parser = Parser()
    tree = parser.parse(sentence)

    print(tree)
    #print(type(tree))

    path1 = findword(tree,word1.lower())
    path2 = findword(tree,word2.lower())
    #print(path1)
    #print(path2)

    # compare both paths
    #   -> find first different element
    j = 0
    for i in range(1,min(len(path1),len(path2))):
        if path1[i] != path2[i]:
            j = i - 1
            break
            
    # now join both list from the jth element
    # we need to take into account the "order" of appearance in the tree
    # left or right, which is left to the other one, cuz it's tree will be reversed
    #  S VP NP Mary
    #  S VP NP Bob <-> Bob Np VP S
    #  always the reversed list goes first and that's it?
    sublist1 = path1[j:]
    #print("sublist1",sublist1)
    if j< len(path2)-1:
        j=j+1
    sublist2 = path2[j:]
    #print("sublist2",sublist2)
    sublist2.reverse()
    #print("sublist2",sublist2)
    shortestpath = sublist2 + sublist1

    return shortestpath
Ejemplo n.º 18
0
def pipeline(records):
    wordnet_lemmatizer = WordNetLemmatizer() 
    parser = Parser()
    for record in records:
        sentences = nltk.sent_tokenize(record)
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            print(sentence)
            print(nltk.pos_tag(sentence))
            print parser.parse(sentence)
            for word in words:
                print("Word: ", word)
                print("Lemma: ", wordnet_lemmatizer.lemmatize(word))
                for synset in wordnet.synsets(word):
                    print (synset, synset.hypernyms())
                    print (synset, synset.hyponyms())
                    print (synset, synset.part_meronyms())
                    print (synset, synset.substance_meronyms())
                    print (synset, synset.part_holonyms())
                    print (synset, synset.substance_holonyms())

          
    print("*********")
Ejemplo n.º 19
0
def string_to_query(string):
    print("Helloooo")
    parser = Parser()
    parsed = parser.parse(string)
    question = []
    question.extend(
        [word for word, pos in parsed.pos() if pos == 'WP' or pos == 'WRB'])
    action = []
    action.extend([word for word, pos in parsed.pos() if 'VB' in pos])
    affected_entity = []
    affected_entity.extend(
        [word for word, pos in parsed.pos() if 'NN' in pos or pos == "VBG"])
    particle = list(parsed.subtrees(filter=lambda x: x.label() == 'PRT'))
    tree = parsed
    verb_phrases_list = list(
        parsed.subtrees(filter=lambda x: "V" in x.label()))[0]
    entity = None
    try:
        entity = [x for x in parsed.pos() if x[0] == affected_entity[0]][0]
    except:
        entity = None
    print(question)
    print(action)
    print(affected_entity)
    print(parsed.pos())
    try:
        print(parsed.pos()[parsed.pos().index(entity) + 1:])
    except:
        print("entity is None")
    output = {
        "question": question,
        "action": action,
        "affected_entity": affected_entity,
        "rest": None,  #parsed.pos()[parsed.pos().index(entity)+1:],
        "sentence": string
    }
    return output
Ejemplo n.º 20
0
def best_candidate_token(Sentence, Question, token):
    #parser = Parser()
    #tree = parser.parse(Sentence)

    print "Sentence: " + Sentence
    key = get_md5(Sentence)
    if key in parse_cache:
        print "hit"
        tree = parse_cache[key]
    else:
        try:
            parser = Parser()
            tree = parser.parse(Sentence)
            parse_cache[key] = tree
        except:
            return " "
    list1 = []
    list2 = []
    traverseTree_token(tree, list1, list2, Question.split(), token)
    min_overlap = min(list2)
    num = [[i, len(list1[i])] for i in range(len(list1))
           if list2[i] == min_overlap]
    s = sorted(num, key=lambda x: -x[1])
    return " ".join(list1[s[0][0]])
class Solution(object):
    def __init__(self):
        print("Initializing Data Loader...")
        # Load Parser and process it
        self.parser = Parser()
        self.count = 0

    def is_question(self, sentence):
        """
        Class function for the boolean tagging
        :param sentence:
        :return:
        """
        self.count += 1  # To check which line gives error
        print(self.count)
        result = str(self.parser.parse(sentence)).split()
        if '(SBARQ' in result[0]:
            return "QUESTION_CODE"
        else:
            return "n/a"

    def question_classify(self):
        """
        This function classifies each sentence in the input file and outputs a tsv into a result.txt file
        :return: result.txt file
        """
        with open(DATA_FILE, 'r') as doc:
            with open("result.txt", 'w') as target:
                dat = doc.read()
                lines = dat.splitlines()
                for line in lines[1:]:
                    query, freq = line.strip().split("\t")
                    try:
                        # Since some sentences may not have any valid parse
                        is_q = self.is_question(query)
                        bin_val = [0, 1][is_q == "QUESTION_CODE"]
                    except TypeError:
                        # It is highly likely that it ont be a question
                        is_q = "n/a"
                        bin_val = 0

                    target.write("%s\t%s\t%s\n" % (query, bin_val, is_q))
            target.close()
        doc.close()
Ejemplo n.º 22
0
 def __init__(self, text):
     parser = Parser()
     self.math_ops = ('add', 'multiply', 'divide', 'subtract', 'power')
     self.ops = ('summarize', 'translate')
     custom_sent_tokenizer = PunktSentenceTokenizer(text)
     tokenized = custom_sent_tokenizer.tokenize(text)
     words = nltk.word_tokenize(tokenized[0])
     self.parsed_text = nltk.pos_tag(words)
     self.verb = []
     self.verb_ranges = []
     self.language = ''
     self.keywords = [
         'odd', 'even', 'prime', 'composite', 'squares', 'square', 'cubes',
         'cube'
     ]
     self.special = sum(
         [word[0] in self.keywords for word in self.parsed_text]) > 0
     self.special_keywords = set()
     for word in self.parsed_text:
         if word[0] in self.keywords:
             self.special_keywords.add(word[0])
     self.special_keywords = list(self.special_keywords)
     self.d = d = {
         "odd":
         lambda x: x % 2 == 1,
         "even":
         lambda x: x % 2 == 0,
         "square":
         lambda x: math.sqrt(x).is_integer(),
         "cube":
         lambda x: (x**(1. / 3.)).is_integer(),
         "squares":
         lambda x: math.sqrt(x).is_integer(),
         "cubes":
         lambda x: (x**(1. / 3.)).is_integer(),
         "prime":
         lambda x: x > 1 and all(
             x % i for i in islice(count(2), int(sqrt(x) - 1))),
         "composite":
         lambda x: x > 1 and not all(
             x % i for i in islice(count(2), int(sqrt(x) - 1)))
     }
Ejemplo n.º 23
0
def batch_parse_multiprocessing(sentences_list, num_processes=1):

    sentences_queue = JoinableQueue()
    parsed_queue = JoinableQueue()
    parser = Parser()

    for _ in range(num_processes):
        proc = Process(target=batch_parse,
                       args=(parser, sentences_queue, parsed_queue))
        proc.daemon = True
        proc.start()

    sr = SaveResults(parsed_queue)

    for sent in sentences_list:
        sentences_queue.put(sent)

    sentences_queue.join()
    parsed_queue.join()

    return sr.get_ordered_results()
Ejemplo n.º 24
0
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_vocab_size=10000,
                 parser=Parser(),
                 evaluator=NgramEvaluator(2)):
        '''
        Initializes the ParserEncoder.
        '''
        super(ParserEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict)
        # set up our size-limited vocab
        if max_vocab_size != None:
            vocabulary = self._get_vocab(max_vocab_size)
            self.phonemes_to_words_dict = self._get_phonemes_to_words_dict(
                vocabulary)
        else:
            self.phonemes_to_words_dict = self._get_phonemes_to_words_dict()

        self.parser = parser
        self.evaluator = evaluator
Ejemplo n.º 25
0
import nltk
from parsedatetime import Calendar
from nltk.tag import pos_tag, map_tag
from stat_parser import Parser, display_tree
from time import mktime
from datetime import datetime , timedelta
from dateutil.relativedelta import *

parser = Parser()	# Build this outside the fn. so it doesn't rebuild each time
cal = Calendar()

schedule_verbs = ['add', 'set', 'make', 'create', 'get', 'schedule', 'appoint',
				 'slate', 'arrange', 'organize', 'construct', 'coordinate',
				 'establish', 'form', 'formulate', 'run', 'compose', 'have', 'meet',
				 'reschedule', 'find'] #'find' is for schedule-suggesting; be careful

schedule_suggest_verbs = ['suggest', 'recommend', 'propose', 'show']

schedule_nouns = ['appointment', 'meeting','meetup', 'reservation', 'session'
				 'talk', 'call', 'powwow', 'meet', 'rendezvous', 'event', 'conference', 'time']

doc_verbs = ['open', 'open up', 'view', 'launch', 'look','display', 'check', 'start',
				'begin','create', 'make', 'get', 'have', 'set', 'generate', 'show', 'pull']

avail_words = ['free', 'available', 'works', 'potential', 'options']

time_words = ['tomorrow', 'today', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
							'Friday', 'Saturday', 'Sunday','a.m.', 'am', 'p.m.', 'pm', 'week',
							'month', 'day', 'time', 'year', 'date']

doc_nouns = ['doc', 'dog', 'dock' , 'document', 'script', 'record', 'report', 'page', 'notepad']
Ejemplo n.º 26
0
from contractions import contractions


sent_tokenizer = PunktSentenceTokenizer()

with open("<source of text>", "r") as f:
    text = f.read()

for k, v in contractions.items():
    text = text.replace(k, v)

sents = []
for paragraph in text.split('\n'):
    sents += sent_tokenizer.tokenize(paragraph)

parser = Parser()

productions = []
for sent in sents[:25]:
    try:
        tree = parser.parse(sent)
        productions += tree.productions()
    except:
        pass

S = Nonterminal('S')
grammar = induce_pcfg(S, productions)

for sentence in generate(grammar, depth=5):
    print " ".join(sentence) + "\n"
Ejemplo n.º 27
0
from flask import Flask, send_file, request, jsonify
import nltk
from nltk import Tree
from stat_parser import Parser
import sys
import json

app = Flask(__name__)


@app.route("/")
def index():
    return send_file("static/index.html")


parser = Parser()


#converts an nltk Tree to a dictionary
def tree2dict(tree, parent=None):
    return {
        "parent":
        parent,
        "name":
        tree.label(),
        "children": [
            tree2dict(t, tree.label()) if isinstance(t, Tree) else {
                "name": t,
                "parent": tree.label(),
                "children": None
            } for t in tree
Ejemplo n.º 28
0
 def classify(self, sentence, debug = 0):
     #If the sentence hasn't been parsed, we must parse it.
     plaintext = False
     if type(sentence) != list:
         plaintext = True
         original = sentence
         try:
             sentence = self.parser.parse(sentence)
         except:
             try:
                 self.parser = Parser()
                 sentence = self.parser.parse(sentence)
             except:
                 print 'Couldn\'t create a parsing object.'
                 print 'Prehaps pystatparser is not loaded?'
                 print 'type \"from stat_parser import Parser\"'
         
     #Deal with new root types
     if sentence[0] not in self.importantRootProbabilities:
         self.importantRootProbabilities[sentence[0]] = self.alpha/(self.alpha*(len(self.importantRootProbabilities) + 1))
     if sentence[0] not in self.regularRootProbabilities:
         self.regularRootProbabilities[sentence[0]] = self.alpha/(self.alpha*(len(self.regularRootProbabilities) + 1))
     
     #Deal with new non-root tag types
     flat = self.recursiveFlatten(sentence)
     flat = filter(lambda x: type(x) == unicode, flat)
     for i,tag in enumerate(flat):
         if tag not in self.tags:
             #Set a priori beliefs for multiplicity parameters
             self.importantMultiplictyParameter[tag] = self.beta
             self.regularMultiplictyParameter[tag] = self.beta
             
             #Set a priori beliefs for conditional presence parameters
             self.importantCondPresenceProbs.loc[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.importantCondPresenceProbs.columns))
             self.regularCondPresenceProbs.loc[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.regularCondPresenceProbs.columns))
             if type(flat[i+1])==unicode:
                 #Set a priori beliefs for conditional presence parameters
                 self.importantCondPresenceProbs[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.importantCondPresenceProbs.index))
                 self.regularCondPresenceProbs[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.regularCondPresenceProbs.index))
     
     ##Get P(x|y = REGULAR) 
     PxGy1 = math.log(self.importantRootProbabilities[sentence[0]]) + self.getConditionalLevelProbability([sentence,self.importantCondPresenceProbs,self.importantMultiplictyParameter,sentence[0],debug>=2])
     
     ##Get P(x|y = REGULAR) 
     PxGy0 = math.log(self.regularRootProbabilities[sentence[0]]) + self.getConditionalLevelProbability([sentence,self.regularCondPresenceProbs,self.regularMultiplictyParameter,sentence[0],debug>=2])
     
     #Get priors in a log form:
     Py1 = math.log(self.classPriors[1])
     Py0 = math.log(self.classPriors[0])
     
     #Get log Probabilities of each class through Bayes' Rule
     Py1Gx = PxGy1+Py1
     Py0Gx = PxGy0+Py0
     
     #SoftMax probabilities
     denom = math.log(math.e**Py1Gx + math.e**Py0Gx)
     
     sPy1Gx = Py1Gx-denom
     sPy0Gx = Py0Gx-denom
     
     #Turn back into probabilities for output
     sPy1Gx = math.e**sPy1Gx
     sPy0Gx = math.e**sPy0Gx
     
     if debug > -1:
         print 'Estimating Class for sentence:'
         if plaintext:
             print '\"' + original + '\"'
         else:
             print sentence
     if debug > 0:
         print ' ------------------------------------------------------------------'
         print 'Class Priors (log probability):'
         print 'P(important) = ' + str(Py1)
         print 'P(unimportant) = ' + str(Py0)
         print ' ------------------------------------------------------------------'
         print 'Conditional Sentence Log Probabilities:'
         print 'P(sentence | important) = ' + str(PxGy1)
         print 'P(sentence | unimportant) = ' + str(PxGy0)
         print ' ------------------------------------------------------------------'
         print 'Unnormalized Conditional Class Log Probabilities'
         print 'P(important | sentence) = ' + str(Py1Gx)
         print 'P(unimportant | sentence) = ' + str(Py0Gx)
     if debug > -1:
         print ' ------------------------------------------------------------------'
         print 'Softmaxed Conditional Class Probabilities'
         print 'P(important | sentence) = ' + str(sPy1Gx)
         print 'P(unimportant | sentence) = ' + str(sPy0Gx)
     return(sPy1Gx)
Ejemplo n.º 29
0
class SySE:
    ####Supervised Training.
    #trainingSentences: sentences on which to train (Must already be parsed)
    #labels: corresponding binary (1,0) labels.
    #alpha: laplace/additive smoothing parameter (default = 1)
    def train(self, trainingSentences, labels, alpha = 0.1, beta = 0.1, debug = 0):
        if debug > -1:
            print
            print '*********************************************************'
            print '                   SySE V 0.1 '
            print 'Beginning Training Sequence with ' + \
                str(len(trainingSentences)) + ' training sentences...'
            print '*********************************************************'
            if debug > 0:
                print
                print 'Initializing... '
        
        if type(trainingSentences[0]) != list:
            print 'These sentences do not appear to have been parsed.'
            print 'They will be parsed now.'
            if len(trainingSentences > 10):
                print 'Given their volume, this will take some time.'
            try:
                self.parser = Parser()
            except:
                print 'This environment should have pystatparser loaded ' + \
                'in order to train on unparsed sentences.'
                print 'Exiting...'
                return
            trainingSentences = [parser.parse(x) for x in trainingSentences]
        
        ####Initialization
        #Save hyperparameters
        self.alpha = alpha
        self.beta = beta
        
        #See what tags are in the training data.
        tags = []
        for sentence in trainingSentences:
            flat = self.recursiveFlatten(sentence)
            for el in flat:
                if type(el) == unicode and el not in tags:
                    tags.append(el)            
            
        self.tags = set(tags)
        
        #What kind of root tags are there?
        self.sentenceTypes = set([x[0] for x in trainingSentences])
        
        #Which tags may contain other tags?
        self.phraseTags = []
        for sentence in trainingSentences:
            flat = self.recursiveFlatten(sentence)
            for i in range(0,len(flat)):
                try:
                    if type(flat[i]) == unicode and type(flat[i+1]) == unicode and flat[i] not in self.phraseTags:
                        self.phraseTags.append(flat[i])
                except IndexError:
                    print 'We\'ve reached the end of this sentence'
                    
        self.phraseTags = set(self.phraseTags) - self.sentenceTypes
       
        #Robustness
        labels = list(labels)    
        
        #Split training sentences into Important (I) and Regular (R) (Unimportant)
        importantSentences = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences)
        regularSentences = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences)
    
        self.classPriors = []
        
        ###Test inputs
        #Make sure labels are right length for sentences.
        if len(labels) != len(trainingSentences):
            print 'Labels and trainingSentencs must be the same length!'
            return
        #Make sure labels are valid
        for label in labels:
            if label != 0 and label != 1:
                print 'Lables should be either 0 or 1.'
                print 'exiting...'
                return
        
        #Split training sentences into Important (I) and Regular (R) (Unimportant)
        self.importantRootProbabilities = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences)
        self.regularRootProbabilities = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences)
                            
        ###Train Class Priors
        self.classPriors.append(float(labels.count(0))/float(len(labels)))
        self.classPriors.append(float(labels.count(1))/float(len(labels)))
        
        if debug > 0:
            print '*********************************************************'
            print 'These are the class priors'
            print '*********************************************************'    
            print self.classPriors
            print
            print      
        
        ###Train Sentence Type
        self.importantRootProbabilities = dict(zip(list(self.sentenceTypes),np.zeros(len(list(self.sentenceTypes)))))
        self.regularRootProbabilities = dict(zip(list(self.sentenceTypes),np.zeros(len(list(self.sentenceTypes)))))
        
        #Get the count of each sentence type in I
        for sentence in importantSentences:
            #Make sure we get what we expect
            if type(sentence[0]) != unicode:
                print "We are looking for a non-unicode sentence type. exiting..."
                break
                return            
            #if it isn't in the list yet, add it.
            self.importantRootProbabilities[sentence[0]] += 1
            
        #We will now implement a softmax to turn the counts into probabilities
        for param in self.importantRootProbabilities:
            self.importantRootProbabilities[param]=(float(self.importantRootProbabilities[param]) + alpha)/ \
                (float(len(trainingSentences)) + alpha*(len(self.importantRootProbabilities)+1))
                
        #Get the count of each sentence type in R
        for sentence in regularSentences:
            #Make sure we get what we expect
            if type(sentence[0]) != unicode:
                print "We are looking for a non-unicode sentence type. exiting..."
                break
                return            
            #if it isn't in the list yet, add it.
            self.regularRootProbabilities[sentence[0]] += 1
        
        #We will now implement a softmax to turn the counts into probabilities
        for param in self.regularRootProbabilities:
            self.regularRootProbabilities[param]=float(self.regularRootProbabilities[param])/ \
                float(len(trainingSentences))
        
        if debug > 0:
            print '*********************************************************'
            print 'These are the sentence type parameters'
            print '*********************************************************'
            
            print ' --------------------------------------------------------'
            print ' For Important Sentences:'
            print self.importantRootProbabilities
            
            print ' --------------------------------------------------------'
            print ' For Regular Sentences:'
            print self.regularRootProbabilities
            print
            print
        
        ###Train Phrases
        
        ##Primitive Inference on Multiplicity Parameter
        #Define dictionaries to store times a tag was included in a phrase
        tagInclusionI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many times is a tag in a level?
        tagInclusionR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many times is a tag in a level?
        #Define dictionaries to store times a tag was used at all.
        tagCountI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear?
        tagCountR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear?
        #To store dumb poisson inference
        self.importantMultiplictyParameter = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#For storing parameter estimates.
        self.regularMultiplictyParameter = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#For storing parameter estimates.
        
        #Get Inclusion for I
        for sentence in importantSentences:
            self.getInclusions([sentence,tagInclusionI,debug>=2])
        
        #Get Inclusion for R
        for sentence in regularSentences:
            self.getInclusions([sentence,tagInclusionR,debug>=2])
        
        #Get Counts for I
        for sentence in importantSentences:
            flat = self.recursiveFlatten(sentence)
            currentTags = filter(lambda x: type(x)==unicode, flat)
            for tag in currentTags[1:]:
                tagCountI[tag] += 1
        
        #Get Counts for R
        for sentence in regularSentences:
            flat = self.recursiveFlatten(sentence)
            currentTags = filter(lambda x: type(x)==unicode, flat)
            for tag in currentTags[1:]:
                tagCountR[tag] += 1
        
        #Estimate Parameters for I
        for tag in tagInclusionI.keys():
            if (tagCountI[tag] > 1):
                self.importantMultiplictyParameter[tag] = (tagCountI[tag]-1) / tagInclusionI[tag]
                
        #Estimate Parameters for R
        for tag in tagInclusionR.keys():
            if (tagCountR[tag] > 1):
                self.regularMultiplictyParameter[tag] = (tagCountR[tag]-1) / tagInclusionR[tag]
        
            if debug > 0:
                print '*********************************************************'
                print ' Estimation for Multiplicity Parameters '
                print '*********************************************************'
                print
                print ' ------------------------------------------------------------------'
                print 'Tag Counts for Important Sentences:'
                print tagCountI    
                print 'Tag Counts for Regular Sentences:'
                print tagCountR
                print ' ------------------------------------------------------------------'
                print 'Tag Inclusion for Important Sentences:'
                print tagInclusionI
                print 'Tag Inclusion for Regular Sentences:'
                print tagInclusionR
                print ' ------------------------------------------------------------------'
                print 'Dumb Parameter Estimates for Imporant Sentences:'
                print self.importantMultiplictyParameter
                print 'Dumb Parameter Estimates for Regular Sentences:'
                print self.regularMultiplictyParameter    
                print
                print
        
        ##Primitive Inference on Presence Parameters
        
        #We need to find inclusions given parent
        #To store conditional presence probabilities, what can almost be \
            #thought of as transition probabilities.
        #This is the uinformed probability of a particular presence.
        ui = self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1))
        #For important phrases
        self.importantCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)])
        self.importantCondPresenceProbs = pd.DataFrame(self.importantCondPresenceProbs).applymap(lambda x: x + ui)
        self.importantCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags)
        self.importantCondPresenceProbs.index = list(self.tags)
        
        #For regularPhrases
        self.regularCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)])
        self.regularCondPresenceProbs = pd.DataFrame(self.regularCondPresenceProbs).applymap(lambda x: x + ui)
        self.regularCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags)
        self.regularCondPresenceProbs.index = list(self.tags)
        
        #Define dictionaries to store times a tag was used at all. This time, \
            #We care about root/sentence tags as well.
        tagCountI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear?
        tagCountR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear?
        
        #Tag counts, but on sentences as well, unlike above.
        
        
        #Count Conditional Inclusions for Important Sentences
        for sentence in importantSentences:
            self.getInclusionsGivenParent([sentence,self.importantCondPresenceProbs,sentence[0],debug>=2])
        
        #Count Conditional Inclusions for Regular Sentences
        for sentence in regularSentences:
            self.getInclusionsGivenParent([sentence,self.regularCondPresenceProbs,sentence[0],debug>=2])
        
        #Get Counts for I
        for sentence in importantSentences:
            flat = self.recursiveFlatten(sentence)
            currentTags = filter(lambda x: type(x)==unicode, flat)
            for tag in currentTags:
                tagCountI[tag] += 1
        
        #Get Counts for R
        for sentence in regularSentences:
            flat = self.recursiveFlatten(sentence)
            currentTags = filter(lambda x: type(x)==unicode, flat)
            for tag in currentTags:
                tagCountR[tag] += 1
        
        #Calculate Conditional Presence Parameter for Important Sentences
        for column in self.importantCondPresenceProbs.columns:
            if tagCountI[column] > 0:
                num = self.importantCondPresenceProbs.loc[:,column] + alpha
                denom = tagCountI[column] + (len(self.importantCondPresenceProbs.columns) + 1)*alpha
                self.importantCondPresenceProbs.loc[:,column] = num/denom
            
        #Calculate Conditional Presence Parameter for Regular Sentences
        for column in self.regularCondPresenceProbs.columns:
            if tagCountR[column] > 0:
                #AdditiveSmoothing
                num = self.regularCondPresenceProbs.loc[:,column] + alpha
                denom = tagCountR[column] + (len(self.regularCondPresenceProbs.columns) + 1)*alpha
                self.regularCondPresenceProbs.loc[:,column] = num/denom
        
        if debug > 1:
            print '*********************************************************'
            print 'Presence Parameter Estimation'
            print '*********************************************************'
            print
            print ' ------------------------------------------------------------------'
            print ' Conditional Parameters for Important Sentences'
            print self.importantCondPresenceProbs
            print
            print ' ------------------------------------------------------------------'
            print ' Conditional Parameters for Regular Sentences'
            print self.regularCondPresenceProbs
            print ' ------------------------------------------------------------------'    
        
        if debug > -1:
            print
            print
            print '...Finished'
        
        ####Classification
    def classify(self, sentence, debug = 0):
        #If the sentence hasn't been parsed, we must parse it.
        plaintext = False
        if type(sentence) != list:
            plaintext = True
            original = sentence
            try:
                sentence = self.parser.parse(sentence)
            except:
                try:
                    self.parser = Parser()
                    sentence = self.parser.parse(sentence)
                except:
                    print 'Couldn\'t create a parsing object.'
                    print 'Prehaps pystatparser is not loaded?'
                    print 'type \"from stat_parser import Parser\"'
            
        #Deal with new root types
        if sentence[0] not in self.importantRootProbabilities:
            self.importantRootProbabilities[sentence[0]] = self.alpha/(self.alpha*(len(self.importantRootProbabilities) + 1))
        if sentence[0] not in self.regularRootProbabilities:
            self.regularRootProbabilities[sentence[0]] = self.alpha/(self.alpha*(len(self.regularRootProbabilities) + 1))
        
        #Deal with new non-root tag types
        flat = self.recursiveFlatten(sentence)
        flat = filter(lambda x: type(x) == unicode, flat)
        for i,tag in enumerate(flat):
            if tag not in self.tags:
                #Set a priori beliefs for multiplicity parameters
                self.importantMultiplictyParameter[tag] = self.beta
                self.regularMultiplictyParameter[tag] = self.beta
                
                #Set a priori beliefs for conditional presence parameters
                self.importantCondPresenceProbs.loc[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.importantCondPresenceProbs.columns))
                self.regularCondPresenceProbs.loc[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.regularCondPresenceProbs.columns))
                if type(flat[i+1])==unicode:
                    #Set a priori beliefs for conditional presence parameters
                    self.importantCondPresenceProbs[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.importantCondPresenceProbs.index))
                    self.regularCondPresenceProbs[tag] = np.repeat(self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1)),len(self.regularCondPresenceProbs.index))
        
        ##Get P(x|y = REGULAR) 
        PxGy1 = math.log(self.importantRootProbabilities[sentence[0]]) + self.getConditionalLevelProbability([sentence,self.importantCondPresenceProbs,self.importantMultiplictyParameter,sentence[0],debug>=2])
        
        ##Get P(x|y = REGULAR) 
        PxGy0 = math.log(self.regularRootProbabilities[sentence[0]]) + self.getConditionalLevelProbability([sentence,self.regularCondPresenceProbs,self.regularMultiplictyParameter,sentence[0],debug>=2])
        
        #Get priors in a log form:
        Py1 = math.log(self.classPriors[1])
        Py0 = math.log(self.classPriors[0])
        
        #Get log Probabilities of each class through Bayes' Rule
        Py1Gx = PxGy1+Py1
        Py0Gx = PxGy0+Py0
        
        #SoftMax probabilities
        denom = math.log(math.e**Py1Gx + math.e**Py0Gx)
        
        sPy1Gx = Py1Gx-denom
        sPy0Gx = Py0Gx-denom
        
        #Turn back into probabilities for output
        sPy1Gx = math.e**sPy1Gx
        sPy0Gx = math.e**sPy0Gx
        
        if debug > -1:
            print 'Estimating Class for sentence:'
            if plaintext:
                print '\"' + original + '\"'
            else:
                print sentence
        if debug > 0:
            print ' ------------------------------------------------------------------'
            print 'Class Priors (log probability):'
            print 'P(important) = ' + str(Py1)
            print 'P(unimportant) = ' + str(Py0)
            print ' ------------------------------------------------------------------'
            print 'Conditional Sentence Log Probabilities:'
            print 'P(sentence | important) = ' + str(PxGy1)
            print 'P(sentence | unimportant) = ' + str(PxGy0)
            print ' ------------------------------------------------------------------'
            print 'Unnormalized Conditional Class Log Probabilities'
            print 'P(important | sentence) = ' + str(Py1Gx)
            print 'P(unimportant | sentence) = ' + str(Py0Gx)
        if debug > -1:
            print ' ------------------------------------------------------------------'
            print 'Softmaxed Conditional Class Probabilities'
            print 'P(important | sentence) = ' + str(sPy1Gx)
            print 'P(unimportant | sentence) = ' + str(sPy0Gx)
        return(sPy1Gx)
    
    def summarize(self, article, verbosity = 0.5, debug = 0):
        sentences = self.split_into_sentences(article)
        
        keepers = []        
        i = 0
        for sentence in sentences:
            i += 1
            try:
                if self.classify(sentence, debug = debug) > verbosity:
                    keepers.append(sentence)
            except:
                print 'Error classifying sentence ' + str(i)
                print 'FullText: ' 
                print sentence
        if len(keepers) == 0:
            print 'No sentences found important'
            return('')
        reduced = reduce(lambda x,y: x + ' ' + y, keepers)
        return(reduced)
    
    ####Function Definitions
    #Returns the log probability of a level occuring, along with using recursion to \
    #find the levels contained therein. May be passed an entire sentence.
    def getConditionalLevelProbability(self, inputs):
        level = inputs[0]
        tagDF = inputs[1]
        mult = inputs[2]
        parent = inputs[3]
        debug = inputs[4]
        ret = 0
        if debug == 1:
            print 'Beginning Level...........'
            print level
        inTags = [x[0] for x in level[1:]]
        if u'' in inTags:
            inTags.remove(u'')
        
        #Do some recursion
        for i,tag in enumerate(inTags):
            if tag in self.phraseTags or tag in self.sentenceTypes:
                if debug == 1:
                    print 'beginning recursion due to:'
                    print tag
                ret = ret + self.getConditionalLevelProbability([level[i+1],tagDF,mult,tag,debug])
        
        #Do multiplicity for this level
        for tag in inTags:
            x = inTags.count(tag)
            mu = mult[tag]
            ret = ret + 0#math.log((math.exp(-mu) * mu**x / math.factorial(x)))
        
        #Do presence for this level
        inTags = list(set(inTags))
        for tag in inTags:
            if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case.
                print 'Breaking Due to non-unicode tag in getConditionalLevelProbability!'
                print tag
                break
            if debug == 1:
                print 'Probability of ' + tag + ' given ' + parent + ' is ' + str(tagDF.loc[tag,parent])
            ret = ret + math.log(tagDF.loc[tag,parent])
        return(ret)
        
    #To get the inclusion
    def getInclusionsGivenParent(self, inputs):
        level = inputs[0]
        tagDF = inputs[1]
        parent = inputs[2]
        debug = inputs[3]
        if debug == 1:
            print 'Beginning Level...........'
            print level
        inTags = [x[0] for x in level[1:]]
        if u'' in inTags:
            inTags.remove(u'')
            
        #Do some recursion
        for i,tag in enumerate(inTags):
            if tag in self.phraseTags or tag in self.sentenceTypes:
                if debug == 1:
                    print 'beginning recursion due to:'
                    print tag
                self.getInclusionsGivenParent([level[i+1],tagDF,tag,debug])
        
        #Add count for this level
        inTags = list(set(inTags))
        for tag in inTags:
            if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case.
                break
            if debug == 1:
                print 'incrementing: ' + tag + ' when conditioned on ' + parent
            tagDF.loc[tag,parent] += 1
            
            
    #To get the inclusions in a level recursively.
    def getInclusions(self,inputs):
        level = inputs[0]
        tagDict = inputs[1]
        debug = inputs[2]
        if debug == 1:
            print 'Beginning Level...........'
            print level
        inTags = [x[0] for x in level[1:]]
        if u'' in inTags:
            inTags.remove(u'')
            
        #Do some recursion
        for i,tag in enumerate(inTags):
            if tag in self.phraseTags or tag in self.sentenceTypes:
                if debug == 1:
                    print 'beginning recursion due to:'
                    print tag
                self.getInclusions([level[i+1],tagDict,debug])
        
        #Add count for this level
        inTags = list(set(inTags))
        for tag in inTags:
            if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case.
                break
            if debug == 1:
                print 'incrementing: ' + tag
            tagDict[tag] += 1
    
    #To find all PoS tags (pystatparser's documentation is literally non-existant)
    def getTagsRecursively(self, ss, knownTags = [], debug = 0):
        ret = knownTags
        for sentence in ss:
            for phrase in sentence:
                for element in phrase:
                    if type(element) == unicode:
                        if element not in ret:
                            ret.append(element)        
                    if type(element) == list:
                        ret.extend(self.getTagsRecursively(element))
        return(ret)
        
    #Flatten an n-dimensional list into a 1D list
    def recursiveFlatten(self, myList):
        ret = []
        for element in myList:
            if type(element) == list:
                element = self.recursiveFlatten(element)
            if type(element) == str or type(element) == unicode:
                ret.append(element)
            else:
                ret.extend(list(element))
        return(ret)
    
    #From http://stackoverflow.com/questions/4576077/python-split-text-on-sentences
    def split_into_sentences(self, text):
        if type(text) == unicode:
            text = unicode(text.encode('utf-8'), errors = 'ignore')
            text = unicodedata.normalize('NFKD',text).encode('ascii','ignore')
        caps = "([A-Z])"
        prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
        suffixes = "(Inc|Ltd|Jr|Sr|Co)"
        starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
        acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
        websites = "[.](com|net|org|io|gov)"
        text = " " + text + "  "
        text = text.replace("\n"," ")
        text = re.sub(prefixes,"\\1<prd>",text)
        text = re.sub(websites,"<prd>\\1",text)
        if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
        if 'a.m.' in text: text = text.replace('a.m.','a<prd>m<prd>')
        if 'p.m.' in text: text = text.replace('p.m.','p<prd>m<prd>')
        if '...' in text: text = text.replace('...','<prd><prd><prd>')
        text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
        text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
        text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
        text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
        text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
        text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
        text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
        if "”" in text: text = text.replace(".”","”.")
        if "\"" in text: text = text.replace(".\"","\".")
        if "!" in text: text = text.replace("!\"","\"!")
        if "?" in text: text = text.replace("?\"","\"?")
        text = text.replace(".",".<stop>")
        text = text.replace("?","?<stop>")
        text = text.replace("!","!<stop>")
        text = text.replace("<prd>",".")
        sentences = text.split("<stop>")
        sentences = sentences[:-1]
        sentences = [s.strip() for s in sentences]
        return sentences
Ejemplo n.º 30
0
def main():
    time1 = time()
    parser = Parser()

    inFile = sys.argv[1]
    outFile = sys.argv[2]
    f = open(outFile,'w+')
    
    for line in open(inFile):
        if config.print_line: 
            print line

        global code, errorCode, nodeList, nodeNum, firstVBNN, firstNNVB, found, sqFlag, gqFlag, NNNode, VBNode
        code = 0
        nodeNum = 0
        errorCode = []
        nodeList = []
        firstVBNN = []
        firstNNVB = []
        found = 0
        sqFlag = 0
        gpFlag = 0
        NNNode = []
        VBNode = []
            
        wordNum = len(line.split())
        if config.print_word_num:
            print 'Word Num: ', wordNum

        if config.ignore_long_sentence and wordNum>config.max_word_num:
            # print 'Long Sentence'
            # print '\n'*2
            # print -1, line,
            f.write(str(-1) + ' ' + line)
            continue 

        try:
            start = time()
            indentTree = parser.raw_parse(line)     # raw parse for indent tree list
            end = time()
        except:
            # print 'Parsing Error'
            # print '\n'*2
            # print -1, line,
            f.write(str(-1) + ' ' + line)
            continue

        if config.print_parse_time:
            print 'Raw Parse Time: ', end-start, 's'
        
        if config.print_indent_tree:
            pprint(indentTree)                      # unit test

        root = trans(indentTree)                    # recursively transform list to "tree of node"
        assignCode(root)
        assignDesNum(root)
        
        if config.dfs_indent:
            dfsIndent(root,0)                       # unit test

        if config.print_node_list:                  # unit test
            for node in nodeList:
                print node.getData(), '\t',
            print
        
        if config.show_nltk_tree:
            start = time()
            nlktTree = parser.parse(line)           # nlktTree, could be drawn into graph
            end = time()
            print 'NLKT Tree Parse Time: ', end-start, 's'
            display_tree(nlktTree)                  # unit test

        """ Now the check begins! """
        preCheck()
        totalCheck(root)

        """ If there is some NP+VP left """
        for i in range(len(nodeList)-1):
            node1 = nodeList[i]
            node2 = nodeList[i+1]
            if node2.getParent().getData() == u'RB' and i+2<len(nodeList):
                node2 = nodeList[i+2]
            if node1.getParent().getData() in NNList and node2.getParent().getData() in VBList:
                if (node1 in NNNode and node2 in VBNode) or (firstNNVB and (node1==firstNNVB[0] and node2==firstNNVB[1])):
                    pass
                else:
                    NNNode.append(node1)
                    VBNode.append(node2)

        """ Be Check """
        for i in range(len(nodeList)-1):
            node1 = nodeList[i]
            node2 = nodeList[i+1]
            if node2.getParent().getData() == u'RB' and i+2<len(nodeList):
                node2 = nodeList[i+2]
            if node1.getData().lower() == 'i':
                if node2.getData() in BEList and node2.getData() not in ['am', 'was',"'m"]:
                    errorCode.append(node2.getCode())
            elif node1.getData().lower() in PRPSecondList:
                if node2.getData() in BEList and node2.getData() not in ['are', 'were',"'re"]:
                    errorCode.append(node2.getCode())
            elif node1.getData().lower() in PRPThirdList:
                if node1.getData().lower()=='that':
                    if i>0 and nodeList[i-1].getParent().getData() in NNList:
                        continue
                if node2.getData() in BEList and node2.getData() not in ['is', 'was',"'s"]:
                    errorCode.append(node2.getCode())
            else:
                pass

        """ del duplicates """
        if firstNNVB:
            n = firstNNVB[0]
            v = firstNNVB[1]
            if n in NNNode and v in VBNode:
                NNNode.remove(n)
                VBNode.remove(v)
                

        """ replace RB with the word after (Maybe VB) """
        if firstNNVB:
            v = firstNNVB[1]
            if v.getParent().getData()==u'RB':
                code = v.getCode()
                new = nodeList[code+1]
                if new.getParent().getData() in VBList:
                    firstNNVB[1] = new
                else:
                    pass

        for v in VBNode:
            if v.getParent().getData()==u'RB':
                code = v.getCode()
                new = nodeList[code+1]
                if new.getParent().getData() in VBList:
                    VBNode[VBNode.index(v)] = new
                else:
                    pass

        """ print dependencies """
        if config.print_npvp:
            print 'FROM ERRORCODE:'
            for i in errorCode:
                print nodeList[i].getData(), '\t',
            print

            print 'FROM QUESTION:'
            for node in firstVBNN:
                print node.getData()
            for node in firstNNVB:
                print node.getData()
            print

            print 'FROM NPVP & OTHERS:'
            for i in range(len(NNNode)):
                print NNNode[i].getParent().getData(), NNNode[i].getData(), '\t', VBNode[i].getParent().getData(), VBNode[i].getData()
            print

        """ canonicalize """
        if firstVBNN:
            if not (firstVBNN[0].getParent().getData() in VBList and firstVBNN[1].getParent().getData() in NNList):
                firstVBNN = []

        if firstNNVB:
            if not (firstNNVB[1].getParent().getData() in VBList and firstNNVB[0].getParent().getData() in NNList):
                firstNNVB = []

        for v in VBNode:
            i = VBNode.index(v)
            if not (NNNode[i].getParent().getData() in NNList and VBNode[i].getParent().getData() in VBList):
                del NNNode[i]
                del VBNode[i]
        

        """ Finally! We add codes! """
        if config.print_standard_answer:
            """ FROM QUESTION """
            if sqFlag or gqFlag:
                if firstVBNN:
                    v = firstVBNN[0]
                    n = firstVBNN[1]
                    if n.getData().lower() in PRPThirdList or n.getParent().getData() in [u'NN', u'NNP']:       # single noun
                        if v.getParent().getData() in [u'VB', u'VBP']:
                            errorCode.append(v.getCode())
                    else:
                        if v.getParent().getData()==u'VBZ':
                            errorCode.append(v.getCode())

                    if firstNNVB:
                            v = firstNNVB[1]
                            if v.getParent().getData()==u'VBZ':
                                errorCode.append(v.getCode())

                else:
                    if firstNNVB:
                        if not (firstNNVB[0] in NNNode and firstNNVB[1] in VBNode):
                            NNNode.append(firstNNVB[0])
                            VBNode.append(firstNNVB[1])

            """ FROM NPVP & OTHERS """
            for i in range(len(NNNode)):
                n = NNNode[i]
                v = VBNode[i]
                if n.getData().lower() in PRPThirdList or n.getParent().getData() in [u'NN', u'NNP']:       # single noun
                    if v.getParent().getData() in [u'VB', u'VBP']:
                        errorCode.append(v.getCode())
                else:
                    if v.getParent().getData()==u'VBZ':
                        errorCode.append(v.getCode())

            errorCode = list(set(errorCode))
            errorCode.sort()
            if errorCode:
                for i in errorCode:
                    # print i+1,
                    f.write(str(i+1) + ' ')
            else:
                # print -1,
                f.write(str(-1) + ' ')
            # print line,
            f.write(line)


        if config.print_vb:
            printVB(root)                           # print all verb and MD in tree
            print

        if config.print_empty_line:
            print '\n'*2

    time2 = time()
    if config.print_total_time:
        print 'Total Execution Time: ', time2-time1, 's'
        
    if config.show_nltk_tree:
        Tkinter._test()                             # show the nlktTree graph
Ejemplo n.º 31
0
from stat_parser import Parser
from graph import Graph,get_leaves,merge_graphs
from nltk.tree import ParentedTree

parser = Parser()
trees = []
trees.append(parser.parse("The food was on the table where the child likes to eat"))
trees.append(parser.parse("The money is on the table"))
trees.append(parser.parse("Put the data in the table"))
trees.append(parser.parse("Add more rows to the database table"))
trees.append(parser.parse("Add more rows to the database table"))
trees.append(parser.parse("Why is the table empty It should have data in it"))
trees.append(parser.parse("Do not put your elbows on the table while you eat"))

for tree in trees:
	tree = ParentedTree.convert(tree)
graphs = []
for tree in trees:
	g = Graph()
	g.update(tree)
	graphs.append(g)
new_graph = merge_graphs(graphs)
new_graph.draw("new_graph")
new_graph.save_to_file("new_graph.gml")
new_graph.load_from_file("new_graph.gml")
print new_graph.get_median_relatedness()
print new_graph.get_senses("table")
Ejemplo n.º 32
0
import nltk
import json
import yaml
from random import choice
from stat_parser import Parser, display_tree
parser = Parser()
d = json.load(open('tree_ship_words.json'))
f = open("ships.yml")
ships = yaml.load(f.read())

keywords = {}
keys = []

def is_leaf(tree):
  if len(tree.leaves()) == 1:
    return True
  else:
    return False

def check_for_roll(tree):
  leaves = list(tree)
  key = ""
  words = []
  for leaf in leaves:
    if is_leaf(leaf):
      bottom = leaf
      while type(bottom) not in [str, unicode]:
        leaf = bottom
        bottom = list(leaf)[0]
      key += leaf.node
      words.append(leaf.leaves()[0])
CONSUMER_KEY = os.environ['tw_pg_consumerkey']
CONSUMER_SECRET = os.environ['tw_pg_consumer']
OAUTH_TOKEN = os.environ['tw_pg_token']
OAUTH_TOKEN_SECRET = os.environ['tw_pg_secret']

auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                           CONSUMER_KEY, CONSUMER_SECRET)

twitter_api = twitter.Twitter(domain = 'api.twitter.com', api_version = '1.1', auth = auth, format = 'json')
posts = twitter_api.statuses.user_timeline(count = '200') #does not fetch retweeets right now, set included_rts = true if needed

# pull out tweets as list
tweets = [ipost['text'] for ipost in posts] 

#####
# apply regexp chunker to get noun phrases
# http://nltk.org/book3/ch07.html
# process: 1) sentence segmentation, 2) tokenization, 3) part of speech tagging
# 4) entity detection
#
# stat_parser does all for steps above within its parser.
# TODO still need to check the results if they are reasonable.

parser = Parser()
result = [parser.parse(tweet) for tweet in tweets]

for res in result:
    print(res)

Ejemplo n.º 34
0
def testing_stat_parser(message):
    parser = Parser()
    return parser.parse(message)
# -*- coding: utf-8 -*-
# Import required libraries

import nltk, re, csv
from stat_parser import Parser, display_tree
parser = Parser()
from nltk.tree import Tree
from nltk.stem.wordnet import WordNetLemmatizer

# Define all the lists that are checked for the requirements

not_atomic_list = ["and that", "and also", "but ", "so that", "while ", "however ", "whereas ", "on the other hand", "in addition to", "respectively", "as well as", "thereby", "though ",  "thus ", " hence ", "therefore", "yet ", " including ", "in contrast", "contrary to", " beside", "aside from", "other than", "explaining", "which explains"]
not_independent_list = ["this study ", "our study", "the results ", "results ", "the findings ", "the present study ", "these findings ", "these results ", "this research ", "this data ", "the data ", "these data", "our data", "these observations", "this experiment ", "this publication ", "this analysis", "these analyses", "evidence", "this paper ", "the paper ", "this report ", "the report ", "this effect ", "we ", "compared with", "and other", "previous ", "previously", "the bacterium "]
not_declarative_list = ["?", "!"]
not_absolute_list = ["probabl", "perhaps", "potentially", "putative", "maybe", "plausible", "possible", "likely", "feasible", "hypothetical", "may", "could ", " seem ", "appears to", "appear to", " appear ", " might ", " suggest ", "minimally sufficient", "is predicted", "is foreseen", "is envisioned", "revealed that", "reveals that", "significant", "significantly", "to reveal", " estimated ", " estimate"]

# From here on, all the functions are defined that check whether the sentence fulfills the AIDA rules,
# and if they do not, the sentence is rewritten with individual functions per requirement
# (Yes, for the moment nothing is done when a sentence is not atomic or not independent..)

def check_if_atomic(sentence, parsed_sentence, tags):
    counter = 0
    atomic_check = re.compile("|".join(not_atomic_list))
    tree = Tree('s', parsed_sentence)
    for child in tree:
        string = str(child)
        if string.startswith("(S"):
            counter += 1
    if atomic_check.search(sentence_lower):
        return False
    elif counter > 1:
    data = open("data/extracted.txt").read()
    data = ''.join([i if ord(i) < 128 else '' for i in data])

    print "Tokenizing sentences"
    sentences = nltk.tokenize.sent_tokenize(data)
    open("token_cache", "w").write(pickle.dumps(sentences))

sentences = sentences[:500]

process_count = cpu_count()
# process_count = 1
# sentence_tasks = [sentences[i::process_count] for i in xrange(process_count)]

print "Using %d processes" % (process_count,)

parser = Parser()
parsed = parser.parse("This is a very long sentence.")

def delegate(task_queue, completed_queue):
    graph = Graph()
    parser = Parser()

    while True:
        try:
            sentence = task_queue.get(False)
        except:
            completed_queue.put(graph)
            print "My work here is done"
            return True
        print "Parsing sentence"
        parsed = parser.parse(sentence)
Ejemplo n.º 37
0
print ("Tagged words: %r\n" % tagged)





#-------------------------------------------------------------------------------

# Generate nltk tree

#-------------------------------------------------------------------------------



parser = Parser()

# http://www.thrivenotes.com/the-last-question/

#sentance = "What is the population of the country France?"

tree = parser.parse(sentence)

print ("--- Printing trees -----")

#print ("Tree 1: ",tree)

#print("\nTree 2: ", tree.pformat_latex_qtree())

#print("\nPretty tree:\n")
Ejemplo n.º 38
0
import sys
import nltk
import nltk.data
import nltk.tree
from stat_parser import Parser
import re

parser = Parser()


def getNodes(parent):
    for node in parent:
        if type(node) is nltk.Tree:
            if not getNodes(node):
                if node.label() == "VP":
                    # we want to remove some sentences describing
                    # environment
                    sentence = " ".join(node.leaves()).lower()
                    commands = re.split(r';|\,|\.|\>|\band\b|\bor\b|\bthen\b',
                                        sentence)
                    done = False
                    for command in commands:
                        if re.match(r'^[a-zA-Z0-9\;\,\.\-\*\:\'\"\/\s]{1,80}$',
                                    command) \
                                and re.search(r'[a-zA-Z]', command):
                            tokens = nltk.word_tokenize(command)
                            if len(tokens) > 0 and len(tokens) <= 5:
                                tagged = nltk.pos_tag(tokens)
                                if tagged[0][1] not in \
                                        ["VBZ", "VBN", "VBD", "VBP", "VBG",
                                         "MD", "NNS", "DT", "JJ"]:
Ejemplo n.º 39
0
class Translator:
    def __init__(self):
        self.filename='dictionary.txt'
        self.dict = {}
        bgm  = nltk.collocations.BigramAssocMeasures()
        finder = nltk.collocations.BigramCollocationFinder.from_words(nltk.corpus.brown.words())
        scores = finder.score_ngrams(bgm.likelihood_ratio)
        self.scored = {}
        for key, score in scores:
            self.scored[key] = score
        self.specialWords = [u'了', u'的']
        self.directions = ['east', 'west', 'south', \
        'north','northeast', 'southeast', 'northwest', 'southwest']
        self.parser = Parser()
    
    def tranlate(self):
        pass

    def loadDictionary(self):
        f = codecs.open(self.filename,'r','utf-8')
        regex = re.compile('(.*)\((.*)\)')
        ls = [ line.strip() for line in f]
        for i in ls :
            t = i.split(':')
            cn_word = t[0]
            en_words = []
            for w in t[1].split(';'):
                word = w.strip()
                m = regex.match(word) 
                if (m is not None):
                    en_words.append((m.group(1).strip(), m.group(2).strip()))
                else:
                    en_words.append((w.strip(), 'default'))
            self.dict[cn_word] = en_words
        f.close()

    def isNumerical(self, word):
        if word.isdigit():
            return True
        word = word.lower()
        if word in ['one', 'a', 'an']:
            return True
        else:
            return False

    def preProcess(self, sentence, pickWord):
        words = sentence.split(' ')
        en_sentence = []
        for word in words:
            w = word.split('#')
            word = w[0]
            t = w[1]
            if word == u'。':
                word = '.'
            elif word == u',':
                word = ','
            elif word == u'“' or word == u'”':
                word = '"'
            elif word == u'、':
                word = ','
            elif word == u':':
                word = ':'
            elif word == u'``':
                word = '"'
            if word in self.specialWords:
                #add tense
                if word == u'了':
                    en_sentence[-1] = conjugate(en_sentence[-1].strip(), 'p')
                continue
            if word in self.dict:
                #remove measure words
                if 'M' == type:
                    if len(en_sentence) > 0 and self.isNumerical(en_sentence[-1]):
                        #change one to a
                        if en_sentence[-1].lower() == 'one':
                            en_sentence[-1] = 'a'
                    continue
                if pickWord == 'baseline':
                    en_sentence.append(self.dict[word][0][0])
                else:
                    if len(en_sentence) > 0:
                        en_sentence.append(self.pick(self.dict[word], t, en_sentence[-1]))
                    else:
                        en_sentence.append(self.pick(self.dict[word], t, ''))
            else:
                en_sentence.append(word)
        return en_sentence

    def pick(self, dict, t, prev):
        candidates = []
        for w in dict:
            if w[1] == t:
                candidates.append(w[0])
        if len(candidates) == 0:
            for w in dict:
                candidates.append(w[0])
        if prev == '':
            return candidates[0]
        else:
            return max(candidates, key=lambda x:self.scored[(prev, x)] if (prev, x) in self.scored else 0)

    def parse(self, sentence):
        sent_str = ''
        for w in sentence:
            sent_str += w + ' '
        sent_str = sent_str.strip()
        tree = self.parser.parse(sent_str)
        return tree

    def orderOneOf(self, sentence):
        full_sentence = nltk.word_tokenize(' '.join(sentence))
        tags = nltk.pos_tag(full_sentence)
        new_sentence = []
        for i in range(len(full_sentence) - 1):
            if full_sentence[i] == 'one' and full_sentence[i + 1] == 'of':
                for j in reversed(range(i - 1)):
                    if 'VB' in tags[j][1] and tags[j][1] != 'VBD' and tags[j + 1][1] == 'RB':
                        new_sentence.insert(j + 2, 'of')
                        new_sentence.insert(j + 2, 'one')
                        break
                    elif tags[j][1] == 'IN' or ('VB' in tags[j][1] and tags[j][1] != 'VBD'):
                        new_sentence.insert(j + 1, 'of')
                        new_sentence.insert(j + 1, 'one')
                        break
                    elif tags[j][1] == 'DT':
                        new_sentence.insert(j, 'of')
                        new_sentence.insert(j, 'one')
                        break
            elif i < 2 or (full_sentence[i] != 'of' and full_sentence[i - 1] != 'one'):
                new_sentence.append(full_sentence[i])
        return new_sentence

    def pluralize(self, tree):
        if type(tree) is Tree:
            if tree.node in ['VB', 'VP'] and not type(tree[0]) is Tree:
                tree[0] = pattern.en.conjugate(tree[0], '3sg')
            #if tree.node == 'VBP':
            #   tree[0] = pattern.en.conjugate(tree[0], tense=PARTICIPLE, parse=True)
            if tree.node in ['NP','ADJP','UCP']:
                findCD = False
                for child in tree:
                    if child.node == 'CD' and not type(child[0]) is Tree\
                    and child[0].lower() not in ['1', 'a', 'an', 'one']:
                        findCD = True
                    if child.node == 'JJ' and not type(child[0]) is Tree\
                    and child[0].lower() in ['many', 'numerous', 'a lot']:
                        findCD = True
                    if child.node == 'QP':
                        findCD = True
                    if findCD and child.node == 'NN':
                        child[0] = pattern.en.pluralize(child[0])

            for child in tree:
                self.pluralize(child)

    def arrangeLocations(self, tree):
        if type(tree) is Tree:
            if tree.node == 'NAC':
                for i in range(0, len(tree)):
                    child = tree[i]
                    if i<len(tree)-1 and child.node == 'NNP' \
                    and not type(tree[i+1][0]) is Tree and \
                    tree[i+1][0].lower() in ['state', 'city']:
                        del tree[i+1]
                        del tree[i]
                        tree.insert(0, child)
                    if i >= len(tree)-1:
                        break
            for child in tree:
                self.arrangeLocations(child)

    def uncompleteSentence(self, sentence):
        full_sentence = nltk.word_tokenize(' '.join(sentence))
        tags = nltk.pos_tag(full_sentence)
        new_sentence = []
        for i in range(len(full_sentence) - 1):
            isVerb = True
            synsets = wordnet.synsets(tags[i][0])
            for syn in synsets:
                if 'verb.' not in syn.lexname:
                    isVerb = False
                    break
            if ('NN' == tags[i][1] or tags[i][1] == 'RB') and isVerb:
                new_sentence.append(conjugate(tags[i][0], 'part'))
            elif tags[i][1] == 'JJ' and isVerb:
                new_sentence.append(conjugate(tags[i][0], 'ppart'))
            else:
                new_sentence.append(tags[i][0])
        new_sentence.append(full_sentence[-1])
        return new_sentence

    def forwardDirectionWord(self, tree):
        if type(tree) is Tree:
            if tree.node == 'NP':
                for i in range(0, len(tree)):
                    child = tree[i]
                    if child.node in ['NNP', 'NNPS'] and \
                    not type(child[0]) is Tree and child[0].lower() \
                    in self.directions:
                        del tree[i]
                        child[0] = 'the '+child[0]+' of'
                        tree.insert(0, child)
                        return
            for child in tree:
                self.forwardDirectionWord(child)

    def suchAs(self,sentence):
        st=' '.join(sentence)
        reg=r'for example : ([\w\s,]+) etc\.'
        st=re.sub(reg, 'such as \g<1>, etc', st)
        return st.split(' ')


    def arrangeDate(self,sentence):
        year=r'[12]\d{3}'
        month=r'January|February|March|April|May|June|July|August|September|October|November|December' 
        day=r'\d{1,2}'
        for i in range(len(sentence)):
            yWord=mWord=dWord=''
            word=sentence[i]
            if re.match(year,word):
                yWord=sentence[i]
                if i+1<len(sentence):
                    nextWord=sentence[i+1]
                    if re.match(month, nextWord):
                        mWord=nextWord.capitalize()
                        if i+2<len(sentence):
                            nextNextWord=sentence[i+2]
                            if re.match(day, nextNextWord):
                                dWord=nextNextWord
            if yWord!='' and mWord!='' and dWord!='':
                sentence[i:i+3]=[mWord, dWord, yWord]
                i+=3
            elif yWord!='' and mWord!='':
                sentence[i:i+2]=[mWord, yWord]
                i+=2

        return sentence

    def superlative(self, tree):
        if type(tree) is Tree:
            for i in range(0, len(tree)):
                if i+1<len(tree):
                    if tree[i].node=='RBS' and tree[i+1].node=='JJ':
                        if type(tree[i][0]) is not Tree and \
                        type(tree[i+1][0]) is not Tree:
                            superWord = 'the ' + pattern.en.superlative(tree[i+1][0])
                            if 'most'==superWord:
                                del tree[i+1]
                            elif 'most' not in superWord:
                                tree[i+1][0]=superWord
                                del tree[i]
                            else:
                                tree[i+1][0]=superWord
                                del tree[i]
                        return
            for child in tree:
                self.superlative(child)

    def flatSentence(self, wl):
        result=''
        for i in range(len(wl)):
            if i==0:
                result=wl[0].capitalize()
            elif wl[i]==',' or wl[i]=='.':
                result+=wl[i]
            else:
                result+=' '+wl[i]
        return result+'.'


    def postProcess(self,sentence):
        strategies=[\
        (self.suchAs, False),\
        (self.arrangeDate, False),\
        (self.pluralize, True), \
        (self.arrangeLocations, True),\
        (self.superlative, True),\
        (self.orderOneOf, False), \
        (self.forwardDirectionWord, True), \
        (self.uncompleteSentence, False) \
        ]

        #Process flat sentence first
        for (func,isTree) in strategies:
            if not isTree:
                sentence=func(sentence)
            else:
                tree=self.parse(sentence)
                func(tree)
                sentence=tree.leaves()

        return self.flatSentence(sentence)
Ejemplo n.º 40
0
 def classify(self, sentence, debug = 0, varianceExponent = 0):
     #If the sentence hasn't been parsed, we must parse it.
     plaintext = False
     if type(sentence) != list:
         plaintext = True
         original = sentence
         try:
             sentence = self.parser.parse(sentence)
         except:
             try:
                 self.parser = Parser()
                 sentence = self.parser.parse(sentence)
             except:
                 print 'Couldn\'t create a parsing object.'
                 print 'Prehaps pystatparser is not loaded?'
                 print 'type \"from stat_parser import Parser\"'
                 print 'Otherwise, you must install it from Github as ' + \
                     'directed in the README'
         
     #Deal with new root types
     if sentence[0] not in self.importantRootProbabilities:
         self.importantRootProbabilities[sentence[0]] = self.biomialParamDist(self.binomHyperParams)
     if sentence[0] not in self.regularRootProbabilities:
         self.regularRootProbabilities[sentence[0]] = self.biomialParamDist(self.binomHyperParams)
     
     #Deal with new non-root tag types
     flat = self.recursiveFlatten(sentence)
     flat = filter(lambda x: type(x) == unicode, flat)
     for i,tag in enumerate(flat):
         if tag not in self.tags:
             #Set a priori beliefs for multiplicity parameters
             self.importantMultiplicityParameters[tag] = self.poissonParamDist(self.poissonHyperParams)
             self.regularMultiplicityParameters[tag] = self.poissonParamDist(self.poissonHyperParams)
             
             #Set a priori beliefs for conditional presence parameters being contained by anything else
             self.importantCondPresenceProbs.loc[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.importantCondPresenceProbs.columns]
             self.regularCondPresenceProbs.loc[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.columns]
             if type(flat[i+1])==unicode:
                 #Set a priori beliefs for conditional presence parameters containing other things
                 self.importantCondPresenceProbs[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index]
                 self.regularCondPresenceProbs[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index]
     
     ##Get P(x|y = Important) 
     PxGy1 = math.log(self.importantRootProbabilities[sentence[0]].getMean()) 
     PxGy1 = PxGy1 / self.importantRootProbabilities[sentence[0]].getVar()**varianceExponent
     PxGy1 += self.getConditionalLevelProbability([sentence,self.importantCondPresenceProbs,self.importantMultiplicityParameters,sentence[0],varianceExponent,debug>=2])
     
     ##Get P(x|y = REGULAR) 
     PxGy0 = math.log(self.regularRootProbabilities[sentence[0]].getMean())
     PxGy0 = PxGy0/self.regularRootProbabilities[sentence[0]].getVar()**varianceExponent
     PxGy0 += self.getConditionalLevelProbability([sentence,self.regularCondPresenceProbs,self.regularMultiplicityParameters,sentence[0],varianceExponent,debug>=2])
     
     #Get priors in a log form:
     Py1 = math.log(self.classPriors[1])
     Py0 = math.log(self.classPriors[0])
     
     #Get log Probabilities of each class through Bayes' Rule
     Py1Gx = PxGy1+Py1
     Py0Gx = PxGy0+Py0
     
     #Derive softmax shift parameter for very small probabilities.
     shift = 0
     if min([Py1Gx,Py0Gx]) < -20:
         shift = -1*min([Py1Gx,Py0Gx]) - 20
     
     #SoftMax probabilities
     try:
         denom = math.log(math.e**(shift + Py1Gx) + math.e**(shift + Py0Gx))
         
         sPy1Gx = shift + Py1Gx-denom
         sPy0Gx = shift + Py0Gx-denom
         
         #Turn back into probabilities for output
         sPy1Gx = math.e**sPy1Gx
         sPy0Gx = math.e**sPy0Gx
     except OverflowError:
         if debug > -1:
             print 'Overflow error'
             if Py1Gx >= Py0Gx:
                 print 'Assigning important sentence with probability one.'
             else:
                 print 'Assigning regular sentence with probability one.'
             print 'Before softmax, log probabilities were:'
             print 'P(important | sentence) = ' + str(Py1Gx)
             print 'P(unimportant | sentence) = ' + str(Py0Gx)
         if Py1Gx >= Py0Gx:
             sPy1Gx = 1.0
             sPy0Gx = 0.0
         else:
             sPy1Gx = 0.0
             sPy0Gx = 1.0
             
     
     if debug > -1:
         print 'Estimating Class for sentence:'
         if plaintext:
             print '\"' + original + '\"'
         else:
             print sentence
     if debug > 0:
         print ' ------------------------------------------------------------------'
         print 'Class Priors (log probability):'
         print 'P(important) = ' + str(Py1)
         print 'P(unimportant) = ' + str(Py0)
         print ' ------------------------------------------------------------------'
         print 'Conditional Sentence Log Probabilities:'
         print 'P(sentence | important) = ' + str(PxGy1)
         print 'P(sentence | unimportant) = ' + str(PxGy0)
         print ' ------------------------------------------------------------------'
         print 'Unnormalized Conditional Class Log Probabilities'
         print 'P(important | sentence) = ' + str(Py1Gx)
         print 'P(unimportant | sentence) = ' + str(Py0Gx)
     if debug > -1:
         print ' ------------------------------------------------------------------'
         print 'Softmaxed Conditional Class Probabilities'
         print 'P(important | sentence) = ' + str(sPy1Gx)
         print 'P(unimportant | sentence) = ' + str(sPy0Gx)
     return(sPy1Gx)
Ejemplo n.º 41
0
Archivo: te.py Proyecto: ankit96/lehrer
from nltk.corpus import stopwords
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pprint
from stat_parser import Parser
import questanalyser
import sys
parser = Parser()



f=open('questions.txt','r')
cont=f.readlines()

f=open('cprogramming.txt','r')
train_text=f.readlines()
for sample_text in cont :
	if len(sample_text)<3:
		break
	

	parsed_tree =  parser.parse(sample_text)
	
	print ""
	np = [" ".join(i.leaves()) for i in parsed_tree.subtrees() if i.label() == 'NP']
	
	np_mwe_nocomma = [j for j in [" ".join(i.leaves()) for i in parsed_tree.subtrees() if i.label() == 'NP'] if j.count(' ') > 0 and j.count(',') == 0]
	x = []
	for i in sorted(np_mwe_nocomma, key=len):
Ejemplo n.º 42
0
 def train(self, trainingSentences, labels, alpha = 0.1, beta = 0.1, debug = 0):
     if debug > -1:
         print
         print '*********************************************************'
         print '                   SySE V 0.1 '
         print 'Beginning Training Sequence with ' + \
             str(len(trainingSentences)) + ' training sentences...'
         print '*********************************************************'
         if debug > 0:
             print
             print 'Initializing... '
     
     if type(trainingSentences[0]) != list:
         print 'These sentences do not appear to have been parsed.'
         print 'They will be parsed now.'
         if len(trainingSentences > 10):
             print 'Given their volume, this will take some time.'
         try:
             self.parser = Parser()
         except:
             print 'This environment should have pystatparser loaded ' + \
             'in order to train on unparsed sentences.'
             print 'Exiting...'
             return
         trainingSentences = [parser.parse(x) for x in trainingSentences]
     
     ####Initialization
     #Save hyperparameters
     self.alpha = alpha
     self.beta = beta
     
     #See what tags are in the training data.
     tags = []
     for sentence in trainingSentences:
         flat = self.recursiveFlatten(sentence)
         for el in flat:
             if type(el) == unicode and el not in tags:
                 tags.append(el)            
         
     self.tags = set(tags)
     
     #What kind of root tags are there?
     self.sentenceTypes = set([x[0] for x in trainingSentences])
     
     #Which tags may contain other tags?
     self.phraseTags = []
     for sentence in trainingSentences:
         flat = self.recursiveFlatten(sentence)
         for i in range(0,len(flat)):
             try:
                 if type(flat[i]) == unicode and type(flat[i+1]) == unicode and flat[i] not in self.phraseTags:
                     self.phraseTags.append(flat[i])
             except IndexError:
                 print 'We\'ve reached the end of this sentence'
                 
     self.phraseTags = set(self.phraseTags) - self.sentenceTypes
    
     #Robustness
     labels = list(labels)    
     
     #Split training sentences into Important (I) and Regular (R) (Unimportant)
     importantSentences = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences)
     regularSentences = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences)
 
     self.classPriors = []
     
     ###Test inputs
     #Make sure labels are right length for sentences.
     if len(labels) != len(trainingSentences):
         print 'Labels and trainingSentencs must be the same length!'
         return
     #Make sure labels are valid
     for label in labels:
         if label != 0 and label != 1:
             print 'Lables should be either 0 or 1.'
             print 'exiting...'
             return
     
     #Split training sentences into Important (I) and Regular (R) (Unimportant)
     self.importantRootProbabilities = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences)
     self.regularRootProbabilities = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences)
                         
     ###Train Class Priors
     self.classPriors.append(float(labels.count(0))/float(len(labels)))
     self.classPriors.append(float(labels.count(1))/float(len(labels)))
     
     if debug > 0:
         print '*********************************************************'
         print 'These are the class priors'
         print '*********************************************************'    
         print self.classPriors
         print
         print      
     
     ###Train Sentence Type
     self.importantRootProbabilities = dict(zip(list(self.sentenceTypes),np.zeros(len(list(self.sentenceTypes)))))
     self.regularRootProbabilities = dict(zip(list(self.sentenceTypes),np.zeros(len(list(self.sentenceTypes)))))
     
     #Get the count of each sentence type in I
     for sentence in importantSentences:
         #Make sure we get what we expect
         if type(sentence[0]) != unicode:
             print "We are looking for a non-unicode sentence type. exiting..."
             break
             return            
         #if it isn't in the list yet, add it.
         self.importantRootProbabilities[sentence[0]] += 1
         
     #We will now implement a softmax to turn the counts into probabilities
     for param in self.importantRootProbabilities:
         self.importantRootProbabilities[param]=(float(self.importantRootProbabilities[param]) + alpha)/ \
             (float(len(trainingSentences)) + alpha*(len(self.importantRootProbabilities)+1))
             
     #Get the count of each sentence type in R
     for sentence in regularSentences:
         #Make sure we get what we expect
         if type(sentence[0]) != unicode:
             print "We are looking for a non-unicode sentence type. exiting..."
             break
             return            
         #if it isn't in the list yet, add it.
         self.regularRootProbabilities[sentence[0]] += 1
     
     #We will now implement a softmax to turn the counts into probabilities
     for param in self.regularRootProbabilities:
         self.regularRootProbabilities[param]=float(self.regularRootProbabilities[param])/ \
             float(len(trainingSentences))
     
     if debug > 0:
         print '*********************************************************'
         print 'These are the sentence type parameters'
         print '*********************************************************'
         
         print ' --------------------------------------------------------'
         print ' For Important Sentences:'
         print self.importantRootProbabilities
         
         print ' --------------------------------------------------------'
         print ' For Regular Sentences:'
         print self.regularRootProbabilities
         print
         print
     
     ###Train Phrases
     
     ##Primitive Inference on Multiplicity Parameter
     #Define dictionaries to store times a tag was included in a phrase
     tagInclusionI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many times is a tag in a level?
     tagInclusionR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many times is a tag in a level?
     #Define dictionaries to store times a tag was used at all.
     tagCountI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear?
     tagCountR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear?
     #To store dumb poisson inference
     self.importantMultiplictyParameter = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#For storing parameter estimates.
     self.regularMultiplictyParameter = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#For storing parameter estimates.
     
     #Get Inclusion for I
     for sentence in importantSentences:
         self.getInclusions([sentence,tagInclusionI,debug>=2])
     
     #Get Inclusion for R
     for sentence in regularSentences:
         self.getInclusions([sentence,tagInclusionR,debug>=2])
     
     #Get Counts for I
     for sentence in importantSentences:
         flat = self.recursiveFlatten(sentence)
         currentTags = filter(lambda x: type(x)==unicode, flat)
         for tag in currentTags[1:]:
             tagCountI[tag] += 1
     
     #Get Counts for R
     for sentence in regularSentences:
         flat = self.recursiveFlatten(sentence)
         currentTags = filter(lambda x: type(x)==unicode, flat)
         for tag in currentTags[1:]:
             tagCountR[tag] += 1
     
     #Estimate Parameters for I
     for tag in tagInclusionI.keys():
         if (tagCountI[tag] > 1):
             self.importantMultiplictyParameter[tag] = (tagCountI[tag]-1) / tagInclusionI[tag]
             
     #Estimate Parameters for R
     for tag in tagInclusionR.keys():
         if (tagCountR[tag] > 1):
             self.regularMultiplictyParameter[tag] = (tagCountR[tag]-1) / tagInclusionR[tag]
     
         if debug > 0:
             print '*********************************************************'
             print ' Estimation for Multiplicity Parameters '
             print '*********************************************************'
             print
             print ' ------------------------------------------------------------------'
             print 'Tag Counts for Important Sentences:'
             print tagCountI    
             print 'Tag Counts for Regular Sentences:'
             print tagCountR
             print ' ------------------------------------------------------------------'
             print 'Tag Inclusion for Important Sentences:'
             print tagInclusionI
             print 'Tag Inclusion for Regular Sentences:'
             print tagInclusionR
             print ' ------------------------------------------------------------------'
             print 'Dumb Parameter Estimates for Imporant Sentences:'
             print self.importantMultiplictyParameter
             print 'Dumb Parameter Estimates for Regular Sentences:'
             print self.regularMultiplictyParameter    
             print
             print
     
     ##Primitive Inference on Presence Parameters
     
     #We need to find inclusions given parent
     #To store conditional presence probabilities, what can almost be \
         #thought of as transition probabilities.
     #This is the uinformed probability of a particular presence.
     ui = self.alpha / (self.alpha*(len(self.regularRootProbabilities) + 1))
     #For important phrases
     self.importantCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)])
     self.importantCondPresenceProbs = pd.DataFrame(self.importantCondPresenceProbs).applymap(lambda x: x + ui)
     self.importantCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags)
     self.importantCondPresenceProbs.index = list(self.tags)
     
     #For regularPhrases
     self.regularCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)])
     self.regularCondPresenceProbs = pd.DataFrame(self.regularCondPresenceProbs).applymap(lambda x: x + ui)
     self.regularCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags)
     self.regularCondPresenceProbs.index = list(self.tags)
     
     #Define dictionaries to store times a tag was used at all. This time, \
         #We care about root/sentence tags as well.
     tagCountI = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear?
     tagCountR = dict(zip(list(self.tags), np.zeros(len(list(self.tags)))))#How many total times does the tag appear?
     
     #Tag counts, but on sentences as well, unlike above.
     
     
     #Count Conditional Inclusions for Important Sentences
     for sentence in importantSentences:
         self.getInclusionsGivenParent([sentence,self.importantCondPresenceProbs,sentence[0],debug>=2])
     
     #Count Conditional Inclusions for Regular Sentences
     for sentence in regularSentences:
         self.getInclusionsGivenParent([sentence,self.regularCondPresenceProbs,sentence[0],debug>=2])
     
     #Get Counts for I
     for sentence in importantSentences:
         flat = self.recursiveFlatten(sentence)
         currentTags = filter(lambda x: type(x)==unicode, flat)
         for tag in currentTags:
             tagCountI[tag] += 1
     
     #Get Counts for R
     for sentence in regularSentences:
         flat = self.recursiveFlatten(sentence)
         currentTags = filter(lambda x: type(x)==unicode, flat)
         for tag in currentTags:
             tagCountR[tag] += 1
     
     #Calculate Conditional Presence Parameter for Important Sentences
     for column in self.importantCondPresenceProbs.columns:
         if tagCountI[column] > 0:
             num = self.importantCondPresenceProbs.loc[:,column] + alpha
             denom = tagCountI[column] + (len(self.importantCondPresenceProbs.columns) + 1)*alpha
             self.importantCondPresenceProbs.loc[:,column] = num/denom
         
     #Calculate Conditional Presence Parameter for Regular Sentences
     for column in self.regularCondPresenceProbs.columns:
         if tagCountR[column] > 0:
             #AdditiveSmoothing
             num = self.regularCondPresenceProbs.loc[:,column] + alpha
             denom = tagCountR[column] + (len(self.regularCondPresenceProbs.columns) + 1)*alpha
             self.regularCondPresenceProbs.loc[:,column] = num/denom
     
     if debug > 1:
         print '*********************************************************'
         print 'Presence Parameter Estimation'
         print '*********************************************************'
         print
         print ' ------------------------------------------------------------------'
         print ' Conditional Parameters for Important Sentences'
         print self.importantCondPresenceProbs
         print
         print ' ------------------------------------------------------------------'
         print ' Conditional Parameters for Regular Sentences'
         print self.regularCondPresenceProbs
         print ' ------------------------------------------------------------------'    
     
     if debug > -1:
         print
         print
         print '...Finished'
Ejemplo n.º 43
0
from nltk.corpus import stopwords
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pprint
from stat_parser import Parser
parser = Parser()

EXAMPLE_TEXT = "What is a pointer on pointer?"
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(EXAMPLE_TEXT)

filtered_sentence = [w for w in word_tokens if not w in stop_words]
"""
print(word_tokens)
print(filtered_sentence)
"""
train_text = """Hello girls and guys, welcome to an in-depth and practical machine learning course.
The objective of this course is to give you a wholistic understanding of machine learning, covering
theory, application, and inner workings of supervised, unsupervised, and deep learning algorithms.
In this series, we'll be covering linear regression, K Nearest Neighbors, Support Vector Machines
(SVM), flat clustering, hierarchical clustering, and neural networks.
For each major algorithm that we cover, we will discuss the high level intuitions of the algorithms
and how they are logically meant to work. Next, we'll apply the algorithms in code using real world
data sets along with a module, such as with Scikit-Learn. Finally, we'll be diving into the inner
workings of each of the algorithms by recreating them in code, from scratch, ourselves, including
all of the math involved. This should give you a complete understanding of exactly how the
algorithms work, how they can be tweaked, what advantages are, and what their disadvantages are.
In order to follow along with the series, I suggest you have at the very least a basic understanding of
Python. If you do not, I suggest you at least follow the Python 3 Basics tutorial until the module
Ejemplo n.º 44
0
class SySE:
    def __init__(self):
        try:
            send = resource_filename(__name__, 'default.dat')
            self.loadParameters(send)
        except:
            print "Could not load default parameters."
            print "You should either train this object using the \"train\" " +\
                "method, or load parameters with the \"loadParameters\" method"
    ####Supervised Training.
    #trainingSentences: sentences on which to train (May already be parsed)
    #labels: corresponding binary (1,0) labels.
    #HyperParams for the conjugate Beta and Gamma priors respectively.
    #The Gamma distributionis parameterized such that the term with the beta \
    #parameters looks like this: e^(-x*beta) for random variable x.
    def train(self, trainingSentences, labels, binomHyperParams = [0.5,0.5], poissonHyperParams = [0.0001,0.005], debug = 0):
        if debug > -1:
            print
            print '**********************************************************'
            print '                   SySE V 1.1.2 '
            print 'Beginning Training Sequence with ' + \
                str(len(trainingSentences)) + ' training sentences...'
            print '**********************************************************'
            if debug > 0:
                print
                print 'Initializing... '
        
        if type(trainingSentences[0]) != list:
            print 'These sentences do not appear to have been parsed.'
            print 'They will be parsed now.'
            if len(trainingSentences) > 10:
                print 'Given their volume, this will take some time.'
            try:
                self.parser = Parser()
                trainingSentences = [self.parser.parse(x) for x in trainingSentences]
            except:
                print 'This environment should have pystatparser installed ' +\
                'in order to train on unparsed sentences.'
                print 'Parameters could not be fit'
                print 'Exiting...'
                return
            
        
        ####Initialization
        #Save hyperparameters
        self.binomHyperParams = binomHyperParams
        self.poissonHyperParams = poissonHyperParams
        
        #See what tags are in the training data.
        tags = []
        for sentence in trainingSentences:
            flat = self.recursiveFlatten(sentence)
            for el in flat:
                if type(el) == unicode and el not in tags:
                    tags.append(el)            
            
        self.tags = set(tags)
        
        #What kind of root tags are there?
        self.sentenceTypes = set([x[0] for x in trainingSentences])
        
        #Which tags may contain other tags?
        self.phraseTags = []
        for sentence in trainingSentences:
            flat = self.recursiveFlatten(sentence)
            for i in range(0,len(flat)):
                try:
                    if type(flat[i]) == unicode and type(flat[i+1]) == unicode and flat[i] not in self.phraseTags:
                        self.phraseTags.append(flat[i])
                except IndexError:
                    print 'We\'ve reached the end of this sentence'
                    
        self.phraseTags = set(self.phraseTags) - self.sentenceTypes
       
        #Robustness
        labels = list(labels)    
        
        #Split training sentences into Important (I) and Regular (R) (Unimportant)
        importantSentences = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences)
        regularSentences = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences)
    
        self.classPriors = []
        
        ###Test inputs
        #Make sure labels are right length for sentences.
        if len(labels) != len(trainingSentences):
            print 'Labels and trainingSentencs must be the same length!'
            return
        #Make sure labels are valid
        for label in labels:
            if label != 0 and label != 1:
                print 'Lables should be either 0 or 1.'
                print 'exiting...'
                return
                
        ###Train Class Priors
        self.classPriors.append(float(labels.count(0))/float(len(labels)))
        self.classPriors.append(float(labels.count(1))/float(len(labels)))
        
        if debug > 0:
            print '*********************************************************'
            print 'These are the class priors'
            print '*********************************************************'    
            print self.classPriors
            print
            print      
        
        ###Train Sentence Type
        self.importantRootProbabilities = dict(zip(list(self.sentenceTypes),[binomialParamDist(self.binomHyperParams) for x in range(0,len(list(self.sentenceTypes)))]))
        self.regularRootProbabilities = dict(zip(list(self.sentenceTypes),[binomialParamDist(self.binomHyperParams) for x in range(0,len(list(self.sentenceTypes)))]))
        
        #Get the count of each sentence type in I
        for sentence in importantSentences:
            #Make sure we get what we expect
            if type(sentence[0]) != unicode:
                print "We are looking for a non-unicode sentence type. exiting..."
                break
                return            
            #if it isn't in the list yet, add it.
            self.importantRootProbabilities[sentence[0]].update(1)
            for sentence1 in importantSentences:
                if sentence1 != sentence:
                    self.importantRootProbabilities[sentence1[0]].update(False)
        
        #Get the count of each sentence type in R
        for sentence in regularSentences:
            #Make sure we get what we expect
            if type(sentence[0]) != unicode:
                print "We are looking for a non-unicode sentence type. exiting..."
                break
                return            
            #if it isn't in the list yet, add it.
            self.regularRootProbabilities[sentence[0]].update(1)
            for sentence1 in importantSentences:
                if sentence1 != sentence:
                    self.regularRootProbabilities[sentence1[0]].update(False)
        
        if debug > 0:
            print '*********************************************************'
            print 'These are the sentence type parameters'
            print '*********************************************************'
            
            print ' --------------------------------------------------------'
            print ' For Important Sentences:'
            print self.importantRootProbabilities
            
            print ' --------------------------------------------------------'
            print ' For Regular Sentences:'
            print self.regularRootProbabilities
            print
            print
        
        ###Train Phrases
        ##Primitive Inference on Multiplicity Parameter
        #To store poisson beliefs
        self.importantMultiplicityParameters = dict(zip(list(self.tags), [poissonParamDist(self.poissonHyperParams) for x in range(0,len(list(self.tags)))]))#For storing parameter estimates.
        self.regularMultiplicityParameters = dict(zip(list(self.tags), [poissonParamDist(self.poissonHyperParams) for x in range(0,len(list(self.tags)))]))#For storing parameter estimates.
        
        #Get Inclusion for I
        for sentence in importantSentences:
            self.getInclusions([sentence,self.importantMultiplicityParameters,debug>=2])
        
        #Get Inclusion for R
        for sentence in regularSentences:
            self.getInclusions([sentence,self.regularMultiplicityParameters,debug>=2])
        
        #Get Counts for I
        for sentence in importantSentences:
            flat = self.recursiveFlatten(sentence)
            currentTags = filter(lambda x: type(x)==unicode, flat)
            for tag in currentTags[1:]:
                self.importantMultiplicityParameters[tag].updateCount(1)
        
        #Get Counts for R
        for sentence in regularSentences:
            flat = self.recursiveFlatten(sentence)
            currentTags = filter(lambda x: type(x)==unicode, flat)
            for tag in currentTags[1:]:
                self.regularMultiplicityParameters[tag].updateCount(1)
        
        ####YOUNEED TO GO OVER THIS AGAIN
        #Estimate Parameters for I
        for tag in self.importantMultiplicityParameters.keys():
            if (self.importantMultiplicityParameters[tag].alpha > 1):
                self.importantMultiplicityParameters[tag].updateCount(-1)
                
        #Estimate Parameters for R
        for tag in self.regularMultiplicityParameters.keys():
            if (self.regularMultiplicityParameters[tag].alpha > 1):
                self.regularMultiplicityParameters[tag].updateCount(-1)
        
            if debug > 0:
                print '*********************************************************'
                print ' Estimation for Multiplicity Parameters '
                print '*********************************************************'
                print
                print 'Dumb Parameter Estimates for Imporant Sentences:'
                print self.regularMultiplicityParameters
                print 'Dumb Parameter Estimates for Regular Sentences:'
                print self.regularMultiplictyParameter    
                print
                print
        
        ##Primitive Inference on Presence Parameters
        
        #We need to find inclusions given parent
        #To store conditional presence probabilities, what can almost be \
            #thought of as transition probabilities.
        #For important phrases
        self.importantCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)])
        self.importantCondPresenceProbs = pd.DataFrame(self.importantCondPresenceProbs).applymap(lambda x: binomialParamDist(self.binomHyperParams))
        self.importantCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags)
        self.importantCondPresenceProbs.index = list(self.tags)
        
        #For regularPhrases
        self.regularCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)])
        self.regularCondPresenceProbs = pd.DataFrame(self.regularCondPresenceProbs).applymap(lambda x: binomialParamDist(self.binomHyperParams))
        self.regularCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags)
        self.regularCondPresenceProbs.index = list(self.tags)
        
        #Count Conditional Inclusions for Important Sentences
        for sentence in importantSentences:
            self.getInclusionsGivenParent([sentence,self.importantCondPresenceProbs,sentence[0],debug>=2])
        
        #Count Conditional Inclusions for Regular Sentences
        for sentence in regularSentences:
            self.getInclusionsGivenParent([sentence,self.regularCondPresenceProbs,sentence[0],debug>=2])
        
        if debug > 1:
            print '*********************************************************'
            print 'Presence Parameter Estimation'
            print '*********************************************************'
            print
            print ' ------------------------------------------------------------------'
            print ' Conditional Parameters for Important Sentences'
            print self.importantCondPresenceProbs
            print
            print ' ------------------------------------------------------------------'
            print ' Conditional Parameters for Regular Sentences'
            print self.regularCondPresenceProbs
            print ' ------------------------------------------------------------------'    
        
        if debug > -1:
            print
            print
            print '...Finished'
        
        ####Classification
    def classify(self, sentence, debug = 0, varianceExponent = 0):
        #If the sentence hasn't been parsed, we must parse it.
        plaintext = False
        if type(sentence) != list:
            plaintext = True
            original = sentence
            try:
                sentence = self.parser.parse(sentence)
            except:
                try:
                    self.parser = Parser()
                    sentence = self.parser.parse(sentence)
                except:
                    print 'Couldn\'t create a parsing object.'
                    print 'Prehaps pystatparser is not loaded?'
                    print 'type \"from stat_parser import Parser\"'
                    print 'Otherwise, you must install it from Github as ' + \
                        'directed in the README'
            
        #Deal with new root types
        if sentence[0] not in self.importantRootProbabilities:
            self.importantRootProbabilities[sentence[0]] = self.biomialParamDist(self.binomHyperParams)
        if sentence[0] not in self.regularRootProbabilities:
            self.regularRootProbabilities[sentence[0]] = self.biomialParamDist(self.binomHyperParams)
        
        #Deal with new non-root tag types
        flat = self.recursiveFlatten(sentence)
        flat = filter(lambda x: type(x) == unicode, flat)
        for i,tag in enumerate(flat):
            if tag not in self.tags:
                #Set a priori beliefs for multiplicity parameters
                self.importantMultiplicityParameters[tag] = self.poissonParamDist(self.poissonHyperParams)
                self.regularMultiplicityParameters[tag] = self.poissonParamDist(self.poissonHyperParams)
                
                #Set a priori beliefs for conditional presence parameters being contained by anything else
                self.importantCondPresenceProbs.loc[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.importantCondPresenceProbs.columns]
                self.regularCondPresenceProbs.loc[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.columns]
                if type(flat[i+1])==unicode:
                    #Set a priori beliefs for conditional presence parameters containing other things
                    self.importantCondPresenceProbs[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index]
                    self.regularCondPresenceProbs[tag] = [self.biomialParamDist(self.binomHyperParams) for x in self.regularCondPresenceProbs.index]
        
        ##Get P(x|y = Important) 
        PxGy1 = math.log(self.importantRootProbabilities[sentence[0]].getMean()) 
        PxGy1 = PxGy1 / self.importantRootProbabilities[sentence[0]].getVar()**varianceExponent
        PxGy1 += self.getConditionalLevelProbability([sentence,self.importantCondPresenceProbs,self.importantMultiplicityParameters,sentence[0],varianceExponent,debug>=2])
        
        ##Get P(x|y = REGULAR) 
        PxGy0 = math.log(self.regularRootProbabilities[sentence[0]].getMean())
        PxGy0 = PxGy0/self.regularRootProbabilities[sentence[0]].getVar()**varianceExponent
        PxGy0 += self.getConditionalLevelProbability([sentence,self.regularCondPresenceProbs,self.regularMultiplicityParameters,sentence[0],varianceExponent,debug>=2])
        
        #Get priors in a log form:
        Py1 = math.log(self.classPriors[1])
        Py0 = math.log(self.classPriors[0])
        
        #Get log Probabilities of each class through Bayes' Rule
        Py1Gx = PxGy1+Py1
        Py0Gx = PxGy0+Py0
        
        #Derive softmax shift parameter for very small probabilities.
        shift = 0
        if min([Py1Gx,Py0Gx]) < -20:
            shift = -1*min([Py1Gx,Py0Gx]) - 20
        
        #SoftMax probabilities
        try:
            denom = math.log(math.e**(shift + Py1Gx) + math.e**(shift + Py0Gx))
            
            sPy1Gx = shift + Py1Gx-denom
            sPy0Gx = shift + Py0Gx-denom
            
            #Turn back into probabilities for output
            sPy1Gx = math.e**sPy1Gx
            sPy0Gx = math.e**sPy0Gx
        except OverflowError:
            if debug > -1:
                print 'Overflow error'
                if Py1Gx >= Py0Gx:
                    print 'Assigning important sentence with probability one.'
                else:
                    print 'Assigning regular sentence with probability one.'
                print 'Before softmax, log probabilities were:'
                print 'P(important | sentence) = ' + str(Py1Gx)
                print 'P(unimportant | sentence) = ' + str(Py0Gx)
            if Py1Gx >= Py0Gx:
                sPy1Gx = 1.0
                sPy0Gx = 0.0
            else:
                sPy1Gx = 0.0
                sPy0Gx = 1.0
                
        
        if debug > -1:
            print 'Estimating Class for sentence:'
            if plaintext:
                print '\"' + original + '\"'
            else:
                print sentence
        if debug > 0:
            print ' ------------------------------------------------------------------'
            print 'Class Priors (log probability):'
            print 'P(important) = ' + str(Py1)
            print 'P(unimportant) = ' + str(Py0)
            print ' ------------------------------------------------------------------'
            print 'Conditional Sentence Log Probabilities:'
            print 'P(sentence | important) = ' + str(PxGy1)
            print 'P(sentence | unimportant) = ' + str(PxGy0)
            print ' ------------------------------------------------------------------'
            print 'Unnormalized Conditional Class Log Probabilities'
            print 'P(important | sentence) = ' + str(Py1Gx)
            print 'P(unimportant | sentence) = ' + str(Py0Gx)
        if debug > -1:
            print ' ------------------------------------------------------------------'
            print 'Softmaxed Conditional Class Probabilities'
            print 'P(important | sentence) = ' + str(sPy1Gx)
            print 'P(unimportant | sentence) = ' + str(sPy0Gx)
        return(sPy1Gx)
    
    def summarize(self, article, verbosity = 0.5, debug = 0):
        sentences = self.split_into_sentences(article)
        
        keepers = []        
        i = 0
        for sentence in sentences:
            i += 1
            try:
                if self.classify(sentence, debug = debug) > verbosity:
                    keepers.append(sentence)
            except:
                print 'Error classifying sentence ' + str(i)
                print 'FullText: ' 
                print sentence
        if len(keepers) == 0:
            print 'No sentences found important'
            return('')
        reduced = reduce(lambda x,y: x + ' ' + y, keepers)
        return(reduced)
    
    ####Function Definitions
    #Returns the log probability of a level occuring, along with using recursion to \
    #find the levels contained therein. May be passed an entire sentence.
    def getConditionalLevelProbability(self, inputs):
        level = inputs[0]
        tagDF = inputs[1]
        mult = inputs[2]
        parent = inputs[3]
        varExp = inputs[4]
        debug = inputs[5]
        ret = 0
        if debug == 1:
            print 'Beginning Level...........'
            print level
        inTags = [x[0] for x in level[1:]]
        if u'' in inTags:
            inTags.remove(u'')
        
        #Do some recursion
        for i,tag in enumerate(inTags):
            if tag in self.phraseTags or tag in self.sentenceTypes:
                if debug == 1:
                    print 'beginning recursion due to:'
                    print tag
                ret = ret + self.getConditionalLevelProbability([level[i+1],tagDF,mult,tag,varExp,debug])
        
        #Do multiplicity for this level
        for tag in inTags:
            x = inTags.count(tag)
            mu = mult[tag].getMean()
            ret = ret + math.log((math.exp(-mu) * mu**x / math.factorial(x)))/mult[tag].getVar()**varExp
        
        #Do presence for this level
        inTags = list(set(inTags))
        for tag in inTags:
            if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case.
                print 'Breaking Due to non-unicode tag in getConditionalLevelProbability!'
                print tag
                break
            if debug == 1:
                print 'Probability of ' + tag + ' given ' + parent + ' is ' + str(tagDF.loc[tag,parent])
            ret = ret + math.log(tagDF.loc[tag,parent].getMean())/tagDF.loc[tag,parent].getVar()**varExp
        return(ret)
        
    #To get the inclusion
    def getInclusionsGivenParent(self, inputs):
        level = inputs[0]
        tagDF = inputs[1]
        parent = inputs[2]
        debug = inputs[3]
        if debug == 1:
            print 'Beginning Level...........'
            print level
        inTags = [x[0] for x in level[1:]]
        if u'' in inTags:
            inTags.remove(u'')
            
        #Do some recursion
        for i,tag in enumerate(inTags):
            if tag in self.phraseTags or tag in self.sentenceTypes:
                if debug == 1:
                    print 'beginning recursion due to:'
                    print tag
                self.getInclusionsGivenParent([level[i+1],tagDF,tag,debug])
        
        #Update tags on this level
        inTags = list(set(inTags))
        for tag in inTags:
            if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case.
                break
            if debug == 1:
                print 'incrementing: ' + tag + ' when conditioned on ' + parent
            tagDF.loc[tag,parent].update(True)
        #Update tags not on this level
        for tag in self.tags:
            if tag not in inTags:
                tagDF.loc[tag,parent].update(False)
            
            
    #To get the inclusions in a level recursively.
    def getInclusions(self,inputs):
        level = inputs[0]
        tagDict = inputs[1]
        debug = inputs[2]
        if debug == 1:
            print 'Beginning Level...........'
            print level
        inTags = [x[0] for x in level[1:]]
        if u'' in inTags:
            inTags.remove(u'')
            
        #Do some recursion
        for i,tag in enumerate(inTags):
            if tag in self.phraseTags or tag in self.sentenceTypes:
                if debug == 1:
                    print 'beginning recursion due to:'
                    print tag
                self.getInclusions([level[i+1],tagDict,debug])
        
        #Add count for this level
        inTags = list(set(inTags))
        for tag in inTags:
            if type(tag) != unicode:#Some sentences contain only a word, and we won't need to add anything in that case.
                break
            if debug == 1:
                print 'incrementing: ' + tag
            tagDict[tag].incrementTrials()
    
    #To find all PoS tags (pystatparser's documentation is literally non-existant)
    def getTagsRecursively(self, ss, knownTags = [], debug = 0):
        ret = knownTags
        for sentence in ss:
            for phrase in sentence:
                for element in phrase:
                    if type(element) == unicode:
                        if element not in ret:
                            ret.append(element)        
                    if type(element) == list:
                        ret.extend(self.getTagsRecursively(element))
        return(ret)
        
    #Flatten an n-dimensional list into a 1D list
    def recursiveFlatten(self, myList):
        ret = []
        for element in myList:
            if type(element) == list:
                element = self.recursiveFlatten(element)
            if type(element) == str or type(element) == unicode:
                ret.append(element)
            else:
                ret.extend(list(element))
        return(ret)
    
    #From http://stackoverflow.com/questions/4576077/python-split-text-on-sentences
    def split_into_sentences(self, text):
        if type(text) == unicode:
            text = unicode(text.encode('utf-8'), errors = 'ignore')
            text = unicodedata.normalize('NFKD',text).encode('ascii','ignore')
        caps = "([A-Z])"
        prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
        suffixes = "(Inc|Ltd|Jr|Sr|Co)"
        starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
        acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
        websites = "[.](com|net|org|io|gov)"
        text = " " + text + "  "
        text = text.replace("\n"," ")
        text = re.sub(prefixes,"\\1<prd>",text)
        text = re.sub(websites,"<prd>\\1",text)
        if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
        if 'a.m.' in text: text = text.replace('a.m.','a<prd>m<prd>')
        if 'p.m.' in text: text = text.replace('p.m.','p<prd>m<prd>')
        if '...' in text: text = text.replace('...','<prd><prd><prd>')
        text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
        text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
        text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
        text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
        text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
        text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
        text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
        if "”" in text: text = text.replace(".”","”.")
        if "\"" in text: text = text.replace(".\"","\".")
        if "!" in text: text = text.replace("!\"","\"!")
        if "?" in text: text = text.replace("?\"","\"?")
        text = text.replace(".",".<stop>")
        text = text.replace("?","?<stop>")
        text = text.replace("!","!<stop>")
        text = text.replace("<prd>",".")
        sentences = text.split("<stop>")
        sentences = sentences[:-1]
        sentences = [s.strip() for s in sentences]
        return sentences
    
    #Write the parameters we have to file. This will create three files.
    #Passing the parameter "default" to this function will overwrite the \
    #parameters fit by the author.
    def storeParameters(self, target):
        try: str(target)
        except:
            print "store parameters needs to be passed a string"
            return
        f = open(target,'w')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.classPriors) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.importantRootProbabilities.keys()) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), [x.store() for x in self.importantRootProbabilities.values()]) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.regularRootProbabilities.keys()) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), [x.store() for x in self.regularRootProbabilities.values()]) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.importantMultiplicityParameters.keys()) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), [x.store() for x in self.importantMultiplicityParameters.values()]) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.regularMultiplicityParameters.keys()) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), [x.store() for x in self.regularMultiplicityParameters.values()]) + '\n')
        #f.write(reduce(lambda x,y: str(x) + ',' + str(y), self.alpha) + '\n')Good for bayse
        #f.write(reduce(lambda x,y: str(x) + ',' + str(y), self.beta) + '\n')
        f.write(str(self.binomHyperParams[0]) + '/-_-/' +  str(self.binomHyperParams[1]) + '\n')
        f.write(str(self.poissonHyperParams[0]) + '/-_-/' + str(self.poissonHyperParams[1]) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.tags) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.sentenceTypes) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), self.phraseTags) + '\n')
        #Element-wise store parameters
        ICP = []
        for i in self.importantCondPresenceProbs.index:
            for j in self.importantCondPresenceProbs.columns:
                ICP.append(self.importantCondPresenceProbs.loc[i,j].store())
        RCP = []
        for i in self.regularCondPresenceProbs.index:
            for j in self.regularCondPresenceProbs.columns:
                RCP.append(self.regularCondPresenceProbs.loc[i,j].store())
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), ICP) + '\n')
        f.write(reduce(lambda x,y: str(x) + '/-_-/' + str(y), RCP) + '\n')
        f.close()
        
    #Load parameters from file. Simply provide it with the name you provided \
    #to storeParameters. The argument "default" will load the parameters fit \
    #by the author.
    def loadParameters(self, target):
        try: str(target)
        except:
            print "load parameters needs to be passed a string"
            return
        f = open(target,'r')
        groups = [x.split('/-_-/') for x in f.read().split('\n')]
        self.classPriors = [float(x) for x in groups[0]]
        self.importantRootProbabilities = dict(zip([unicode(x) for x in groups[1]],[binomialParamDist().load(x) for x in groups[2]]))
        self.regularRootProbabilities = dict(zip([unicode(x) for x in groups[3]],[binomialParamDist().load(x) for x in groups[4]]))
        self.importantMultiplicityParameters = dict(zip([unicode(x) for x in groups[5]],[poissonParamDist().load(x) for x in groups[6]]))
        self.regularMultiplicityParameters = dict(zip([unicode(x) for x in groups[7]],[poissonParamDist().load(x) for x in groups[8]]))
        #self.alpha = groups[10]#comin with the bayes update
        #self.beta = groups[11]
        self.binomHyperParams = [float(x) for x in groups[9]]
        self.poissonHyperParams = [float(x) for x in groups[10]]
        self.tags = [unicode(x) for x in groups[11]]
        self.sentenceTypes = [unicode(x) for x in groups[12]]
        self.phraseTags = [unicode(x) for x in groups[13]]
        
        #Unpack dataframes
        self.importantCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)])
        self.importantCondPresenceProbs = pd.DataFrame(self.importantCondPresenceProbs)
        self.importantCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags)
        self.importantCondPresenceProbs.index = list(self.tags)
        
        self.regularCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)])
        self.regularCondPresenceProbs = pd.DataFrame(self.regularCondPresenceProbs)
        self.regularCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags)
        self.regularCondPresenceProbs.index = list(self.tags)
        
        for i,row in enumerate(self.importantCondPresenceProbs.index):
            for j,column in enumerate(self.importantCondPresenceProbs.columns):
                self.importantCondPresenceProbs.loc[row,column] = binomialParamDist().load(groups[14][i*len(self.importantCondPresenceProbs.columns) + j])
        
        for i,row in enumerate(self.regularCondPresenceProbs.index):
            for j,column in enumerate(self.regularCondPresenceProbs.columns):
                self.regularCondPresenceProbs.loc[row,column] = binomialParamDist().load(groups[15][i*len(self.regularCondPresenceProbs.columns) + j])
        
        f.close()
    
    def binomialParamDist(params):
        return(binomialParamDist(params))
    
    def poissonParamDist(params):
        return(poissonParamDist(params))
Ejemplo n.º 45
0
class SySE:
    def __init__(self):
        try:
            send = resource_filename(__name__, 'default.dat')
            self.loadParameters(send)
        except:
            print "Could not load default parameters."
            print "You should either train this object using the \"train\" " +\
                "method, or load parameters with the \"loadParameters\" method"

    ####Supervised Training.
    #trainingSentences: sentences on which to train (May already be parsed)
    #labels: corresponding binary (1,0) labels.
    #HyperParams for the conjugate Beta and Gamma priors respectively.
    #The Gamma distributionis parameterized such that the term with the beta \
    #parameters looks like this: e^(-x*beta) for random variable x.
    def train(self,
              trainingSentences,
              labels,
              binomHyperParams=[0.5, 0.5],
              poissonHyperParams=[0.0001, 0.005],
              debug=0):
        if debug > -1:
            print
            print '**********************************************************'
            print '                   SySE V 1.1.2 '
            print 'Beginning Training Sequence with ' + \
                str(len(trainingSentences)) + ' training sentences...'
            print '**********************************************************'
            if debug > 0:
                print
                print 'Initializing... '

        if type(trainingSentences[0]) != list:
            print 'These sentences do not appear to have been parsed.'
            print 'They will be parsed now.'
            if len(trainingSentences) > 10:
                print 'Given their volume, this will take some time.'
            try:
                self.parser = Parser()
                trainingSentences = [
                    self.parser.parse(x) for x in trainingSentences
                ]
            except:
                print 'This environment should have pystatparser installed ' +\
                'in order to train on unparsed sentences.'
                print 'Parameters could not be fit'
                print 'Exiting...'
                return

        ####Initialization
        #Save hyperparameters
        self.binomHyperParams = binomHyperParams
        self.poissonHyperParams = poissonHyperParams

        #See what tags are in the training data.
        tags = []
        for sentence in trainingSentences:
            flat = self.recursiveFlatten(sentence)
            for el in flat:
                if type(el) == unicode and el not in tags:
                    tags.append(el)

        self.tags = set(tags)

        #What kind of root tags are there?
        self.sentenceTypes = set([x[0] for x in trainingSentences])

        #Which tags may contain other tags?
        self.phraseTags = []
        for sentence in trainingSentences:
            flat = self.recursiveFlatten(sentence)
            for i in range(0, len(flat)):
                try:
                    if type(flat[i]) == unicode and type(flat[
                            i +
                            1]) == unicode and flat[i] not in self.phraseTags:
                        self.phraseTags.append(flat[i])
                except IndexError:
                    print 'We\'ve reached the end of this sentence'

        self.phraseTags = set(self.phraseTags) - self.sentenceTypes

        #Robustness
        labels = list(labels)

        #Split training sentences into Important (I) and Regular (R) (Unimportant)
        importantSentences = filter(
            lambda x: labels[trainingSentences.index(x)] == 1,
            trainingSentences)
        regularSentences = filter(
            lambda x: not labels[trainingSentences.index(x)] == 1,
            trainingSentences)

        self.classPriors = []

        ###Test inputs
        #Make sure labels are right length for sentences.
        if len(labels) != len(trainingSentences):
            print 'Labels and trainingSentencs must be the same length!'
            return
        #Make sure labels are valid
        for label in labels:
            if label != 0 and label != 1:
                print 'Lables should be either 0 or 1.'
                print 'exiting...'
                return

        ###Train Class Priors
        self.classPriors.append(float(labels.count(0)) / float(len(labels)))
        self.classPriors.append(float(labels.count(1)) / float(len(labels)))

        if debug > 0:
            print '*********************************************************'
            print 'These are the class priors'
            print '*********************************************************'
            print self.classPriors
            print
            print

        ###Train Sentence Type
        self.importantRootProbabilities = dict(
            zip(list(self.sentenceTypes), [
                binomialParamDist(self.binomHyperParams)
                for x in range(0, len(list(self.sentenceTypes)))
            ]))
        self.regularRootProbabilities = dict(
            zip(list(self.sentenceTypes), [
                binomialParamDist(self.binomHyperParams)
                for x in range(0, len(list(self.sentenceTypes)))
            ]))

        #Get the count of each sentence type in I
        for sentence in importantSentences:
            #Make sure we get what we expect
            if type(sentence[0]) != unicode:
                print "We are looking for a non-unicode sentence type. exiting..."
                break
                return
            #if it isn't in the list yet, add it.
            self.importantRootProbabilities[sentence[0]].update(1)
            for sentence1 in importantSentences:
                if sentence1 != sentence:
                    self.importantRootProbabilities[sentence1[0]].update(False)

        #Get the count of each sentence type in R
        for sentence in regularSentences:
            #Make sure we get what we expect
            if type(sentence[0]) != unicode:
                print "We are looking for a non-unicode sentence type. exiting..."
                break
                return
            #if it isn't in the list yet, add it.
            self.regularRootProbabilities[sentence[0]].update(1)
            for sentence1 in importantSentences:
                if sentence1 != sentence:
                    self.regularRootProbabilities[sentence1[0]].update(False)

        if debug > 0:
            print '*********************************************************'
            print 'These are the sentence type parameters'
            print '*********************************************************'

            print ' --------------------------------------------------------'
            print ' For Important Sentences:'
            print self.importantRootProbabilities

            print ' --------------------------------------------------------'
            print ' For Regular Sentences:'
            print self.regularRootProbabilities
            print
            print

        ###Train Phrases
        ##Primitive Inference on Multiplicity Parameter
        #To store poisson beliefs
        self.importantMultiplicityParameters = dict(
            zip(list(self.tags), [
                poissonParamDist(self.poissonHyperParams)
                for x in range(0, len(list(self.tags)))
            ]))  #For storing parameter estimates.
        self.regularMultiplicityParameters = dict(
            zip(list(self.tags), [
                poissonParamDist(self.poissonHyperParams)
                for x in range(0, len(list(self.tags)))
            ]))  #For storing parameter estimates.

        #Get Inclusion for I
        for sentence in importantSentences:
            self.getInclusions(
                [sentence, self.importantMultiplicityParameters, debug >= 2])

        #Get Inclusion for R
        for sentence in regularSentences:
            self.getInclusions(
                [sentence, self.regularMultiplicityParameters, debug >= 2])

        #Get Counts for I
        for sentence in importantSentences:
            flat = self.recursiveFlatten(sentence)
            currentTags = filter(lambda x: type(x) == unicode, flat)
            for tag in currentTags[1:]:
                self.importantMultiplicityParameters[tag].updateCount(1)

        #Get Counts for R
        for sentence in regularSentences:
            flat = self.recursiveFlatten(sentence)
            currentTags = filter(lambda x: type(x) == unicode, flat)
            for tag in currentTags[1:]:
                self.regularMultiplicityParameters[tag].updateCount(1)

        ####YOUNEED TO GO OVER THIS AGAIN
        #Estimate Parameters for I
        for tag in self.importantMultiplicityParameters.keys():
            if (self.importantMultiplicityParameters[tag].alpha > 1):
                self.importantMultiplicityParameters[tag].updateCount(-1)

        #Estimate Parameters for R
        for tag in self.regularMultiplicityParameters.keys():
            if (self.regularMultiplicityParameters[tag].alpha > 1):
                self.regularMultiplicityParameters[tag].updateCount(-1)

            if debug > 0:
                print '*********************************************************'
                print ' Estimation for Multiplicity Parameters '
                print '*********************************************************'
                print
                print 'Dumb Parameter Estimates for Imporant Sentences:'
                print self.regularMultiplicityParameters
                print 'Dumb Parameter Estimates for Regular Sentences:'
                print self.regularMultiplictyParameter
                print
                print

        ##Primitive Inference on Presence Parameters

        #We need to find inclusions given parent
        #To store conditional presence probabilities, what can almost be \
        #thought of as transition probabilities.
        #For important phrases
        self.importantCondPresenceProbs = np.zeros(
            [len(self.tags),
             len(self.phraseTags) + len(self.sentenceTypes)])
        self.importantCondPresenceProbs = pd.DataFrame(
            self.importantCondPresenceProbs).applymap(
                lambda x: binomialParamDist(self.binomHyperParams))
        self.importantCondPresenceProbs.columns = list(
            self.sentenceTypes) + list(self.phraseTags)
        self.importantCondPresenceProbs.index = list(self.tags)

        #For regularPhrases
        self.regularCondPresenceProbs = np.zeros(
            [len(self.tags),
             len(self.phraseTags) + len(self.sentenceTypes)])
        self.regularCondPresenceProbs = pd.DataFrame(
            self.regularCondPresenceProbs).applymap(
                lambda x: binomialParamDist(self.binomHyperParams))
        self.regularCondPresenceProbs.columns = list(
            self.sentenceTypes) + list(self.phraseTags)
        self.regularCondPresenceProbs.index = list(self.tags)

        #Count Conditional Inclusions for Important Sentences
        for sentence in importantSentences:
            self.getInclusionsGivenParent([
                sentence, self.importantCondPresenceProbs, sentence[0],
                debug >= 2
            ])

        #Count Conditional Inclusions for Regular Sentences
        for sentence in regularSentences:
            self.getInclusionsGivenParent([
                sentence, self.regularCondPresenceProbs, sentence[0],
                debug >= 2
            ])

        if debug > 1:
            print '*********************************************************'
            print 'Presence Parameter Estimation'
            print '*********************************************************'
            print
            print ' ------------------------------------------------------------------'
            print ' Conditional Parameters for Important Sentences'
            print self.importantCondPresenceProbs
            print
            print ' ------------------------------------------------------------------'
            print ' Conditional Parameters for Regular Sentences'
            print self.regularCondPresenceProbs
            print ' ------------------------------------------------------------------'

        if debug > -1:
            print
            print
            print '...Finished'

        ####Classification
    def classify(self, sentence, debug=0, varianceExponent=0):
        #If the sentence hasn't been parsed, we must parse it.
        plaintext = False
        if type(sentence) != list:
            plaintext = True
            original = sentence
            try:
                sentence = self.parser.parse(sentence)
            except:
                try:
                    self.parser = Parser()
                    sentence = self.parser.parse(sentence)
                except:
                    print 'Couldn\'t create a parsing object.'
                    print 'Prehaps pystatparser is not loaded?'
                    print 'type \"from stat_parser import Parser\"'
                    print 'Otherwise, you must install it from Github as ' + \
                        'directed in the README'

        #Deal with new root types
        if sentence[0] not in self.importantRootProbabilities:
            self.importantRootProbabilities[
                sentence[0]] = self.biomialParamDist(self.binomHyperParams)
        if sentence[0] not in self.regularRootProbabilities:
            self.regularRootProbabilities[sentence[0]] = self.biomialParamDist(
                self.binomHyperParams)

        #Deal with new non-root tag types
        flat = self.recursiveFlatten(sentence)
        flat = filter(lambda x: type(x) == unicode, flat)
        for i, tag in enumerate(flat):
            if tag not in self.tags:
                #Set a priori beliefs for multiplicity parameters
                self.importantMultiplicityParameters[
                    tag] = self.poissonParamDist(self.poissonHyperParams)
                self.regularMultiplicityParameters[
                    tag] = self.poissonParamDist(self.poissonHyperParams)

                #Set a priori beliefs for conditional presence parameters being contained by anything else
                self.importantCondPresenceProbs.loc[tag] = [
                    self.biomialParamDist(self.binomHyperParams)
                    for x in self.importantCondPresenceProbs.columns
                ]
                self.regularCondPresenceProbs.loc[tag] = [
                    self.biomialParamDist(self.binomHyperParams)
                    for x in self.regularCondPresenceProbs.columns
                ]
                if type(flat[i + 1]) == unicode:
                    #Set a priori beliefs for conditional presence parameters containing other things
                    self.importantCondPresenceProbs[tag] = [
                        self.biomialParamDist(self.binomHyperParams)
                        for x in self.regularCondPresenceProbs.index
                    ]
                    self.regularCondPresenceProbs[tag] = [
                        self.biomialParamDist(self.binomHyperParams)
                        for x in self.regularCondPresenceProbs.index
                    ]

        ##Get P(x|y = Important)
        PxGy1 = math.log(
            self.importantRootProbabilities[sentence[0]].getMean())
        PxGy1 = PxGy1 / self.importantRootProbabilities[
            sentence[0]].getVar()**varianceExponent
        PxGy1 += self.getConditionalLevelProbability([
            sentence, self.importantCondPresenceProbs,
            self.importantMultiplicityParameters, sentence[0],
            varianceExponent, debug >= 2
        ])

        ##Get P(x|y = REGULAR)
        PxGy0 = math.log(self.regularRootProbabilities[sentence[0]].getMean())
        PxGy0 = PxGy0 / self.regularRootProbabilities[
            sentence[0]].getVar()**varianceExponent
        PxGy0 += self.getConditionalLevelProbability([
            sentence, self.regularCondPresenceProbs,
            self.regularMultiplicityParameters, sentence[0], varianceExponent,
            debug >= 2
        ])

        #Get priors in a log form:
        Py1 = math.log(self.classPriors[1])
        Py0 = math.log(self.classPriors[0])

        #Get log Probabilities of each class through Bayes' Rule
        Py1Gx = PxGy1 + Py1
        Py0Gx = PxGy0 + Py0

        #Derive softmax shift parameter for very small probabilities.
        shift = 0
        if min([Py1Gx, Py0Gx]) < -20:
            shift = -1 * min([Py1Gx, Py0Gx]) - 20

        #SoftMax probabilities
        try:
            denom = math.log(math.e**(shift + Py1Gx) + math.e**(shift + Py0Gx))

            sPy1Gx = shift + Py1Gx - denom
            sPy0Gx = shift + Py0Gx - denom

            #Turn back into probabilities for output
            sPy1Gx = math.e**sPy1Gx
            sPy0Gx = math.e**sPy0Gx
        except OverflowError:
            if debug > -1:
                print 'Overflow error'
                if Py1Gx >= Py0Gx:
                    print 'Assigning important sentence with probability one.'
                else:
                    print 'Assigning regular sentence with probability one.'
                print 'Before softmax, log probabilities were:'
                print 'P(important | sentence) = ' + str(Py1Gx)
                print 'P(unimportant | sentence) = ' + str(Py0Gx)
            if Py1Gx >= Py0Gx:
                sPy1Gx = 1.0
                sPy0Gx = 0.0
            else:
                sPy1Gx = 0.0
                sPy0Gx = 1.0

        if debug > -1:
            print 'Estimating Class for sentence:'
            if plaintext:
                print '\"' + original + '\"'
            else:
                print sentence
        if debug > 0:
            print ' ------------------------------------------------------------------'
            print 'Class Priors (log probability):'
            print 'P(important) = ' + str(Py1)
            print 'P(unimportant) = ' + str(Py0)
            print ' ------------------------------------------------------------------'
            print 'Conditional Sentence Log Probabilities:'
            print 'P(sentence | important) = ' + str(PxGy1)
            print 'P(sentence | unimportant) = ' + str(PxGy0)
            print ' ------------------------------------------------------------------'
            print 'Unnormalized Conditional Class Log Probabilities'
            print 'P(important | sentence) = ' + str(Py1Gx)
            print 'P(unimportant | sentence) = ' + str(Py0Gx)
        if debug > -1:
            print ' ------------------------------------------------------------------'
            print 'Softmaxed Conditional Class Probabilities'
            print 'P(important | sentence) = ' + str(sPy1Gx)
            print 'P(unimportant | sentence) = ' + str(sPy0Gx)
        return (sPy1Gx)

    def summarize(self, article, verbosity=0.5, debug=0):
        sentences = self.split_into_sentences(article)

        keepers = []
        i = 0
        for sentence in sentences:
            i += 1
            try:
                if self.classify(sentence, debug=debug) > verbosity:
                    keepers.append(sentence)
            except:
                print 'Error classifying sentence ' + str(i)
                print 'FullText: '
                print sentence
        if len(keepers) == 0:
            print 'No sentences found important'
            return ('')
        reduced = reduce(lambda x, y: x + ' ' + y, keepers)
        return (reduced)

    ####Function Definitions
    #Returns the log probability of a level occuring, along with using recursion to \
    #find the levels contained therein. May be passed an entire sentence.
    def getConditionalLevelProbability(self, inputs):
        level = inputs[0]
        tagDF = inputs[1]
        mult = inputs[2]
        parent = inputs[3]
        varExp = inputs[4]
        debug = inputs[5]
        ret = 0
        if debug == 1:
            print 'Beginning Level...........'
            print level
        inTags = [x[0] for x in level[1:]]
        if u'' in inTags:
            inTags.remove(u'')

        #Do some recursion
        for i, tag in enumerate(inTags):
            if tag in self.phraseTags or tag in self.sentenceTypes:
                if debug == 1:
                    print 'beginning recursion due to:'
                    print tag
                ret = ret + self.getConditionalLevelProbability(
                    [level[i + 1], tagDF, mult, tag, varExp, debug])

        #Do multiplicity for this level
        for tag in inTags:
            x = inTags.count(tag)
            mu = mult[tag].getMean()
            ret = ret + math.log(
                (math.exp(-mu) * mu**x /
                 math.factorial(x))) / mult[tag].getVar()**varExp

        #Do presence for this level
        inTags = list(set(inTags))
        for tag in inTags:
            if type(
                    tag
            ) != unicode:  #Some sentences contain only a word, and we won't need to add anything in that case.
                print 'Breaking Due to non-unicode tag in getConditionalLevelProbability!'
                print tag
                break
            if debug == 1:
                print 'Probability of ' + tag + ' given ' + parent + ' is ' + str(
                    tagDF.loc[tag, parent])
            ret = ret + math.log(tagDF.loc[tag, parent].getMean()) / tagDF.loc[
                tag, parent].getVar()**varExp
        return (ret)

    #To get the inclusion
    def getInclusionsGivenParent(self, inputs):
        level = inputs[0]
        tagDF = inputs[1]
        parent = inputs[2]
        debug = inputs[3]
        if debug == 1:
            print 'Beginning Level...........'
            print level
        inTags = [x[0] for x in level[1:]]
        if u'' in inTags:
            inTags.remove(u'')

        #Do some recursion
        for i, tag in enumerate(inTags):
            if tag in self.phraseTags or tag in self.sentenceTypes:
                if debug == 1:
                    print 'beginning recursion due to:'
                    print tag
                self.getInclusionsGivenParent(
                    [level[i + 1], tagDF, tag, debug])

        #Update tags on this level
        inTags = list(set(inTags))
        for tag in inTags:
            if type(
                    tag
            ) != unicode:  #Some sentences contain only a word, and we won't need to add anything in that case.
                break
            if debug == 1:
                print 'incrementing: ' + tag + ' when conditioned on ' + parent
            tagDF.loc[tag, parent].update(True)
        #Update tags not on this level
        for tag in self.tags:
            if tag not in inTags:
                tagDF.loc[tag, parent].update(False)

    #To get the inclusions in a level recursively.
    def getInclusions(self, inputs):
        level = inputs[0]
        tagDict = inputs[1]
        debug = inputs[2]
        if debug == 1:
            print 'Beginning Level...........'
            print level
        inTags = [x[0] for x in level[1:]]
        if u'' in inTags:
            inTags.remove(u'')

        #Do some recursion
        for i, tag in enumerate(inTags):
            if tag in self.phraseTags or tag in self.sentenceTypes:
                if debug == 1:
                    print 'beginning recursion due to:'
                    print tag
                self.getInclusions([level[i + 1], tagDict, debug])

        #Add count for this level
        inTags = list(set(inTags))
        for tag in inTags:
            if type(
                    tag
            ) != unicode:  #Some sentences contain only a word, and we won't need to add anything in that case.
                break
            if debug == 1:
                print 'incrementing: ' + tag
            tagDict[tag].incrementTrials()

    #To find all PoS tags (pystatparser's documentation is literally non-existant)
    def getTagsRecursively(self, ss, knownTags=[], debug=0):
        ret = knownTags
        for sentence in ss:
            for phrase in sentence:
                for element in phrase:
                    if type(element) == unicode:
                        if element not in ret:
                            ret.append(element)
                    if type(element) == list:
                        ret.extend(self.getTagsRecursively(element))
        return (ret)

    #Flatten an n-dimensional list into a 1D list
    def recursiveFlatten(self, myList):
        ret = []
        for element in myList:
            if type(element) == list:
                element = self.recursiveFlatten(element)
            if type(element) == str or type(element) == unicode:
                ret.append(element)
            else:
                ret.extend(list(element))
        return (ret)

    #From http://stackoverflow.com/questions/4576077/python-split-text-on-sentences
    def split_into_sentences(self, text):
        if type(text) == unicode:
            text = unicode(text.encode('utf-8'), errors='ignore')
            text = unicodedata.normalize('NFKD',
                                         text).encode('ascii', 'ignore')
        caps = "([A-Z])"
        prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
        suffixes = "(Inc|Ltd|Jr|Sr|Co)"
        starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
        acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
        websites = "[.](com|net|org|io|gov)"
        text = " " + text + "  "
        text = text.replace("\n", " ")
        text = re.sub(prefixes, "\\1<prd>", text)
        text = re.sub(websites, "<prd>\\1", text)
        if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>")
        if 'a.m.' in text: text = text.replace('a.m.', 'a<prd>m<prd>')
        if 'p.m.' in text: text = text.replace('p.m.', 'p<prd>m<prd>')
        if '...' in text: text = text.replace('...', '<prd><prd><prd>')
        text = re.sub("\s" + caps + "[.] ", " \\1<prd> ", text)
        text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
        text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]",
                      "\\1<prd>\\2<prd>\\3<prd>", text)
        text = re.sub(caps + "[.]" + caps + "[.]", "\\1<prd>\\2<prd>", text)
        text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2",
                      text)
        text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
        text = re.sub(" " + caps + "[.]", " \\1<prd>", text)
        if "”" in text: text = text.replace(".”", "”.")
        if "\"" in text: text = text.replace(".\"", "\".")
        if "!" in text: text = text.replace("!\"", "\"!")
        if "?" in text: text = text.replace("?\"", "\"?")
        text = text.replace(".", ".<stop>")
        text = text.replace("?", "?<stop>")
        text = text.replace("!", "!<stop>")
        text = text.replace("<prd>", ".")
        sentences = text.split("<stop>")
        sentences = sentences[:-1]
        sentences = [s.strip() for s in sentences]
        return sentences

    #Write the parameters we have to file. This will create three files.
    #Passing the parameter "default" to this function will overwrite the \
    #parameters fit by the author.
    def storeParameters(self, target):
        try:
            str(target)
        except:
            print "store parameters needs to be passed a string"
            return
        f = open(target, 'w')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.classPriors) +
            '\n')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y),
                   self.importantRootProbabilities.keys()) + '\n')
        f.write(
            reduce(
                lambda x, y: str(x) + '/-_-/' + str(y),
                [x.store()
                 for x in self.importantRootProbabilities.values()]) + '\n')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y),
                   self.regularRootProbabilities.keys()) + '\n')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y),
                   [x.store()
                    for x in self.regularRootProbabilities.values()]) + '\n')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y),
                   self.importantMultiplicityParameters.keys()) + '\n')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y), [
                x.store()
                for x in self.importantMultiplicityParameters.values()
            ]) + '\n')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y),
                   self.regularMultiplicityParameters.keys()) + '\n')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y), [
                x.store() for x in self.regularMultiplicityParameters.values()
            ]) + '\n')
        #f.write(reduce(lambda x,y: str(x) + ',' + str(y), self.alpha) + '\n')Good for bayse
        #f.write(reduce(lambda x,y: str(x) + ',' + str(y), self.beta) + '\n')
        f.write(
            str(self.binomHyperParams[0]) + '/-_-/' +
            str(self.binomHyperParams[1]) + '\n')
        f.write(
            str(self.poissonHyperParams[0]) + '/-_-/' +
            str(self.poissonHyperParams[1]) + '\n')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.tags) + '\n')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.sentenceTypes)
            + '\n')
        f.write(
            reduce(lambda x, y: str(x) + '/-_-/' + str(y), self.phraseTags) +
            '\n')
        #Element-wise store parameters
        ICP = []
        for i in self.importantCondPresenceProbs.index:
            for j in self.importantCondPresenceProbs.columns:
                ICP.append(self.importantCondPresenceProbs.loc[i, j].store())
        RCP = []
        for i in self.regularCondPresenceProbs.index:
            for j in self.regularCondPresenceProbs.columns:
                RCP.append(self.regularCondPresenceProbs.loc[i, j].store())
        f.write(reduce(lambda x, y: str(x) + '/-_-/' + str(y), ICP) + '\n')
        f.write(reduce(lambda x, y: str(x) + '/-_-/' + str(y), RCP) + '\n')
        f.close()

    #Load parameters from file. Simply provide it with the name you provided \
    #to storeParameters. The argument "default" will load the parameters fit \
    #by the author.
    def loadParameters(self, target):
        try:
            str(target)
        except:
            print "load parameters needs to be passed a string"
            return
        f = open(target, 'r')
        groups = [x.split('/-_-/') for x in f.read().split('\n')]
        self.classPriors = [float(x) for x in groups[0]]
        self.importantRootProbabilities = dict(
            zip([unicode(x) for x in groups[1]],
                [binomialParamDist().load(x) for x in groups[2]]))
        self.regularRootProbabilities = dict(
            zip([unicode(x) for x in groups[3]],
                [binomialParamDist().load(x) for x in groups[4]]))
        self.importantMultiplicityParameters = dict(
            zip([unicode(x) for x in groups[5]],
                [poissonParamDist().load(x) for x in groups[6]]))
        self.regularMultiplicityParameters = dict(
            zip([unicode(x) for x in groups[7]],
                [poissonParamDist().load(x) for x in groups[8]]))
        #self.alpha = groups[10]#comin with the bayes update
        #self.beta = groups[11]
        self.binomHyperParams = [float(x) for x in groups[9]]
        self.poissonHyperParams = [float(x) for x in groups[10]]
        self.tags = [unicode(x) for x in groups[11]]
        self.sentenceTypes = [unicode(x) for x in groups[12]]
        self.phraseTags = [unicode(x) for x in groups[13]]

        #Unpack dataframes
        self.importantCondPresenceProbs = np.zeros(
            [len(self.tags),
             len(self.phraseTags) + len(self.sentenceTypes)])
        self.importantCondPresenceProbs = pd.DataFrame(
            self.importantCondPresenceProbs)
        self.importantCondPresenceProbs.columns = list(
            self.sentenceTypes) + list(self.phraseTags)
        self.importantCondPresenceProbs.index = list(self.tags)

        self.regularCondPresenceProbs = np.zeros(
            [len(self.tags),
             len(self.phraseTags) + len(self.sentenceTypes)])
        self.regularCondPresenceProbs = pd.DataFrame(
            self.regularCondPresenceProbs)
        self.regularCondPresenceProbs.columns = list(
            self.sentenceTypes) + list(self.phraseTags)
        self.regularCondPresenceProbs.index = list(self.tags)

        for i, row in enumerate(self.importantCondPresenceProbs.index):
            for j, column in enumerate(
                    self.importantCondPresenceProbs.columns):
                self.importantCondPresenceProbs.loc[
                    row, column] = binomialParamDist().load(groups[14][
                        i * len(self.importantCondPresenceProbs.columns) + j])

        for i, row in enumerate(self.regularCondPresenceProbs.index):
            for j, column in enumerate(self.regularCondPresenceProbs.columns):
                self.regularCondPresenceProbs.loc[
                    row, column] = binomialParamDist().load(
                        groups[15][i *
                                   len(self.regularCondPresenceProbs.columns) +
                                   j])

        f.close()

    def binomialParamDist(params):
        return (binomialParamDist(params))

    def poissonParamDist(params):
        return (poissonParamDist(params))
Ejemplo n.º 46
0
    def classify(self, sentence, debug=0, varianceExponent=0):
        #If the sentence hasn't been parsed, we must parse it.
        plaintext = False
        if type(sentence) != list:
            plaintext = True
            original = sentence
            try:
                sentence = self.parser.parse(sentence)
            except:
                try:
                    self.parser = Parser()
                    sentence = self.parser.parse(sentence)
                except:
                    print 'Couldn\'t create a parsing object.'
                    print 'Prehaps pystatparser is not loaded?'
                    print 'type \"from stat_parser import Parser\"'
                    print 'Otherwise, you must install it from Github as ' + \
                        'directed in the README'

        #Deal with new root types
        if sentence[0] not in self.importantRootProbabilities:
            self.importantRootProbabilities[
                sentence[0]] = self.biomialParamDist(self.binomHyperParams)
        if sentence[0] not in self.regularRootProbabilities:
            self.regularRootProbabilities[sentence[0]] = self.biomialParamDist(
                self.binomHyperParams)

        #Deal with new non-root tag types
        flat = self.recursiveFlatten(sentence)
        flat = filter(lambda x: type(x) == unicode, flat)
        for i, tag in enumerate(flat):
            if tag not in self.tags:
                #Set a priori beliefs for multiplicity parameters
                self.importantMultiplicityParameters[
                    tag] = self.poissonParamDist(self.poissonHyperParams)
                self.regularMultiplicityParameters[
                    tag] = self.poissonParamDist(self.poissonHyperParams)

                #Set a priori beliefs for conditional presence parameters being contained by anything else
                self.importantCondPresenceProbs.loc[tag] = [
                    self.biomialParamDist(self.binomHyperParams)
                    for x in self.importantCondPresenceProbs.columns
                ]
                self.regularCondPresenceProbs.loc[tag] = [
                    self.biomialParamDist(self.binomHyperParams)
                    for x in self.regularCondPresenceProbs.columns
                ]
                if type(flat[i + 1]) == unicode:
                    #Set a priori beliefs for conditional presence parameters containing other things
                    self.importantCondPresenceProbs[tag] = [
                        self.biomialParamDist(self.binomHyperParams)
                        for x in self.regularCondPresenceProbs.index
                    ]
                    self.regularCondPresenceProbs[tag] = [
                        self.biomialParamDist(self.binomHyperParams)
                        for x in self.regularCondPresenceProbs.index
                    ]

        ##Get P(x|y = Important)
        PxGy1 = math.log(
            self.importantRootProbabilities[sentence[0]].getMean())
        PxGy1 = PxGy1 / self.importantRootProbabilities[
            sentence[0]].getVar()**varianceExponent
        PxGy1 += self.getConditionalLevelProbability([
            sentence, self.importantCondPresenceProbs,
            self.importantMultiplicityParameters, sentence[0],
            varianceExponent, debug >= 2
        ])

        ##Get P(x|y = REGULAR)
        PxGy0 = math.log(self.regularRootProbabilities[sentence[0]].getMean())
        PxGy0 = PxGy0 / self.regularRootProbabilities[
            sentence[0]].getVar()**varianceExponent
        PxGy0 += self.getConditionalLevelProbability([
            sentence, self.regularCondPresenceProbs,
            self.regularMultiplicityParameters, sentence[0], varianceExponent,
            debug >= 2
        ])

        #Get priors in a log form:
        Py1 = math.log(self.classPriors[1])
        Py0 = math.log(self.classPriors[0])

        #Get log Probabilities of each class through Bayes' Rule
        Py1Gx = PxGy1 + Py1
        Py0Gx = PxGy0 + Py0

        #Derive softmax shift parameter for very small probabilities.
        shift = 0
        if min([Py1Gx, Py0Gx]) < -20:
            shift = -1 * min([Py1Gx, Py0Gx]) - 20

        #SoftMax probabilities
        try:
            denom = math.log(math.e**(shift + Py1Gx) + math.e**(shift + Py0Gx))

            sPy1Gx = shift + Py1Gx - denom
            sPy0Gx = shift + Py0Gx - denom

            #Turn back into probabilities for output
            sPy1Gx = math.e**sPy1Gx
            sPy0Gx = math.e**sPy0Gx
        except OverflowError:
            if debug > -1:
                print 'Overflow error'
                if Py1Gx >= Py0Gx:
                    print 'Assigning important sentence with probability one.'
                else:
                    print 'Assigning regular sentence with probability one.'
                print 'Before softmax, log probabilities were:'
                print 'P(important | sentence) = ' + str(Py1Gx)
                print 'P(unimportant | sentence) = ' + str(Py0Gx)
            if Py1Gx >= Py0Gx:
                sPy1Gx = 1.0
                sPy0Gx = 0.0
            else:
                sPy1Gx = 0.0
                sPy0Gx = 1.0

        if debug > -1:
            print 'Estimating Class for sentence:'
            if plaintext:
                print '\"' + original + '\"'
            else:
                print sentence
        if debug > 0:
            print ' ------------------------------------------------------------------'
            print 'Class Priors (log probability):'
            print 'P(important) = ' + str(Py1)
            print 'P(unimportant) = ' + str(Py0)
            print ' ------------------------------------------------------------------'
            print 'Conditional Sentence Log Probabilities:'
            print 'P(sentence | important) = ' + str(PxGy1)
            print 'P(sentence | unimportant) = ' + str(PxGy0)
            print ' ------------------------------------------------------------------'
            print 'Unnormalized Conditional Class Log Probabilities'
            print 'P(important | sentence) = ' + str(Py1Gx)
            print 'P(unimportant | sentence) = ' + str(Py0Gx)
        if debug > -1:
            print ' ------------------------------------------------------------------'
            print 'Softmaxed Conditional Class Probabilities'
            print 'P(important | sentence) = ' + str(sPy1Gx)
            print 'P(unimportant | sentence) = ' + str(sPy0Gx)
        return (sPy1Gx)
Ejemplo n.º 47
0
    def train(self,
              trainingSentences,
              labels,
              binomHyperParams=[0.5, 0.5],
              poissonHyperParams=[0.0001, 0.005],
              debug=0):
        if debug > -1:
            print
            print '**********************************************************'
            print '                   SySE V 1.1.2 '
            print 'Beginning Training Sequence with ' + \
                str(len(trainingSentences)) + ' training sentences...'
            print '**********************************************************'
            if debug > 0:
                print
                print 'Initializing... '

        if type(trainingSentences[0]) != list:
            print 'These sentences do not appear to have been parsed.'
            print 'They will be parsed now.'
            if len(trainingSentences) > 10:
                print 'Given their volume, this will take some time.'
            try:
                self.parser = Parser()
                trainingSentences = [
                    self.parser.parse(x) for x in trainingSentences
                ]
            except:
                print 'This environment should have pystatparser installed ' +\
                'in order to train on unparsed sentences.'
                print 'Parameters could not be fit'
                print 'Exiting...'
                return

        ####Initialization
        #Save hyperparameters
        self.binomHyperParams = binomHyperParams
        self.poissonHyperParams = poissonHyperParams

        #See what tags are in the training data.
        tags = []
        for sentence in trainingSentences:
            flat = self.recursiveFlatten(sentence)
            for el in flat:
                if type(el) == unicode and el not in tags:
                    tags.append(el)

        self.tags = set(tags)

        #What kind of root tags are there?
        self.sentenceTypes = set([x[0] for x in trainingSentences])

        #Which tags may contain other tags?
        self.phraseTags = []
        for sentence in trainingSentences:
            flat = self.recursiveFlatten(sentence)
            for i in range(0, len(flat)):
                try:
                    if type(flat[i]) == unicode and type(flat[
                            i +
                            1]) == unicode and flat[i] not in self.phraseTags:
                        self.phraseTags.append(flat[i])
                except IndexError:
                    print 'We\'ve reached the end of this sentence'

        self.phraseTags = set(self.phraseTags) - self.sentenceTypes

        #Robustness
        labels = list(labels)

        #Split training sentences into Important (I) and Regular (R) (Unimportant)
        importantSentences = filter(
            lambda x: labels[trainingSentences.index(x)] == 1,
            trainingSentences)
        regularSentences = filter(
            lambda x: not labels[trainingSentences.index(x)] == 1,
            trainingSentences)

        self.classPriors = []

        ###Test inputs
        #Make sure labels are right length for sentences.
        if len(labels) != len(trainingSentences):
            print 'Labels and trainingSentencs must be the same length!'
            return
        #Make sure labels are valid
        for label in labels:
            if label != 0 and label != 1:
                print 'Lables should be either 0 or 1.'
                print 'exiting...'
                return

        ###Train Class Priors
        self.classPriors.append(float(labels.count(0)) / float(len(labels)))
        self.classPriors.append(float(labels.count(1)) / float(len(labels)))

        if debug > 0:
            print '*********************************************************'
            print 'These are the class priors'
            print '*********************************************************'
            print self.classPriors
            print
            print

        ###Train Sentence Type
        self.importantRootProbabilities = dict(
            zip(list(self.sentenceTypes), [
                binomialParamDist(self.binomHyperParams)
                for x in range(0, len(list(self.sentenceTypes)))
            ]))
        self.regularRootProbabilities = dict(
            zip(list(self.sentenceTypes), [
                binomialParamDist(self.binomHyperParams)
                for x in range(0, len(list(self.sentenceTypes)))
            ]))

        #Get the count of each sentence type in I
        for sentence in importantSentences:
            #Make sure we get what we expect
            if type(sentence[0]) != unicode:
                print "We are looking for a non-unicode sentence type. exiting..."
                break
                return
            #if it isn't in the list yet, add it.
            self.importantRootProbabilities[sentence[0]].update(1)
            for sentence1 in importantSentences:
                if sentence1 != sentence:
                    self.importantRootProbabilities[sentence1[0]].update(False)

        #Get the count of each sentence type in R
        for sentence in regularSentences:
            #Make sure we get what we expect
            if type(sentence[0]) != unicode:
                print "We are looking for a non-unicode sentence type. exiting..."
                break
                return
            #if it isn't in the list yet, add it.
            self.regularRootProbabilities[sentence[0]].update(1)
            for sentence1 in importantSentences:
                if sentence1 != sentence:
                    self.regularRootProbabilities[sentence1[0]].update(False)

        if debug > 0:
            print '*********************************************************'
            print 'These are the sentence type parameters'
            print '*********************************************************'

            print ' --------------------------------------------------------'
            print ' For Important Sentences:'
            print self.importantRootProbabilities

            print ' --------------------------------------------------------'
            print ' For Regular Sentences:'
            print self.regularRootProbabilities
            print
            print

        ###Train Phrases
        ##Primitive Inference on Multiplicity Parameter
        #To store poisson beliefs
        self.importantMultiplicityParameters = dict(
            zip(list(self.tags), [
                poissonParamDist(self.poissonHyperParams)
                for x in range(0, len(list(self.tags)))
            ]))  #For storing parameter estimates.
        self.regularMultiplicityParameters = dict(
            zip(list(self.tags), [
                poissonParamDist(self.poissonHyperParams)
                for x in range(0, len(list(self.tags)))
            ]))  #For storing parameter estimates.

        #Get Inclusion for I
        for sentence in importantSentences:
            self.getInclusions(
                [sentence, self.importantMultiplicityParameters, debug >= 2])

        #Get Inclusion for R
        for sentence in regularSentences:
            self.getInclusions(
                [sentence, self.regularMultiplicityParameters, debug >= 2])

        #Get Counts for I
        for sentence in importantSentences:
            flat = self.recursiveFlatten(sentence)
            currentTags = filter(lambda x: type(x) == unicode, flat)
            for tag in currentTags[1:]:
                self.importantMultiplicityParameters[tag].updateCount(1)

        #Get Counts for R
        for sentence in regularSentences:
            flat = self.recursiveFlatten(sentence)
            currentTags = filter(lambda x: type(x) == unicode, flat)
            for tag in currentTags[1:]:
                self.regularMultiplicityParameters[tag].updateCount(1)

        ####YOUNEED TO GO OVER THIS AGAIN
        #Estimate Parameters for I
        for tag in self.importantMultiplicityParameters.keys():
            if (self.importantMultiplicityParameters[tag].alpha > 1):
                self.importantMultiplicityParameters[tag].updateCount(-1)

        #Estimate Parameters for R
        for tag in self.regularMultiplicityParameters.keys():
            if (self.regularMultiplicityParameters[tag].alpha > 1):
                self.regularMultiplicityParameters[tag].updateCount(-1)

            if debug > 0:
                print '*********************************************************'
                print ' Estimation for Multiplicity Parameters '
                print '*********************************************************'
                print
                print 'Dumb Parameter Estimates for Imporant Sentences:'
                print self.regularMultiplicityParameters
                print 'Dumb Parameter Estimates for Regular Sentences:'
                print self.regularMultiplictyParameter
                print
                print

        ##Primitive Inference on Presence Parameters

        #We need to find inclusions given parent
        #To store conditional presence probabilities, what can almost be \
        #thought of as transition probabilities.
        #For important phrases
        self.importantCondPresenceProbs = np.zeros(
            [len(self.tags),
             len(self.phraseTags) + len(self.sentenceTypes)])
        self.importantCondPresenceProbs = pd.DataFrame(
            self.importantCondPresenceProbs).applymap(
                lambda x: binomialParamDist(self.binomHyperParams))
        self.importantCondPresenceProbs.columns = list(
            self.sentenceTypes) + list(self.phraseTags)
        self.importantCondPresenceProbs.index = list(self.tags)

        #For regularPhrases
        self.regularCondPresenceProbs = np.zeros(
            [len(self.tags),
             len(self.phraseTags) + len(self.sentenceTypes)])
        self.regularCondPresenceProbs = pd.DataFrame(
            self.regularCondPresenceProbs).applymap(
                lambda x: binomialParamDist(self.binomHyperParams))
        self.regularCondPresenceProbs.columns = list(
            self.sentenceTypes) + list(self.phraseTags)
        self.regularCondPresenceProbs.index = list(self.tags)

        #Count Conditional Inclusions for Important Sentences
        for sentence in importantSentences:
            self.getInclusionsGivenParent([
                sentence, self.importantCondPresenceProbs, sentence[0],
                debug >= 2
            ])

        #Count Conditional Inclusions for Regular Sentences
        for sentence in regularSentences:
            self.getInclusionsGivenParent([
                sentence, self.regularCondPresenceProbs, sentence[0],
                debug >= 2
            ])

        if debug > 1:
            print '*********************************************************'
            print 'Presence Parameter Estimation'
            print '*********************************************************'
            print
            print ' ------------------------------------------------------------------'
            print ' Conditional Parameters for Important Sentences'
            print self.importantCondPresenceProbs
            print
            print ' ------------------------------------------------------------------'
            print ' Conditional Parameters for Regular Sentences'
            print self.regularCondPresenceProbs
            print ' ------------------------------------------------------------------'

        if debug > -1:
            print
            print
            print '...Finished'
Ejemplo n.º 48
0
import nltk
from stat_parser import Parser
import sys
"""Helper script to add POS tags (headline and body) and syntactic parse (headline only) to a file of training data"""
"""First argument is the input file, second argument is the output file"""

reload(sys)
sys.setdefaultencoding('utf8')

parse_dict = {}
parser = Parser()

input_file = sys.argv[1]
file_with_feats = sys.argv[2]

with open(input_file, 'rU') as f:
    with open(file_with_feats, 'w') as w:
        i = 1
        for line in f:
            print(i)
            line = line.strip()
            art_id = str(i)
            try:
                art_title, art_text, source, source_type = line.split('\t')
                try:
                    title_tokens = nltk.word_tokenize(art_title)
                    title_pos_tags = [x[1] for x in nltk.pos_tag(title_tokens)]
                except:
                    title_pos_tags = ['n/a']
                try:
                    text_tokens = nltk.word_tokenize(art_text)
Ejemplo n.º 49
0
from stat_parser import Parser, display_tree


parser = Parser()

[tree1, tree2] = parser.parse("John saw Mary with the telescope")

display_tree([tree1, tree2])
Ejemplo n.º 50
0
from stat_parser import Parser, display_tree


parser = Parser()

# http://www.thrivenotes.com/the-last-question/
tree = parser.parse("Multiple SQL injection vulnerabilities in myBloggie 2.1.6 and earlier allow remote attackers to execute arbitrary SQL commands via the (1) cat_id or (2) year parameter to index.php in a viewuser action, different vectors than CVE-2005-1500 and CVE-2005-4225. ")

display_tree(tree)
Ejemplo n.º 51
0
 def train(self, trainingSentences, labels, binomHyperParams = [0.5,0.5], poissonHyperParams = [0.0001,0.005], debug = 0):
     if debug > -1:
         print
         print '**********************************************************'
         print '                   SySE V 1.1.2 '
         print 'Beginning Training Sequence with ' + \
             str(len(trainingSentences)) + ' training sentences...'
         print '**********************************************************'
         if debug > 0:
             print
             print 'Initializing... '
     
     if type(trainingSentences[0]) != list:
         print 'These sentences do not appear to have been parsed.'
         print 'They will be parsed now.'
         if len(trainingSentences) > 10:
             print 'Given their volume, this will take some time.'
         try:
             self.parser = Parser()
             trainingSentences = [self.parser.parse(x) for x in trainingSentences]
         except:
             print 'This environment should have pystatparser installed ' +\
             'in order to train on unparsed sentences.'
             print 'Parameters could not be fit'
             print 'Exiting...'
             return
         
     
     ####Initialization
     #Save hyperparameters
     self.binomHyperParams = binomHyperParams
     self.poissonHyperParams = poissonHyperParams
     
     #See what tags are in the training data.
     tags = []
     for sentence in trainingSentences:
         flat = self.recursiveFlatten(sentence)
         for el in flat:
             if type(el) == unicode and el not in tags:
                 tags.append(el)            
         
     self.tags = set(tags)
     
     #What kind of root tags are there?
     self.sentenceTypes = set([x[0] for x in trainingSentences])
     
     #Which tags may contain other tags?
     self.phraseTags = []
     for sentence in trainingSentences:
         flat = self.recursiveFlatten(sentence)
         for i in range(0,len(flat)):
             try:
                 if type(flat[i]) == unicode and type(flat[i+1]) == unicode and flat[i] not in self.phraseTags:
                     self.phraseTags.append(flat[i])
             except IndexError:
                 print 'We\'ve reached the end of this sentence'
                 
     self.phraseTags = set(self.phraseTags) - self.sentenceTypes
    
     #Robustness
     labels = list(labels)    
     
     #Split training sentences into Important (I) and Regular (R) (Unimportant)
     importantSentences = filter(lambda x: labels[trainingSentences.index(x)]==1, trainingSentences)
     regularSentences = filter(lambda x: not labels[trainingSentences.index(x)]==1, trainingSentences)
 
     self.classPriors = []
     
     ###Test inputs
     #Make sure labels are right length for sentences.
     if len(labels) != len(trainingSentences):
         print 'Labels and trainingSentencs must be the same length!'
         return
     #Make sure labels are valid
     for label in labels:
         if label != 0 and label != 1:
             print 'Lables should be either 0 or 1.'
             print 'exiting...'
             return
             
     ###Train Class Priors
     self.classPriors.append(float(labels.count(0))/float(len(labels)))
     self.classPriors.append(float(labels.count(1))/float(len(labels)))
     
     if debug > 0:
         print '*********************************************************'
         print 'These are the class priors'
         print '*********************************************************'    
         print self.classPriors
         print
         print      
     
     ###Train Sentence Type
     self.importantRootProbabilities = dict(zip(list(self.sentenceTypes),[binomialParamDist(self.binomHyperParams) for x in range(0,len(list(self.sentenceTypes)))]))
     self.regularRootProbabilities = dict(zip(list(self.sentenceTypes),[binomialParamDist(self.binomHyperParams) for x in range(0,len(list(self.sentenceTypes)))]))
     
     #Get the count of each sentence type in I
     for sentence in importantSentences:
         #Make sure we get what we expect
         if type(sentence[0]) != unicode:
             print "We are looking for a non-unicode sentence type. exiting..."
             break
             return            
         #if it isn't in the list yet, add it.
         self.importantRootProbabilities[sentence[0]].update(1)
         for sentence1 in importantSentences:
             if sentence1 != sentence:
                 self.importantRootProbabilities[sentence1[0]].update(False)
     
     #Get the count of each sentence type in R
     for sentence in regularSentences:
         #Make sure we get what we expect
         if type(sentence[0]) != unicode:
             print "We are looking for a non-unicode sentence type. exiting..."
             break
             return            
         #if it isn't in the list yet, add it.
         self.regularRootProbabilities[sentence[0]].update(1)
         for sentence1 in importantSentences:
             if sentence1 != sentence:
                 self.regularRootProbabilities[sentence1[0]].update(False)
     
     if debug > 0:
         print '*********************************************************'
         print 'These are the sentence type parameters'
         print '*********************************************************'
         
         print ' --------------------------------------------------------'
         print ' For Important Sentences:'
         print self.importantRootProbabilities
         
         print ' --------------------------------------------------------'
         print ' For Regular Sentences:'
         print self.regularRootProbabilities
         print
         print
     
     ###Train Phrases
     ##Primitive Inference on Multiplicity Parameter
     #To store poisson beliefs
     self.importantMultiplicityParameters = dict(zip(list(self.tags), [poissonParamDist(self.poissonHyperParams) for x in range(0,len(list(self.tags)))]))#For storing parameter estimates.
     self.regularMultiplicityParameters = dict(zip(list(self.tags), [poissonParamDist(self.poissonHyperParams) for x in range(0,len(list(self.tags)))]))#For storing parameter estimates.
     
     #Get Inclusion for I
     for sentence in importantSentences:
         self.getInclusions([sentence,self.importantMultiplicityParameters,debug>=2])
     
     #Get Inclusion for R
     for sentence in regularSentences:
         self.getInclusions([sentence,self.regularMultiplicityParameters,debug>=2])
     
     #Get Counts for I
     for sentence in importantSentences:
         flat = self.recursiveFlatten(sentence)
         currentTags = filter(lambda x: type(x)==unicode, flat)
         for tag in currentTags[1:]:
             self.importantMultiplicityParameters[tag].updateCount(1)
     
     #Get Counts for R
     for sentence in regularSentences:
         flat = self.recursiveFlatten(sentence)
         currentTags = filter(lambda x: type(x)==unicode, flat)
         for tag in currentTags[1:]:
             self.regularMultiplicityParameters[tag].updateCount(1)
     
     ####YOUNEED TO GO OVER THIS AGAIN
     #Estimate Parameters for I
     for tag in self.importantMultiplicityParameters.keys():
         if (self.importantMultiplicityParameters[tag].alpha > 1):
             self.importantMultiplicityParameters[tag].updateCount(-1)
             
     #Estimate Parameters for R
     for tag in self.regularMultiplicityParameters.keys():
         if (self.regularMultiplicityParameters[tag].alpha > 1):
             self.regularMultiplicityParameters[tag].updateCount(-1)
     
         if debug > 0:
             print '*********************************************************'
             print ' Estimation for Multiplicity Parameters '
             print '*********************************************************'
             print
             print 'Dumb Parameter Estimates for Imporant Sentences:'
             print self.regularMultiplicityParameters
             print 'Dumb Parameter Estimates for Regular Sentences:'
             print self.regularMultiplictyParameter    
             print
             print
     
     ##Primitive Inference on Presence Parameters
     
     #We need to find inclusions given parent
     #To store conditional presence probabilities, what can almost be \
         #thought of as transition probabilities.
     #For important phrases
     self.importantCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)])
     self.importantCondPresenceProbs = pd.DataFrame(self.importantCondPresenceProbs).applymap(lambda x: binomialParamDist(self.binomHyperParams))
     self.importantCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags)
     self.importantCondPresenceProbs.index = list(self.tags)
     
     #For regularPhrases
     self.regularCondPresenceProbs = np.zeros([len(self.tags),len(self.phraseTags) + len(self.sentenceTypes)])
     self.regularCondPresenceProbs = pd.DataFrame(self.regularCondPresenceProbs).applymap(lambda x: binomialParamDist(self.binomHyperParams))
     self.regularCondPresenceProbs.columns = list(self.sentenceTypes) + list(self.phraseTags)
     self.regularCondPresenceProbs.index = list(self.tags)
     
     #Count Conditional Inclusions for Important Sentences
     for sentence in importantSentences:
         self.getInclusionsGivenParent([sentence,self.importantCondPresenceProbs,sentence[0],debug>=2])
     
     #Count Conditional Inclusions for Regular Sentences
     for sentence in regularSentences:
         self.getInclusionsGivenParent([sentence,self.regularCondPresenceProbs,sentence[0],debug>=2])
     
     if debug > 1:
         print '*********************************************************'
         print 'Presence Parameter Estimation'
         print '*********************************************************'
         print
         print ' ------------------------------------------------------------------'
         print ' Conditional Parameters for Important Sentences'
         print self.importantCondPresenceProbs
         print
         print ' ------------------------------------------------------------------'
         print ' Conditional Parameters for Regular Sentences'
         print self.regularCondPresenceProbs
         print ' ------------------------------------------------------------------'    
     
     if debug > -1:
         print
         print
         print '...Finished'
Ejemplo n.º 52
0
from stat_parser import Parser, display_tree

parser = Parser()

# http://www.thrivenotes.com/the-last-question/
tree = parser.parse(
    "How can the net amount of entropy of the universe be massively decreased?"
)

display_tree(tree)
Ejemplo n.º 53
0
import nltk
import json
import yaml
import pdb

from stat_parser import Parser, display_tree
f = open("ships.yml")
f2 = open("ships2.yml")
all_tokens = []
ships = yaml.load(f.read())
ships2 = yaml.load(f2.read())
d = dict()
parser = Parser()

def is_leaf(tree):
  if len(tree.leaves()) == 1:
    return True
  else:
    return False

def top_level_leaves(tree):
  leaves = list(tree)
  is_top_level = True
  for leaf in leaves:
    if not is_leaf(leaf):
      is_top_level = False
  return is_top_level

def check_for_roll(tree):
  leaves = list(tree)
  key = ""
Ejemplo n.º 54
0
import nltk
import json
import yaml
from random import choice
from stat_parser import Parser, display_tree
parser = Parser()
d = json.load(open('tree_ship_words.json'))
f = open("ships.yml")
ships = yaml.load(f.read())
name = choice(ships)
print name.lower()
#tree = parser.parse("Anticipation of a new lover's arrival")
tree = parser.parse("they had good pretzels")
display_tree(tree)
Ejemplo n.º 55
0
#! /usr/bin/env python2
import nltk
from stat_parser import Parser
import os
import pickle, string
import hw1_util

spunctuation = set(string.punctuation)
poem_names = []
poem_names.extend(['gb_poems/' + s for s in os.listdir("gb_poems")])
poem_names.extend(['lh_poems/' + s for s in os.listdir("lh_poems")])

#poem_names = ['gb_poems/Sadie_and_Maud.txt']
parser = Parser()
vocab = dict()
parsed_lines = dict()
rhymes = dict()
for poem_name in poem_names:
    with open(poem_name, 'rb') as f:
        parsed_lines[poem_name] = list()
        rhyme_count = ord('A')
        rhymes[poem_name] = list()
        print poem_name
        for line in f.read().split('\n'):
            line = line.decode('utf-8')
            line = line.replace("'", "")
            line = line.replace('"', '')
            last_word = ''
            try:
                if line == "" or line.isspace():
                    raise TypeError
Ejemplo n.º 56
0
#!/usr/bin/env python
# vim: set noexpandtab tabstop=2 shiftwidth=2 softtabstop=-1 fileencoding=utf-8:

import sys
from stat_parser import Parser

parser = Parser()
print 'Init complete'
print parser.parse(sys.argv[1])
from nltk.corpus import brown
import sys
from graph import Graph,merge_graphs
from nltk.tree import ParentedTree
from stat_parser import Parser
parser = Parser()

user_sentence = sys.argv[1]
query = sys.argv[2]
trees = []
done = 0
# for sentence in brown.sents():
# 	if done >= 20:
# 		break
# 	if not query in sentence:
# 		continue
# 	if len(sentence) > 20:
# 		continue
# 	try:
# 		trees.append(parser.parse(" ".join(sentence)))
# 		done += 1
# 		print done
# 	except:
# 		print "oops couldn't parse that one"
# trees = []
# trees.append(parser.parse("The food was on the table where the child likes to eat"))
# trees.append(parser.parse("I eat food at the table"))
# trees.append(parser.parse("I eat the food that is on the table"))
# trees.append(parser.parse("The money is on the table"))
# trees.append(parser.parse("Put the data in the table"))
# trees.append(parser.parse("Add more rows to the database table"))