Example #1
0
 def _create_parser(url):
     try:
         parser = CoreNLPParser(url=url)
         parser.raw_parse('This is a test sentence.')
     except Exception:
         parser = None
     return parser
Example #2
0
class CNLP:
    CNLPServerURL = 'http://localhost:9000'

    def __init__(self):
        self.parser = CoreNLPParser(url=self.CNLPServerURL)
        self.dep_parser = CoreNLPDependencyParser(url=self.CNLPServerURL)
        self.ner_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='ner')
        self.pos_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='pos')

    def getParse(self, sentence):
        if (type(sentence) == list):
            return self.parser.parse(sentence)
        else:
            return self.parser.raw_parse(sentence)

    def getDepParse(self, sentence):
        if (type(sentence) == list):
            return self.dep_parser.parse(sentence)
        else:
            return self.dep_parser.raw_parse(sentence)

    def getNERTags(self, sentence):
        if (type(sentence) != list):
            sentence = sentence.split()
        return self.ner_tagger.tag(sentence)

    def getPOSTags(self, sentence):
        if (type(sentence) == list):
            return self.pos_tagger.parse(sentence)
        else:
            return self.pos_tagger.raw_parse(sentence)
Example #3
0
class Parser:
    def __init__(self):
        self.parser = CoreNLPParser()
        self.parser.session.trust_env = False

    def parse(self, sentence):
        return self.parser.raw_parse(sentence)
Example #4
0
def get_bigram_and_deep_syntax_feature(review, speller, stop_words, ps, preprocess):
    res = ""
    productions = []

    parser = CoreNLPParser(url='http://localhost:9500')

    for sentence in re.split(r"[.!?]", review):
        try:
            tree = next(parser.raw_parse(sentence))

            # Optimize by creating Chomsky normal form
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)
            productions += tree.productions()

        except StopIteration:
            # End of review reached
            break

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)

    count = 0
    for line in str(grammar).split("\n"):
        if count == 0:
            count += 1
            continue
        elif "'" in line:
            res += re.sub(r"[(->) `\'\"\[\d\]]", "", line) + " "

    res += bipos.get_bigrams_and_unigrams_of_sentence(
        bow.sanitize_sentence(review, speller, stop_words, ps, preprocess))

    return res
    def parse_tree(self, s):
        parser = CoreNLPParser()

        parse = next(parser.raw_parse(s))
        # parse.draw()

        return parse
Example #6
0
class NLTK_NLP():

    def __init__(self, ip_port):
        self.dep_parser = CoreNLPDependencyParser(url=ip_port)
        self.ner_parser = CoreNLPParser(url=ip_port, tagtype='ner')
        self.parser = CoreNLPParser(url=ip_port)
        self.pos_tagger = CoreNLPParser(url=ip_port, tagtype='pos')

    def generate_dependency_tree(self, sentence):
        '''what is the name of the asteroid ?'''
        dependency_tree, = self.dep_parser.raw_parse(sentence=sentence)
        return dependency_tree

    def generate_dependency_graph(self, sentence):
        '''12 {'address': 12, 'word': '.', 'lemma': '.', 'ctag': '.', 'tag': '.', 'feats': '', 'head': 1, 'deps': defaultdict(<class 'list'>, {}), 'rel': 'punct'}
        7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``.'''
        dependency_tree, = self.dep_parser.raw_parse(sentence=sentence)
        return DependencyGraph(dependency_tree.to_conll(10))

    def generate_constituency_tree(self, sentence):
        '''input: one question'''
        tree_list = list(self.parser.raw_parse(sentence=sentence))
        return tree_list[0]

    def get_pos(self, sentence):
        '''What is the airspeed of an unladen swallow ?
        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), 'airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
        '''
        pos_list = list(self.pos_tagger.tag(sentence.split()))
        # tokens = nltk.word_tokenize(sentence)
        # wordpos = nltk.pos_tag(tokens)
        return pos_list

    def get_pos_by_tokens(self, tokens):
        '''What is the airspeed of an unladen swallow ?'''
        pos_list = list(self.pos_tagger.tag(tokens))
        return pos_list

    def get_ner(self, sentence):
        # tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
        '''april the 26th, 1882 is the birth date of which athletes ?
        [('april', 'DATE'), ('the', 'DATE'), ('26th', 'DATE'), (',', 'DATE'), ('1882', 'DATE'),
        ('is', 'O'), ('the', 'O'), ('birth', 'O'), ('date', 'O'), ('of', 'O'), ('which', 'O'),
        ('athletes', 'O'), ('?', 'O')]'''
        sequence_ner_tuple_list = self.ner_parser.tag(sentence.split())
        sequence_ner_list = []
        for i, (word, ner_tag) in enumerate(sequence_ner_tuple_list):
            sequence_ner_list.append(ner_tag)
        return sequence_ner_list

    def get_toknizer(self, sentence):
        return list(self.parser.tokenize(sentence))

    def find_phrases(self, tree, phrase_tag='NP'):
        return [subtree.leaves() for subtree in tree.subtrees(lambda t: t.label()==phrase_tag)]
Example #7
0
def stanford_nlp():
    parser = CoreNLPParser()
    text1 = "There is still a place for mercenaries working for NGOs."
    text2 = "The Rich Poor Gap Silences the Political Voice of the Poor"
    text3 = "Legislation against mercenaries"
    for text in [text1, text2, text3]:
        parse = next(parser.raw_parse(text))
        print(parse)
        has_sent = False
        for item in parse.subtrees():
            if item.label() == "S":
                has_sent = True
        print(has_sent)
Example #8
0
def convert_text_tree(sentence):
    """ Converts a given sentence into a sentiment treebank like tree.

    :param sentence:
        String that needs to be converted.
    :return:
        String encoding tree structure.
    """
    parser = CoreNLPParser()

    # Parse sentence in nltk tree nodes
    root, = next(parser.raw_parse(sentence))

    # Recursively build text
    return get_node_text(root)
def parse_consituency_tree(sentence_list):

    pos_parent = []
    right_sublings_list = []
    chunk_position = []
    sen = mergeWords(sentence_list)
    parser = CoreNLPParser(url="http://localhost:9000")
    parse, = parser.raw_parse(sen)
    parse.pretty_print()
    newtree = ParentedTree.convert(parse)
    leaf_values = newtree.leaves()
    for i, word in enumerate(sentence_list):
        index = find_closest_words(i, word, leaf_values)
        if index >= 0 and index < len(leaf_values):
            tree_location = newtree.leaf_treeposition(index)
            parent = newtree[tree_location[:-2]].label()
            pos_parent.append(parent)

            #####################find right_sibling###########################
            right_sibling = newtree[tree_location[:-1]].right_sibling()
            #count = calcuate_nodes((right_sibling))
            if parent == "NP" and right_sibling is not None and calcuate_nodes(
                    right_sibling) == 1:
                count = calcuate_nodes((right_sibling))
                #print(count)
                right_sublings_list.append(right_sibling.leaves()[0])
            else:
                right_sublings_list.append(" ")

            ###########################find chunk item position##########################
            height = newtree[tree_location[:-2]].height()
            #只处理最底层的NP tree_height == 3
            if parent == "NP" and height == 3:
                chunk_item_list = newtree[tree_location[:-2]].leaves()
                print(newtree[tree_location[:-2]].height())
                for i, item in enumerate(chunk_item_list):
                    if item == leaf_values[index]:
                        chunk_position.append(i + 1)
                        break

            else:
                chunk_position.append(" ")

        else:
            pos_parent.append("null")
            right_sublings_list.append("null")
            chunk_position.append(" ")
    return pos_parent, right_sublings_list, chunk_position
Example #10
0
class TreeParser:
    def __init__(self):
        self.parser = None
        self.server = None
        self.dependency_parser = None

    def setup(self):
        url = settings.CORENLP_URL

        if url is None:
            server = CoreNLPServer(
               settings.CORENLP_PATH,
               settings.CORENLP_MODEL_PATH,
            )
            server.start()

            self.server = server
            url = server.url

        else:
            print("[TreeParser] Using existing CoreNLP Server...")

        self.parser = CoreNLPParser(url=url)

        # maybe separated with another class...
        self.dependency_parser = CoreNLPDependencyParser(url=url)

        return self.parser

    def parse(self, sentence):
        if not self.parser:
            raise AttributeError('parser is not set up')

        return self.parser.raw_parse(sentence)

    def free(self):
        if not self.server:
            return

        self.server.stop()

    def dependency_parse(self, sentence):
        if not self.dependency_parser:
            raise AttributeError('dependency parser is not set up')

        return self.dependency_parser.raw_parse(sentence)
Example #11
0
class JeopardyParser():
    def __init__(self, url='http://localhost:9000', encoding='utf8'):
        """Start the parsers to make sure they're running before calling.

        CoreNLP runs by default on port 9000, but if an external server is used
          or a different port is selected when started, the url will need to be
          explicitly passed.
        """
        self.NERT = CoreNLPNERTagger(url=url)
        self.Parser = CoreNLPParser(url=url, encoding=encoding)
        self.dep_parser = DepParser(url=url)

    def tag(self, sentence):
        """Return the sentence after tagging named entities."""
        sentence = self.clean_sentence(sentence)
        sentence = sentence.split()
        sentence = self.NERT.tag(sentence)
        return (sentence)

    def lexname(self, word, index=0):
        """Return the lexname entry for a word in WordNet."""
        synset = wordnet.synsets(word)
        lex = synset[index].lexname()
        return (lex)

    def parse(self, sentence):
        """Return the syntactic Tree object for a sentence."""
        sentence = self.clean_sentence(sentence)
        parse, = self.Parser.raw_parse(sentence)
        return (parse)

    def clean_sentence(self, sentence):
        """Remove backslash apostrophes from the data."""
        s = sentence.replace("\\'", "'")
        return (s)

    def check_syntax(self, tree, labels=['SBARQ', 'WHNP']):
        """Return True if sentence type is not on provided in labels."""
        if tree[0].label() not in labels:
            print('Malformed question', tree[0].label())
            return (False)
        elif tree[0].label() in labels:
            return (True)
def test_leaves():
    parser = CoreNLPParser(url="http://localhost:9000")
    parse, = parser.raw_parse(
        "we will collect user informaiton, and google user emails")
    sen_list = [
        "we", "will", "collect", "user", "informaiton,", "and", "google",
        "user", "emails."
    ]
    parse.pretty_print()
    newtree = ParentedTree.convert(parse)
    leaf_values = newtree.leaves()

    for i, word in enumerate(sen_list):
        node_index = find_closest_words(i, word, leaf_values)
        tree_location = newtree.leaf_treeposition(node_index)
        print(i)
        print(word)
        print("---------------------")
        print(node_index)
        print(newtree[tree_location[:-1]].leaves()[0])
        print("\n")
Example #13
0
def get_parser_tree_from_phrase(phrase):
    #ini_path = "/stanford/jars"
    #os.environ['STANFORD_PARSER'] = ini_path
    #os.environ['STANFORD_MODELS'] = ini_path
    '''
    parser = stanford.StanfordParser(model_path= ini_path + "/stanford-parser-3.9.2-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    parse_generator = parser.raw_parse(phrase)
    for line in parse_generator:
        parse_tree = line
        break
    '''
    parse_tree = Tree(None, [])
    parser = CoreNLPParser(url=os.environ['STANFORD_NLP_TOOLS'])
    try:
        parse_generator = parser.raw_parse(phrase)
        for line in parse_generator:
            parse_tree = line
            break
    except:
        print('Something wrong when trying to get parser tree by Stanford Parser!')

    return parse_tree
Example #14
0
def create_grammar_of_sample(review_type, sample_id):
    # DONT FORGET TO RUN THE STANFORD CORENLP SERVER BY RUNNING THIS JAVA COMMAND IN THE ROOT FOLDER:
    # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9500 -timeout 30000

    if review_type == 'regular':
        reader = yelp.get_regular_balanced_sample_reader(sample_id)
    elif review_type == '45stars':
        reader = yelp.get_45stars_balanced_sample_reader(sample_id)
    elif review_type == '12stars':
        reader = yelp.get_12stars_balanced_sample_reader(sample_id)

    parser = CoreNLPParser(url='http://localhost:9500')
    productions = []

    label, review = yelp.get_next_review_and_label(reader)

    while label != "-1":
        for sentence in re.split(r"[.!?]", review):
            try:
                tree = next(parser.raw_parse(sentence))

                # Optimize by creating Chomsky normal form
                tree.collapse_unary(collapsePOS=False)
                tree.chomsky_normal_form(horzMarkov=2)
                productions += tree.productions()

            except StopIteration:
                # End of review reached
                break

            label, review = yelp.get_next_review_and_label(reader)

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)

    output = open('../grammars/sample_' + review_type + "_" + str(sample_id), 'wb')
    dump(grammar, output, -1)
    output.close()
def getParser():
    parser = CoreNLPParser(url="http://localhost:9000")
    parse, = parser.raw_parse(
        "Do not interfere with, intercept, disrupt, filter, or disable any features of Google or the Twitter API,  including the Twitter Content of embedded Tweets and embedded timelines"
    )

    parse.pretty_print()

    newtree = ParentedTree.convert(parse)
    #for i, child in enumerate(newtree):
    #print(type(child))
    leaf_values = newtree.leaves()
    print(leaf_values[12:])
    if 'Twitter' in leaf_values[12:]:
        leaf_index = leaf_values.index('Twitter')
        tree_location = newtree.leaf_treeposition(leaf_index)
        print(tree_location[:-1])
        #print (newtree[tree_location[:-2]].right_sibling().leaves())
        #print (newtree[tree_location[:-2]].right_sibling().height())

        #print ( calcuate_nodes(newtree[tree_location[:-1]].right_sibling()))

        #print(newtree[tree_location[:-1]].right_sibling().height())
        #print (tree_location[:-2])
        #print (newtree[tree_location[:-1]].label())
        '''
        path = []
        for l in range(1,len(tree_location)):
            print(newtree[tree_location[:-l]].label())
            path.append(newtree[tree_location[:-l]].label())
        print(obtain_dash_list(path[::-1]))
        '''

        #subtree.left_sibling()

        parent_tree = newtree[tree_location[:-2]].leaves()
        print(parent_tree)
class SVO():
    def __init__(self, sentence):
        config = ApplicationConfig.get_corenlp_config()
        self._parser = CoreNLPParser(url=f"http://{config['host']}:{config['port']}")
        self._dependency = CoreNLPDependencyParser(url=f"http://{config['host']}:{config['port']}")
        sentence = sentence.replace('  ', ' ')
        sentence = sentence.replace('.', '')
        self._load(sentence)
        self.original = sentence

    def get_dependency_tree():
        return self._dependency
    def get_parser_tree():
        return self.t
                                                   
    def _load(self, sentence):
        self.t = list(self._parser.raw_parse(sentence))[0]
        self.t = ParentedTree.convert(self.t)

    def show(self):
        self.t.pretty_print()
        
    def find_svo(self):
        self._queue = []

        # sentence須為S或NP才能找SVO & find conj
        for i in self.t.subtrees(lambda i: i.label() != 'ROOT'):
#             if i.label() in ['S','NP','SINV','SBAR','FRAG','X','PP']:
            remover = self._find_conj()

            # refresh
            for i in remover:
                self.original = self.original.replace(i, '')
            self._load(self.original) 
            self.pos = self.t.pos()
            self._root = SVONode(('main', self.t), None)
            self._queue.append(self._root)
            break
#             else:
#                 return 'Sentence can not find SVO.'  
                              
        # find SVO   
        while self._queue != []:
            self._data = self._queue.pop(0)
            tmp = list(self._data.data.flatten())
            if ',' in tmp:
                tmp.remove(',')
            if len(tmp) == 1:
                continue
            sentence = ' '.join(self._data.data.flatten())
            self.t = self._data.data

            # 找子句 & 對等連接詞 & 分詞
#             self.show()
            if self._data.relation != 'appos':
                self._find_SBAR()
#             self.show()
#             self._remove_comma()
#             self.show()
            self._data.svo = collections.defaultdict(list)

            # Find Subject
            tmp = self._find_subject()
            if isinstance(tmp, list):
                self._data.svo['subject'] = tmp
            else:
                self._data.svo['subject'] = self._add_conj(tmp)

            # Find Predicate
            tmp = self._find_predicate()
            self._data.svo['predicate'] = self._add_conj(tmp)
            
            # Find Object
            tmp = self._find_object(self._data.svo['predicate'])
            self._data.svo['object'] = self._add_conj(tmp)                
            
            self._all = collections.defaultdict(list)
            self._flatten(self._data.svo['predicate'])
            self._data.svo['object'] = self._filter(self._data.svo['object'])
            
            for s in self.t.subtrees():
                if s.label() != 'ROOT':
                    break
                else:
                    for i in self.t.subtrees(lambda i:i.label() != 'ROOT'):
                        if i.label() in ['FRAG']:
                            continue
                        if i.label() in ['S','SINV']:
                            for n in i.subtrees(lambda n: n.label() == 'S' and n != i):
                                flag = True
                                test = n
                                while test.parent():
                                    if test.parent() == i:
                                        flag = False
                                        break
                                    test = test.parent()
                                if flag:
                                    tmp = self._del(' '.join(n.flatten()))
                                    if tmp:
                                        self._refresh(n)
                                        kid = SVONode(('', self.t), self._data)
                                        self._data.child.append(kid)
                                        self._queue.append(kid)
                                break
                        break
                break
                                                   
        # Integrate
        self._result = collections.defaultdict(list)
        self._traversal(self._root)
        
        return self._result                                           
                                                   
    def _filter(self, x):
        for i in x:
            if i[1] != []:
                for j in i[1]:
                    if isinstance(j,dict):
                        for k in ['predicate', 'object']:
                            tmp = self._filter(j[k])
                            if tmp == []:
                                del j[k]
                    else:
                        if j in self._all['predicate']:
                            i[1].remove(j)
            if i[0] in self._all['predicate']:
                x.remove(i)
        return x
                                                   
    def _flatten(self, x):
        for i in x:
            self._all['predicate'].append(i[0])
            if i[1] != []:
                for j in i[1]:
                    if isinstance(j,dict):
                        for k in j.keys():
                            self._flatten(j[k])
                    else:
                        self._all['predicate'].append(j)
    
    def _traversal(self, node):
        if node.svo != None and (node.svo['subject']!=[] or node.svo['predicate']!=[] or node.svo['object']!=[]):
            self._result[node.relation].append({'subject':node.svo['subject'], 'predicate':node.svo['predicate'], 'object':node.svo['object']})
        for i in node.child:
            self._traversal(i)
    
    def _add_conj(self, tmp):
        result = []
        if isinstance(tmp, tuple):
            flag = tmp[0].split(' ')
            if len(flag) <= 5:
                for k in flag:
                    if k in self._dic.keys():
                        # 把conj補進來
                        for j in self._dic[k]:
                            if j[0] == 'attr':
                                tree = list(self._parser.raw_parse(tmp[0]+' is '+j[1]))[0]
                                tree = ParentedTree.convert(tree)
                                kid = SVONode(('appos', tree), self._data)
                                self._data.child.append(kid)
                                self._queue.append(kid)
                                self._dic[k].remove(j)
#                                 a = tmp[0]
#                                 b = tmp[1]
#                                 result.append((a, b+[j[1]]))
                            else:
                                result.append((j[1], j[2]))

        if isinstance(tmp, tuple) and tmp[0] not in [x[0] for x in result]:
            result.append(tmp)
        result.reverse()
        return result
    
    def _remove_comma(self):
        for i in self.t.subtrees(lambda i:i[0] in [',', ';']):
            if i.left_sibling() and i.left_sibling().label() not in ['NP','S','VP','PP','JJ','SINV','ADJP'] and 'VB' not in i.left_sibling().label():
                if ' '.join(i.left_sibling().flatten()) != ' '.join(self.t.flatten()):
                    self._refresh(i.left_sibling())
                if ' '.join(i.flatten()) != ' '.join(self.t.flatten()):
                    self._refresh(i)
    
    # 拔掉的句子放進child                                               
    def _child(self, a, b):
        kid = SVONode((a, b), self._data)
        self._data.child.append(kid)
        self._queue.append(kid)                                               
        self._refresh(b, a)
    
    # 能否 refresh(拔掉的句子和原有句子是否一樣)                                               
    def _del(self, tmp_1):
        tmp = ' '.join(self.t.flatten())
        tmp = tmp.replace(tmp_1, '')   
        tmp = tmp.strip(',; ') 
        if tmp != '':
            return True
        else:
            return False                                       
                                                   
    def _find_SBAR(self):
        # 有無對等連接詞
        for i in self.t.subtrees(lambda i: i.label() == 'CC'):
            if i.right_sibling() and i.right_sibling().label() in ['S','VP']:
                tmp = self._del(i[0]+' '+' '.join(i.right_sibling().flatten()))
                if tmp and [x for x in self._queue if ' '.join(i.right_sibling().flatten()) in ' '.join(x.data.flatten())] == []:
                    self._child(i[0], i.right_sibling())                               
                                                   
        # 有無子句                                          
        for node in self.t.subtrees(lambda node: node.label() == 'SBAR'):
            if 'VB' in node.pos()[0][1]:
                continue
            tmp = self._del(' '.join(node.flatten()))   
            if tmp:
                conj = []
                # 連接詞
                for s in node.subtrees(lambda s: s.label() != 'SBAR'):
                    if s.label() not in ['S','ADVP','RB'] and 'VB' not in s.label():
                        if s.leaves()[0] not in conj:
                            conj.append(s.leaves()[0])
                    elif s.label() in ['ADVP','RB']:
                        continue
                    else:
                        break
                conj = ' '.join(conj)
                for s in node.subtrees(lambda s: s.label() == 'S'):
                    # SBAR 會重複
                    if [x for x in self._queue if ' '.join(s.flatten()) in ' '.join(x.data.flatten())] == []:
                        if node.left_sibling() and node.left_sibling().label() == 'IN' and node.parent().label() != 'S':
                            tmp = self._del(' '.join(node.parent().flatten()))                       
                            if tmp:
                                self._child(conj, s)
                        else:
                            self._child(conj, s)
                    break
                                                  
        # 分詞                                           
        participle = [x[0] for x in self.t.pos() if x[1] in ['VBG','VBN']]
        for i in participle:
            if i in self.t.leaves():
                candidate = [x for x, y in enumerate(self.t.leaves()) if y == i]
                if candidate[-1] == 0:
                    pos = ''
                else:
                    before = self.t.leaves()[candidate[-1]-1]
                    pos = [x for x in self.t.pos() if x[0] == before][0][1]
                IN = ['when','while','before','after','till','since','because','as','so','although','though','if','unless','upon','once']
                                                   
                if pos == 'IN' and before.lower() in IN:
#                 candidate[-1]-2 >= 0 and 'VB' not in [x for x in self.t.pos() if x[0] == self.t.leaves()[candidate[-1]-2]][0][1]
                    for j in self.t.subtrees(lambda j: j[0] == before):
                        tmp = self._del(' '.join(j.parent().flatten()))                           
                        if tmp and j.parent().label() != 'NP' and j.right_sibling() and [x for x in self._queue if ' '.join(j.right_sibling().flatten()) in ' '.join(x.data.flatten())] == []:
                            self._child(before, j.right_sibling())
                            
                if ('VB' not in pos) and (pos not in ['IN','RB','MD','POS', 'TO']):
                    for j in self.t.subtrees(lambda j: j[0] == i):
                        tmp = self._del(' '.join(j.parent().flatten()))                                                       
                        if tmp and j.parent().label() not in ['NP','ADJP'] and j.right_sibling() and [x for x in self._queue if ' '.join(j.parent().flatten()) in ' '.join(x.data.flatten())] == []:
                            self._child('', j.parent())                       
    
                                                   
    def _refresh(self, node, conj=''):
        sentence = ' '.join(self.t.flatten())
        if conj == '':
            tmp = ' '.join(node.flatten())
        else:
            tmp = conj + ' ' + ' '.join(node.flatten())
        if tmp in sentence:
            idx = sentence.index(tmp)
            if idx-2 >= 0 and sentence[idx-2] == ',':
                tmp = ', ' + tmp
            if idx+len(tmp)+1 < len(sentence) and sentence[idx+len(tmp)+1] == ',':
                tmp = tmp +' ,'
        sentence = sentence.replace(tmp, '')
        self._load(sentence)
    
    def _find_conj(self):
        self._dic = collections.defaultdict(list)
        dep, = self._dependency.raw_parse(self.original)
        remover = []      
        pool_conj = []
        pool_appos = []
        for governor, bridge, dependent in dep.triples():
            # 對等連接詞
            if bridge == 'conj':
                # NN conj NN
                if 'NN' in governor[1] and 'NN' in dependent[1]:
                    address = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0]['conj']
                    for add in address:
                        if add not in pool_conj:
                            tmp = []
                            r = []
                            pool_conj.append(add)
                            for key, value in dep.get_by_address(add)['deps'].items():
                                if key not in ['conj', 'cc', 'nmod', 'nmod:poss']:
                                    for j in value:
                                        tmp.append(dep.get_by_address(j)['word'])
                                        r.append(dep.get_by_address(j)['word'])
                                if key in ['nmod']:
                                    r.append(dep.get_by_address(add)['word'])
                                    for j in value:
                                        for key1, value1 in dep.get_by_address(j)['deps'].items():
                                            if key1 not in ['conj', 'cc']:
                                                for k in value1:
                                                    r.append(dep.get_by_address(k)['word'])
                                        r.append(dep.get_by_address(j)['word'])
                                if key in ['nmod:poss']:
                                    for j in value:
                                        for key1, value1 in dep.get_by_address(j)['deps'].items():
                                            if key1 not in ['conj', 'cc', 'case']:
                                                for k in value1:
                                                   tmp.append(dep.get_by_address(k)['word'])
                                                   r.append(dep.get_by_address(k)['word'])
                                            if key1 in ['case']:
                                                tmp.append(dep.get_by_address(j)['word'])
                                                r.append(dep.get_by_address(j)['word'])
                                                for k in value1:
                                                   tmp.append(dep.get_by_address(k)['word'])
                                                   r.append(dep.get_by_address(k)['word'])
                                    if dep.get_by_address(j)['word'] not in tmp:
                                        tmp.append(dep.get_by_address(j)['word'])
                                        r.append(dep.get_by_address(j)['word'])    
                            if dep.get_by_address(add)['word'] not in tmp:
                                tmp.append(dep.get_by_address(add)['word'])
                            if dep.get_by_address(add)['word'] not in r:
                                r.append(dep.get_by_address(add)['word'])

                            for i in self.t.subtrees(lambda i: i.leaves() == r):
                                for n in i.subtrees(lambda n: n[0] == dependent[0]):
                                    self._dic[governor[0]].append(('entity', ' '.join(tmp), self._find_attrs(n, ' '.join(tmp))))
                                    remover.append(' '.join(r))
                                    break
                                break
                            if ' '.join(r) not in remover:
                                self._dic[governor[0]].append(('entity', ' '.join(tmp), []))
                                remover.append(' '.join(r))
                            
                    
                # VB conj VB O
                elif 'VB' in governor[1] and 'VB' in dependent[1] and governor[1] == dependent[1]:   
                    gov_key = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0].keys()
                    dep_key = [x['deps'] for x in dep.nodes.values() if x['word']==dependent[0]][0].keys()
                    if [j for j in gov_key if j in ['dobj','xcomp','ccomp', 'nmod', 'nsubjpass']]==[] or [j for j in dep_key if j in ['dobj','xcomp','ccomp', 'nmod', 'nsubjpass', 'nsubj']]==[]:  
                        for i in self.t.subtrees(lambda i: i[0] == dependent[0]):
                            self._dic[governor[0]].append(('entity', dependent[0],  self._find_attrs(i, dependent[0])))
                            remover.append(dependent[0])
                            break
                        
            # 同位語(回傳整串)           
            elif bridge == 'appos':
                tmp = []
                address = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0]['appos']
                for add in address:
                    if add not in pool_appos:
                        tmp = []
                        pool_appos.append(add)    
                        for key, value in dep.get_by_address(add)['deps'].items():
                            if key in ['compound', 'amod']:
                                for j in value:
                                    tmp.append(dep.get_by_address(j)['word'])
                            if key in ['nmod']:
                                tmp.append(dep.get_by_address(add)['word'])
                                for j in value:
                                    for key1, value1 in dep.get_by_address(j)['deps'].items():
                                        if key1 not in ['conj', 'cc']:
                                            for k in value1:
                                                tmp.append(dep.get_by_address(k)['word'])
                                    tmp.append(dep.get_by_address(j)['word'])
                        if dep.get_by_address(add)['word'] not in tmp:
                            tmp.append(dep.get_by_address(add)['word'])                        
                        self._dic[governor[0]].append(('attr', ' '.join(tmp), []))
                        remover.append(' '.join(tmp))
        
        for i in range(len(remover)):
            #所有可能的位置
            can = [m.start() for m in re.finditer(remover[i], self.original)]
            flag = False
            for j in can:
                if self.original[j-2] == ',':
                    remover[i] = ', ' + remover[i]
                    flag = True
                    break
                elif self.original[j-4:j-1] == 'and':
                    remover[i] = 'and ' + remover[i]
                    flag = True
                    break
            if not flag:
                remover[i] = ' ' + remover[i]
        return remover        
                                                   
    # Breadth First Search the tree and take the first noun in the NP subtree.
    def _find_subject(self):
        synonym = ['', 'which', 'that', 'who', 'whom', 'where', 'when', 'what', 'why', 'how', 'whether', 'in']
        for i in self.t.subtrees(lambda i: i.label() == 'SBAR'):
            dep, = self._dependency.raw_parse(' '.join(self.t.flatten()))
            sub = [z for x, y, z in dep.triples() if y in ['nsubj', 'nsubjpass']]
            if sub != []:
                for s in self.t.subtrees(lambda s:s[0] == sub[0][0]):
                    return self._find_NOUN(s)   
            for s in i.subtrees(lambda s: s.label() == 'NP'):
                for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() in 'PRP'):
                    return self._find_NOUN(n)
                for n in s.subtrees(lambda n: n.label() == 'DT'):
                    return (n[0], self._find_attrs(n, n[0]))
        for i in self.t.subtrees(lambda i: i.label() not in ['S', 'ROOT', 'PP', 'FRAG']):  
            # 有Subject
            dep, = self._dependency.raw_parse(' '.join(self.t.flatten()))
            sub = [z for x, y, z in dep.triples() if y in ['nsubj', 'nsubjpass']]
            if sub != []:
                for s in self.t.subtrees(lambda s:s[0] == sub[0][0]):
                    return self._find_NOUN(s)   
                                                   
            if i.label() not in ['VP','PP'] and 'VB' not in i.label():                                
                for s in self.t.subtrees(lambda s: s.label() == 'NP'): 
                    for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'):                          
                        return self._find_NOUN(n)
                    for n in s.subtrees(lambda n: n.label() == 'DT'):
                        return (n[0], self._find_attrs(n, n[0]))
            
            # 祈使句
            elif (i.label() == 'VP' or i.label().startswith('VB')) and self._data.relation == 'main':
                if [x for x in self.t.pos()][0][1] not in ['RB','MD'] and 'VB' not in [x for x in self.t.pos()][0][1]:
                    for s in self.t.subtrees(lambda s: s.label() == 'NP'): 
                        for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'):                          
                            return self._find_NOUN(n)
                        for n in s.subtrees(lambda n: n.label() == 'DT'):
                            return (n[0], self._find_attrs(n, n[0]))
                    return None
                else:
                    return None
                                                   
            # 沒有subject & relation是代名詞
            elif (i.label() == 'VP' or i.label().startswith('VB')) and self._data.relation in synonym:
                dep, = self._dependency.raw_parse(self.original)
                candidate = [x for x in dep.triples() if x[1] in ['acl:relcl','acl'] and x[2][0] in self.t.flatten()]
                if candidate != []:
                    compound = self._find_compound(candidate[0][0][0], dep)
                    sub = []
                    if compound != '':
                        for com in compound:
                            sub.append(com)
                    sub.append(candidate[0][0][0])
                    return (' '.join(sub), [])
                else:
                    sent = [x[0] for x in self.pos]
                    if self._data.relation != '':
                        candidate = [x for x, y in enumerate(sent) if y == self._data.relation.split(' ')[0]]
                        after = self.t.pos()[0][0]
                    else:
                        candidate = [x for x, y in enumerate(sent) if y == self.t.pos()[0][0]]
                        if len(self.t.pos()) > 1:                               
                            after = self.t.pos()[1][0]
                        else:
                            after = ''                           
                    before = candidate[0] - 1 
                    for x in candidate:
                        if sent[x+1] == after:
                            before = x - 1
                    
                    if before == -1:
                        return None

                    # 原句前一個詞是否為NN  
                    if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0] or [x[1] for x in self.pos if x[0] == sent[before]][0] in ['PRP']:
                        sub = [sent[before]]
                        before -= 1
                        while 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]:
                            sub.append(sent[before])
                            before -= 1
                        return (' '.join(reversed(sub)), [])
                    elif [x[1] for x in self.pos if x[0] == sent[before]][0] in ['IN',','] and 'NN' in [x[1] for x in self.pos if x[0] == sent[before-1]][0]:
                        before -= 1                               
                        sub = [sent[before]]
                        before -= 1
                        while before != -1 and 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]:
                            sub.append(sent[before])
                            before -= 1
                        return (' '.join(reversed(sub)), [])

                    # 找parent中最近的
                    else:
                        target = self.t.pos()[0][0]
                        if self._data.parent.svo['subject'] == []:
                            sub = -1    
                        else:
                            sub = self._data.parent.svo['subject'][0][0].split(' ')[-1]
                        if self._data.parent.svo['object'] == []:
                            obj = -1
                        else:
                            obj = self._data.parent.svo['object'][0][0].split(' ')[-1]
                        if sub == -1 and obj != -1:
                            return self._data.parent.svo['object']
                        elif sub != -1 and obj == -1:
                            return self._data.parent.svo['subject']
                        elif sub != -1 and obj != -1:
                            if abs(self.original.find(target)-self.original.find(sub)) <= abs(self.original.find(target)-self.original.find(obj)):
                                return self._data.parent.svo['subject']
                            else:
                                return self._data.parent.svo['object']

            # 沒有subject & relation是連接詞    
            elif i.label() == 'VP' or i.label().startswith('VB'):                                   
                if self._data.parent != None:
                    return self._data.parent.svo['subject']
            else:                                  
                return None
                                                   
    def _find_compound(self, word, dep):
        deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word]
        com = []
        deps = [x for x in deps if 'compound' in x]                                           
        for i in deps:
            for j in i['compound']:
                com.append(dep.get_by_address(j)['word'])  
        deps = [x for x in deps if 'dep' in x]                                           
        for i in deps:
            com.append(dep.get_by_address(i['dep'][0])['word'])                                            
        return com
                                                   
    
    def _compound(self, compound, before):
        obj = []
        if compound != '':
            for n in self.t.subtrees(lambda n:n[0] == before):
                for com in compound:
                    for s in n.parent().subtrees(lambda s:s[0] == com):
                        obj.append(com)
        return obj
                                                   
                                                   
    def _dobj(self, candidate, dep, before):
        if 'dobj' in candidate.keys():
            word = dep.nodes[candidate['dobj'][0]]['word']
            tag = dep.nodes[candidate['dobj'][0]]['tag']
        else:
            word = dep.nodes[candidate['xcomp'][0]]['word']
            tag = dep.nodes[candidate['xcomp'][0]]['tag'] 
        compound = self._find_compound(word, dep)
        obj = self._compound(compound, before)
        if tag != 'TO':
            for n in self.t.subtrees(lambda n:n[0] == before):
                for s in n.parent().subtrees(lambda s:s[0] == word):
                    obj.append(s[0])
                    return (' '.join(obj), self._find_attrs(s, ' '.join(obj)))                                           
        

    def _find_object(self, predicate, node = '', data = ''):
        if node == '':
            node = self.t
        if data == '':
            data = self._data
        synonym = ['which', 'that', 'who', 'whom']                                          
        if data != None and data.relation == 'appos':
            dep, = self._dependency.raw_parse(' '.join(node.flatten()))
        else:
            dep, = self._dependency.raw_parse(self.original)
        for i in predicate:
            pre = i[0].split(' ')
            for j in range(len(pre)-1, -1, -1):
                if len([x['deps'] for x in dep.nodes.values() if x['word']==pre[j]]) > 1:
                    dep, = self._dependency.raw_parse(' '.join(node.flatten()))

                candidate = [x['deps'] for x in dep.nodes.values() if x['word']==pre[j]][0]
                candidate_1 = [x for x in dep.triples() if x[2][0]==pre[j]]
                                                   
                if 'dobj' in candidate.keys() or 'xcomp' in candidate.keys():
                    return self._dobj(candidate, dep, pre[j])
                                                   
                elif 'ccomp' in candidate.keys():
                    word = dep.nodes[candidate['ccomp'][0]]['word']
                    tag = dep.nodes[candidate['ccomp'][0]]['tag']
                    dic = collections.defaultdict(list)
                    deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word][0]
                                                   
                    if 'nsubj' in deps.keys():
                        compound = self._find_compound(dep.get_by_address(deps['nsubj'][0])['word'], dep)
                        obj = self._compound(compound, pre[j])
                        obj.append(dep.get_by_address(deps['nsubj'][0])['word'])
                        if 'dobj' in deps.keys() or 'xcomp' in deps.keys():
                            for n in self.t.subtrees(lambda n:n[0] == word):
                                dic['predicate'].append((word, self._find_attrs(n, word))) 
                            dic['object'] = self._add_conj(self._dobj(deps, dep, word))
                            return (' '.join(obj), [dic])
                     
                    elif 'dobj' in deps.keys():
                        compound = self._find_compound(dep.get_by_address(deps['dobj'][0])['word'], dep)
                        obj = self._compound(compound, pre[j])
                        for n in self.t.subtrees(lambda n:n[0] == dep.get_by_address(deps['dobj'][0])['word']):
                            obj.append(n[0])
                            return (' '.join(obj), self._find_attrs(n, ' '.join(obj)))
#                     else:
#                         return None
                                                   
                elif 'cop' in [x[1] for x in candidate_1]:
                    tmp = [x for x in candidate_1 if x[1] == 'cop'][0]
                    compound = self._find_compound(tmp[0][0], dep)
                    obj = self._compound(compound, pre[j])
                    for j in self.t.subtrees(lambda j:j[0] == tmp[0][0]):
                        obj.append(j[0])
                        return (' '.join(obj), self._find_attrs(j, ' '.join(obj)))    
                elif 'case' in [x[1] for x in candidate_1]:
                    tmp = [x for x in candidate_1 if x[1] == 'case'][0]
                    compound = self._find_compound(tmp[0][0], dep)
                    obj = self._compound(compound, pre[j])
                    for j in self.t.subtrees(lambda j:j[0] == tmp[0][0]):
                        obj.append(j[0])
                        return (' '.join(obj), self._find_attrs(j, ' '.join(obj)))
                                                   
                elif 'auxpass' in candidate.keys():
                    sent = [x[0] for x in self.pos]
                    if data != None and data.relation in synonym:
                        relation = sent.index(data.relation.split(' ')[0])
                        if 'IN' in [x[1] for x in self.pos if x[0] == sent[relation]][0]:
                            return (sent[relation-1], [])
                    return None
                                
                # 沒有受詞
                elif data != None and data.relation in synonym:
                    sent = [x[0] for x in self.pos]
                    before = sent.index(data.relation.split(' ')[0])-1
                    # 原句前一個詞是否為NN   
                    if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]:
                        return (sent[before], [])
                    elif 'IN' in [x[1] for x in self.pos if x[0] == sent[before]][0] and 'NN' in [x[1] for x in self.pos if x[0] == sent[before-1]][0]:
                        return (sent[before-1], [])
                    elif data.child != []:
                        kid = data.child[0]
                        if kid.relation != 'appos':
                            return (kid.relation+' '+' '.join(kid.data.flatten()), [])
                    else:
                        return None

                # 受詞為子句
                elif data != None and data.child != []:
                    kid = data.child[0]
                    if kid.relation != 'appos':
                        return (kid.relation+' '+' '.join(kid.data.flatten()), [])
                elif [x for x in dep.nodes.values() if x['word']==pre[j]][0]['tag'] == 'RP':
                    continue
                else:
                    return None
                                                   
    def _find_predicate(self):
        tmp = self.t.flatten()
        for n in self.t.subtrees(lambda n: n.label().startswith('VB')):
            if n.parent().label() in ['ADJP']:
                continue
            i = tmp.index(n[0])
            sub = []
            while self.t.pos()[i-1][1] in ['MD','RB']:
                sub.append(self.t.pos()[i-1][0])
                i -= 1
            sub.reverse()
            i = tmp.index(n[0])
            while i+1 < len(tmp):
                if [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0] == 'RP':
                    sub.append(tmp[i])
                    i += 1
                elif [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0] in ['RB','MD']:
                    if i+2 >= len(tmp):
                        break
                    count = i+2
                    while count+1 < len(tmp) and [x[1] for x in self.t.pos() if x[0] == tmp[count]][0] in ['RB','MD']:
                        count += 1
                    if count < len(tmp) and [x[1] for x in self.t.pos() if x[0] == tmp[count]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[count]][0] == 'TO':
                        sub.append(tmp[i])
                        i += 1
                    else:
                        break
                else:
                    break
            flag = i
            sub.append(tmp[flag])
            # 不定詞
            for j in self.t.subtrees(lambda j:j[0] == tmp[flag]):
                if j.right_sibling() and j.right_sibling().label() == 'PP' and j.right_sibling().leaves()[0] != 'to':
                    start = tmp.index(j.right_sibling().leaves()[-1])
                    has_PP = True
                else:
                    start = flag
                    has_PP = False

                if start+1 < len(tmp) and tmp[start+1] == 'to':
                    for i in range(start+1, len(tmp)):                                                   
                        if [x[1] for x in self.t.pos() if x[0] == tmp[i]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[i]][0] in ['TO','RB']:
                            sub.append(tmp[i])
                            if [x[1] for x in self.t.pos() if x[0] == tmp[i]][0].startswith('VB'):
                                flag = i
                        else:
                            break

                    if has_PP:
                        for i in self.t.subtrees(lambda i:i[0] == sub[-1]):
                            return (' '.join(sub), self._find_attrs(i, ' '.join(sub)))
                    else:
                        for i in self.t.subtrees(lambda i:i[0] == tmp[flag]):
                            return (' '.join(sub), self._find_attrs(i, ' '.join(sub)))
                else:
                    for i in self.t.subtrees(lambda i:i[0] == tmp[flag]):
                        return (' '.join(sub), self._find_attrs(i, ' '.join(sub)))
                                                   
           
    def _find_NOUN(self, n):
        # 所有格
        if n.parent().right_sibling() and n.parent().right_sibling().label().startswith('NN'):
            sub = n.parent().leaves()
            p = n.parent()
            while p.right_sibling():
                if p.right_sibling().label().startswith('NN') or p.right_sibling().label() in ['PRP','CD','DT']:
                    p = p.right_sibling()
                    sub.append(p[0])   
                else:
                    break
            return (' '.join(sub), self._find_attrs(p, ' '.join(sub)))
        else:
            sub = []
            pp = n.parent()   
            flag = ''
            for l in pp:
                if l.label().startswith('NN') or l.label() in ['PRP','CD','DT']:
                    if l[0] not in sub:
                        sub.append(l[0])
                        flag = l 
            if flag == '':
                sub.append(n[0])
                flag = n
            return (' '.join(sub), self._find_attrs(flag, ' '.join(sub)))
                                                   
    def _find_to(self, node):
        dic = collections.defaultdict(list)
        flag = node.leaves().index('to')
        tmp = node.leaves()[flag:]
        predicate = []
        for i in tmp:
            if [x[1] for x in self.t.pos() if x[0] == i][0] in 'TO' or 'VB' in [x[1] for x in self.t.pos() if x[0] == i][0]:
                predicate.append(i)
            else:
                break    
        for n in node.subtrees(lambda n: n[0] == predicate[-1]):        
            dic['predicate'].append((' '.join(predicate), self._find_attrs(n, ' '.join(predicate))))
        if predicate[-1] == 'be':
            for n in node.subtrees(lambda n: n.label() in ['NP', 'PP']):
                if n.label() in ['NP', 'PP']:
                    for c in n.subtrees(lambda c: c.label().startswith('NN') or c.label() in ['PRP', 'CD']):
                        a = self._find_NOUN(c)
                        dic['object'] = self._add_conj(a)
                        return dic
        else:
            tmp = self._find_object(dic['predicate'], node, None)
            dic['object'] = self._add_conj(tmp)
            return dic 
                                                   
    def _toV(self, node):
        # 可能有多個一樣的字                                           
        flat = list(self.t.flatten())
        candidate = [x for x, y in enumerate(flat) if y == node[0]]
        flag = candidate[0]
        if node.left_sibling():
            before = node.left_sibling().leaves()[-1]
            for i in candidate:
                if flat[i-1] == before:
                    flag = i
                    break
        elif node.right_sibling():
            after = node.right_sibling().leaves()[0]
            for i in candidate:
                if flat[i+1] == after:
                    flag = i
                    break 
        elif node.parent().left_sibling():
            before = node.parent().left_sibling().leaves()[-1]
            for i in candidate:
                if flat[i-1] == before:
                    flag = i
                    break
        elif node.parent().right_sibling():
            after = node.parent().right_sibling().leaves()[0]
            for i in candidate:
                if flat[i+1] == after:
                    flag = i
                    break 
        
        if not node.label().startswith('VB') and flag+2 < len(flat) and flat[flag+1] == 'to' and [x[1] for x in self.t.pos() if x[0] == flat[flag+2]][0] in 'VB':
            for i in self.t.subtrees(lambda i: i[0] == 'to'):                                 
                if flat[flag] not in i.parent().flatten():
                    return i.parent()

        else:
            return None
               
    def _PP(self, s, name, attrs):
        if ' '.join(s.flatten()) not in name:
            if len(s[0]) != 1:
                for i in s.subtrees(lambda i: i.label() == 'PP'):
                    if i.parent() == s:
                        a = self._proposition(i)
                        if a != []:
                            attrs.append(a)
                        else:
                            attrs.append(' '.join(s.flatten()))
            else:
                a = self._proposition(s)
                if a != []:
                    attrs.append(a)
                else:
                    attrs.append(' '.join(s.flatten()))
        return attrs
                                                   
                                                   
    def _find_attrs(self, node, name): 
        attrs = []
        p = node.parent()
        toV = self._toV(node)
        name = name.split(' ')
        # Search siblings of adjective for adverbs
        if node.label().startswith('JJ'):
            for s in p:
                if s.label() == 'RB':
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s.label() == 'PP':
                    attrs = self._PP(s, name, attrs)
                elif s.label() == 'NP':
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))                 

        elif node.label().startswith('NN') or node.label() in ['PRP', 'CD', 'DT']:
            for s in p:
                if s != node and s.label() in ['DT','PRP$','POS','CD','IN'] or s.label().startswith('JJ') or s.label().startswith('NN'):
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s != node and s.label() in ['ADJP','NP','QP', 'VP']:                            
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))  
                elif s != p and s.label() in ['PP']:
                    attrs = self._PP(s, name, attrs)

        # Search siblings of verbs for adverb phrase
        elif node.label().startswith('VB'):   
            for s in p:
#                 if s.label() in ['ADVP','MD','RB']:
                if s.label() in ['ADVP', 'RB', 'MD']:
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))

                elif s.label() == 'PP':
                    attrs = self._PP(s, name, attrs)

            
        # Search uncles
        # if the node is noun or adjective search for prepositional phrase
        if node.label().startswith('JJ') or node.label().startswith('NN') or node.label() in ['PRP', 'CD', 'DT']:
            if p.label() == 'QP':
                p = p.parent()
            for s in p.parent():
                if s != p and s.label() in ['PP']:
                    attrs = self._PP(s, name, attrs)
                elif s != p and 'NN' in s.label() or s.label() == 'JJ':
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s != p and s.label() == 'VP' and s.parent().label() == 'NP':
                    if ' '.join(s.flatten()) not in name:
                        if toV != None:
                            if ' '.join(s.flatten()[:3]) != ' '.join(toV.flatten()[:3]):
                                attrs.append(' '.join(s.flatten()))
                        else:
#                             self._refresh(s)
                            attrs.append(' '.join(s.flatten()))

        elif node.label().startswith('VB') or node.label() == 'RP':
            if p.parent():
                tmp = node
                for s in p.parent():
                    if s != p and s.label().startswith('ADVP'):
                        if ' '.join(s.flatten()) not in name:
                            attrs.append(' '.join(s.flatten()))
    #                 elif s != p and s.label() in ['MD','RB']:
    #                     attrs.append(s[0])
                    elif s != p and s.label() == 'PP' and s == tmp.right_sibling():       
                        attrs = self._PP(s, name, attrs)
                        tmp = s
        
        if toV != None:
            attrs.append(self._find_to(toV))
            self._refresh(toV) 
        
        return attrs  
                                                   
    def _proposition(self, node):
        dic = collections.defaultdict(list)
        tmp = node.leaves()
        if len(tmp) == 1:
            return []
        for k in node.subtrees(lambda k: k.label() in ['IN', 'TO']):  
            if tmp.index(k[0])+1 < len(tmp):
                VB = [x for x in node.pos() if x[0] == tmp[tmp.index(k[0])+1]]
                if VB != [] and 'VB' in VB[0][1]:                                   
                    dic['predicate'].append((k[0]+' '+VB[0][0], []))
                else:
                    dic['predicate'].append((k[0], []))  
            else:
                dic['predicate'].append((k[0], []))                                   
            if k.right_sibling():
                for c in k.right_sibling().subtrees(lambda c: c.label().startswith('NN') or c.label() in ['JJ', 'CD']):
                    # 所有格
                    if c.parent().right_sibling() and c.parent().right_sibling().label().startswith('NN'):
                        sub = c.parent().leaves()
                        p = c.parent()
                        while p.right_sibling():
                            if p.right_sibling().label().startswith('NN') or p.right_sibling().label() in ['PRP','CD']:
                                p = p.right_sibling()
                                sub.append(p[0])
                                flag = p
                            else:
                                break
                    else:
                        sub = []
                        pp = c.parent()
                        for l in pp:
                            if l.label().startswith('NN') or l.label() in ['PRP','CD', 'JJ']:
                                if l[0] not in sub:
                                    sub.append(l[0])
                                    flag = l
                    dic['object'].append((' '.join(sub), self._find_attrs(flag, ' '.join(sub))))
                    dic['object'] = self._add_conj(dic['object'][0])   
                    return dic
                return []
            else:
               return []                                    
        return []                                           
Example #17
0
import sys

parser = CoreNLPParser(url='http://localhost:9000')

dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

parses = dep_parser.parse('What is the airspeed of an unladen swallow ?'.split())

dp = [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]


#normal_parse = parser.raw_parse("What does the important inscription on the tomb of Ankhtifi, a nomarch during the early First Intermediate Periodi, describe?")

text = "During the Old Kingdom, the king of Egypt (not called the Pharaoh until the New Kingdom) became a living god who ruled absolutely and could demand the services and wealth of his subjects."

actual_parse = parser.raw_parse(text)

actual_tree = [t for t in actual_parse][0]

actual_tree.pretty_print()

def findRelatives(t, label=None, word=None):

    if label is None and word is None:
        print("please specify either the label or the word to search for")
        return None

    q = [(t,[])]

    curr = 0
    while (curr < len(q)):
Example #18
0
import wikipedia
datum = (wikipedia.page(title="School").content)
#doc = nlp(datum)

(requests.post(
    'http://[::]:9000/?properties={"annotators":"tokenize,ssplit,pos","outputFormat":"json"}',
    data={
        'data': datum
    }).text)
#server.start()

question = []
nlp = spacy.load('en_core_web_sm')

parser = CoreNLPParser()
parse = next(parser.raw_parse("I put the book in the box on the table."))


class SST():
    def __init__(self, label, children):
        self.label = label
        self.children = children


# Sentence Structure Leaf
class SSL():
    def __init__(self, label):
        self.label = label


simple_predicate = SST('ROOT', [SST('S', [SSL('NP'), SSL('VP'), SSL('.')])])
Example #19
0
class StanfordClient:
    def __init__(self, core_nlp_version: str = '2018-10-05', annotators=None):
        if annotators is None or not isinstance(annotators, list):
            annotators = ['openie', 'dcoref']
        self.remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format(
            core_nlp_version)
        self.install_dir = Path('~/.stanfordnlp_resources/').expanduser()
        self.install_dir.mkdir(exist_ok=True)
        if not (self.install_dir / Path(
                'stanford-corenlp-full-{}'.format(core_nlp_version))).exists():
            print('Downloading from %s.' % self.remote_url)
            output_filename = wget.download(self.remote_url,
                                            out=str(self.install_dir))
            print('\nExtracting to %s.' % self.install_dir)
            zf = ZipFile(output_filename)
            zf.extractall(path=self.install_dir)
            zf.close()

        os.environ['CORENLP_HOME'] = str(self.install_dir /
                                         'stanford-corenlp-full-2018-10-05')
        from stanfordnlp.server import CoreNLPClient
        self.client = CoreNLPClient(annotators=annotators, memory='8G')
        self.parser = CoreNLPParser()

    def parse(self,
              text: str,
              properties_key: str = None,
              properties: dict = None,
              output_format='json'):
        core_nlp_output = self.client.annotate(text=text,
                                               annotators=['parse'],
                                               output_format=output_format,
                                               properties_key=properties_key,
                                               properties=properties)
        return core_nlp_output

    def nltk_parse(self, text: str):
        return [tree for tree in self.parser.raw_parse(text)][0]

    def pos(self,
            text: str,
            properties_key: str = None,
            properties: dict = None):
        core_nlp_output = self.client.annotate(text=text,
                                               annotators=['pos'],
                                               output_format='json',
                                               properties_key=properties_key,
                                               properties=properties)
        return core_nlp_output

    def kbp(self,
            text: str,
            properties_key: str = None,
            properties: dict = None,
            simple_format: bool = True):
        core_nlp_output = self.client.annotate(text=text,
                                               annotators=['kbp'],
                                               output_format='json',
                                               properties_key=properties_key,
                                               properties=properties)
        if simple_format:
            return self.__parse_triples(core_nlp_output, key='kbp')
        else:
            return core_nlp_output

    def openie(self,
               text: str,
               properties_key: str = None,
               properties: dict = None,
               simple_format: bool = True):
        """
        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (str) properties_key: key into properties cache for the client
        :param (dict) properties: additional request properties (written on top of defaults)
        :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict.
        :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>.
        """
        # https://stanfordnlp.github.io/CoreNLP/openie.html
        core_nlp_output = self.client.annotate(text=text,
                                               annotators=['openie'],
                                               output_format='json',
                                               properties_key=properties_key,
                                               properties=properties)
        if simple_format:
            return self.__parse_triples(core_nlp_output, key='openie')
        else:
            return core_nlp_output

    @staticmethod
    def __parse_triples(core_nlp_output, key):
        triples = []
        for sentence in core_nlp_output['sentences']:
            for triple in sentence[key]:
                triples.append({
                    'subject': triple['subject'],
                    'relation': triple['relation'],
                    'object': triple['object']
                })
        return triples

    def coref(self,
              text: str,
              properties_key: str = None,
              properties: dict = None,
              simple_format: bool = True):
        core_nlp_output = self.client.annotate(text=text,
                                               annotators=['dcoref'],
                                               output_format='json',
                                               properties_key=properties_key,
                                               properties=properties)
        if simple_format:
            chains = []
            for _, chain in core_nlp_output['corefs'].items():
                if len(chain) > 1:
                    # there is a coreference found
                    chains.append([link['text'] for link in chain])
            return chains
        return core_nlp_output

    def generate_graphviz_graph(self,
                                text: str,
                                png_filename: str = './out/graph.png'):
        """
       :param (str | unicode) text: raw text for the CoreNLPServer to parse
       :param (list | string) png_filename: list of annotators to use
       """
        entity_relations = self.openie(text, simple_format=True)
        """digraph G {
        # a -> b [ label="a to b" ];
        # b -> c [ label="another label"];
        }"""
        graph = list()
        graph.append('digraph {')
        for er in entity_relations:
            graph.append('"{}" -> "{}" [ label="{}" ];'.format(
                er['subject'], er['object'], er['relation']))
        graph.append('}')

        output_dir = os.path.join('..', os.path.dirname(png_filename))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        out_dot = os.path.join(tempfile.gettempdir(), 'graph.dot')
        with open(out_dot, 'w') as output_file:
            output_file.writelines(graph)

        command = 'dot -Tpng {} -o {}'.format(out_dot, png_filename)
        dot_process = Popen(command, stdout=stderr, shell=True)
        dot_process.wait()
        assert not dot_process.returncode, 'ERROR: Call to dot exited with a non-zero code status.'

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def __del__(self):
        self.client.stop()
        if 'CORENLP_HOME' in os.environ:
            del os.environ['CORENLP_HOME']
Example #20
0
import os
import nltk
from nltk.parse.corenlp import CoreNLPServer
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser

STANFORD = "stanford-corenlp-full-2018-10-05"

jars = (
    os.path.join(STANFORD, "stanford-corenlp-3.9.2.jar"),
    os.path.join(STANFORD, "stanford-corenlp-3.9.2-models.jar"),
)

text = "turn right and go up the stairs and stand at the top."
#text = "Walk out of the closet and into the hallway. Walk through the hallway entrance on the left. Stop just inside the entryway."
#text = "Turn, putting the exit of the building on your left. Walk to the end of the entrance way and turn left. Travel across the kitchen area with the counter and chairs on your right. Continue straight until you reach the dining room. Enter the room and stop and wait one meter from the closest end of the long dining table."
print(text)
with CoreNLPServer(*jars):

    parser = CoreNLPParser()
    for i in parser.parse_text(text):
        print(i)

    parser = CoreNLPDependencyParser()
    for i in parser.raw_parse(text):
        print(i)
Example #21
0
    'Install App and import local SMS from the device.',
    'hey i have a problem and that is after building project when i try to edit the saved video, app crashesh on multiple devices (m above).',
    'During multiple input of identical values e.g. 10 km (by day) and 10 litres some calculations fail. E.g. first calculates correcty 100 l/km but then next two results are \"infinity l / 100 km.',
    'Plotting will death lock program then and you have to restart device.'

]
result = []

for sent in comp_sent_rec:



# with open(sys.argv[1], 'r') as file:
#     lines = file.readlines()
#     for sent in lines:
        parse = next(parser.raw_parse(sent))
        t = Tree.fromstring(str(parse))

        t.pretty_print()
        # print(t.leaves())

        for i in break_up_sent(t):
            result.append(i)
            print(i)
        # result = break_up_sent(t)
        # print(result)
        # print()

# for i in result:
#     print(i)
Example #22
0
with open(filename, encoding='utf-8') as f:
    file_content = f.read().replace('\n', '')
    file_content = ' '.join(file_content.split())
    lines = sent_tokenize(file_content)
    lines = [line for line in lines if line != ''
             and len(word_tokenize(line)) <= 10
             and line[-1] in '.?!'
             and line[0].isupper()]
    print(len(lines))
    wrong_lines_count = 0
    pic_count = 0
    for i, line in enumerate(lines):
        if wrong_lines_count == 5:
            break
        print('Original line: ' + line)
        tree = next(parser.raw_parse(line))
        if pic_count < 5 and word_tokenize(line) == 10:
            filename = get_valid_filename(line)
            TreeView(tree)._cframe.print_to_file(filename + '.ps')
            pic_count += 1
        errors = ATD.checkDocument(line)
        if len(list(errors)) == 0:
            print('**No errors** ({}/{})'.format(i + 1, len(lines)))
            continue
        else:
            print()
        correct_line = correct(line, errors)
        tree.pretty_print()
        print('Correct line: ' + correct_line)
        correct_tree = next(parser.raw_parse(correct_line))
        correct_tree.pretty_print()
Example #23
0
 def show_parse_tree(self, query):
     # show it with tree
     from nltk.parse.corenlp import CoreNLPParser
     cnlp = CoreNLPParser('http://localhost:9000')
     next(cnlp.raw_parse(query)).draw()
Example #24
0
path_list = []
examples = []
augment_examples = []

res = open(tag_path, 'w')

with open(train_path) as f:
    for i, line in enumerate(f):
        label = line[0]
        text = line[2:]
        if i % 100 == 0:
            print(i)

        path_list = []
        try:
            parsed_sen = next(parser.raw_parse(text))
        except requests.exceptions.HTTPError:
            print(str(i + 1) + ' : pass sentence')
            continue
        except StopIteration:
            print(str(i + 1) + ' : StopIteration')
            continue

        path_list = traverse_tree(parsed_sen, word_path)
    
        tag_dict = {}
        for p in position_set:
            tag_dict[p] = []
        
        for words in path_list:
            length = len(words[1])
 def parser_generator(_words):
     lemma_sent = " ".join(_words)
     parser = CoreNLPParser(url='http://localhost:9000')
     # next(parser.raw_parse(lemma_sent)).pretty_print()
     return parser.raw_parse(lemma_sent)
Example #26
0
    else:
        label = tree.label()
        #         print(type(label))
        return reduce(lambda x, y: Tree(label, (binarize(x), binarize(y))),
                      tree)


# In[7]:
import os

parser = CoreNLPParser(url='http://localhost:8000')

# In[8]:

my_sentence = "I love you."
t, = parser.raw_parse(my_sentence)
# t.draw()
bt = binarize(t)
# bt.draw()
tree = bt.pformat()
input = SenTreeTest.getTree(tree)
# input.draw()
model.eval()

# print(type(dev[0]))
# print(type(input))
predictions, loss = model.getLoss(input)
# print((predictions.data))
# print((model.labelList.data))
pred = predictions.data
label = model.labelList.data
Example #27
0
            lisDic[i] = jaccard_distance(question, token_sen)
        minInd = min(lisDic, key=lisDic.get)
        relevent.append(sentences[minInd])

    return relevent


def ans_type(question, relevent):
    wh_root = determine_wh(question)
    if wh_root in aux_words:
        ans_binary(question, relevent)
    else:
        ans_wh(question, relevent)


q_tree = parser.raw_parse(question)
r_tree = parser.raw_parse(relevent)

q_tree = list(parser.raw_parse(questions[0]))
r_tree = list(parser.raw_parse(relevent[0]))

newtree = ParentedTree.convert(r_tree[0])
newtree.draw()

for subtree in newtree:
    print(subtree.left_sibling())

print(q_tree)
for item in q_tree:
    print(item.label())
Example #28
0
class SVO():
    def __init__(self, sentence):
        config = ApplicationConfig.get_corenlp_config()
        self._parser = CoreNLPParser(
            url=f"http://{config['host']}:{config['port']}")
        self._dependency = CoreNLPDependencyParser(
            url=f"http://{config['host']}:{config['port']}")
        sentence = sentence.replace('  ', ' ')
        sentence = sentence.replace('.', '')
        self._load(sentence)
        self.original = sentence
#         self._ner = self._parser.tag(sentence.split(' '))

    def _load(self, sentence):
        self.t = list(self._parser.raw_parse(sentence))[0]
        self.t = ParentedTree.convert(self.t)

    def show(self):
        self.t.pretty_print()

    def find_svo(self):
        self._queue = []

        # sentence須為S或NP才能找SVO & find conj
        for i in self.t.subtrees(lambda i: i.label() != 'ROOT'):
            if i.label() in ['S', 'NP']:
                remover = self._find_conj()
                #                 print(remover)
                # refresh
                for i in remover:
                    self.original = self.original.replace(i, '')
                self._load(self.original)
                self.pos = self.t.pos()
                self._root = SVONode(('main', self.t), None)
                self._queue.append(self._root)
                break
            else:
                #                 return [], []
                return 'Sentence can not find SVO.'

        # find SVO
        while self._queue != []:
            data = self._queue.pop(0)
            sentence = ' '.join(data.data.flatten())
            self._load(sentence)
            # 找子句 & 對等連接詞 & 分詞
            #             self.show()
            self._find_SBAR(data)
            #             self.show()
            self._remove_comma()
            #             self.show()
            data.svo = collections.defaultdict(list)

            # Find Subject
            tmp = self._find_subject(data)
            if isinstance(tmp, list):
                data.svo['subject'] = tmp
            else:
                data.svo['subject'] = self._add_conj(tmp)
#             print(data.svo['subject'])

# Find Predicate
            tmp = self._find_predicate()
            data.svo['predicate'] = self._add_conj(tmp)
            #             print(data.svo['predicate'])

            # Find Object
            tmp = self._find_object(data, data.svo['predicate'])
            data.svo['object'] = self._add_conj(tmp)
#             print(data.svo['object'])

# Integrate
        result = collections.defaultdict(list)
        result = self._traversal(self._root, result)

        return result

    def _traversal(self, node, result):
        if node.svo['subject'] != [] or node.svo[
                'predicate'] != [] or node.svo['object'] != []:
            result[node.relation].append({
                'subject': node.svo['subject'],
                'predicate': node.svo['predicate'],
                'object': node.svo['object']
            })
        for i in node.child:
            result = self._traversal(i, result)
        return result

    def _add_conj(self, tmp):
        result = []
        if isinstance(tmp, tuple):
            flag = tmp[0].split(' ')
            if len(flag) <= 5:
                for k in flag:
                    if k in self._dic.keys():
                        # 把conj補進來
                        for j in self._dic[k]:
                            if j[0] == 'attr':
                                a = tmp[0]
                                b = tmp[1]
                                result.append((a, b + [j[1]]))
                            else:
                                result.append((j[1], j[2]))

        if isinstance(tmp, tuple) and tmp[0] not in [x[0] for x in result]:
            result.append(tmp)
        return result

    def _remove_comma(self):
        for i in self.t.subtrees(lambda i: i[0] == ','):
            if i.left_sibling() and i.left_sibling().label() not in [
                    'NP', 'S', 'VP'
            ]:
                if ' '.join(i.left_sibling().flatten()) != ' '.join(
                        self.t.flatten()):
                    self._refresh(i.left_sibling())
            if ' '.join(i.flatten()) != ' '.join(self.t.flatten()):
                self._refresh(i)

    def _find_SBAR(self, data):
        # 有無對等連接詞
        for i in self.t.subtrees(lambda i: i.label() == 'CC'):
            if i.right_sibling() and i.right_sibling().label() in ['S', 'VP']:
                if [
                        x for x in self._queue if ' '.join(i.right_sibling(
                        ).flatten()) in ' '.join(x.data.flatten())
                ] == [] and i[0] + ' ' + ' '.join(
                        i.right_sibling().flatten()) != ' '.join(
                            self.t.flatten()):
                    kid = SVONode((i[0], i.right_sibling()), data)
                    data.child.append(kid)
                    self._queue.append(kid)
                    # refresh
                    sentence = ' '.join(self.t.flatten())
                    tmp = i[0] + ' ' + ' '.join(i.right_sibling().flatten())
                    sentence = sentence.replace(tmp, '')
                    self._load(sentence)

        # 有無子句
        for node in self.t.subtrees(lambda node: node.label() == 'SBAR'):
            if ' '.join(node.flatten()) != ' '.join(self.t.flatten()):
                conj = []
                # 連接詞
                for s in node.subtrees(lambda s: s.label() != 'SBAR'):
                    if s.label() != 'S':
                        if s.leaves()[0] not in conj:
                            conj.append(s.leaves()[0])
                    else:
                        break
                conj = ' '.join(conj)
                for s in node.subtrees(lambda s: s.label() == 'S'):
                    # SBAR 會重複
                    if [
                            x for x in self._queue if ' '.join(s.flatten()) in
                            ' '.join(x.data.flatten())
                    ] == []:
                        kid = SVONode((conj, s), data)
                        data.child.append(kid)
                        self._queue.append(kid)
                        if node.left_sibling() and node.left_sibling().label(
                        ) == 'IN' and node.parent().label() != 'S':
                            self._refresh(node.parent())
                        else:
                            self._refresh(node)
                        break

        # 分詞
        participle = [x[0] for x in self.t.pos() if x[1] in ['VBG', 'VBN']]
        for i in participle:
            if i in self.t.leaves():
                candidate = [
                    x for x, y in enumerate(self.t.leaves()) if y == i
                ]
                before = self.t.leaves()[candidate[-1] - 1]
                pos = [x for x in self.t.pos() if x[0] == before][0][1]
                if pos == 'IN' and candidate[-1] - 2 >= 0 and 'VB' not in [
                        x for x in self.t.pos()
                        if x[0] == self.t.leaves()[candidate[-1] - 2]
                ][0][1]:
                    for j in self.t.subtrees(lambda j: j[0] == before):
                        if j.parent().label() != 'NP' and j.right_sibling(
                        ) and [
                                x for x in self._queue
                                if ' '.join(j.right_sibling().flatten()) in
                                ' '.join(x.data.flatten())
                        ] == [] and ' '.join(j.parent().flatten()) != ' '.join(
                                self.t.flatten()):
                            kid = SVONode((before, j.right_sibling()), data)
                            data.child.append(kid)
                            self._queue.append(kid)
                            self._refresh(j.parent())
                elif ('VB' not in pos) and (pos
                                            not in ['IN', 'RB', 'MD', 'POS']):
                    for j in self.t.subtrees(lambda j: j[0] == i):
                        if j.parent().label() not in [
                                'NP', 'ADJP'
                        ] and j.right_sibling() and [
                                x for x in self._queue if ' '.join(j.parent(
                                ).flatten()) in ' '.join(x.data.flatten())
                        ] == [] and ' '.join(j.parent().flatten()) != ' '.join(
                                self.t.flatten()):
                            kid = SVONode(('', j.parent()), data)
                            data.child.append(kid)
                            self._queue.append(kid)
                            self._refresh(j.parent())

    def _refresh(self, node):
        sentence = ' '.join(self.t.flatten())
        tmp = ' '.join(node.flatten())
        sentence = sentence.replace(tmp, '')
        self._load(sentence)

    def _find_conj(self):
        self._dic = collections.defaultdict(list)
        dep, = self._dependency.raw_parse(self.original)
        remover = []
        for governor, bridge, dependent in dep.triples():
            # 對等連接詞
            if bridge == 'conj':
                # NN conj NN
                if 'NN' in governor[1] and 'NN' in dependent[1]:
                    tmp = []
                    for key, value in [
                            x['deps'] for x in dep.nodes.values()
                            if x['word'] == dependent[0]
                    ][0].items():
                        if key not in ['conj', 'cc']:
                            tmp.append(dep.get_by_address(value[0])['word'])
                    tmp.append(dependent[0])
                    for i in self.t.subtrees(lambda i: i[0] == dependent[0]):
                        self._dic[governor[0]].append(
                            ('entity', ' '.join(tmp),
                             self._find_attrs(i, ' '.join(tmp))))
                        remover.append(' '.join(tmp))
                        break

                # VB conj VB O
                elif 'VB' in governor[1] and 'VB' in dependent[1]:
                    gov_key = [
                        x['deps'] for x in dep.nodes.values()
                        if x['word'] == governor[0]
                    ][0].keys()
                    dep_key = [
                        x['deps'] for x in dep.nodes.values()
                        if x['word'] == dependent[0]
                    ][0].keys()
                    if [j for j in gov_key if j in ['dobj', 'xcomp', 'ccomp']
                        ] == [] or [
                            j for j in dep_key
                            if j in ['dobj', 'xcomp', 'ccomp']
                        ] == []:
                        for i in self.t.subtrees(
                                lambda i: i[0] == dependent[0]):
                            self._dic[governor[0]].append(
                                ('entity', dependent[0],
                                 self._find_attrs(i, dependent[0])))
                            remover.append(dependent[0])
                            break

            # 同位語(回傳整串)
            elif bridge == 'appos':
                tmp = []
                for i in [
                        x['deps'] for x in dep.nodes.values()
                        if x['word'] == dependent[0]
                ][0].values():
                    tmp.append(dep.get_by_address(i[0])['word'])
                tmp.append(dependent[0])
                self._dic[governor[0]].append(('attr', ' '.join(tmp), []))
                remover.append(' '.join(tmp))

        for i in range(len(remover)):
            #所有可能的位置
            can = [m.start() for m in re.finditer(remover[i], self.original)]
            for j in can:
                if self.original[j - 2] == ',':
                    remover[i] = ', ' + remover[i]
                    break
                elif self.original[j - 4:j - 1] == 'and':
                    remover[i] = 'and ' + remover[i]
                    break
        return remover

    # Breadth First Search the tree and take the first noun in the NP subtree.
    def _find_subject(self, data):
        synonym = [
            '', 'which', 'that', 'who', 'whom', 'where', 'when', 'what', 'why',
            'how', 'whether', 'in'
        ]
        for i in self.t.subtrees(
                lambda i: i.label() != 'S' and i.label() != 'ROOT'):
            # 有Subject
            if i.label() not in ['VP', 'PP'] and 'VB' not in i.label():
                for s in self.t.subtrees(lambda t: t.label() == 'NP'):
                    for n in s.subtrees(lambda n: n.label().startswith('NN') or
                                        n.label() == 'PRP'):
                        return self._find_NOUN(n)
                    for n in s.subtrees(lambda n: n.label() == 'DT'):
                        return (n[0], self._find_attrs(n, n[0]))

            # 沒有subject & relation是代名詞
            elif i.label() != 'S' and i.label(
            ) == 'VP' and data.relation in synonym:
                sent = [x[0] for x in self.pos]
                if data.relation != '':
                    candidate = [
                        x for x, y in enumerate(sent)
                        if y == data.relation.split(' ')[0]
                    ]
                    after = self.t.pos()[0][0]
                else:
                    candidate = [
                        x for x, y in enumerate(sent)
                        if y == self.t.pos()[0][0]
                    ]
                    after = self.t.pos()[1][0]
                before = candidate[0] - 1
                for x in candidate:
                    if sent[x + 1] == after:
                        before = x - 1

                # 原句前一個詞是否為NN
                if 'NN' in [x[1]
                            for x in self.pos if x[0] == sent[before]][0] or [
                                x[1] for x in self.pos if x[0] == sent[before]
                            ][0] in ['PRP']:
                    sub = [sent[before]]
                    before -= 1
                    while 'NN' in [
                            x[1] for x in self.pos if x[0] == sent[before]
                    ][0]:
                        sub.append(sent[before])
                        before -= 1
                    return (' '.join(reversed(sub)), [])
                elif [x[1] for x in self.pos if x[0] == sent[before]
                      ][0] in ['IN', ','] and 'NN' in [
                          x[1] for x in self.pos if x[0] == sent[before - 1]
                      ][0]:
                    before -= 1
                    sub = [sent[before]]
                    before -= 1
                    while 'NN' in [
                            x[1] for x in self.pos if x[0] == sent[before]
                    ][0]:
                        sub.append(sent[before])
                        before -= 1
                    return (' '.join(reversed(sub)), [])

                # 找parent中最近的
                else:
                    target = self.t.pos()[0][0]
                    if data.parent.svo['subject'] == []:
                        sub = -1
                    else:
                        sub = data.parent.svo['subject'][0][0].split(' ')[-1]
                    if data.parent.svo['object'] == []:
                        obj = -1
                    else:
                        obj = data.parent.svo['object'][0][0].split(' ')[-1]
                    if sub == -1 and obj != -1:
                        return data.parent.svo['object']
                    elif sub != -1 and obj == -1:
                        return data.parent.svo['subject']
                    elif sub != -1 and obj != -1:
                        if abs(
                                self.original.find(target) -
                                self.original.find(sub)) <= abs(
                                    self.original.find(target) -
                                    self.original.find(obj)):
                            return data.parent.svo['subject']
                        else:
                            return data.parent.svo['object']

            # 沒有subject & relation是連接詞
            elif i.label() != 'S' and (i.label() == 'VP'
                                       or i.label().startswith('VB')):
                if data.parent != None:
                    return data.parent.svo['subject']
            else:
                return None

    def _find_compound(self, word, dep):
        deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word][0]
        if 'compound' in deps:
            return dep.get_by_address(deps['compound'][0])['word']
        else:
            return ''

    def _find_object(self, data, predicate):
        synonym = ['which', 'that', 'who', 'whom']
        dep, = self._dependency.raw_parse(' '.join(self.t.flatten()))
        for i in predicate:
            pre = i[0].split(' ')
            for j in range(len(pre) - 1, -1, -1):
                for governor, bridge, dependent in dep.triples():
                    if governor[0] == pre[j] and bridge in ['dobj', 'xcomp']:
                        obj = []
                        compound = self._find_compound(dependent[0], dep)
                        if compound != '':
                            obj.append(compound)
                        if dependent[1] != 'TO':
                            for j in self.t.subtrees(
                                    lambda j: j[0] == dependent[0]):
                                obj.append(j[0])
                                return (' '.join(obj),
                                        self._find_attrs(j, ' '.join(obj)))
                    elif governor[0] == pre[j] and bridge == 'ccomp':
                        dic = collections.defaultdict(list)
                        deps = [
                            x['deps'] for x in dep.nodes.values()
                            if x['word'] == dependent[0]
                        ][0]
                        if 'nsubj' in deps:
                            obj = []
                            compound = self._find_compound(
                                dep.get_by_address(deps['nsubj'][0])['word'],
                                dep)
                            if compound != '':
                                obj.append(compound)
                            obj.append(
                                dep.get_by_address(deps['nsubj'][0])['word'])
                            if 'dobj' in deps:
                                dic['predicate'].append(dependent[0])
                                for j in self.t.subtrees(
                                        lambda j: j[0] == dep.get_by_address(
                                            deps['dobj'][0])['word']):
                                    dic['object'].append(
                                        (j[0], self._find_attrs(j, j[0])))
                                return (' '.join(obj), [dic])
                        elif 'dobj' in deps:
                            obj = []
                            compound = self._find_compound(
                                dep.get_by_address(deps['dobj'][0])['word'],
                                dep)
                            if compound != '':
                                obj.append(compound)
                            for j in self.t.subtrees(
                                    lambda j: j[0] == dep.get_by_address(deps[
                                        'dobj'][0])['word']):
                                obj.append(j[0])
                                return (' '.join(obj),
                                        self._find_attrs(j, ' '.join(obj)))
                    elif dependent[0] == pre[j] and bridge == 'cop':
                        obj = []
                        compound = self._find_compound(governor[0], dep)
                        if compound != '':
                            obj.append(compound)
                        for j in self.t.subtrees(
                                lambda j: j[0] == governor[0]):
                            obj.append(j[0])
                            return (' '.join(obj),
                                    self._find_attrs(j, ' '.join(obj)))

        # 沒有受詞
        if data != None and data.relation in synonym:
            sent = [x[0] for x in self.pos]
            before = sent.index(data.relation.split(' ')[0]) - 1
            # 原句前一個詞是否為NN
            if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]:
                return (sent[before], [])
            elif 'IN' in [x[1] for x in self.pos if x[0] == sent[before]
                          ][0] and 'NN' in [
                              x[1]
                              for x in self.pos if x[0] == sent[before - 1]
                          ][0]:
                return (sent[before - 1], [])

        # 受詞為子句
        elif data != None and data.child != []:
            kid = data.child[0]
            return (kid.relation + ' ' + ' '.join(kid.data.flatten()), [])
        else:
            return None

    def _find_predicate(self):
        for s in self.t.subtrees(lambda s: s.label() == 'VP'):
            tmp = s.flatten()
            for n in s.subtrees(lambda n: n.label().startswith('VB')):
                i = tmp.index(n[0])
                sub = []
                while i + 1 < len(tmp):
                    if [x[1] for x in self.t.pos()
                            if x[0] == tmp[i + 1]][0].startswith('VB'):
                        sub.append(tmp[i])
                        i += 1
                    elif [x[1] for x in self.t.pos()
                          if x[0] == tmp[i + 1]][0] in ['RB', 'MD']:
                        count = i + 2
                        while count < len(tmp) and [
                                x[1]
                                for x in self.t.pos() if x[0] == tmp[count]
                        ][0] in ['RB', 'MD']:
                            count += 1
                        if count < len(tmp) and [
                                x[1]
                                for x in self.t.pos() if x[0] == tmp[count]
                        ][0].startswith('VB'):
                            sub.append(tmp[i])
                            i += 1
                        else:
                            break
                    else:
                        break
                flag = i
                sub.append(tmp[flag])
                # 不定詞
                for j in self.t.subtrees(lambda j: j[0] == tmp[flag]):
                    if j.right_sibling() and j.right_sibling().label(
                    ) == 'PP' and j.right_sibling().leaves()[0] != 'to':
                        start = tmp.index(j.right_sibling().leaves()[-1])
                        has_PP = True
                    else:
                        start = flag
                        has_PP = False

                    if start + 1 < len(tmp) and tmp[start + 1] == 'to':
                        for i in range(start + 1, len(tmp)):
                            if [x[1] for x in self.t.pos() if x[0] == tmp[i]
                                ][0].startswith('VB') or [
                                    x[1]
                                    for x in self.t.pos() if x[0] == tmp[i]
                                ][0] == 'TO':
                                sub.append(tmp[i])
                            else:
                                break

                        if has_PP:
                            for i in self.t.subtrees(
                                    lambda i: i[0] == sub[-1]):
                                return (' '.join(sub),
                                        self._find_attrs(i, ' '.join(sub)))
                        else:
                            for i in self.t.subtrees(
                                    lambda i: i[0] == tmp[flag]):
                                return (' '.join(sub),
                                        self._find_attrs(i, ' '.join(sub)))
                    else:
                        for i in self.t.subtrees(lambda i: i[0] == tmp[flag]):
                            return (' '.join(sub),
                                    self._find_attrs(i, ' '.join(sub)))

        for s in self.t.subtrees(lambda s: s.label().startswith('VB')):
            return (s[0], [])

    def _find_NOUN(self, n):
        # 所有格
        if n.parent().right_sibling() and n.parent().right_sibling().label(
        ).startswith('NN'):
            sub = n.parent().leaves()
            p = n.parent()
            while p.right_sibling():
                if p.right_sibling().label().startswith(
                        'NN') or p.right_sibling().label() in ['PRP', 'CD']:
                    p = p.right_sibling()
                    sub.append(p[0])
                else:
                    break
            return (' '.join(sub), self._find_attrs(p, ' '.join(sub)))
        else:
            sub = []
            pp = n.parent()
            for l in pp:
                if l.label().startswith('NN') or l.label() in ['PRP', 'CD']:
                    if l[0] not in sub:
                        sub.append(l[0])
                        flag = l
            return (' '.join(sub), self._find_attrs(flag, ' '.join(sub)))

    def _find_to(self, node):
        dic = collections.defaultdict(list)
        tmp = node.flatten()
        predicate = []
        for i in tmp:
            if [x[1]
                    for x in self.t.pos() if x[0] == i][0] == 'TO' or 'VB' in [
                        x[1] for x in self.t.pos() if x[0] == i
                    ][0]:
                predicate.append(i)
            else:
                break
        dic['predicate'].append((' '.join(predicate), []))
        if predicate[-1] == 'be':
            for n in node.subtrees(lambda n: n.label() in ['NP', 'PP']):
                if n.label() in ['NP', 'PP']:
                    for c in n.subtrees(lambda c: c.label().startswith('NN') or
                                        c.label() in ['PRP', 'CD']):
                        a = self._find_NOUN(c)
                        dic['object'] = self._add_conj(a)
                        return dic
        else:
            tmp = self._find_object(None, dic['predicate'])
            dic['object'] = self._add_conj(tmp)
            return dic

    def _find_attrs(self, node, name):
        attrs = []
        p = node.parent()
        flat = list(self.t.flatten())
        # 可能有多個一樣的字
        candidate = [x for x, y in enumerate(flat) if y == node[0]]
        flag = candidate[0]
        if node.left_sibling():
            before = node.left_sibling().leaves()[-1]
            for i in candidate:
                if flat[i - 1] == before:
                    flag = i
                    break
        elif node.right_sibling():
            after = node.right_sibling().leaves()[0]
            for i in candidate:
                if flat[i + 1] == after:
                    flag = i
                    break

        if not node.label().startswith('VB') and flag + 2 < len(flat) and flat[
                flag + 1] == 'to' and [
                    x[1] for x in self.t.pos() if x[0] == flat[flag + 2]
                ][0] == 'VB':
            for i in self.t.subtrees(lambda i: i[0] == 'to'):
                if flat[flat.index(node[0]) + 2] in i.parent().flatten():
                    toV = i.parent()
        else:
            toV = None

        # Search siblings of adjective for adverbs
        if node.label().startswith('JJ'):
            for s in p:
                if s.label() == 'RB':
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s.label() == 'PP':
                    if ' '.join(s.flatten()) not in name:
                        a = self._proposition(s)
                        if a != []:
                            attrs.append(a)
                elif s.label() == 'NP':
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))

        elif node.label().startswith('NN') or node.label() in [
                'PRP', 'CD', 'DT'
        ]:
            for s in p:
                if s != node and s.label() in [
                        'DT', 'PRP$', 'POS', 'CD', 'IN', 'VBG', 'VBN'
                ] or s.label().startswith('JJ'):
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s != node and s.label() in ['ADJP', 'NP', 'QP', 'VP']:
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))

        # Search siblings of verbs for adverb phrase
        elif node.label().startswith('VB'):
            tmp = node
            for s in p:
                #                 if s.label() in ['ADVP','MD','RB']:
                if s.label() in ['ADVP', 'RB', 'MD']:
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))
                        tmp = s
                elif s.label() == 'PP' and s == tmp.right_sibling():
                    if ' '.join(s.flatten()) not in name:
                        a = self._proposition(s)
                        if a != []:
                            attrs.append(a)
                            tmp = s

        # Search uncles
        # if the node is noun or adjective search for prepositional phrase
        if node.label().startswith('JJ') or node.label().startswith(
                'NN') or node.label() in ['PRP', 'CD', 'DT']:
            for s in p.parent():
                if s != p and s.label() in ['PP', 'IN']:
                    if ' '.join(s.flatten()) not in name:
                        a = self._proposition(s)
                        if a != []:
                            attrs.append(a)
                elif s != p and s.label() == 'VP' and s.parent().label(
                ) == 'NP':
                    if ' '.join(s.flatten()) not in name:
                        self._refresh(s)
                        attrs.append(' '.join(s.flatten()))
#                 # 不定詞
#                 elif s != p and s.label() == 'S' and 'to' in s.flatten() and s.flatten() != toV.flatten():
#                     attrs.append(self._find_to(s))

        elif node.label().startswith('VB'):
            for s in p.parent():
                if s != p and s.label().startswith('ADVP'):
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))
#                 elif s != p and s.label() in ['MD','RB']:
#                     attrs.append(s[0])
                elif s != p and s.label() == 'PP' and s == node.right_sibling(
                ):
                    if ' '.join(s.flatten()) not in name:
                        a = self._proposition(s)
                        if a != []:
                            attrs.append(a)

        if toV != None:
            attrs.append(self._find_to(toV))
            self._refresh(toV)

        return attrs

    def _proposition(self, node):
        dic = collections.defaultdict(list)
        for k in node.subtrees(lambda k: k.label() in ['IN', 'TO']):
            dic['predicate'].append((k[0], []))
            if k.right_sibling():
                for c in k.right_sibling().subtrees(lambda c: c.label(
                ).startswith('NN') or c.label() in ['JJ', 'CD']):
                    # 所有格
                    if c.parent().right_sibling() and c.parent().right_sibling(
                    ).label().startswith('NN'):
                        sub = c.parent().leaves()
                        p = c.parent()
                        while p.right_sibling():
                            if p.right_sibling().label().startswith(
                                    'NN') or p.right_sibling().label() in [
                                        'PRP', 'CD'
                                    ]:
                                p = p.right_sibling()
                                sub.append(p[0])
                            else:
                                break
                    else:
                        sub = []
                        pp = c.parent()
                        for l in pp:
                            if l.label().startswith('NN') or l.label() in [
                                    'PRP', 'CD', 'JJ'
                            ]:
                                if l[0] not in sub:
                                    sub.append(l[0])
                                    flag = l
                    dic['object'].append((' '.join(sub), []))
                    return dic

            else:
                return []
        return []
Example #29
0
def standford_parse_tree(sentence):
    parser = CoreNLPParser()
    return next(parser.raw_parse(sentence))