Ejemplo n.º 1
0
def main():
    sents = 0
    words_tot = 0
    yngve_tot = 0
    frazier_tot = 0
    nodes_tot = 0
    for line in sys.stdin:
        if line.strip() == "":
            continue
        t = Tree.parse(line)
        words = calc_words(t)
        words_tot += words
        sents += 1
        yngve = calc_yngve(t, 0)
        yngve_avg = float(yngve)/words
        yngve_tot += yngve_avg
        nodes = calc_nodes(t)
        nodes_avg = float(nodes)/words
        nodes_tot += nodes_avg
        frazier = calc_frazier(t, 0, "")
        frazier_avg = float(frazier)/words
        frazier_tot += frazier_avg
        # print "Sentence=%d\twords=%d\tyngve=%f\tfrazier=%f\tnodes=%f" % (sents, words, yngve_avg, frazier_avg, nodes_avg)
    yngve_avg = float(yngve_tot)/sents
    frazier_avg = float(frazier_tot)/sents
    nodes_avg = float(nodes_tot)/sents
    words_avg = float(words_tot)/sents
    print "Total\tsents=%d\twords=%f\tyngve=%f\tfrazier=%f\tnodes=%f" % (sents, words_avg, yngve_avg, frazier_avg, nodes_avg)
Ejemplo n.º 2
0
def load_ace_file(textfile, fmt):
    print('  - %s' % os.path.split(textfile)[1])
    annfile = textfile+'.tmx.rdc.xml'

    # Read the xml file, and get a list of entities
    entities = []
    xml = ET.parse(open(annfile)).getroot()
    for entity in xml.findall('document/entity'):
        typ = entity.find('entity_type').text
        for mention in entity.findall('entity_mention'):
            if mention.get('TYPE') != 'NAME': continue # only NEs
            s = int(mention.find('head/charseq/start').text)
            e = int(mention.find('head/charseq/end').text)+1
            entities.append( (s, e, typ) )

    # Read the text file, and mark the entities.
    text = open(textfile).read()
    
    # Strip XML tags, since they don't count towards the indices
    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)

    # Blank out anything before/after <TEXT>
    def subfunc(m): return ' '*(m.end()-m.start()-6)
    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
    text = re.sub('</TEXT>[\s\S]*', '', text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s,e,typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == 'binary':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree('NE', text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == 'multiclass':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree(typ, text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError('bad fmt value')
Ejemplo n.º 3
0
    def test_current_production(self):
        inputs_ = [("""
                (S
                    (sentence
                        (type_1_sentence_coord_1
                        (type_1_sentence_coord_2
                            (type_2_sentence
                            (THERE There)
                            (AUX is)
                            (Noun_Phrase
                                (det (DET an))
                                (Noun_w_support
                                (Adj_phrase
                                    (Adj_core (JJ small))
                                    (AND and)
                                    (Adj_phrase (Adj_core (JJ red))))
                                (Noun_Count (NN apple)))))))
                        (PERIOD .)))
                """, Production(Nonterminal("S"), [Nonterminal("sentence")]))]

        for i, (input_, expect_) in enumerate(inputs_):
            tree = Tree.parse(input_)
            production = current_production(tree)

            self.assertEqual(expect_, production)
Ejemplo n.º 4
0
 def parse_trees(self, flatten=False):
   trees = []
   for sentence in self.result['sentences']:
     ptree = Tree.parse(sentence['parsetree'])
     if flatten:
       ptree = flatten_deeptree(ptree)
     trees.append(ptree)
   return trees
 def parse_trees(self, flatten=False):
     trees = []
     for sentence in self.result['sentences']:
         ptree = Tree.parse(sentence['parsetree'])
         if flatten:
             ptree = flatten_deeptree(ptree)
         trees.append(ptree)
     return trees
Ejemplo n.º 6
0
 def loadHeadTrees(self,filename):
     """load trees with head annotated with ps2ds"""
     trees = []
     inf = codecs.open(filename,'r','utf-8')
     for s in inf.readlines():
         head_tree = Tree.parse(s)
         head_tree = Tree('TOP',[head_tree]) # coordinate with original tree structure
         trees.append(head_tree)
     return trees
Ejemplo n.º 7
0
def get_semantics_from_parse_tree(parse_tree_string):
    """Take a string representing the parse tree as input, and print the
    semantic parse. The result list consists of a list of tuples, with each
    tuple containing the VerbNet frame and its associated tree."""
    parse_tree = Tree.parse(parse_tree_string)
    # parse_tree.draw()

    split_clause_dict = split_clauses(parse_tree)

    for key, (clause, conjunction) in split_clause_dict.items():
        activized_clause = activize_clause(clause)
        split_clause_dict[key] = (activized_clause, conjunction)

    result_list = []

    for position, (clause, conjunction) in split_clause_dict.items():
        split_tree_dict = split_conjunctions(clause)

        if conjunction != "":
            result_list.append(conjunction)

        for split, (split_tree, conjunction) in split_tree_dict.items():
            if conjunction != "":
                result_list.append(conjunction)

            for tree in split_tree:
                tree = existential_there_insertion(tree)
                tree = invert_clause(tree)
                tree = wh_movement(tree)

                tree.draw()

                # Regex for finding verbs
                verb_finder = re.compile(r"(?<=VB[ DGNPZ]) *\w*(?=\))")

                # Get the lemma of the verb for searching verbnet
                verbs = (word.strip().lower() for word in verb_finder.findall(str(tree)))

                for verb in verbs:

                    lemmatized_verb = lemmatizer.lemmatize(verb, "v")
                    vfo_list = create_VerbFrameObjects(lemmatized_verb)

                    match_list = []

                    for vfo in vfo_list:
                        match = vfo.match_parse(tree)

                        if match:
                            match_list.append(match)

                    best_match = pick_best_match(match_list)
                    if not best_match is None:
                        result_list.append((best_match, tree))

    return result_list
Ejemplo n.º 8
0
 def _parse_trees_output(output_):
     res = []
     cur_lines = []
     for line in output_.splitlines(False):
         if line == '':
             res.append(Tree.parse('\n'.join(cur_lines)))
             cur_lines = []
         else:
             cur_lines.append(line)
     return res
Ejemplo n.º 9
0
 def _parse_trees_output(output_):
     res = []
     cur_lines = []
     for line in output_.splitlines(False):
         if line == '':
             res.append(Tree.parse('\n'.join(cur_lines)))
             cur_lines = []
         else:
             cur_lines.append(line)
     return res
Ejemplo n.º 10
0
    def _parse(self, t):
        try:
            return Tree.parse(self._normalize(t))

        except ValueError, e:
            sys.stderr.write("Bad tree detected; trying to recover...\n")
            # Try to recover, if we can:
            if e.args == ('mismatched parens',):
                for n in range(1, 5):
                    try:
                        v = Tree.parse(self._normalize(t+')'*n))
                        sys.stderr.write("  Recovered by adding %d close "
                                         "paren(s)\n" % n)
                        return v
                    except ValueError: pass
            # Try something else:
            sys.stderr.write("  Recovered by returning a flat parse.\n")
            #sys.stderr.write(' '.join(t.split())+'\n')
            return Tree('S', self._tag(t))
Ejemplo n.º 11
0
def load_parse_doc(parse_path):
    parse_path = os.path.abspath(parse_path)
    parses = []
    with open(parse_path, 'r') as fp:
        for line in fp:
            line = line.strip()
            if line == '':
                continue
            parse = Tree.parse(line)
            parses.append(parse)
    return parses
def build_tagged_sents(files):
    """
	Build the corpus of tagged sentences from the files of the sequoia corpus.
	"""
    sents = []
    for fname in files:
        fin = codecs.open(fname, "r", "utf-8")
        for line in fin:
            t = Tree.parse(line)
            sents.append(t.pos())
        fin.close()
    return sents
Ejemplo n.º 13
0
def build_tagged_sents(files):
    """
	Build the corpus of tagged sentences from the files of the sequoia corpus.
	"""
    sents = []
    for fname in files:
        fin = codecs.open(fname, "r", "utf-8")
        for line in fin:
            t = Tree.parse(line)
            sents.append(t.pos())
        fin.close()
    return sents
Ejemplo n.º 14
0
 def _load_sent_token(self):
     print "Loading sentences and tokens..."
     sent_elmts = self.c_root.findall(CTAKES_PREFIX + 'textspan.Sentence')
     t_counter = 0
     for sent_elmt in sent_elmts:
         sent_begin = int(sent_elmt.get('begin'))
         sent_end = int(sent_elmt.get('end'))
         sent_num = int(sent_elmt.get('sentenceNumber'))
         cursor = sent_begin
         sent_span = []
         token_offset = 0
         while cursor < sent_end:
             buf = self._find_token_elmt_with_attrib_of_val('begin', cursor)
             if len(buf) == 0:
                 cursor = cursor + 1
                 continue
             elif len(buf) > 1:
                 print 'More than one token appear to begin at ' + str(cursor) + \
                     '\nLoading ctakes xml file terminated'
                 return
             else:
                 token_elmt = buf[0]
                 t = Token(self.ds_id + '_t_' + str(t_counter))
                 t.type = token_elmt.tag.split('.')[-1][:-5]
                 # skipping 'newline' token when counting up tid
                 t_num = int(token_elmt.get('tokenNumber')) - sent_num
                 if t_num != t_counter:
                     print 'CAUTION: t_num does not equal to counter t_counter'
                 t.offset = token_offset
                 t.begin = int(token_elmt.get('begin'))
                 t.end = int(token_elmt.get('end'))
                 t.pos = token_elmt.get('partOfSpeech')
                 t.n_form = token_elmt.get('normalizedForm')
                 #t.c_form = token_elmt.get('canonicalForm')
                 #t.cap = int(token_elmt.get('capitalization'))
                 #t.num_p = int(token_elmt.get('numPosition'))
                 self.tokens.append(t)
             sent_span.append(t)
             cursor = t.end + 1
             token_offset = token_offset + 1
             t_counter += 1
             
         s = Sentence(self.ds_id + '_s_' + str(sent_num))
         s.span = sent_span
         s.num = sent_num
         #s.begin = sent_begin
         #s.end = sent_end
         s.parse = Tree.parse(self.p_fp.next())
         for t in s.span:
             t.sent = s
         self.sents.append(s)  
     return
Ejemplo n.º 15
0
    def __init__(self, json_file):
        data = json.load(json_file)
        for k, v in data.iteritems():
            self.__setattr__(k, v)
        self.__raw_data = data # for future reference

        #print data
        self.spantree = SpanTree.parse(self.goldparse)
        self.spantree.convert()
        self.goldparse = Tree.parse(self.goldparse)

        self.text = data['text'].split()
        self.treebank_sentence = data['treebank_sentence'].split()
Ejemplo n.º 16
0
    def findAmbiguities(self, line):
        result = self.parse(line)

        #if 'coref' in result:
        #    return 1

        trees = []
        retval = 0
        for i in range(len(result['sentences'])):
            tree = Tree.parse(result['sentences'][i]['parsetree'])
            trees.append(tree)
            # Since tree[0] is a S
            for subtree in tree:
                retval = max(retval, self.exploreSubTree(subtree))
        return retval
Ejemplo n.º 17
0
 def findAmbiguities(self,line):    
     result = self.parse(line) 
     
     #if 'coref' in result:
     #    return 1
 
     trees = []
     retval = 0
     for i in range(len(result['sentences'])):
         tree = Tree.parse(result['sentences'][i]['parsetree'])
         trees.append(tree)
         # Since tree[0] is a S
         for subtree in tree:
             retval = max(retval, self.exploreSubTree(subtree))
     return retval
Ejemplo n.º 18
0
    def read(klass, path=KNOWLEDGE_PATH):
       
        if not path:
            raise Exception("Specify a path to the verbframes.json as $WIMKB")

        with open(path, 'rb') as kbfile:
            data = json.load(kbfile, encoding="utf8")

            kwargs = {}
            for frame in data['frames']:
                for mapping in frame['mappings']:
                    # Update mapping with frame object
                    mapping['frame']   = frame['frame']

                    # Convert string reprs of Trees
                    mapping['verbmap'] = Tree.parse(mapping['verbmap'])

                    if 'parse' in mapping:
                        mapping['parse']   = Tree.parse(mapping['parse']) 

                # Convert kwargs
                kwargs[frame['frame']] = frame['mappings']

        return klass(**kwargs)
Ejemplo n.º 19
0
    def read(klass, path=KNOWLEDGE_PATH):

        if not path:
            raise Exception("Specify a path to the verbframes.json as $WIMKB")

        with open(path, 'rb') as kbfile:
            data = json.load(kbfile, encoding="utf8")

            kwargs = {}
            for frame in data['frames']:
                for mapping in frame['mappings']:
                    # Update mapping with frame object
                    mapping['frame'] = frame['frame']

                    # Convert string reprs of Trees
                    mapping['verbmap'] = Tree.parse(mapping['verbmap'])

                    if 'parse' in mapping:
                        mapping['parse'] = Tree.parse(mapping['parse'])

                # Convert kwargs
                kwargs[frame['frame']] = frame['mappings']

        return klass(**kwargs)
Ejemplo n.º 20
0
def tag_ptree(ptree, coreflist):
    """Tags given parse tree with coreferences

    Args:
        ptree: string, parenthesized str represenation of parse tree
        coreflist: list of tuples, [('1', {'text': 'dog', 'ref': None})]

    Returns:
        string, tagged parse tree

    >>> ptree = '(S NP( (NN He)) VP( (V ran)))'
    >>> coreflist = [('1', {'text': 'He', 'ref': None})]
    >>> tag_ptree(ptree, coreflist)
    '(S NP( COREF_TAG_1( (NN He))) VP( (V ran)))'

    """
    pattern = r"""(?P<lp>\(?\s*)       # left parenthesis
                  (?P<tg>[a-zA-Z$]+)?  # POS tag
                  (?P<data>\s*%s)      # subtree of tag
                  (?P<rp>(?:\s*\))*)   # right parenthesis
               """
    for cid, coref in coreflist[::-1]:
        words = ''.join(word_tokenize(coref['text']))

        nltktree = Tree.parse(ptree)
        nltktree.reverse()  # perform search right to left
        data = None
        for subtree in nltktree.subtrees():  # BFS
            if ''.join(subtree.leaves()) == words:  # equal ignoring whitespace
                data = subtree.pprint()
                break

        # If found via breadth-first search of parse tree
        if data:
            ptree = ptree.replace(data, '( COREF_TAG_%s%s)' % (cid, data))
        else:  # Try finding via regex matching instead
            dpattern = r'\s*'.join([r'\(\s*[a-zA-Z$]+\s+%s\s*\)' % word
                                    for word in word_tokenize(coref['text'])])
            found = re.findall(pattern % dpattern, ptree, re.X)
            if found:
                repl = '%s%s ( COREF_TAG_%s%s) %s' % (found[0][0],
                                                      found[0][1],
                                                      cid,
                                                      found[0][2],
                                                      found[0][3])
                ptree = re.sub(pattern % dpattern, repl, ptree, 1, re.X)

    return ptree
Ejemplo n.º 21
0
 def parseQuestion(self, text):
     question = Question()
     print "RECEIVED DATA IS\n" + text
     wordList = nltk.word_tokenize(text)
     i = 0
     tokens = list()
     for word in wordList:
         print "WORD: "+str(word)
         if not str(word).strip() is "" and not str(word).strip() is "." and not str(word).strip() is "?" and not str(word).strip() is "!" and not str(word).strip() is ",":
             tokens.append(word)
         i+=1
     print tokens
     question.setTokens(tokens)
     result = self.parse(text)
     tree = Tree.parse(result['sentences'][0]['parsetree'])
     print TreeUtils.findPocs(tree)
Ejemplo n.º 22
0
def create_trees_nltk(filename):    
    f = open(filename, "r")

    response = f.readlines(); f.close()
    valid_tree_texts = []   
    tree_text = '' 
    for line in response:
        line = line.strip()
        if(line == ""):
            valid_tree_texts.append(tree_text)
            tree_text = ""            
        else:
            tree_text += line+" "        
    trees = [Tree.parse(line) for line in valid_tree_texts]
    
    for i in range(len(trees)):
        trees[i].chomsky_normal_form() 
    
    return trees
Ejemplo n.º 23
0
def create_trees_nltk(filename):
    f = open(filename, "r")

    response = f.readlines()
    f.close()
    valid_tree_texts = []
    tree_text = ''
    for line in response:
        line = line.strip()
        if (line == ""):
            valid_tree_texts.append(tree_text)
            tree_text = ""
        else:
            tree_text += line + " "
    trees = [Tree.parse(line) for line in valid_tree_texts]

    for i in range(len(trees)):
        trees[i].chomsky_normal_form()

    return trees
Ejemplo n.º 24
0
def test_nltk_trees(parsed_text):
    ''' Example of parsed_text, stanford parser output :
    
        (ROOT
  (S
    (ADVP (RB However))
    (NP
      (NP (DT the) (NNS talks))
      (, ,)
      (VP (VBN hosted)
        (PP (IN by)
          (NP (NNP Douglas) (NNP Hurd))))
      (, ,))
    (VP (VBD ended)
      (PP (IN in)
        (NP (NN stalemate))))
    (. .)))
    
    '''
    nltree = Tree.parse(parsed_text)
    nltree.chomsky_normal_form()
    nltree.draw()
Ejemplo n.º 25
0
def test_nltk_trees(parsed_text):
    
    ''' Example of parsed_text, stanford parser output :
    
        (ROOT
  (S
    (ADVP (RB However))
    (NP
      (NP (DT the) (NNS talks))
      (, ,)
      (VP (VBN hosted)
        (PP (IN by)
          (NP (NNP Douglas) (NNP Hurd))))
      (, ,))
    (VP (VBD ended)
      (PP (IN in)
        (NP (NN stalemate))))
    (. .)))
    
    ''' 
    nltree = Tree.parse(parsed_text)
    nltree.chomsky_normal_form()
    nltree.draw()
Ejemplo n.º 26
0
def _process_parse(parse, coreflist):
    """Tags parse tree with corefs and returns the tree, lexicon, dependencies
    and raw text as tuple

    Args:
        parse: list of stanford corenlp parsed sentences
        coreflist: list of coreferences from tagged xml

    Returns:
        tuple, (ptree, lexicon, dependencies, rawtext) if parse contains a
            sentence, else returns None

    """
    sentence = parse.get('sentences')
    if sentence:
        ptree = Tree.parse(tag_ptree(sentence[0]['parsetree'], coreflist))
        words = [(w[0], w[1]) for w in sentence[0]['words']]
        depends = [(d[0], d[1], d[2]) for d in sentence[0]['dependencies']]
        text = sentence[0]['text']

        return ptree, words, depends, text
    else:
        return None
Ejemplo n.º 27
0
def read_trees(filename, treelist, check=True):
    buffer = []
    for line in open(filename):
        if not line.strip():
            continue
        if line.startswith("(") and buffer:
            tree = ' '.join(buffer)
            tree = re.sub('\s+', ' ', tree)
            treelist.append(tree)
            buffer = []
        buffer.append(line.rstrip())
    if buffer:
        tree = ' '.join(buffer)
        tree = re.sub('\s+', ' ', tree)
        treelist.append(tree)

    if check:
        for idx, tree in enumerate(treelist):
            try:
                t = Tree.parse(tree)
                s = "  ".join(t.leaves())
            except ValueError:
                assert False, "f: %s, i: %s, t: %s" %(filename, idx, tree)
Ejemplo n.º 28
0
    def test_nltk_trees(self):
        parsed_text =  """ (S
    (NP (PRP He))
    (VP (VBZ reckons)
      (SBAR
        (S
          (NP (DT the) (JJ current) (NN account) (NN deficit))
          (VP (MD will)
            (VP (VB narrow)
              (PP (TO to)
                (NP
                  (QP (RB only) (# #) (CD 1.8) (CD billion))))
              (PP (IN in)
                (NP (NNP September))))))))
    (. .)) """ 
#        parsed_text = """(S
#    (S
#      (NP
#        (NP (JJS Most))
#        (PP (IN of)
#          (NP (DT the) (NN commodity) (NN traffic))))
#      (VP (VBD was)
#        (ADJP (RP off))))
#    (, ,)
#    (NP (DT the) (NN company))
#    (VP (VBD said))
#    (. .)) """ 
#        """(S
#    (NP (DT The) (NN cat))
#    (VP (VBD sat)
#      (PP (IN on)
#        (NP (DT a) (NN mat))))
#    (. .))"""
        nltree = Tree.parse(parsed_text)
        nltree.chomsky_normal_form()
        nltree.draw()
Ejemplo n.º 29
0
 def test_nltk_trees(self):
     parsed_text = """ (S
 (NP (PRP He))
 (VP (VBZ reckons)
   (SBAR
     (S
       (NP (DT the) (JJ current) (NN account) (NN deficit))
       (VP (MD will)
         (VP (VB narrow)
           (PP (TO to)
             (NP
               (QP (RB only) (# #) (CD 1.8) (CD billion))))
           (PP (IN in)
             (NP (NNP September))))))))
 (. .)) """
     #        parsed_text = """(S
     #    (S
     #      (NP
     #        (NP (JJS Most))
     #        (PP (IN of)
     #          (NP (DT the) (NN commodity) (NN traffic))))
     #      (VP (VBD was)
     #        (ADJP (RP off))))
     #    (, ,)
     #    (NP (DT the) (NN company))
     #    (VP (VBD said))
     #    (. .)) """
     #        """(S
     #    (NP (DT The) (NN cat))
     #    (VP (VBD sat)
     #      (PP (IN on)
     #        (NP (DT a) (NN mat))))
     #    (. .))"""
     nltree = Tree.parse(parsed_text)
     nltree.chomsky_normal_form()
     nltree.draw()
Ejemplo n.º 30
0
def main():
  for line in sys.stdin:
    t = Tree.parse(line)
    t.draw()
Ejemplo n.º 31
0
# sys.exit()

mode = 0
parse = ""

first_tree = True
for line in sys.stdin:
    # print 'mode:', mode
    line = line[:-1]  # remove newline

    if line == "Leaves:":
        assert mode == -2

        if mode == -2:
            t = Tree.parse(parse)
            assert t
            if not first_tree:
                print ""
            first_tree = False
            print t.pprint()
            parse = ""
        mode = 0
        continue

    if line.startswith("Tree:"):
        mode -= 1
    elif line == "-----":
        mode -= 1
    else:
        assert abs(mode) < 3
Ejemplo n.º 32
0
def demo():
    import random
    def fill(cw):
        cw['fill'] = '#%06d' % random.randint(0,999999)

    cf = CanvasFrame(width=550, height=450, closeenough=2)

    t = Tree.parse('''
    (S (NP the very big cat)
       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))''')

    tc = TreeWidget(cf.canvas(), t, draggable=1,
                    node_font=('helvetica', -14, 'bold'),
                    leaf_font=('helvetica', -12, 'italic'),
                    roof_fill='white', roof_color='black',
                    leaf_color='green4', node_color='blue2')
    cf.add_widget(tc,10,10)

    def boxit(canvas, text):
        big = ('helvetica', -16, 'bold')
        return BoxWidget(canvas, TextWidget(canvas, text,
                                            font=big), fill='green')
    def ovalit(canvas, text):
        return OvalWidget(canvas, TextWidget(canvas, text),
                          fill='cyan')

    treetok = Tree.parse('(S (NP this tree) (VP (V is) (AdjP shapeable)))')
    tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)

    def color(node):
        node['color'] = '#%04d00' % random.randint(0,9999)
    def color2(treeseg):
        treeseg.node()['fill'] = '#%06d' % random.randint(0,9999)
        treeseg.node().child()['color'] = 'white'

    tc.bind_click_trees(tc.toggle_collapsed)
    tc2.bind_click_trees(tc2.toggle_collapsed)
    tc.bind_click_nodes(color, 3)
    tc2.expanded_tree(1).bind_click(color2, 3)
    tc2.expanded_tree().bind_click(color2, 3)

    paren = ParenWidget(cf.canvas(), tc2)
    cf.add_widget(paren, tc.bbox()[2]+10, 10)

    tree3 = Tree.parse('''
    (S (NP this tree) (AUX was)
       (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))''')
    tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color='green4',
                              tree_xspace=2, tree_width=2)
    tc3['draggable'] = 1
    cf.add_widget(tc3, 10, tc.bbox()[3]+10)

    def orientswitch(treewidget):
        if treewidget['orientation'] == 'horizontal':
            treewidget.expanded_tree(1,1).subtrees()[0].set_text('vertical')
            treewidget.collapsed_tree(1,1).subtrees()[0].set_text('vertical')
            treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical')
            treewidget.collapsed_tree().subtrees()[3].set_text('vertical')
            treewidget['orientation'] = 'vertical'
        else:
            treewidget.expanded_tree(1,1).subtrees()[0].set_text('horizontal')
            treewidget.collapsed_tree(1,1).subtrees()[0].set_text('horizontal')
            treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal')
            treewidget.collapsed_tree().subtrees()[3].set_text('horizontal')
            treewidget['orientation'] = 'horizontal'

    text = """
Try clicking, right clicking, and dragging
different elements of each of the trees.
The top-left tree is a TreeWidget built from
a Tree.  The top-right is a TreeWidget built
from a Tree, using non-default widget
constructors for the nodes & leaves (BoxWidget
and OvalWidget).  The bottom-left tree is
built from tree_to_treesegment."""
    twidget = TextWidget(cf.canvas(), text.strip())
    textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1)
    cf.add_widget(textbox, tc3.bbox()[2]+10, tc2.bbox()[3]+10)

    tree4 = Tree.parse('(S (NP this tree) (VP (V is) (Adj horizontal)))')
    tc4 = TreeWidget(cf.canvas(), tree4, draggable=1,
                     line_color='brown2', roof_color='brown2',
                     node_font=('helvetica', -12, 'bold'),
                     node_color='brown4', orientation='horizontal')
    tc4.manage()
    cf.add_widget(tc4, tc3.bbox()[2]+10, textbox.bbox()[3]+10)
    tc4.bind_click(orientswitch)
    tc4.bind_click_trees(tc4.toggle_collapsed, 3)

    # Run mainloop
    cf.mainloop()
Ejemplo n.º 33
0
Archivo: tree.py Proyecto: gijs/nltk
def demo():
    import random

    def fill(cw):
        cw["fill"] = "#%06d" % random.randint(0, 999999)

    cf = CanvasFrame(width=550, height=450, closeenough=2)

    t = Tree.parse(
        """
    (S (NP the very big cat)
       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))"""
    )

    tc = TreeWidget(
        cf.canvas(),
        t,
        draggable=1,
        node_font=("helvetica", -14, "bold"),
        leaf_font=("helvetica", -12, "italic"),
        roof_fill="white",
        roof_color="black",
        leaf_color="green4",
        node_color="blue2",
    )
    cf.add_widget(tc, 10, 10)

    def boxit(canvas, text):
        big = ("helvetica", -16, "bold")
        return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill="green")

    def ovalit(canvas, text):
        return OvalWidget(canvas, TextWidget(canvas, text), fill="cyan")

    treetok = Tree.parse("(S (NP this tree) (VP (V is) (AdjP shapeable)))")
    tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)

    def color(node):
        node["color"] = "#%04d00" % random.randint(0, 9999)

    def color2(treeseg):
        treeseg.node()["fill"] = "#%06d" % random.randint(0, 9999)
        treeseg.node().child()["color"] = "white"

    tc.bind_click_trees(tc.toggle_collapsed)
    tc2.bind_click_trees(tc2.toggle_collapsed)
    tc.bind_click_nodes(color, 3)
    tc2.expanded_tree(1).bind_click(color2, 3)
    tc2.expanded_tree().bind_click(color2, 3)

    paren = ParenWidget(cf.canvas(), tc2)
    cf.add_widget(paren, tc.bbox()[2] + 10, 10)

    tree3 = Tree.parse(
        """
    (S (NP this tree) (AUX was)
       (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))"""
    )
    tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color="green4", tree_xspace=2, tree_width=2)
    tc3["draggable"] = 1
    cf.add_widget(tc3, 10, tc.bbox()[3] + 10)

    def orientswitch(treewidget):
        if treewidget["orientation"] == "horizontal":
            treewidget.expanded_tree(1, 1).subtrees()[0].set_text("vertical")
            treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("vertical")
            treewidget.collapsed_tree(1).subtrees()[1].set_text("vertical")
            treewidget.collapsed_tree().subtrees()[3].set_text("vertical")
            treewidget["orientation"] = "vertical"
        else:
            treewidget.expanded_tree(1, 1).subtrees()[0].set_text("horizontal")
            treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("horizontal")
            treewidget.collapsed_tree(1).subtrees()[1].set_text("horizontal")
            treewidget.collapsed_tree().subtrees()[3].set_text("horizontal")
            treewidget["orientation"] = "horizontal"

    text = """
Try clicking, right clicking, and dragging
different elements of each of the trees.
The top-left tree is a TreeWidget built from
a Tree.  The top-right is a TreeWidget built
from a Tree, using non-default widget
constructors for the nodes & leaves (BoxWidget
and OvalWidget).  The bottom-left tree is
built from tree_to_treesegment."""
    twidget = TextWidget(cf.canvas(), text.strip())
    textbox = BoxWidget(cf.canvas(), twidget, fill="white", draggable=1)
    cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10)

    tree4 = Tree.parse("(S (NP this tree) (VP (V is) (Adj horizontal)))")
    tc4 = TreeWidget(
        cf.canvas(),
        tree4,
        draggable=1,
        line_color="brown2",
        roof_color="brown2",
        node_font=("helvetica", -12, "bold"),
        node_color="brown4",
        orientation="horizontal",
    )
    tc4.manage()
    cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10)
    tc4.bind_click(orientswitch)
    tc4.bind_click_trees(tc4.toggle_collapsed, 3)

    # Run mainloop
    cf.mainloop()
Ejemplo n.º 34
0
fout = open('tree.tex', 'w')
print >> fout, r'''\documentclass[tikz]{standalone}
\usepackage{CJKutf8}
\usepackage{color}
\usepackage{tikz}
\usepackage{tikz-qtree}
\thispagestyle{empty}
\begin{document}
\begin{CJK}{UTF8}{gbsn}

\begin{tikzpicture}'''
f = open(parse_file)
for i, s in enumerate(f):
    if i == line_num:
        s = s.replace('$', '\$')
        tree = Tree.parse(s)
        if flag == '0':
            h = tree.height()
            print >> fout, '''\\begin{{scope}}[frontier/.style={{distance from root={}}}]\n'''.format(
                h * 28)
            for pos in tree.treepositions('leaves'):
                tree[pos] = r'\edge[dotted]; {' + tree[pos] + '}'
            idx = 0
            for line in tree.pprint_latex_qtree().split('\n'):
                if ';' in line:
                    line = line.replace('{',
                                        '\\node(n{}) {{'.format(idx)).replace(
                                            '}', '};').replace('%', '\%')
                    idx += 1
                print >> fout, line
            for i in range(idx):
Ejemplo n.º 35
0
    TreeView(*trees).mainloop()
    return

##//////////////////////////////////////////////////////
##  Demo Code
##//////////////////////////////////////////////////////

import random
if __name__ == '__main__':
    def fill(cw):
        cw['fill'] = '#%06d' % random.randint(0,999999)
    
    cf = CanvasFrame(width=550, height=450, closeenough=2)

    tree = Tree.parse('''
    (S (NP the very big cat)
       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))
    ''', leafparser = lambda t: Token(TEXT=t))
                
    tc = TreeWidget(cf.canvas(), tree, draggable=1, 
                    node_font=('helvetica', -14, 'bold'),
                    leaf_font=('helvetica', -12, 'italic'),
                    roof_fill='white', roof_color='black',
                    leaf_color='green4', node_color='blue2')
    cf.add_widget(tc,10,10)
    
    def boxit(canvas, text):
        big = ('helvetica', -16, 'bold')
        return BoxWidget(canvas, TextWidget(canvas, text,
                                            font=big), fill='green')
    def ovalit(canvas, text):
        return OvalWidget(canvas, TextWidget(canvas, text),
Ejemplo n.º 36
0
fout = open('tree.tex','w')
print >>fout,r'''\documentclass[tikz]{standalone}
\usepackage{CJKutf8}
\usepackage{color}
\usepackage{tikz}
\usepackage{tikz-qtree}
\thispagestyle{empty}
\begin{document}
\begin{CJK}{UTF8}{gbsn}

\begin{tikzpicture}'''
f = open(parse_file)
for i,s in enumerate(f):
    if i == line_num:
        s = s.replace('$','\$')
        tree = Tree.parse(s)
        if flag == '0':
            h = tree.height()
            print >>fout,'''\\begin{{scope}}[frontier/.style={{distance from root={}}}]\n'''.format(h*28)
            for pos in tree.treepositions('leaves'):
	        tree[pos] = r'\edge[dotted]; {' + tree[pos] + '}'
            idx = 0
            for line in tree.pprint_latex_qtree().split('\n'):
                if ';' in line:
                    line = line.replace('{','\\node(n{}) {{'.format(idx)).replace('}','};').replace('%','\%')
                    idx += 1
                print >>fout,line
            for i in range(idx):
                print >>fout,'\draw (n{} |- 0,{}pt) node {{{}}};'.format(i,-h*28-10,i)
        else:
            print >>fout,r'\begin{scope}'
Ejemplo n.º 37
0
#!/usr/bin/python

from nltk.tree import Tree
import sys

# A program to display parse trees (in Penn treebank format) with NLTK
#
#  To install NLTK on ubuntu: sudo apt-get install python-nltk

for line in sys.stdin:
    t = Tree.parse(line)
    t.draw()
Ejemplo n.º 38
0
    parser = argparse.ArgumentParser()
    parser.add_argument('ptb', action='store', help="ptb.json file")
    parser.add_argument('json', action='store', help="json input file")
    parser.add_argument('jsonout', action='store', help="json output file")
    parser.add_argument('-verbose', action='store_true')
    arguments = parser.parse_args(sys.argv[1:])

    treebank = json.load(open(arguments.ptb))

    docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', arguments.json).groups()
    #print treebank.keys()
    #print docId
    #int(docId)
    sentNr = int(sentNr)
    data = json.load(open(arguments.json))

    if arguments.verbose:
        from nltk.tree import Tree
        sys.stderr.write("text:\n"), data['text']
        sys.stderr.write("%s\n" %(treebank[docId][sentNr]))

        t = Tree.parse(treebank[docId][sentNr])
        sys.stderr.write("%s\n" %(" ".join(t.leaves())))

    assert docId in treebank
    #print treebank[docId]
    assert int(sentNr) < len(treebank[docId])

    data['ptbparse'] = treebank[docId][sentNr]
    json.dump(data, open(arguments.jsonout, 'w'), indent=2, sort_keys=True)
Ejemplo n.º 39
0
#!/usr/bin/env python

# check if parse from .onf is equal to parse obtained from penn treebank

import sys
from collections import defaultdict
from itertools import imap, izip
import json
import re
from nltk.tree import Tree

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('json', action='store', help="json input file")
    arguments = parser.parse_args(sys.argv[1:])

    data = json.load(open(arguments.json))

    ptb = Tree.parse(data['ptbparse'])
    onf = Tree.parse(data['goldparse'])

    equal = ptb[0].pprint() == onf[0].pprint()
    if not equal:
        print "0 parses from pbt and .onf differ in %s" %arguments.json
    if equal:
        print "1 parses from pbt and .onf do NOT differ in %s" %arguments.json
        #print ptb[0].pprint()
        #print onf[0].pprint()
Ejemplo n.º 40
0
are brackets annotated. Export the content as a regular annotated corpus
for pos tagging learning.
"""

import sys, codecs
from nltk.tree import Tree

def treeSentenceToTuples(sent):
	"""
	:param sent: a Tree representing a sentence
	:type sent: nltk.tree.Tree
	"""
	return [u"%s/%s"%(t,p) for t,p in sent.pos() if not t in ["-LRB-", "-RRB-"]]

if __name__ == "__main__":
	if len(sys.argv) < 3:
		print "Usage:\n\t%s <destination> <corpus>" % sys.argv[0]
		sys.exit(-1)
	dest = sys.argv[1]
	fout = codecs.open(dest, "w", "utf-8")
	for fname in sys.argv[2:]:
		fin = codecs.open(fname, "r", "utf-8")
		for line in fin:
			t = Tree.parse(line)
			tokens = treeSentenceToTuples(t)
			fout.write(u" ".join(tokens))
			fout.write("\n")
		fin.close()
	fout.close()
	
Ejemplo n.º 41
0
import json
# from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp
import jsonrpclib
from pprint import pprint


class StanfordNLP:
    def __init__(self, port_number=8080):
        self.server = jsonrpclib.Server("http://localhost:%d" % port_number)

    def parse(self, text):
        return json.loads(self.server.parse(text))

nlp = StanfordNLP()
result = nlp.parse("Hello world!  It is so beautiful.")
pprint(result)

from nltk.tree import Tree
tree = Tree.parse(result['sentences'][0]['parsetree'])
pprint(tree)
Ejemplo n.º 42
0
Archivo: tree.py Proyecto: sp00/nltk
def demo():
    import random
    def fill(cw):
        cw['fill'] = '#%06d' % random.randint(0,999999)

    cf = CanvasFrame(width=550, height=450, closeenough=2)

    t = Tree.parse('''
    (S (NP the very big cat)
       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))''')

    tc = TreeWidget(cf.canvas(), t, draggable=1,
                    node_font=('helvetica', -14, 'bold'),
                    leaf_font=('helvetica', -12, 'italic'),
                    roof_fill='white', roof_color='black',
                    leaf_color='green4', node_color='blue2')
    cf.add_widget(tc,10,10)

    def boxit(canvas, text):
        big = ('helvetica', -16, 'bold')
        return BoxWidget(canvas, TextWidget(canvas, text,
                                            font=big), fill='green')
    def ovalit(canvas, text):
        return OvalWidget(canvas, TextWidget(canvas, text),
                          fill='cyan')

    treetok = Tree.parse('(S (NP this tree) (VP (V is) (AdjP shapeable)))')
    tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)

    def color(node):
        node['color'] = '#%04d00' % random.randint(0,9999)
    def color2(treeseg):
        treeseg.node()['fill'] = '#%06d' % random.randint(0,9999)
        treeseg.node().child()['color'] = 'white'

    tc.bind_click_trees(tc.toggle_collapsed)
    tc2.bind_click_trees(tc2.toggle_collapsed)
    tc.bind_click_nodes(color, 3)
    tc2.expanded_tree(1).bind_click(color2, 3)
    tc2.expanded_tree().bind_click(color2, 3)

    paren = ParenWidget(cf.canvas(), tc2)
    cf.add_widget(paren, tc.bbox()[2]+10, 10)

    tree3 = Tree.parse('''
    (S (NP this tree) (AUX was)
       (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))''')
    tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color='green4',
                              tree_xspace=2, tree_width=2)
    tc3['draggable'] = 1
    cf.add_widget(tc3, 10, tc.bbox()[3]+10)

    def orientswitch(treewidget):
        if treewidget['orientation'] == 'horizontal':
            treewidget.expanded_tree(1,1).subtrees()[0].set_text('vertical')
            treewidget.collapsed_tree(1,1).subtrees()[0].set_text('vertical')
            treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical')
            treewidget.collapsed_tree().subtrees()[3].set_text('vertical')
            treewidget['orientation'] = 'vertical'
        else:
            treewidget.expanded_tree(1,1).subtrees()[0].set_text('horizontal')
            treewidget.collapsed_tree(1,1).subtrees()[0].set_text('horizontal')
            treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal')
            treewidget.collapsed_tree().subtrees()[3].set_text('horizontal')
            treewidget['orientation'] = 'horizontal'

    text = """
Try clicking, right clicking, and dragging
different elements of each of the trees.
The top-left tree is a TreeWidget built from
a Tree.  The top-right is a TreeWidget built
from a Tree, using non-default widget
constructors for the nodes & leaves (BoxWidget
and OvalWidget).  The bottom-left tree is
built from tree_to_treesegment."""
    twidget = TextWidget(cf.canvas(), text.strip())
    textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1)
    cf.add_widget(textbox, tc3.bbox()[2]+10, tc2.bbox()[3]+10)

    tree4 = Tree.parse('(S (NP this tree) (VP (V is) (Adj horizontal)))')
    tc4 = TreeWidget(cf.canvas(), tree4, draggable=1,
                     line_color='brown2', roof_color='brown2',
                     node_font=('helvetica', -12, 'bold'),
                     node_color='brown4', orientation='horizontal')
    tc4.manage()
    cf.add_widget(tc4, tc3.bbox()[2]+10, textbox.bbox()[3]+10)
    tc4.bind_click(orientswitch)
    tc4.bind_click_trees(tc4.toggle_collapsed, 3)

    # Run mainloop
    cf.mainloop()
Ejemplo n.º 43
0
def process_file(json_filename, nb):
    docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', json_filename).groups()
    sentNr = int(sentNr)
    data = json.load(open(json_filename))
    data['nom'] = []

    # index adjustments for consistency with ontonotes parses
    ptb_tree = Tree.parse(data['ptbparse'])
    ptbstring = tree_to_string(ptb_tree) # wrap traces

    onftree = Tree.parse(data['goldparse'])
    onfstring = tree_to_string(onftree) # wrap traces
    raw_onfstring = tree_to_string(onftree, wrap_traces=False)

    ptbstring_tok = add_spaces(ptbstring, onfstring)

    tokenize_offsets = split_offsets(ptbstring, ptbstring_tok)
    trace_offsets = Offset(ptbstring_tok.split(), onfstring.split(), ignore_braces=True)

    #print ptbstring
    #print ptbstring_tok
    #print onfstring
    #print tokenize_offsets
    #print trace_offsets

    pt = SpanTree.parse(data['ptbparse'])

    for nb_data in nb[docId][sentNr]:
        args = nb_data['args']

	# TODO: arguments that are chains or concatenations of multiple nodes

        new_args = []
        for pos, role in args:
            words, start, end = [], None, None
            leaf_id, depth = pt.parse_pos(pos)
            if leaf_id != None and depth != None:
                treepos = pt.get_treepos(leaf_id, depth)
                while is_trace(pt[treepos]):
                    trace_id = int(pt[treepos].leaves()[0].split('-')[-1])
                    print 'looking for trace', trace_id
                    tracepos = pt.find_trace(trace_id)
                    if tracepos != None:
                        print 'trace %s found! Here:', tracepos
                        print pt[tracepos].pprint()
                        treepos = tracepos
                    else:
                        break # could not follow trace

                words = pt[treepos].leaves()
                start, end = span_from_treepos(pt, treepos)
                #print start, end,

                # adjust of different tokenization
                assert start in tokenize_offsets
                start = min(tokenize_offsets[start])
                assert end in tokenize_offsets
                end = max(tokenize_offsets[end])

                # adjust of inserted traces in ontonotes
                start = trace_offsets.map_to_longer(start)
                end = trace_offsets.map_to_longer(end)
                #print '->', start, end

            phrase = ''
            if words:
                phrase = ' '.join(raw_onfstring.split()[start:end+1])
            new_args.append( [role, pos, start, end, phrase] )

        nb_data['args'] = new_args
        data['nom'].append(nb_data)

        #print nb_data
    json.dump(data, open(json_filename, 'w'), indent=2, sort_keys=True)
Ejemplo n.º 44
0
import json
# from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp
import jsonrpclib
from pprint import pprint


class StanfordNLP:
    def __init__(self, port_number=8080):
        self.server = jsonrpclib.Server("http://localhost:%d" % port_number)

    def parse(self, text):
        return json.loads(self.server.parse(text))


nlp = StanfordNLP()
result = nlp.parse("Hello world!  It is so beautiful.")
pprint(result)

from nltk.tree import Tree
tree = Tree.parse(result['sentences'][0]['parsetree'])
pprint(tree)