Ejemplo n.º 1
0
def assign_slots(tokens, tag_tree, word_tree):
    stopword_list = stopwords.words('english')
    tokens_with_slot_tags = []
    word_tree = ParentedTree.convert(word_tree)
    tag_tree = ParentedTree.convert(tag_tree)
    word_tree_with_cats = tag_words_with_categories(word_tree)
    tag_tree_with_cats = tag_words_with_categories(tag_tree)
    for i, word in enumerate(tokens):
        tag = finalize_tags(i, word, tag_tree_with_cats, word_tree_with_cats) 
        tokens_with_slot_tags.append((word, tag))
    found_query_focus = False
    for i, item in enumerate(tokens_with_slot_tags):
        word, tag = item
        if tag in ['USER','MEDIA','NETWORK'] and not found_query_focus:
            tokens_with_slot_tags[i] = (word, 'SEARCH')
            found_query_focus = True
        elif tag == UNK:
            tokens_with_slot_tags[i] = (word, 'KEYWORD')
    slots = {}
    for word, tag in tokens_with_slot_tags:
        if tag == 'SKIP':
            continue
        elif tag == 'KEYWORD':
            if 'KEYWORDS' not in slots:
                slots['KEYWORDS'] = []
            if word not in stopword_list and word not in PUNCTUATION:
                slots['KEYWORDS'].append(word)
        else:
            if tag not in slots:
                slots[tag] = word
            else:
                previous_words = slots[tag]
                slots[tag] = ' '.join([previous_words, word])
    return slots
Ejemplo n.º 2
0
def lappinleasse(parsetree, i):
    global entitySet
    for np in parsetree.subtrees(lambda x: x.label() == 'NP'):
        if 'PRP' in np[0].label():
            if np[0,0].lower() == 'it' and ispleonastic(np, parsetree): continue
            maxsalience = -1
            referent = None
            e = Entity(np, parsetree, i)
            for entity in entitySet:
                if entity.sentencenum >= i - 4 and e.agreeswith(entity) and maxsalience < entity.salience:
                    maxsalience = entity.salience
                    referent = entity
            try:
                referent.salience += e.salience
                referent.gender = e.gender
                referent.phrases.add(np[0,0] + str(i))
                orig = np[0,0]
                if np[0].label() == 'PRP$':
                    np[0] = ParentedTree.fromstring('(SUB <'+ referent.name + "'s>)")
                    print('PRP$ substitution', orig, '-->', referent.name)
                else:
                    np[0] = ParentedTree.fromstring('(SUB <' + referent.name + '>)')
                    print('PRP substitution', orig, '-->', referent.name)
            except:
                print('No substitution found for ', orig)
                continue

        elif np[0].label() == 'EX': continue
        else: entitySet.add(Entity(np, parsetree, i))
#    print('Discourse model after sentence', i + 1, ':')
#    for entity in entitySet: print(entity)
    halve()
Ejemplo n.º 3
0
def merge_tree_nnps(tree):
    """
    Takes a parse tree and merges any consecutive leaf nodes that come from NNPs
    For example if there is a segment of:
        (NP
            (JJ old)
            (NNP Pierre)
            (NNP Vinken)
        )
    Returns:
        (NP
            (JJ old)
            (NNP PierreVinken)
        )
    """

    # require a parented tree to get a subtrees tree position
    p = ParentedTree.convert(tree)

    # iterates subtrees of height 3. This is where NP's leading to NNP's leading to lexicalizations will be
    for s in p.subtrees(filter=lambda s: s.height() == 3):
        # merge NNP's in the list representation of this trees children: [(POS, word), ...] 
        new_noun_phrase = merge_tagged_nnps([(c.label(), c[0]) for c in s])
        child_str = " ".join("(%s %s)" % (pos, word) for pos, word in new_noun_phrase)
        # create new subtree with merged NNP's
        new_s = ParentedTree.fromstring("(%s %s)" % (s.label(), child_str))

        # replace old subtree with new subtree
        p[s.treeposition()] = new_s
    return Tree.convert(p)
Ejemplo n.º 4
0
    def test_labeled_nodes(self):
        '''
        Test labeled nodes.

        Test case from Emily M. Bender.
        '''
        search = '''
            # macros
            @ SBJ /SBJ/;
            @ VP /VP/;
            @ VB /VB/;
            @ VPoB /V[PB]/;
            @ OBJ /OBJ/;

            # 1 svo
            S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'''
        sent1 = ParentedTree.fromstring(
            '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))')
        sent2 = ParentedTree.fromstring(
            '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))')
        search_firsthalf = (search.split('\n\n')[0] +
                            'S < @SBJ < (@VP < (@VB $.. @OBJ))')
        search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))'

        self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
        self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
        self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
        self.assertEqual(list(tgrep.tgrep_positions(search, [sent1])),
                         list(tgrep.tgrep_positions(search_rewrite, [sent1])))
        self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
        self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
        self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
        self.assertEqual(list(tgrep.tgrep_positions(search, [sent2])),
                         list(tgrep.tgrep_positions(search_rewrite, [sent2])))
Ejemplo n.º 5
0
 def syntax_similarity_two_documents(self, doc1, doc2, average=False): #syntax similarity of two single documents
     global numnodes
     doc1sents = self.sent_detector.tokenize(doc1.strip())
     doc2sents = self.sent_detector.tokenize(doc2.strip())
     for s in doc1sents: # to handle unusual long sentences.
         if len(s.split())>100:
             return "NA"
     for s in doc2sents:
         if len(s.split())>100:
             return "NA"
     try: #to handle parse errors. Parser errors might happen in cases where there is an unsuall long word in the sentence.
         doc1parsed = self.parser.raw_parse_sents((doc1sents))
         doc2parsed = self.parser.raw_parse_sents((doc2sents))
     except Exception as e:
         sys.stderr.write(str(e))
         return "NA"
     costMatrix = []
     doc1parsed = list(doc1parsed)
     for i in range(len(doc1parsed)):
         doc1parsed[i] = list(doc1parsed[i])[0]
     doc2parsed = list(doc2parsed)
     for i in range(len(doc2parsed)):
         doc2parsed[i] = list(doc2parsed[i])[0]
     for i in range(len(doc1parsed)):
         numnodes = 0
         sentencedoc1 = ParentedTree.convert(doc1parsed[i])
         tempnode = Node(sentencedoc1.root().label())
         new_sentencedoc1 = self.convert_mytree(sentencedoc1,tempnode)
         temp_costMatrix = []
         sen1nodes = numnodes
         for j in range(len(doc2parsed)):
             numnodes=0.0
             sentencedoc2 = ParentedTree.convert(doc2parsed[j])
             tempnode = Node(sentencedoc2.root().label())
             new_sentencedoc2 = self.convert_mytree(sentencedoc2,tempnode)
             ED = simple_distance(new_sentencedoc1, new_sentencedoc2)
             ED = ED / (numnodes + sen1nodes)
             temp_costMatrix.append(ED)
         costMatrix.append(temp_costMatrix)
     costMatrix = np.array(costMatrix)
     if average==True:
         return 1-np.mean(costMatrix)
     else:
         indexes = su.linear_assignment(costMatrix)
         total = 0
         rowMarked = [0] * len(doc1parsed)
         colMarked = [0] * len(doc2parsed)
         for row, column in indexes:
             total += costMatrix[row][column]
             rowMarked[row] = 1
             colMarked [column] = 1
         for k in range(len(rowMarked)):
             if rowMarked[k]==0:
                 total+= np.min(costMatrix[k])
         for c in range(len(colMarked)):
             if colMarked[c]==0:
                 total+= np.min(costMatrix[:,c])
         maxlengraph = max(len(doc1parsed),len(doc2parsed))
         return 1-(total/maxlengraph)
Ejemplo n.º 6
0
def test_exact_match():
    tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN cat)) (VP bit) (NP (DT a) (NN cat)))')
    node = search_by_exact_string_matching(tree, 'cat')
    assert_equal(len(node), 2)
    assert_equal(node[0], ParentedTree.fromstring('(NN cat)'))

    node = search_by_exact_string_matching(tree, 'a cat')
    assert_equal(len(node), 1)
    assert_equal(node[0], ParentedTree.fromstring('(NP (DT a) (NN cat))'))
Ejemplo n.º 7
0
    def __init__(self, nlp_sent):
        """
			:param nlp_sent: sentence extracted from parse from stanford corenlp parser
			has "enhancedPlusPlusDependencies"
			has "tokens"
		"""
        tokens = nlp_sent['tokens']
        self.raw_dict = {token['word']: token for token in tokens}
        const_parse = ParentedTree.fromstring(nlp_sent['parse'])
        self.clause_trees = parse_to_clauses(const_parse)
        # dependencies
        deps = nlp_sent['enhancedPlusPlusDependencies']
        dep_dict = collections.defaultdict(lambda: (None, None))
        try:
            dep_dict.update({
                dep['dependentGloss']: (dep['dep'], dep['governorGloss'])
                for dep in deps
            })
        except:
            pass

        # create sentence list __self__
        self.word_list = self.make_words(tokens, dep_dict)
        self.word_dict = dict(
            zip([token['word'] for token in tokens], self.word_list))

        self.clauses = self.integrate_tokens_to_clauses()
Ejemplo n.º 8
0
def get_candidates(treestring: str, verb_idx: int) -> List[List[str]]:
    tree = ParentedTree.fromstring(treestring)
    # Designate the predicate as the current node
    current = get_verbs_non_terminal_node(tree, verb_idx=verb_idx)
    candidates = []
    while current is not None:
        # collect its sisters (constituents attached at the same level as the predicate)
        for sister in get_sisters(current):
            if sister.label() == "CC":
                # unless its sisters are coordinated with the predicate.
                continue
            if sister.label() == "PP":
                # If a sister is a PP, also collect its immediate children
                for child in get_children(sister):
                    candidates += [child.leaves()]
            if sister is not None and sister.label() not in [
                    ".", "``", ",", ":"
            ]:
                candidates += [sister.leaves()]
        current = current.parent()

    # remove candidates which are just a single token, because they will anyway respect the constraint
    new_candidates = []
    for cand in candidates:
        if len(cand) == 1:
            continue
        else:
            new_candidates.append(cand)
    candidates = new_candidates

    return candidates
Ejemplo n.º 9
0
def getConsituentTreeDistribution(core_nlp_files):
    diff_productions = dict()
    production_dict_for_files = dict()
    for genre_file_path, genre_file_name in core_nlp_files:
        production_dict = dict()
        dictionary = dict()
        with open(genre_file_path) as f:
            lines = f.readlines()
            assert len(lines) == 1
            line = lines[0]
            line = 'dictionary=' + line
            exec(line)
            # print genre_file_path, dictionary
            sentences = dictionary[SENTENCES]
            for sent in sentences:
                parsetree = sent[PARSE_TREE]
                t = ParentedTree.fromstring(parsetree)
                prods = t.productions()
                for prod in prods:
                    if prod not in diff_productions:
                        diff_productions[prod] = 0.0
                    if prod not in production_dict:
                        production_dict[prod] = 0.0
                    diff_productions[prod] += 1.0
                    production_dict[prod] += 1.0
            production_dict_for_files[genre_file_name.replace('_corenlp1000.txt', '.txt')] = production_dict
    return production_dict_for_files, diff_productions
Ejemplo n.º 10
0
 def disfile2tree(dis_filepath):
     """converts a *.dis file into a ParentedTree (NLTK) instance"""
     with open(dis_filepath) as f:
         rst_tree_str = f.read().strip()
         rst_tree_str = fix_rst_treebank_tree_str(rst_tree_str)
         rst_tree_str = convert_parens_in_rst_tree_str(rst_tree_str)
         return ParentedTree.fromstring(rst_tree_str)
Ejemplo n.º 11
0
 def get_triples(self, sentence):
     t = list(self.parser.raw_parse(sentence))[0]
     t = ParentedTree.convert(t)
     s = self.find_subject(t)
     p = self.find_predicate(t)
     o = self.find_object(t)
     return (s, p, o)
Ejemplo n.º 12
0
 def create_tree(tree):
     nodes = []
     for n in tree:
         subtrees = [
             subtree for subtree in n.subtrees(filter=lambda k: k != n)
         ]
         if len(subtrees) > 0:
             subnodes = create_tree(n)
             nodes.append(ParentedTree(n.label(), subnodes))
         else:
             parent_label = n.parent().label() if n.parent() is not None \
                                                  and n.parent().label() not in ['S', 'ROOT'] else None
             nodes.append(
                 ParentedTree(parent_label, [(self.__decode_(
                     n[0]), self.__decode_(n.label()))]))
     return nodes
Ejemplo n.º 13
0
def extract_parse_actions(tree):
    """
    Extract a list of ``ShiftReduceAction`` objects for the given tree.

    Parameters
    ----------
    tree : nltk.tree.ParentedTree
        The RST tree from which to extract the actions.

    Returns
    -------
    actseq : list
        List of ``ShiftReduceAction`` objects extracted from the tree.
    """
    if tree.label() == '':
        tree.set_label("ROOT")
    assert tree.label() == "ROOT"

    stack = []
    cstack = [ParentedTree.fromstring("(DUMMY0 (DUMMY1 DUMMY3))")]
    actseq = []
    _extract_parse_actions_helper(tree, stack, cstack, actseq)
    actseq = _merge_constituent_end_shifts(actseq)

    return actseq
Ejemplo n.º 14
0
 def test_node_printing(self):
     '''Test that the tgrep print operator ' is properly ignored.'''
     tree = ParentedTree.fromstring('(S (n x) (N x))')
     self.assertEqual(list(tgrep.tgrep_positions('N', [tree])),
                      list(tgrep.tgrep_positions('\'N', [tree])))
     self.assertEqual(list(tgrep.tgrep_positions('/[Nn]/', [tree])),
                      list(tgrep.tgrep_positions('\'/[Nn]/', [tree])))
Ejemplo n.º 15
0
def get_modparse(sentence):
    """returns the modified parse tree for a sentence"""
    sp_db = SentenceParse.get_sentence_parse(sentence)
    try:
        res = sp_db.all()[0]
        parsetree = res.original_parse
        modparsetree = res.modified_parse
    except:
        print "parse.py: 103: " + sentence
        parses = parse_sentences([sentence])
        if len(parses) == 0:
            raise ParseError(printcolors.WARNING + ('ParseError: a sentence was empty'))

        modparses = modify_parses(parses)
        for i,chunk in enumerate(modparses[:]):
            for j,modparse in enumerate(chunk):
                if 'LANDMARK-PHRASE' in modparse:
                    modparses[i] = modparse
                    parses[i] = parses[i][j]
                    break
            if isinstance(modparses[i],list):
                modparses[i] = modparses[i][0]
                parses[i] = parses[i][0]

        parsetree = parses[0]
        modparsetree = modparses[0]
        try:
            SentenceParse.add_sentence_parse(sentence, parsetree, modparsetree)
        except Exception as e:
            print e

    if count_lmk_phrases(ParentedTree.parse(modparsetree)) < 1:
        raise ParseError(printcolors.WARNING + ('ParseError: Parse contained no Landmark phrase.\nSentence: %s\nParse: %s\nModparse: %s' % (sentence,parsetree,modparsetree)))

    return parsetree, modparsetree
Ejemplo n.º 16
0
def get_tree_part(sentence, part):
    url = "http://corenlp.run:80/tregex"
    request_paramsN = {
        "pattern":
        "(NP[$VP]>S)|(NP[$VP]>S\\n)|(NP\\n[$VP]>S)|(NP\\n[$VP]>S\\n)|(NP[$VP]>SQ)"
    }
    request_paramsV = {
        "pattern":
        "(VP[$NP]>S)|(VP[$NP]>S\\n)|(VP\\n[$NP]>S)|(VP\\n[$NP]>S\\n)|(VP[$NP]>SQ)"
    }
    select = request_paramsN if part == "NP" else request_paramsV
    try:
        request = requests.post(url, data=sentence, params=select)
        json = request.json()
        if print_switch: print(json)
    except:
        print("Cannot connect to coreNLP server. Try again later.")
        raise Exception
        return
    try:
        string = str(dict(json['sentences'][0])['0']['match'])
        tree = ParentedTree.fromstring(string)
        return tree
    except:
        print("Parsing issue in sentence:", sentence)
        print("Recieved parse:", nlp.parse(sentence))
        raise Exception
        return
Ejemplo n.º 17
0
 def test_rel_precedence(self):
     """
     Test matching nodes based on precedence relations.
     """
     tree = ParentedTree.fromstring("(S (NP (NP (PP x)) (NP (AP x)))"
                                    " (VP (AP (X (PP x)) (Y (AP x))))"
                                    " (NP (RC (NP (AP x)))))")
     self.assertEqual(list(tgrep.tgrep_positions("* . X", [tree])),
                      [[(0, ), (0, 1), (0, 1, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions("* . Y", [tree])),
                      [[(1, 0, 0), (1, 0, 0, 0)]])
     self.assertEqual(
         list(tgrep.tgrep_positions("* .. X", [tree])),
         [[(0, ), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]],
     )
     self.assertEqual(
         list(tgrep.tgrep_positions("* .. Y", [tree])),
         [[(0, ), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0),
           (1, 0, 0, 0)]],
     )
     self.assertEqual(list(tgrep.tgrep_positions("* , X", [tree])),
                      [[(1, 0, 1), (1, 0, 1, 0)]])
     self.assertEqual(
         list(tgrep.tgrep_positions("* , Y", [tree])),
         [[(2, ), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
     )
     self.assertEqual(
         list(tgrep.tgrep_positions("* ,, X", [tree])),
         [[(1, 0, 1), (1, 0, 1, 0), (2, ), (2, 0), (2, 0, 0),
           (2, 0, 0, 0)]],
     )
     self.assertEqual(
         list(tgrep.tgrep_positions("* ,, Y", [tree])),
         [[(2, ), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
     )
Ejemplo n.º 18
0
 def test_bad_operator(self):
     """
     Test error handling of undefined tgrep operators.
     """
     tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
     self.assertRaises(tgrep.TgrepException, list,
                       tgrep.tgrep_positions("* >>> S", [tree]))
Ejemplo n.º 19
0
def parse(sentence, use_cache=True, parser='stanford'):

    cache_key = "parse_trees_{0}".format(parser)
    valid_lines = None

    if use_cache:
        cache_attempt = cache_get(cache_key, sentence)
        if cache_attempt:
            valid_lines = cache_attempt

    if valid_lines is None:
        if parser == "stanford":
            response = parse_stanford(sentence, use_cache=use_cache)
        elif parser == "malt":
            response = parse_malt(sentence, use_cache=use_cache)
        else:
            return []

        valid_lines = [line for line in response.split("\n") if len(line) > 2 and line[0] == "(" and line[-1] == ")"]

        if use_cache:
            cache_set(cache_key, sentence, valid_lines)

    # throw away the garbgage we don't want from the parser's response.
    # this could probably get us in trouble since it'll hide errors etc,
    # but we got deadlines....
    trees = [ParentedTree.parse(line) for line in valid_lines]

    return trees
Ejemplo n.º 20
0
def add_indices_to_terminals(treestring):
    tree = ParentedTree.fromstring(treestring)
    for idx, _ in enumerate(tree.leaves()):
        tree_location = tree.leaf_treeposition(idx)
        non_terminal = tree[tree_location[:-1]]
        non_terminal[0] = non_terminal[0] + "_" + str(idx)
    return str(tree)
Ejemplo n.º 21
0
    def get_example(
            self,  # type: ignore
            tree: ParentedTree,
            ancestor: str):
        """
        Given a ParentedTree, extract the labels of the parents,
        grandparents, or greatgrandparents.

        Parameters
        ----------
        tree: ParentedTree
            ParentedTree to extract the example from.
        ancestor: str
            Whether the labels should be the parent, grandparent, or great-grandparent
            of each leaf.
        """
        tokens = tree.leaves()
        labels: List[str] = []
        for child in tree:
            if isinstance(child, ParentedTree):
                if len(list(child.subtrees())) > 1:
                    labels.extend(self.get_example(child, self._ancestor)[1])
                else:
                    labels.append(self._get_label(child, self._ancestor))
        return tokens, labels
Ejemplo n.º 22
0
def gen_instances(dataset, parses, model):
    instances = []
    labels = []
    candidate_re = re.compile("[%s]" % model.candidate)
    for paragraph in chain(*dataset):
        root = paragraph.root_relation()
        if root:
            sentences = list(root.iterfind(filter=node_type_filter(Sentence)))
            # 分割点两边的偏移量
            for sentence in sentences:
                segments = set()  # 分割点两侧的偏移量
                candidates = set()  # 候选分割词的偏移量
                edus = list(sentence.iterfind(filter=node_type_filter(EDU)))
                offset = 0
                for edu in edus:
                    segments.add(offset)
                    segments.add(offset+len(edu.text)-1)
                    offset += len(edu.text)
                # convert tree in parented tree for feature extraction
                parse = ParentedTree.fromstring(parses[sentence.sid].pformat())
                for m in candidate_re.finditer(sentence.text):
                    candidate = m.start()
                    instances.append(model.extract_features(candidate, parse))
                    labels.append(1 if candidate in segments else 0)
    return instances, labels
def test_reconstruct_training_examples():
    """Check extracted actions for entire training data."""
    # go through the training data and make sure
    # that the actions extracted from the trees can be used to
    # reconstruct those trees from a list of EDUs

    # check if the training data file exists, otherwise skip test
    file_path = Path('rst_discourse_tb_edus_TRAINING_TRAIN.json')
    if not file_path.exists():
        raise SkipTest("training data JSON file not found")

    # read in the training data file
    with open(file_path) as train_data_file:
        data = json.load(train_data_file)

    # instantiate the parser
    rst_parser = Parser(max_acts=1, max_states=1, n_best=1)

    # iterate over each document in the training data
    for doc_dict in data:

        # get the original RST tree
        original_tree = ParentedTree.fromstring(doc_dict['rst_tree'])

        # extract the parser actions from this tree
        actions = extract_parse_actions(original_tree)

        # reconstruct the tree from these actions using the parser
        reconstructed_tree = next(rst_parser.parse(doc_dict,
                                                   gold_actions=actions,
                                                   make_features=False))['tree']

        eq_(reconstructed_tree, original_tree)
Ejemplo n.º 24
0
    def get_all_parts_of_ctree(self, cparse, clabeldict, learn_features):
        self.cparse = ParentedTree.fromstring(str(cparse))
        if len(cparse.leaves()) != len(self.tokens):
            raise Exception("sentences do not line up!")

        # Replace leaves with node-ids.
        idx = 0
        for pos in self.cparse.treepositions('leaves'):
            self.cparse[pos] = idx
            idx += 1
        # Replace internal nodes with node-ids.
        for st in self.cparse.subtrees():
            # if x[0] in parentedp.leaves(): continue
            self.idxlabelmap[idx] = clabeldict.addstr(st.label())
            st.set_label(idx)
            idx += 1
        self.get_all_constit_spans()

        if not learn_features:
            return
        # Get stuff for constit features.
        self.leafnodes = [
            k for k in self.cparse.subtrees(lambda t: t.height() == 2)
        ]
        for a in xrange(len(self.leafnodes)):
            if self.leafnodes[a][0] != a:
                raise Exception("order mixup!")
        self.get_cpath_to_root()

        # Get all lowest common ancestors.
        for j in xrange(len(self.leafnodes)):
            for k in xrange(j, len(self.leafnodes)):
                lca, lcaid = self.get_lca(self.leafnodes[j], self.leafnodes[k])
                self.lca[(j, k)] = (lca, lcaid)
Ejemplo n.º 25
0
 def test_rel_precedence(self):
     '''
     Test matching nodes based on precedence relations.
     '''
     tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))'
                                    ' (VP (AP (X (PP x)) (Y (AP x))))'
                                    ' (NP (RC (NP (AP x)))))')
     self.assertEqual(list(tgrep.tgrep_positions('* . X', [tree])),
                      [[(0, ), (0, 1), (0, 1, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* . Y', [tree])),
                      [[(1, 0, 0), (1, 0, 0, 0)]])
     self.assertEqual(
         list(tgrep.tgrep_positions('* .. X', [tree])),
         [[(0, ), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]],
     )
     self.assertEqual(
         list(tgrep.tgrep_positions('* .. Y', [tree])),
         [[(0, ), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0),
           (1, 0, 0, 0)]],
     )
     self.assertEqual(list(tgrep.tgrep_positions('* , X', [tree])),
                      [[(1, 0, 1), (1, 0, 1, 0)]])
     self.assertEqual(
         list(tgrep.tgrep_positions('* , Y', [tree])),
         [[(2, ), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
     )
     self.assertEqual(
         list(tgrep.tgrep_positions('* ,, X', [tree])),
         [[(1, 0, 1), (1, 0, 1, 0), (2, ), (2, 0), (2, 0, 0),
           (2, 0, 0, 0)]],
     )
     self.assertEqual(
         list(tgrep.tgrep_positions('* ,, Y', [tree])),
         [[(2, ), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
     )
Ejemplo n.º 26
0
def get_sentence_posteriors(sentence, iterations=1, extra_meaning=None):
    meaning_probs = {}
    # parse sentence with charniak and apply surgeries
    print 'parsing ...'
    modparse = get_modparse(sentence)
    t = ParentedTree.parse(modparse)
    print '\n%s\n' % t.pprint()
    num_ancestors = count_lmk_phrases(t) - 1

    for _ in xrange(iterations):
        (lmk, _, _), (rel, _, _) = get_meaning(num_ancestors=num_ancestors)
        meaning = m2s(lmk,rel)
        if meaning not in meaning_probs:
            ps = get_tree_probs(t, lmk, rel)[0]
            # print "Tree probs: ", zip(ps,rls)
            meaning_probs[meaning] = np.prod(ps)
        print '.'

    if extra_meaning:
        meaning = m2s(*extra_meaning)
        if meaning not in meaning_probs:
            ps = get_tree_probs(t, lmk, rel)[0]
            # print "Tree prob: ", zip(ps,rls)
            meaning_probs[meaning] = np.prod(ps)
        print '.'

    summ = sum(meaning_probs.values())
    for key in meaning_probs:
        meaning_probs[key] /= summ
    return meaning_probs.items()
Ejemplo n.º 27
0
    def __init__(self,
                 corpus_path,
                 pos_path=None,
                 parse_path=None,
                 dep_path=None):
        self.corpus = load_corpus(corpus_path)
        self.pos_corpus = {}
        if pos_path is not None:
            self.pos_corpus = load_corpus(pos_path)

        self.parse_corpus = {}
        if parse_path is not None:
            self.parse_corpus = load_corpus(
                parse_path, lambda x: ParentedTree.fromstring(x))

        self.edu_corpus = {}
        for l, tokens in self.corpus.items():
            self.edu_corpus[l] = argument.get_EDU_offsets(tokens)

        self.dep_corpus = {}
        if dep_path is not None:
            self.dep_corpus = load_corpus(
                dep_path,
                postprocess_dep_entry,
                preprocess_dep_entry,
            )
            for l, dp in self.dep_corpus.items():
                assert (len(dp) == len(self.edu_corpus[l]))
Ejemplo n.º 28
0
def get_pp_old(text):
    # Return: a list of prepositions inside PP's in
    # the text. If the phrase is preceded by a VP/ADJP, the result
    # include the verb/adj also. If the phrase is preceded by a NP,
    # the noun is not included.
    phrases = {}

    for structure in parser.parse(nltk.word_tokenize(text)):
        tree = ParentedTree.convert(structure)
        for subtree in tree.subtrees():
            if subtree.label() == "PP":
                preposition = subtree.leaves()[0]
                left_sibling = subtree.left_sibling()

                if left_sibling != None:
                    left_sibling_label = left_sibling.label()
                    if is_noun(left_sibling_label):
                        phrases[preposition] = True
                    elif is_verb(left_sibling_label):
                        verb = convert_to_base_form(
                            " ".join(left_sibling.leaves()), 'v')
                        word = verb + " " + preposition
                        phrases[word] = True
                    elif is_adj(left_sibling_label):
                        adj = convert_to_base_form(
                            " ".join(left_sibling.leaves()), 'a')
                        word = adj + " " + preposition
                        phrases[word] = True

    return phrases
Ejemplo n.º 29
0
    def _parse_trees(self, file):
        with open(file, 'r') as f:
            lines = ''.join(map(str.strip, f.readlines()))

        s_expressions = self._tokenizer.tokenize(lines)
        trees = [ParentedTree.fromstring(s_expr) for s_expr in s_expressions]
        return trees
Ejemplo n.º 30
0
 def test_bad_operator(self):
     '''
     Test error handling of undefined tgrep operators.
     '''
     tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
     self.assertRaises(tgrep.TgrepException, list,
                       tgrep.tgrep_positions('* >>> S', [tree]))
Ejemplo n.º 31
0
def findSentencePTreeToken(sentence, keyword):
	import nltk
	from nltk.tree import ParentedTree
	stemmed = _lemma_(keyword)

	tmp = proc.parse_doc(sentence)
	i = 0
	numSentences = len(tmp['sentences'])
	rs = []
	for i in range(0, numSentences):
		p = tmp['sentences'][i]['parse']
		ptree = ParentedTree.fromstring(p)

		# rs = []
		for i in range(0, len(ptree.leaves())):
			tree_position = ptree.leaf_treeposition(i)

			node = ptree[tree_position]

			if _stem_(node)==stemmed:
				tree_position = tree_position[0:len(tree_position)-1]
				rs.append(ptree[tree_position])
		# if len(rs)>0:
		# 	return rs
	return rs
Ejemplo n.º 32
0
def conll2tree(arr):
    #format: idx, word, _, pos, pos, _, head, _, _, _

    # dep:head
    dep2headHash = {}
    for wArr in arr:
        dep2headHash[int(wArr[0])] = int(wArr[6])
    #print dep2headHash

    # head:depsList[]
    head2depsHash = {}
    for dep in dep2headHash:
        head = dep2headHash[dep]

        if head in head2depsHash:
            head2depsHash[head].append(dep)
        else:
            head2depsHash[head] = [dep]
    #print head2depsHash


#    if len(head2depsHash[0]) > 1:
#        print "Error. Multiple roots."

# head:tree
    treeheadHash = {}
    for head in head2depsHash:
        tree = ParentedTree(head, head2depsHash[head])
        treeheadHash[head] = tree

    root = updateTree(treeheadHash, 0)
    return root
 def parse_sentences(self, filename, num_sentences):
     """Parses each one-line sentence into a syntax tree"""
     # Open the file and parse a given number of sentences
     f = open(filename, 'r')
     if num_sentences == 'all':
         num_sentences = -1
     count = 0
     for sentence in f.readlines()[:num_sentences]:
         if count%10==0:
             print("Number of sentences trained: ",count)
         # Get possible parse trees
         trees = self.parser.raw_parse(sentence.lower())
         for tree in trees:
             self.nonterminal_counts['ROOT'] += 1
             tokenized_sentence = self.tokenize_sentence(sentence)
             # Only extract rules from sentences with greater than 8 tokens,
             # to avoid adding rules that generate short, ungrammatical sentences
             if len(tokenized_sentence) > 8:
                 self.extract_rules(tree)
             # Convert the tree into a ParentedTree, 
             # which is an NLTK tree that keeps pointers to each node's parent
             ptree = ParentedTree.convert(tree)
             # Calculate the bigram counts for this sentence
             self.get_bigram(ptree, tokenized_sentence)
         count+=1
Ejemplo n.º 34
0
 def test_node_printing(self):
     '''Test that the tgrep print operator ' is properly ignored.'''
     tree = ParentedTree.fromstring('(S (n x) (N x))')
     self.assertEqual(list(tgrep.tgrep_positions('N', [tree])),
                      list(tgrep.tgrep_positions('\'N', [tree])))
     self.assertEqual(list(tgrep.tgrep_positions('/[Nn]/', [tree])),
                      list(tgrep.tgrep_positions('\'/[Nn]/', [tree])))
def check(sent) :

	parser = StanfordParser()

	# Parse the example sentence


	# print(sent)
	t = list(parser.raw_parse(sent))[0]
	# print(t)
	t = ParentedTree.convert(t)
	# print(t)
	# t.pretty_print()
	try :
		subj = find_subject(t)
	except :
		subj = []
	try :
		pred = find_predicate(t)
	except :
		pred = []
	try :
		obj =  find_object(t)
	except :
		obj = []

	# print (subj)
	# print (pred)
	# print (obj)
	return subj , pred , obj
Ejemplo n.º 36
0
    def parse_text(self, text):
        default_properties = {
            'outputFormat': 'xml',
            'annotators': 'tokenize,pos,lemma,ssplit,parse,depparse'
        }

        response = self.session.post(
            self.url,
            params={'properties': json.dumps(default_properties)},
            data=text.encode(self.encoding),
            timeout=60)

        response.raise_for_status()

        parsed_data = xml.parse(response.text)
        sentences = parsed_data['root']['document']['sentences']['sentence']
        sentences = sentences if isinstance(sentences, list) else [sentences]

        for sentence in sentences:
            yield (ParentedTree.fromstring(
                sentence['parse'],
                read_leaf=lambda leaf: leaf.lower(),
                read_node=lambda node: node.split("-")[0]),
                   self.make_deps(sentence['dependencies'][3]['dep']),
                   self.create_raw_sentence(sentence['tokens']['token']))
Ejemplo n.º 37
0
 def test_use_macros(self):
     '''
     Test defining and using tgrep2 macros.
     '''
     tree = ParentedTree.fromstring(
         '(VP (VB sold) (NP (DET the) '
         '(NN heiress)) (NP (NN deed) (PREP to) '
         '(NP (DET the) (NN school) (NN house))))'
     )
     self.assertEqual(
         list(
             tgrep.tgrep_positions(
                 '@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN', [tree]
             )
         ),
         [[(1,), (2, 2)]],
     )
     # use undefined macro @CNP
     self.assertRaises(
         tgrep.TgrepException,
         list,
         tgrep.tgrep_positions(
             '@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree]
         ),
     )
Ejemplo n.º 38
0
 def test_node_nocase(self):
     '''
     Test selecting nodes using case insensitive node names.
     '''
     tree = ParentedTree.fromstring('(S (n x) (N x))')
     self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
     self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
Ejemplo n.º 39
0
def _is_leaf(tree: ParentedTree):
    """
    Checks whether the given tree is a leaf.
    :param tree: a ParentedTree instance
    :return: true if it is a leaf
    """
    return tree.height() == 2
Ejemplo n.º 40
0
    def add_tree(self, datum):
        # parse tree and binarize
        tree = Tree.fromstring(datum["raw_tree"])
        tree.chomsky_normal_form()
        tree.collapse_unary(collapsePOS=True)
        tree = ParentedTree.convert(tree)

        # assign indices to subtrees
        indices = {}
        counter = 0
        for t in tree.subtrees():
            indices[t.treeposition()] = counter
            counter += 1

        # generate parent pointers and labels
        # (labels = one instance of sent in sents by treelstm terminology)
        parents = [0] * (counter - 1)
        labels = []
        counter = 0
        for t in tree.subtrees():
            parent = t.parent()
            if parent != None:
                parents[counter] = indices[parent.treeposition()]
                counter += 1
            if type(t[0]) is str or type(t[0]) is unicode: labels.append(t[0])

        self.parents_file.write(" ".join(map(str, parents)) + "\n")
        self.sents_file.write(" ".join(labels) + "\n")
        self.trees.append(datum)
        return len(self.trees) - 1 # ID
Ejemplo n.º 41
0
 def test_node_nocase(self):
     '''
     Test selecting nodes using case insensitive node names.
     '''
     tree = ParentedTree.fromstring('(S (n x) (N x))')
     self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
     self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
Ejemplo n.º 42
0
 def test_use_macros(self):
     '''
     Test defining and using tgrep2 macros.
     '''
     tree = ParentedTree.fromstring(
         '(VP (VB sold) (NP (DET the) '
         '(NN heiress)) (NP (NN deed) (PREP to) '
         '(NP (DET the) (NN school) (NN house))))'
     )
     self.assertEqual(
         list(
             tgrep.tgrep_positions(
                 '@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN', [tree]
             )
         ),
         [[(1,), (2, 2)]],
     )
     # use undefined macro @CNP
     self.assertRaises(
         tgrep.TgrepException,
         list,
         tgrep.tgrep_positions(
             '@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree]
         ),
     )
Ejemplo n.º 43
0
def gen(files):
    for f in files:
        with open(f) as fi:
            #set_trace()
            #leaves = ParentedTree.parse(fi.read()).leaves()
            pos = ParentedTree.parse(fi.read()).pos()
        yield makeRow(getLocalContext(pos), f)
def j_is_subject(feats):
    "WORKS"
    sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence_ref)]
    ptree = ParentedTree.convert(sentence_tree)
    parent = __get_parent_tree__(feats.token_ref, ptree)
    j_subject = __is_subject__(ptree,feats.token_ref, parent,ptree)
    return "j_is_subject={}".format(j_subject)
Ejemplo n.º 45
0
def _preprosess(root: ParentedTree):
    """
    Preprocesses the lexcial tree: clean the syntactic tags and replace each token value with its
    index number.
    :param root: the root of the lexcial tree
    :return: a tuple of a processed tree and a sequence of (tag, token)
    """
    root: ParentedTree = root.copy(deep=True)

    def __iterate(tree: ParentedTree, index: int = 1):
        # clean the tags which contains '-'
        if '-' in tree.label():
            tree.set_label(tree.label().split('-')[0])
        if _is_leaf(tree):
            yield tree.label(), tree[0]  # (tag, token)
            tree[0] = index  # replace the token with its index number
            index += 1
        else:
            for subtree in tree:
                for _item in __iterate(subtree, index):
                    yield _item
                    index += 1

    # i.e. [('NR', '上海'), ('NR', '浦东'), ('NN', '开发'), ('CC', '与'), ...]
    sequences = [i for i in __iterate(root)]

    return root, sequences
def is_pred_nominal(feats):
    """WORKS"""
    if feats.sentence != feats.sentence_ref:
        return "is_pred_nominal={}".format(False)
    else:
        s_tree = ParentedTree.convert(TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)])
        NP_i = __get_parent_tree__(feats.token, s_tree)
        NP_j = __get_parent_tree__(feats.token_ref,s_tree)
        nominal= __get_max_projection__(s_tree,NP_j)
        copula_verbs = ["is","are","were","was","am"]
        def check_nominal_construction(tree):
            found = False
            for t in tree:
                if found:
                    break
                elif isinstance(t, ParentedTree):
                    if t == NP_i:
                        brother = t.right_sibling()
                        if isinstance(brother,ParentedTree) and brother.node == "VP":
                            verb = brother.leaves()[0]
                            if verb in copula_verbs:
                                for subtree in brother:
                                    if subtree == nominal:
                                        found = True
                                        break
                    else:
                        found = check_nominal_construction(t)
            return found

        return "is_pred_nominal={}".format(check_nominal_construction(s_tree))
Ejemplo n.º 47
0
 def syntax_similarity_conversation(self, documents1, average=False): #syntax similarity of each document with its before and after document
     global numnodes
     documents1parsed = []
     for d1 in range(len(documents1)):
         sys.stderr.write(str(d1)+"\n")
         # print documents1[d1]
         tempsents = (self.sent_detector.tokenize(documents1[d1].strip()))
         for s in tempsents:
             if len(s.split())>100:
                 documents1parsed.append("NA")
                 break
         else:
             temp = list(self.parser.raw_parse_sents((tempsents)))
             for i in range(len(temp)):
                 temp[i] = list(temp[i])[0]
                 temp[i] = ParentedTree.convert(temp[i])
             documents1parsed.append(list(temp))
     results = OrderedDict()
     for d1 in range(len(documents1parsed)):
         d2 = d1+1
         if d2 == len(documents1parsed):
             break
         if documents1parsed[d1] == "NA" or documents1parsed[d2]=="NA":
             continue
         costMatrix = []
         for i in range(len(documents1parsed[d1])):
             numnodes = 0
             tempnode = Node(documents1parsed[d1][i].root().label())
             new_sentencedoc1 = self.convert_mytree(documents1parsed[d1][i],tempnode)
             temp_costMatrix = []
             sen1nodes = numnodes
             for j in range(len(documents1parsed[d2])):
                 numnodes=0.0
                 tempnode = Node(documents1parsed[d2][j].root().label())
                 new_sentencedoc2 = self.convert_mytree(documents1parsed[d2][j],tempnode)
                 ED = simple_distance(new_sentencedoc1, new_sentencedoc2)
                 ED = ED / (numnodes + sen1nodes)
                 temp_costMatrix.append(ED)
             costMatrix.append(temp_costMatrix)
         costMatrix = np.array(costMatrix)
         if average==True:
             return 1-np.mean(costMatrix)
         else:
             indexes = su.linear_assignment(costMatrix)
             total = 0
             rowMarked = [0] * len(documents1parsed[d1])
             colMarked = [0] * len(documents1parsed[d2])
             for row, column in indexes:
                 total += costMatrix[row][column]
                 rowMarked[row] = 1
                 colMarked [column] = 1
             for k in range(len(rowMarked)):
                 if rowMarked[k]==0:
                     total+= np.min(costMatrix[k])
             for c in range(len(colMarked)):
                 if colMarked[c]==0:
                     total+= np.min(costMatrix[:,c])
             maxlengraph = max(len(documents1parsed[d1]),len(documents1parsed[d2]))
             results[(d1,d2)] = 1-total/maxlengraph#, minWeight/minlengraph, randtotal/lengraph
     return results
Ejemplo n.º 48
0
 def test_rel_precedence(self):
     '''
     Test matching nodes based on precedence relations.
     '''
     tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))'
                                    ' (VP (AP (X (PP x)) (Y (AP x))))'
                                    ' (NP (RC (NP (AP x)))))')
     self.assertEqual(list(tgrep.tgrep_positions('* . X', [tree])),
                      [[(0,), (0, 1), (0, 1, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* . Y', [tree])),
                      [[(1, 0, 0), (1, 0, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* .. X', [tree])),
                      [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* .. Y', [tree])),
                      [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0),
                       (1, 0, 0), (1, 0, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* , X', [tree])),
                      [[(1, 0, 1), (1, 0, 1, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* , Y', [tree])),
                      [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* ,, X', [tree])),
                      [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0),
                       (2, 0, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* ,, Y', [tree])),
                      [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
def apposition(feats): #this was driving me MAD....I SHOULD CORRECT THE STYLE...aarrrrggghhshs
    """WORKS WITH THE EXAMPLES IN UNITTEST, HOPE THEY WERE NOT A COINDIDENCE"""
    if feats.sentence!=feats.sentence_ref:
        return "apposition={}".format(False)
    else:
        sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence_ref)]
        ptree = ParentedTree.convert(sentence_tree)
        token_ref = set(feats.token_ref.split("_"))
        token = set(feats.token.split("_"))
        def is_j_apposition(curr_tree):
                found = False
                for child in curr_tree:
                    if found:
                        break
                    elif isinstance(child, ParentedTree):
                        child_leaves = set(child.leaves())
                        conditions = len(token_ref.intersection(child_leaves))>0 and curr_tree.node == "NP"
                        if conditions:
                            brother = child.left_sibling()
                            if isinstance(brother, ParentedTree) and brother.node == ",":
                                antecedent = brother.left_sibling()
                                if isinstance(antecedent,ParentedTree):
                                    previous_words = set(antecedent.leaves())
                                    if len(token.intersection(previous_words))>0:
                                        found = True
                        else:
                            found = is_j_apposition(child)

                return found
        return "apposition={}".format(is_j_apposition(ptree))
Ejemplo n.º 50
0
def get_right_sibling(tree, pos, ct):

    for i, node in enumerate(tree.pos()):
        if i == pos:
            nodepos = tree.leaf_treeposition(i)
            pt = ParentedTree.convert(tree)
            rs = pt[nodepos[:-1]].right_sibling()
            if rs:
                if rs.label(
                ) == 'S':  # the conn is connecting one or two S-es, take the right sibling S as int arg
                    return rs.leaves()
                else:
                    parent = pt[nodepos[:-1]].parent()
                    # assuming that there are no duplicates of the connective anymore at this level of detail:
                    leaves = parent.leaves()
                    connindex = leaves.index(ct.token)
                    remainder = [
                        xj for xi, xj in enumerate(leaves) if xi >= connindex
                    ]
                    return remainder
            else:  # it's on the same level with its arg, which is not an S-clause
                parent = pt[nodepos[:-1]].parent()
                right_sibling = parent.right_sibling()
                leaves = parent.leaves()
                leaves = leaves + right_sibling.leaves(
                )  # in this case, it may well be at the end of the clause, in which case the right sibling should probably also be included
                connindex = leaves.index(ct.token)
                remainder = [
                    xj for xi, xj in enumerate(leaves) if xi >= connindex
                ]
                return remainder
Ejemplo n.º 51
0
 def test_bad_operator(self):
     '''
     Test error handling of undefined tgrep operators.
     '''
     tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
     self.assertRaises(
         tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree])
     )
Ejemplo n.º 52
0
 def test_node_regex(self):
     '''
     Test regex matching on nodes.
     '''
     tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
     # This is a regular expression that matches any node whose
     # name starts with NP, including NP-SBJ:
     self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])), [[(0,), (1,)]])
Ejemplo n.º 53
0
 def test_node_quoted(self):
     '''
     Test selecting nodes using quoted node names.
     '''
     tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
     self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
     self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
     self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
     self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
Ejemplo n.º 54
0
def get_sentence_meaning_likelihood(sentence, lmk, rel):
    modparse = get_modparse(sentence)
    t = ParentedTree.parse(modparse)
    print '\n%s\n' % t.pprint()

    probs, entropies, lrpc, tps = get_tree_probs(t, lmk, rel)
    if np.prod(probs) == 0.0:
        logger('ERROR: Probability product is 0 for sentence: %s, lmk: %s, rel: %s, probs: %s' % (sentence, lmk, rel, str(probs)))
    return np.prod(probs), sum(entropies), lrpc, tps
def vertical_imbalance(furcation_node_dict):
    max_sd = 0
    for node in furcation_node_dict:
        node = ParentedTree.fromstring(node)
        child_heights = numpy.array([child.height() for child in node])
        sd = numpy.std(child_heights)
        if sd > max_sd:
            max_sd = sd
    return max_sd
def span(feats):
    """WORKS"""
    if feats.sentence != feats.sentence_ref:
        return "span={}".format(False)
    else:
        s_tree = ParentedTree.convert(TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)])
        i_parent = __get_parent_tree__(feats.token, s_tree)
        j_parent = __get_parent_tree__(feats.token_ref,s_tree)
        return "span={}".format(i_parent==j_parent)
Ejemplo n.º 57
0
 def test_node_noleaves(self):
     '''
     Test node name matching with the search_leaves flag set to False.
     '''
     tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
     self.assertEqual(list(tgrep.tgrep_positions('x', [tree])),
                      [[(0, 0, 0), (1, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)),
                      [[]])