def parse(text):
    """
    Primary function to run syntaxnet and PredPatt over input sentences.
    """
    parse_tree, trace = annotate_text(text)
    conll_parsed = parse_to_conll(parse_tree)

    conll_pp = [ud_parse for sent_id, ud_parse in load_conllu(conll_parsed)][0]

    #PredPatt options. Modify as needed.
    resolve_relcl = True  # relative clauses
    resolve_appos = True  # appositional modifiers
    resolve_amod = True   # adjectival modifiers
    resolve_conj = True   # conjuction
    resolve_poss = True   # possessives
    ud = dep_v2.VERSION   # the version of UD
    opts = PredPattOpts(resolve_relcl=resolve_relcl,
                        resolve_appos=resolve_appos,
                        resolve_amod=resolve_amod,
                        resolve_conj=resolve_conj,
                        resolve_poss=resolve_poss,
                        ud=ud)
    ppatt = PredPatt(conll_pp, opts=opts)
    predicate_deps, arg_deps = get_ud_fragments(ppatt)

    #NOTE:
    #This returns the pretty print formatted string from PredPatt. This is done
    #largely as a place holder for JSON compatability within the REST API.
    return {'predpatt': {'predicate_deps': predicate_deps,
                         'arg_deps': arg_deps},
            'conll': conll_parsed,
            'original': text}
Beispiel #2
0
def extract_triples(input_remaining, params):
    opts = PredPattOpts(
        resolve_relcl=True,  # relative clauses
        resolve_appos=True,  # appositional modifiers
        resolve_amod=True,  # adjectival modifiers
        resolve_conj=True,  # conjuction
        resolve_poss=True,  # possessives
        ud=dep_v1.VERSION,  # the version of UD
    )
    triples = {}
    remaining = {}
    for idx in input_remaining:
        for line in input_remaining[idx]:
            if line.strip():
                try:
                    pp = PredPatt.from_sentence(line,
                                                opts=opts,
                                                cacheable=False)
                    extractions = get_predpatt_triples(pp, line)
                    if extractions:
                        triples.setdefault(idx, []).extend(extractions)
                except KeyError:
                    pass
        if idx not in triples:
            remaining[idx] = input_remaining[idx]
            triples[idx] = []
    return triples, remaining
Beispiel #3
0
def setup_graph():
    ud = DependencyGraphBuilder.from_conll(listtree, 'tree1')

    pp = PredPatt(next(load_conllu(rawtree))[1],
                  opts=PredPattOpts(resolve_relcl=True,
                                    borrow_arg_for_relcl=True,
                                    resolve_conj=False,
                                    cut=True))

    graph = PredPattGraphBuilder.from_predpatt(pp, ud, 'tree1')

    return pp, graph
Beispiel #4
0
def extract_predpatt(path='../../data/corpora/ud/UD_English-EWT-r1.2/'):
    '''
        Extract PredPatt objects from CONLLU files
    '''

    options = PredPattOpts(resolve_relcl=True,
                           borrow_arg_for_relcl=True,
                           resolve_conj=False,
                           cut=True)  # Resolve relative clause
    patt = {}

    for file in os.listdir(path):
        if file.endswith('.conllu'):
            with open(path + file, 'r') as infile:
                for sent_id, ud_parse in load_conllu(infile.read()):
                    patt[file + " " + sent_id] = \
                                                PredPatt(ud_parse, opts=options)

    return patt
 def __init__(
     self,
     path_to_udpipe: str,
     resolve_relcl: bool = True,
     resolve_appos: bool = True,
     resolve_amod: bool = True,
     resolve_conj: bool = True,
     resolve_poss: bool = True,
     ud=dep_v2.VERSION,
 ):
     super().__init__()
     self.model = Model.load(path_to_udpipe)
     self.pipeline = Pipeline(
         self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu"
     )
     self._error = ProcessingError()
     self._opts = PredPattOpts(
         resolve_relcl=resolve_relcl,
         resolve_appos=resolve_appos,
         resolve_amod=resolve_amod,
         resolve_conj=resolve_conj,
         resolve_poss=resolve_poss,
         ud=ud,
     )
from collections import defaultdict, Counter
from itertools import product
from tqdm import tqdm
#import matplotlib.pyplot as plt
import re
import json
import argparse
from predpatt import load_conllu
from predpatt import PredPatt
from predpatt import PredPattOpts
from nltk import DependencyGraph
import re
from recast_utils import *

options = PredPattOpts(resolve_relcl=True,
                       borrow_arg_for_relcl=True,
                       resolve_conj=False,
                       cut=True)

#Data Locations:
ud_path = "../../data/raw_data/UD_English-EWT-r1.3/"
ud_train = "../../data/raw_data/UD_English-EWT-r1.3/en-ud-train.conllu"
ud_dev = "../../data/raw_data/UD_English-EWT-r1.3/en-ud-dev.conllu"
ud_test = "../../data/raw_data/UD_English-EWT-r1.3/en-ud-test.conllu"
ud_data = [ud_train, ud_dev, ud_test]

# #### Hypothesis Generation Functions
duration_dict = defaultdict(str)
duration_dict[0] = 'instantaneously'
duration_dict[1] = 'second'
duration_dict[2] = 'minute'
duration_dict[3] = 'hour'
Beispiel #7
0
        print ' ', a, a.phrase()

        # Uncomment to list rules which fired on this proposition. Along with
        # an explanation.
        #for r in a.rules:
        #    print '    %s: %s' % (r, r.explain())

print '______________________________________________________________________________'
print

# To change certain behaviors, you can pass different options for the PredPatt
# instance. For example, to disable expansion of conjunctions and extraction of
# amods, use the following:
from predpatt import PredPattOpts
P = PredPatt.from_sentence(sentence,
                           opts=PredPattOpts(resolve_amod=0, resolve_conj=0))

print P.pprint(color=1)

print '______________________________________________________________________________'
print

#______________________________________________________________________________
# Bonus material

# Already have a constituency parse? No problem!
P = PredPatt.from_constituency(
    '( (S (NP (NNP Chris)) (VP (VBZ loves) (NP (NNP Pat))) (. .)) )')
print P.pprint(track_rule=True, color=True)

print '______________________________________________________________________________'
Beispiel #8
0
# pylint: disable=W0221
# pylint: disable=R0903
# pylint: disable=R1704
"""Module for converting PredPatt objects to networkx digraphs"""

from os.path import basename, splitext
from typing import Tuple, Hashable, TextIO, Optional, Union
from networkx import DiGraph
from predpatt import load_conllu, PredPatt, PredPattOpts
from ..corpus import Corpus
from ..syntax.dependency import CoNLLDependencyTreeCorpus

DEFAULT_PREDPATT_OPTIONS = PredPattOpts(resolve_relcl=True,
                                        borrow_arg_for_relcl=True,
                                        resolve_conj=False,
                                        cut=True)  # Resolve relative clause


class PredPattCorpus(Corpus):
    """Container for predpatt graphs"""

    def _graphbuilder(self,
                      graphid: Hashable,
                      predpatt_depgraph: Tuple[PredPatt, DiGraph]) -> DiGraph:
        """
        Parameters
        ----------
        treeid
            an identifier for the tree
        predpatt_depgraph
            a pairing of the predpatt for a dependency parse and the graph
Beispiel #9
0
def main():

    patterns = ''
    sentence = 'The quick brown fox jumped over the lazy dog .'
    tags = ''
    parse = ''
    if request.GET.get('sentence', '').strip():
        sentence = request.GET.get('sentence', '').strip()

    pp_opts = PredPattOpts()
    for k, v in sorted(PredPattOpts().__dict__.iteritems()):
        v = int(float(request.GET.get(
            k, v)))  # all options are true/false for now.
        setattr(pp_opts, k, v)

    if sentence:

        #for sent in sent_detector.tokenize('"John saw Mary", said Jason. Larry met Sally for dinner.'):
        #    print tokenize(sent)

        original_sentence = sentence
        parse = parser(sentence, tokenized=False)

        P = PredPatt(parse, opts=pp_opts)
        patterns = P.pprint(track_rule=True)
        tags = ' '.join('%s/%s' % x for x in zip(parse.tokens, parse.tags))
        parse = parse.pprint(K=3)

        # remove predpatt's bracketed comments
        patterns = re.sub(r'\s*\[.*?\]', '', patterns)
        patterns = dedent(patterns)

    opts = []
    for k, v in sorted(pp_opts.__dict__.iteritems()):
        # Create a hidden textbox with the false value because the values of
        # "unchecked" boxes don't get posted with form.
        opts.append('<input type="hidden" value="0" name="%s">' % (k, ))
        opts.append('<input type="checkbox" name="%s" value="1" %s> %s<br/>' %
                    (k, 'checked' if v else '', k))

    options = '\n'.join(opts)

    return template("""
<html>
<head>


<!-- JQuery -->
<script src="//code.jquery.com/jquery-2.1.4.min.js"></script>
<!-- Bootstrap -->
<link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css"/>
<link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap-theme.min.css"/>
<script src="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
<!-- Chosen Dropdown Library -->
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.css"/>
<script src="//cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.jquery.min.js"></script>

<style>
html {
     overflow: -moz-scrollbars-vertical;
     overflow: scroll;
}
</style>
</head>
<body>
<div style="width: 800px; padding: 10px; margin-left: auto; margin-right: auto;">
<h1>PredPatt</h1>
<strong>Sentence</strong>
<pre>{{sentence}}</pre>

<strong>Propositions</strong>
<div id="propositions">
<pre>
{{patterns}}
</pre>

<div>
<button class="btn" data-toggle="collapse" data-target="#parse" style="margin-bottom: 10px;">Toggle Parse</button>
<div id="parse" class="collapse">
<strong>Tags</strong>
<pre>
{{tags}}
</pre>
<strong>Parse</strong>
<pre>
{{parse}}
</pre>
</div>
</div>
<strong>Input</strong>
<form action="/" method="GET">
<textarea type="text" name="sentence" style="height:50px; width: 100%;"
placeholder="e.g., The quick brown fox jumped over the lazy dog."
class="form-control"
autofocus>{{original_sentence}}</textarea>
<div style="padding: 10px;"><strong>Options</strong><br/>""" + options + """
</div>
<br/>
<input type="submit" name="save" value="submit">
</form>
</div>
</body>
</html>
    """,
                    sentence=sentence,
                    original_sentence=original_sentence,
                    patterns=patterns,
                    tags=tags,
                    parse=parse,
                    options=options)
Beispiel #10
0
def hand_engineering(prot, batch_size, data, data_dev):
    '''
        Hand engineered feature extraction. Supports the following - UD,
        Verbnet classids, Wordnet supersenses, concreteness ratings, LCS
        eventivity scores
    '''
    home = expanduser("~")
    framnet_posdict = {
        'V': 'VERB',
        'N': 'NOUN',
        'A': 'ADJ',
        'ADV': 'ADV',
        'PREP': 'ADP',
        'NUM': 'NUM',
        'INTJ': 'INTJ',
        'ART': 'DET',
        'C': 'CCONJ',
        'SCON': 'SCONJ',
        'PRON': 'PRON',
        'IDIO': 'X',
        'AVP': 'ADV'
    }
    # Load the features
    features = {}
    with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f:
        for line in f.readlines():
            feats = line.split('\t')
            features[feats[0]] = (feats[1].split(), feats[2].split())

    # Load the predpatt objects for creating features
    files = [
        '/Downloads/UD_English-r1.2/en-ud-train.conllu',
        '/Downloads/UD_English-r1.2/en-ud-dev.conllu',
        '/Downloads/UD_English-r1.2/en-ud-test.conllu'
    ]
    home = expanduser("~")
    options = PredPattOpts(resolve_relcl=True,
                           borrow_arg_for_relcl=True,
                           resolve_conj=False,
                           cut=True)  # Resolve relative clause
    patt = {}

    for file in files:
        path = home + file
        with open(path, 'r') as infile:
            for sent_id, ud_parse in load_conllu(infile.read()):
                patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse,
                                                                opts=options)

    data['Structure'] = data['Split.Sentence.ID'].map(lambda x:
                                                      (patt[x], features[x]))
    data_dev['Structure'] = data_dev['Split.Sentence.ID'].map(
        lambda x: (patt[x], features[x]))

    raw_x = data['Structure'].tolist()
    raw_dev_x = data_dev['Structure'].tolist()

    all_x = raw_x + raw_dev_x
    all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))])
    feature_cols = Counter(all_feats.split('|'))

    # All UD dataset features
    all_ud_feature_cols = list(
        feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()]

    # Concreteness
    f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb')
    concreteness = pickle.load(f)
    if prot == 'arg':
        conc_cols = ['concreteness']
    else:
        conc_cols = ['concreteness', 'max_conc', 'min_conc']
    f.close()

    # LCS eventivity
    from lcsreader import LexicalConceptualStructureLexicon
    lcs = LexicalConceptualStructureLexicon(
        home + '/Desktop/protocols/data/verbs-English.lcs')
    lcs_feats = ['lcs_eventive', 'lcs_stative']

    # Wordnet supersenses(lexicographer names)
    supersenses = list(
        set(['supersense=' + x.lexname() for x in wordnet.all_synsets()]))

    # Framenet
    lem2frame = {}
    for lm in framenet.lus():
        for lemma in lm['lexemes']:
            lem2frame[lemma['name'] + '.' +
                      framnet_posdict[lemma['POS']]] = lm['frame']['name']
    frame_names = ['frame=' + x.name for x in framenet.frames()]

    # Verbnet classids
    verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()]

    # Lexical features
    lexical_feats = [
        'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must',
        'ought', 'dare', 'need'
    ] + [
        'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every',
        'this', 'that', 'any', 'most', 'all', 'both', 'these'
    ]

    dict_feats = {}
    for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols:
        dict_feats[f] = 0

    x_pd = pd.DataFrame([
        features_func(sent_feat=sent,
                      token=token,
                      lemma=lemma,
                      dict_feats=dict_feats.copy(),
                      prot=prot,
                      concreteness=concreteness,
                      lcs=lcs,
                      l2f=lem2frame) for sent, token, lemma in
        zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist())
    ])

    dev_x_pd = pd.DataFrame([
        features_func(sent_feat=sent,
                      token=token,
                      lemma=lemma,
                      dict_feats=dict_feats.copy(),
                      prot=prot,
                      concreteness=concreteness,
                      lcs=lcs,
                      l2f=lem2frame)
        for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist(
        ), data_dev['Lemma'].tolist())
    ])

    # Figure out which columns to drop(they're always zero)
    todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist()
    todrop = x_pd.columns[(x_pd == 0).all()].values.tolist()
    intdrop = [a for a in todrop if a not in todrop1]
    cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop))

    x = x_pd.drop(cols_to_drop, axis=1).values.tolist()
    dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist()

    x = [[a[:] for a in x[i:i + batch_size]]
         for i in range(0, len(data), batch_size)]
    dev_x = [[a[:] for a in dev_x[i:i + batch_size]]
             for i in range(0, len(data_dev), batch_size)]
    return x, dev_x
Beispiel #11
0
"""
Documentation test runner.
"""

from __future__ import print_function

import re, codecs
from predpatt import PredPatt, PredPattOpts, Parser
from termcolor import colored

ppattopts = PredPattOpts(simple=False,
                         cut=False,
                         resolve_relcl=True,
                         resolve_appos=True,
                         resolve_amod=True,
                         resolve_conj=True,
                         resolve_poss=True,
                         borrow_arg_for_relcl=True,
                         big_args=False,
                         ud="1.0")


def test():
    from argparse import ArgumentParser
    p = ArgumentParser()
    p.add_argument('--filename', default='doc/DOCTEST.md')
    args = p.parse_args()

    sentences = re.findall(
        '^> (.*)\n([\w\W]*?)(?=^>|<END>)',
        codecs.open(args.filename, encoding='utf-8').read() + '\n<END>',