def parse(text): """ Primary function to run syntaxnet and PredPatt over input sentences. """ parse_tree, trace = annotate_text(text) conll_parsed = parse_to_conll(parse_tree) conll_pp = [ud_parse for sent_id, ud_parse in load_conllu(conll_parsed)][0] #PredPatt options. Modify as needed. resolve_relcl = True # relative clauses resolve_appos = True # appositional modifiers resolve_amod = True # adjectival modifiers resolve_conj = True # conjuction resolve_poss = True # possessives ud = dep_v2.VERSION # the version of UD opts = PredPattOpts(resolve_relcl=resolve_relcl, resolve_appos=resolve_appos, resolve_amod=resolve_amod, resolve_conj=resolve_conj, resolve_poss=resolve_poss, ud=ud) ppatt = PredPatt(conll_pp, opts=opts) predicate_deps, arg_deps = get_ud_fragments(ppatt) #NOTE: #This returns the pretty print formatted string from PredPatt. This is done #largely as a place holder for JSON compatability within the REST API. return {'predpatt': {'predicate_deps': predicate_deps, 'arg_deps': arg_deps}, 'conll': conll_parsed, 'original': text}
def extract_triples(input_remaining, params): opts = PredPattOpts( resolve_relcl=True, # relative clauses resolve_appos=True, # appositional modifiers resolve_amod=True, # adjectival modifiers resolve_conj=True, # conjuction resolve_poss=True, # possessives ud=dep_v1.VERSION, # the version of UD ) triples = {} remaining = {} for idx in input_remaining: for line in input_remaining[idx]: if line.strip(): try: pp = PredPatt.from_sentence(line, opts=opts, cacheable=False) extractions = get_predpatt_triples(pp, line) if extractions: triples.setdefault(idx, []).extend(extractions) except KeyError: pass if idx not in triples: remaining[idx] = input_remaining[idx] triples[idx] = [] return triples, remaining
def setup_graph(): ud = DependencyGraphBuilder.from_conll(listtree, 'tree1') pp = PredPatt(next(load_conllu(rawtree))[1], opts=PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)) graph = PredPattGraphBuilder.from_predpatt(pp, ud, 'tree1') return pp, graph
def extract_predpatt(path='../../data/corpora/ud/UD_English-EWT-r1.2/'): ''' Extract PredPatt objects from CONLLU files ''' options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in os.listdir(path): if file.endswith('.conllu'): with open(path + file, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file + " " + sent_id] = \ PredPatt(ud_parse, opts=options) return patt
def __init__( self, path_to_udpipe: str, resolve_relcl: bool = True, resolve_appos: bool = True, resolve_amod: bool = True, resolve_conj: bool = True, resolve_poss: bool = True, ud=dep_v2.VERSION, ): super().__init__() self.model = Model.load(path_to_udpipe) self.pipeline = Pipeline( self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu" ) self._error = ProcessingError() self._opts = PredPattOpts( resolve_relcl=resolve_relcl, resolve_appos=resolve_appos, resolve_amod=resolve_amod, resolve_conj=resolve_conj, resolve_poss=resolve_poss, ud=ud, )
from collections import defaultdict, Counter from itertools import product from tqdm import tqdm #import matplotlib.pyplot as plt import re import json import argparse from predpatt import load_conllu from predpatt import PredPatt from predpatt import PredPattOpts from nltk import DependencyGraph import re from recast_utils import * options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) #Data Locations: ud_path = "../../data/raw_data/UD_English-EWT-r1.3/" ud_train = "../../data/raw_data/UD_English-EWT-r1.3/en-ud-train.conllu" ud_dev = "../../data/raw_data/UD_English-EWT-r1.3/en-ud-dev.conllu" ud_test = "../../data/raw_data/UD_English-EWT-r1.3/en-ud-test.conllu" ud_data = [ud_train, ud_dev, ud_test] # #### Hypothesis Generation Functions duration_dict = defaultdict(str) duration_dict[0] = 'instantaneously' duration_dict[1] = 'second' duration_dict[2] = 'minute' duration_dict[3] = 'hour'
print ' ', a, a.phrase() # Uncomment to list rules which fired on this proposition. Along with # an explanation. #for r in a.rules: # print ' %s: %s' % (r, r.explain()) print '______________________________________________________________________________' print # To change certain behaviors, you can pass different options for the PredPatt # instance. For example, to disable expansion of conjunctions and extraction of # amods, use the following: from predpatt import PredPattOpts P = PredPatt.from_sentence(sentence, opts=PredPattOpts(resolve_amod=0, resolve_conj=0)) print P.pprint(color=1) print '______________________________________________________________________________' print #______________________________________________________________________________ # Bonus material # Already have a constituency parse? No problem! P = PredPatt.from_constituency( '( (S (NP (NNP Chris)) (VP (VBZ loves) (NP (NNP Pat))) (. .)) )') print P.pprint(track_rule=True, color=True) print '______________________________________________________________________________'
# pylint: disable=W0221 # pylint: disable=R0903 # pylint: disable=R1704 """Module for converting PredPatt objects to networkx digraphs""" from os.path import basename, splitext from typing import Tuple, Hashable, TextIO, Optional, Union from networkx import DiGraph from predpatt import load_conllu, PredPatt, PredPattOpts from ..corpus import Corpus from ..syntax.dependency import CoNLLDependencyTreeCorpus DEFAULT_PREDPATT_OPTIONS = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause class PredPattCorpus(Corpus): """Container for predpatt graphs""" def _graphbuilder(self, graphid: Hashable, predpatt_depgraph: Tuple[PredPatt, DiGraph]) -> DiGraph: """ Parameters ---------- treeid an identifier for the tree predpatt_depgraph a pairing of the predpatt for a dependency parse and the graph
def main(): patterns = '' sentence = 'The quick brown fox jumped over the lazy dog .' tags = '' parse = '' if request.GET.get('sentence', '').strip(): sentence = request.GET.get('sentence', '').strip() pp_opts = PredPattOpts() for k, v in sorted(PredPattOpts().__dict__.iteritems()): v = int(float(request.GET.get( k, v))) # all options are true/false for now. setattr(pp_opts, k, v) if sentence: #for sent in sent_detector.tokenize('"John saw Mary", said Jason. Larry met Sally for dinner.'): # print tokenize(sent) original_sentence = sentence parse = parser(sentence, tokenized=False) P = PredPatt(parse, opts=pp_opts) patterns = P.pprint(track_rule=True) tags = ' '.join('%s/%s' % x for x in zip(parse.tokens, parse.tags)) parse = parse.pprint(K=3) # remove predpatt's bracketed comments patterns = re.sub(r'\s*\[.*?\]', '', patterns) patterns = dedent(patterns) opts = [] for k, v in sorted(pp_opts.__dict__.iteritems()): # Create a hidden textbox with the false value because the values of # "unchecked" boxes don't get posted with form. opts.append('<input type="hidden" value="0" name="%s">' % (k, )) opts.append('<input type="checkbox" name="%s" value="1" %s> %s<br/>' % (k, 'checked' if v else '', k)) options = '\n'.join(opts) return template(""" <html> <head> <!-- JQuery --> <script src="//code.jquery.com/jquery-2.1.4.min.js"></script> <!-- Bootstrap --> <link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css"/> <link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap-theme.min.css"/> <script src="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script> <!-- Chosen Dropdown Library --> <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.css"/> <script src="//cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.jquery.min.js"></script> <style> html { overflow: -moz-scrollbars-vertical; overflow: scroll; } </style> </head> <body> <div style="width: 800px; padding: 10px; margin-left: auto; margin-right: auto;"> <h1>PredPatt</h1> <strong>Sentence</strong> <pre>{{sentence}}</pre> <strong>Propositions</strong> <div id="propositions"> <pre> {{patterns}} </pre> <div> <button class="btn" data-toggle="collapse" data-target="#parse" style="margin-bottom: 10px;">Toggle Parse</button> <div id="parse" class="collapse"> <strong>Tags</strong> <pre> {{tags}} </pre> <strong>Parse</strong> <pre> {{parse}} </pre> </div> </div> <strong>Input</strong> <form action="/" method="GET"> <textarea type="text" name="sentence" style="height:50px; width: 100%;" placeholder="e.g., The quick brown fox jumped over the lazy dog." class="form-control" autofocus>{{original_sentence}}</textarea> <div style="padding: 10px;"><strong>Options</strong><br/>""" + options + """ </div> <br/> <input type="submit" name="save" value="submit"> </form> </div> </body> </html> """, sentence=sentence, original_sentence=original_sentence, patterns=patterns, tags=tags, parse=parse, options=options)
def hand_engineering(prot, batch_size, data, data_dev): ''' Hand engineered feature extraction. Supports the following - UD, Verbnet classids, Wordnet supersenses, concreteness ratings, LCS eventivity scores ''' home = expanduser("~") framnet_posdict = { 'V': 'VERB', 'N': 'NOUN', 'A': 'ADJ', 'ADV': 'ADV', 'PREP': 'ADP', 'NUM': 'NUM', 'INTJ': 'INTJ', 'ART': 'DET', 'C': 'CCONJ', 'SCON': 'SCONJ', 'PRON': 'PRON', 'IDIO': 'X', 'AVP': 'ADV' } # Load the features features = {} with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f: for line in f.readlines(): feats = line.split('\t') features[feats[0]] = (feats[1].split(), feats[2].split()) # Load the predpatt objects for creating features files = [ '/Downloads/UD_English-r1.2/en-ud-train.conllu', '/Downloads/UD_English-r1.2/en-ud-dev.conllu', '/Downloads/UD_English-r1.2/en-ud-test.conllu' ] home = expanduser("~") options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in files: path = home + file with open(path, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse, opts=options) data['Structure'] = data['Split.Sentence.ID'].map(lambda x: (patt[x], features[x])) data_dev['Structure'] = data_dev['Split.Sentence.ID'].map( lambda x: (patt[x], features[x])) raw_x = data['Structure'].tolist() raw_dev_x = data_dev['Structure'].tolist() all_x = raw_x + raw_dev_x all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))]) feature_cols = Counter(all_feats.split('|')) # All UD dataset features all_ud_feature_cols = list( feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()] # Concreteness f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb') concreteness = pickle.load(f) if prot == 'arg': conc_cols = ['concreteness'] else: conc_cols = ['concreteness', 'max_conc', 'min_conc'] f.close() # LCS eventivity from lcsreader import LexicalConceptualStructureLexicon lcs = LexicalConceptualStructureLexicon( home + '/Desktop/protocols/data/verbs-English.lcs') lcs_feats = ['lcs_eventive', 'lcs_stative'] # Wordnet supersenses(lexicographer names) supersenses = list( set(['supersense=' + x.lexname() for x in wordnet.all_synsets()])) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: lem2frame[lemma['name'] + '.' + framnet_posdict[lemma['POS']]] = lm['frame']['name'] frame_names = ['frame=' + x.name for x in framenet.frames()] # Verbnet classids verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()] # Lexical features lexical_feats = [ 'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must', 'ought', 'dare', 'need' ] + [ 'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every', 'this', 'that', 'any', 'most', 'all', 'both', 'these' ] dict_feats = {} for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols: dict_feats[f] = 0 x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist()) ]) dev_x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist( ), data_dev['Lemma'].tolist()) ]) # Figure out which columns to drop(they're always zero) todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist() todrop = x_pd.columns[(x_pd == 0).all()].values.tolist() intdrop = [a for a in todrop if a not in todrop1] cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop)) x = x_pd.drop(cols_to_drop, axis=1).values.tolist() dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist() x = [[a[:] for a in x[i:i + batch_size]] for i in range(0, len(data), batch_size)] dev_x = [[a[:] for a in dev_x[i:i + batch_size]] for i in range(0, len(data_dev), batch_size)] return x, dev_x
""" Documentation test runner. """ from __future__ import print_function import re, codecs from predpatt import PredPatt, PredPattOpts, Parser from termcolor import colored ppattopts = PredPattOpts(simple=False, cut=False, resolve_relcl=True, resolve_appos=True, resolve_amod=True, resolve_conj=True, resolve_poss=True, borrow_arg_for_relcl=True, big_args=False, ud="1.0") def test(): from argparse import ArgumentParser p = ArgumentParser() p.add_argument('--filename', default='doc/DOCTEST.md') args = p.parse_args() sentences = re.findall( '^> (.*)\n([\w\W]*?)(?=^>|<END>)', codecs.open(args.filename, encoding='utf-8').read() + '\n<END>',