Ejemplo n.º 1
0
def parse(text):
    """
    Primary function to run syntaxnet and PredPatt over input sentences.
    """
    parse_tree, trace = annotate_text(text)
    conll_parsed = parse_to_conll(parse_tree)

    conll_pp = [ud_parse for sent_id, ud_parse in load_conllu(conll_parsed)][0]

    #PredPatt options. Modify as needed.
    resolve_relcl = True  # relative clauses
    resolve_appos = True  # appositional modifiers
    resolve_amod = True   # adjectival modifiers
    resolve_conj = True   # conjuction
    resolve_poss = True   # possessives
    ud = dep_v2.VERSION   # the version of UD
    opts = PredPattOpts(resolve_relcl=resolve_relcl,
                        resolve_appos=resolve_appos,
                        resolve_amod=resolve_amod,
                        resolve_conj=resolve_conj,
                        resolve_poss=resolve_poss,
                        ud=ud)
    ppatt = PredPatt(conll_pp, opts=opts)

    #NOTE:
    #This returns the pretty print formatted string from PredPatt. This is done
    #largely as a place holder for JSON compatability within the REST API. 
    return {'predpatt': ppatt.pprint(), 'conll': conll_parsed, 'original': text}
Ejemplo n.º 2
0
def main():

    patterns = ''
    sentence = 'The quick brown fox jumped over the lazy dog .'
    tags = ''
    parse = ''
    if request.GET.get('sentence', '').strip():
        sentence = request.GET.get('sentence', '').strip()

    pp_opts = PredPattOpts()
    for k, v in sorted(PredPattOpts().__dict__.iteritems()):
        v = int(float(request.GET.get(
            k, v)))  # all options are true/false for now.
        setattr(pp_opts, k, v)

    if sentence:

        #for sent in sent_detector.tokenize('"John saw Mary", said Jason. Larry met Sally for dinner.'):
        #    print tokenize(sent)

        original_sentence = sentence
        parse = parser(sentence, tokenized=False)

        P = PredPatt(parse, opts=pp_opts)
        patterns = P.pprint(track_rule=True)
        tags = ' '.join('%s/%s' % x for x in zip(parse.tokens, parse.tags))
        parse = parse.pprint(K=3)

        # remove predpatt's bracketed comments
        patterns = re.sub(r'\s*\[.*?\]', '', patterns)
        patterns = dedent(patterns)

    opts = []
    for k, v in sorted(pp_opts.__dict__.iteritems()):
        # Create a hidden textbox with the false value because the values of
        # "unchecked" boxes don't get posted with form.
        opts.append('<input type="hidden" value="0" name="%s">' % (k, ))
        opts.append('<input type="checkbox" name="%s" value="1" %s> %s<br/>' %
                    (k, 'checked' if v else '', k))

    options = '\n'.join(opts)

    return template("""
<html>
<head>


<!-- JQuery -->
<script src="//code.jquery.com/jquery-2.1.4.min.js"></script>
<!-- Bootstrap -->
<link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css"/>
<link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap-theme.min.css"/>
<script src="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
<!-- Chosen Dropdown Library -->
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.css"/>
<script src="//cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.jquery.min.js"></script>

<style>
html {
     overflow: -moz-scrollbars-vertical;
     overflow: scroll;
}
</style>
</head>
<body>
<div style="width: 800px; padding: 10px; margin-left: auto; margin-right: auto;">
<h1>PredPatt</h1>
<strong>Sentence</strong>
<pre>{{sentence}}</pre>

<strong>Propositions</strong>
<div id="propositions">
<pre>
{{patterns}}
</pre>

<div>
<button class="btn" data-toggle="collapse" data-target="#parse" style="margin-bottom: 10px;">Toggle Parse</button>
<div id="parse" class="collapse">
<strong>Tags</strong>
<pre>
{{tags}}
</pre>
<strong>Parse</strong>
<pre>
{{parse}}
</pre>
</div>
</div>
<strong>Input</strong>
<form action="/" method="GET">
<textarea type="text" name="sentence" style="height:50px; width: 100%;"
placeholder="e.g., The quick brown fox jumped over the lazy dog."
class="form-control"
autofocus>{{original_sentence}}</textarea>
<div style="padding: 10px;"><strong>Options</strong><br/>""" + options + """
</div>
<br/>
<input type="submit" name="save" value="submit">
</form>
</div>
</body>
</html>
    """,
                    sentence=sentence,
                    original_sentence=original_sentence,
                    patterns=patterns,
                    tags=tags,
                    parse=parse,
                    options=options)
Ejemplo n.º 3
0
def test():
    from argparse import ArgumentParser
    p = ArgumentParser()
    p.add_argument('--filename', default='doc/DOCTEST.md')
    args = p.parse_args()

    sentences = re.findall(
        '^> (.*)\n([\w\W]*?)(?=^>|<END>)',
        codecs.open(args.filename, encoding='utf-8').read() + '\n<END>',
        re.MULTILINE)

    # TODO: Use PredPatt.from_string instead of duplicating code here.
    parser = Parser.get_instance()

    passed = 0
    failed = 0
    blank = 0
    for s, chunk in sentences:
        s = s.strip()
        if not s:
            continue

        # use cached parse listed in doctest chunk.
        parse_chunk = re.findall('<\!--parse=([\w\W]+?)-->', chunk)
        if parse_chunk:
            from predpatt.UDParse import DepTriple, UDParse
            [parse_chunk] = parse_chunk
            triples = [
                DepTriple(r, int(b), int(a)) for r, a, b in re.findall(
                    '(\S+)\(\S+?/(\d+), \S+?/(\d+)\)', parse_chunk)
            ]
            tokens = s.split()
            [tags_chunk] = re.findall('<\!--tags=([\w\W]+?)-->', chunk)
            tags = re.findall('\S+/(\S+)', tags_chunk)
            parse = UDParse(tokens, tags, triples)

        else:
            parse = parser(s)

        P = PredPatt(parse, ppattopts)
        relations = P.pprint(track_rule=True)
        tags = ' '.join('%s/%s' % x for x in zip(parse.tokens, parse.tags))
        parse = parse.pprint(K=4)

        relations = relations.replace('\t', '    ')
        relations = '\n'.join(line[4:].rstrip()
                              for line in relations.split('\n'))

        expected = []
        chunk = chunk.replace('\t', '    ')
        for line in chunk.split('\n'):
            if line.startswith('    '):
                line = line[4:].rstrip()
                expected.append(line)

        expected = '\n'.join(expected)

        if not expected.strip():
            blank += 1

        #got = '%s\n%s\n%s' % (tags, parse, relations)
        got = relations.strip() or '<empty>'
        got = re.sub(r'\s*\[.*\]', '', got)

        if expected.strip() == got.strip():
            #print colored('pass', 'green')
            passed += 1
        else:
            print()
            print(colored('> ' + s, 'yellow'))
            print(colored('fail', 'red'))
            print('expected:')
            for line in expected.split('\n'):
                print('   ', colored(line, 'blue'))
            print('got:')
            for line in got.split('\n'):
                print('   ', line)
            print()
            print(colored(tags, 'magenta'))
            print()
            print(colored(parse, 'magenta'))
            failed += 1

    msg = '[doctest] %.f%% (%s/%s) passed' % (passed * 100.0 /
                                              (passed + failed), passed,
                                              passed + failed)
    if failed == 0:
        print(msg)
    else:
        print()
        print(msg)
        print()
        if blank:
            print('blank:', blank)