def parse(self, sentences, translate):
        """Parse a list of sentences.

        :param sentences: a sequence of strings.
        :returns: A list of dependency trees (a forest)
        """
        def tmpfile(prefix):
            return NamedTemporaryFile(
                dir='.', delete=False,
                prefix=prefix) if self.debug else NamedTemporaryFile(
                    prefix=prefix)

        with uwriter(tmpfile('in-')) as infile, uwriter(
                tmpfile('err-')) as errfile:
            infile.writelines(u'\n'.join(sanitized(s) for s in sentences))
            infile.seek(0)
            dependencies = check_output(self.command,
                                        stdin=infile,
                                        stderr=errfile).split('\n')
            process, keep = self.config
            #             dpprint(dependencies)
            if translate:
                return translate(dependencies,
                                 sentences,
                                 base=0,
                                 parser=self.name,
                                 process=process,
                                 keep=keep)
            else:
                return dependencies
Beispiel #2
0
def main(args):
    import codecs
    from util import ureader, uwriter, uopen

    def handler(x):
        v = x.object[x.start:x.end]
        print >> stderr, repr(v), v
        return (u'', x.end)

    codecs.register_error('clear', handler)

    if '-t' not in args:
        usage(args)

    tag = map(string.lower, args[1 + args.index('-t')].split(','))
    enc = args[1 + args.index('-e')] if '-e' in args else 'utf8'
    stdin = ureader(sys.stdin) if '-i' not in args else uopen(
        args[1 + args.index('-i')])
    #     stdout = codecs.getwriter(enc)(sys.stdout if '-o' not in args else open(args[1 + args.index('-o')], 'wb'), errors='clear')
    stdout = codecs.getwriter(enc)(
        sys.stdout if '-o' not in args else open(args[1 +
                                                      args.index('-o')], 'wb'))
    stderr = uwriter(sys.stderr)
    for l in strip(stdin.read(), keep=tag):
        try:
            print >> stdout, l
        except UnicodeDecodeError:
            print 'problem with', l
Beispiel #3
0
def main(args):
    from util import ureader, uwriter  #, tag, untag

    if '-l' not in args:
        usage()

    lang = args[1 + args.index('-l')]

    #     do_tag = '-t' in args
    #     maybe_tag = tag if do_tag else lambda x: x

    #     valid_rels = ('Adj', 'Dobj', 'Subj', 'Iobj', 'AdvAdj', 'AdvVerb')

    stdin, stdout = ureader(sys.stdin), uwriter(sys.stdout)
    uprint = partial(print, file=stdout)
    for s, (sid, relations) in parse(stdin, lang):
        uprint(u'[{}] {}'.format(sid, s))
        for sid, (_, _, r, d, h) in relations:
            if r != '**UNK**':
                #                 uprint(u'[{0}] {1} {2[0]}.{2[1]} {3[0]}.{3[1]}'.format(sid, r, d, h))
                if r == 'Adj':
                    uprint(u'[{0}] {1} {2[0]} {3[0]}'.format(sid, r, h, d))
                else:
                    uprint(u'[{0}] {1} {2[0]} {3[0]}'.format(sid, r, d, h))
        uprint()
def main(args):
    if args.debug_meta:
        entries = args.__dict__
        print('Metaphors for language {lang}, seed file {seed_fn}:'.format(
            **entries))
        for n, v in all_metaphors(**entries):
            print(u'{0[0]}.{0[1]} {1[0]}.{1[1]}'.format(n, v),
                  file=uwriter(sys.stdout))
    else:
        with open_file(args.json_fn) as jsonf:
            json_out = m4detect(json_in=json.load(fp=jsonf, encoding='utf-8'),
                                **args.__dict__)
            if args.out_fn == '-':
                json_dump(obj=json_out, fp=uwriter(sys.stdout))
            else:
                with uopen(args.out_fn, mode='w+b') as out_f:
                    json_dump(obj=json_out, fp=out_f)
def findm(index, relations, outs):
    for record in index:
        try:
            sent_id, _, _, k, n, v = record
        except ValueError:
            print(u'problem with', tabjoin(record), file=uwriter(sys.stderr))
        if (k, n, v) in relations:
            print(sent_id, k, n, v, sep=u'\t', file=outs)
            print(sentence(int(sent_id)), file=outs)
Beispiel #6
0
def split(args):
    try:
        assert '-p' in args

        parser_kind = args[args.index('-p') + 1]
        split_re = splitter(parser_kind)

        #sin = ureader(sys.stdin)
        sin = sys.stdin
        sout = uwriter(sys.stdout)
        tokenjoin = spacejoin if '-l' not in args else nljoin
        eos = [args[args.index('-eos') + 1]] if '-eos' in args else []
        lines = (l.rstrip().decode('utf8') for l in sin)
        for line in lines:
            print(tokenjoin(split_re.findall(line) + eos), file=sout)
            # Perhaps add blank line
            #if '-b' in args:
            #    print(file=sout)
            print(file=sout)

    except AssertionError:
        usage(args)
Beispiel #7
0
 def streams_for(categories, mode):
     return map(lambda c: uwriter(gzip.open('{}.{}.gz'.format(out_fname, c.lower()), mode)), categories)
 def dump(self, stream=uwriter(sys.stderr)):
     for n, v in self.metaphors:
         print(u'{0[0]}.{0[1]} {1[0]}.{1[1]}'.format(n, v), file=stream)
    from a parsed Russian web blog corpus.
    
    Created on October 24, 2012

.. moduleauthor:: ChrisXie, Luca Gilardi <*****@*****.**>
"""

from __future__ import print_function

import sys, argparse
from codecs import open
from util import uwriter, Environment, derivations, update, uopen, dprint

from pprint import pprint, pformat

ustderr = uwriter(sys.stderr)

env = Environment({
    'BASE': '/u/metanet/corpolexica/RU/RU-WAC/Russian Metaphor Extraction',
    'GW': '/u/metanet/Parsing/Results/GW',
    'BNC': '/u/metanet/corpolexica/EN',
    'DEMO': '/u/metanet/demo',
    'SEED_DIR': '/u/metanet/extraction/seeds'
})


def main():
    parser = create_parser()
    name_space = parser.parse_args()
    args = vars(name_space)
def main(args):
    """Main program.
    """
    lang = args.lang
    uout = uwriter(sys.stdout)

    def openf(fn):
        return uopen(fn) if fn != '-' else ureader(sys.stdin)

    def output(json_doc):
        """Write our JSON file out in a 'standard' way.
        """
        dump(json_doc,
             uout,
             encoding='utf-8',
             sort_keys=True,
             indent=2,
             ensure_ascii=False,
             separators=(',', ': '))

    def process_conllx(args):
        sin = openf(args.conllx_fn)
        json_out = dict(JDATA)
        json_out.update(sentences=to_json(
            combine_conllx, blocks(lambda r: len(r) > 1, lines(sin, u'\t'))))
        output(json_out)

    def process_json(args):
        sin = openf(args.json_fn)
        json_in = load(fp=sin, encoding='utf-8')
        sentences_in = json_in['sentences']
        try:
            json_out = parse(lang,
                             [sanitized(s['ctext']) for s in sentences_in])
        except KeyError:
            #         from nltk.tokenize import TreebankWordTokenizer
            #         _tokenize = TreebankWordTokenizer().tokenize
            _j = u' '.join

            def make_ctext(s):  # this has side effects
                ctext = sanitized(s['text'])
                s['ctext'] = ctext
                return ctext

            json_out = parse(lang, [make_ctext(s) for s in sentences_in])

        json_out.update((k, v) for k, v in json_in.items() if k != 'sentences')
        # Sanity check: verify we haven't modified ctext
        if False:
            for idx, (sent_in,
                      sent_out) in enumerate(zip(json_in['sentences'],
                                                 json_out['sentences']),
                                             start=1):
                ctext_in, ctext_out = sent_in['ctext'], sent_out['ctext']
                try:
                    assert ctext_in == ctext_out
                except AssertionError:
                    dprint(u'error at line {}:\n  {}  \n!=\n  {}'.format(
                        idx, ctext_in, ctext_out))

        output(json_out)

    if args.test:
        import doctest
        doctest.testmod()


#   elif args.freeling_fn:
#       process_freeling(args)
    elif args.conllx_fn:
        process_conllx(args)
    else:
        process_json(args)
Beispiel #11
0
def clean(args):
    sin = ureader(sys.stdin)
    sout = uwriter(sys.stdout)
    for line in sin:
        print >> sout, cleaned(line),
Beispiel #12
0
    return (l.decode('utf8').rstrip().split(sep)
            for l in f) if decode else (l.rstrip().split(sep) for l in f)


from os import path
from itertools import islice


def sentence(sent_id, files=512, chunk=4096):
    fn, pos = divmod(sent_id - 1, chunk)
    dn = fn // files
    with uopen(path.join('%.2x' % dn, '%.4x.ss' % fn)) as f:
        return islice(f, pos, pos + 1).next()


if __name__ == '__main__':
    args = sys.argv[1:]

    multiple = '-m' in args
    if not multiple:
        main(*args)
    else:
        index, vs, vo = args[1:5]
        r = dict(vo=u'1-компл', vs=u'предик')
        outs = uwriter(sys.stdout)
        with uopen(vs) as vss, uopen(vo) as vos, open(index) as indexs:
            relations = read(split(vss), r['vs']) | read(split(vos), r['vo'])
            #             for k, n, v in relations:
            #                 print(k, n, v, sep=u'\t', file=uwriter(sys.stderr))

            findm(split(indexs, decode=True, sep=u'\t'), relations, outs)