コード例 #1
0
ファイル: view.py プロジェクト: brendano/gfl_syntax
    def parseiter(filename):
        if file_is_json(filename):  # JSON input
            assert filename != '/dev/stdin', "can't view JSON on stdin sorry!"
            with codecs.open(filename, 'r', 'utf-8') as inF:
				for i,line in enumerate(inF):
					row = line.rstrip('\n').split('\t')
					sentid = row[0]
					obj = json.loads(row[-1])
					parse = gfl_parser.Parse.from_json(obj)
					anno_text = u' '.join(obj['tokens'])
					yield anno_text,parse
        else:   # GFL annotation input
            tokens_codes_texts = process_potentially_multifile(filename)
            for tokens,code,text in tokens_codes_texts:
                if not code or not tokens:
                    yield text,None
                else:
                    try:
                        if not is_balanced(code):
                            raise Exception("Unbalanced parentheses, brackets, or braces in annotation:\n"+code)
                        if VERBOSE:
                            print 'Parsing: '+str(tokens)
                        parse = gfl_parser.parse(tokens, code, check_semantics=True)
                        yield text,parse
                    except Exception:
                        print code
                        if not batch_mode: raise
                        traceback.print_exc()
                        yield text,None
コード例 #2
0
 def parseiter(filename):
     if file_is_json(filename):  # JSON input
         assert filename != '/dev/stdin', "can't view JSON on stdin sorry!"
         with codecs.open(filename, 'r', 'utf-8') as inF:
             for i, line in enumerate(inF):
                 row = line.rstrip('\n').split('\t')
                 sentid = row[0]
                 obj = json.loads(row[-1])
                 parse = gfl_parser.Parse.from_json(obj)
                 anno_text = u' '.join(obj['tokens'])
                 yield anno_text, parse
     else:  # GFL annotation input
         tokens_codes_texts = process_potentially_multifile(filename)
         for tokens, code, text in tokens_codes_texts:
             if not code or not tokens:
                 yield text, None
             else:
                 try:
                     if not is_balanced(code):
                         raise Exception(
                             "Unbalanced parentheses, brackets, or braces in annotation:\n"
                             + code)
                     if VERBOSE:
                         print 'Parsing: ' + str(tokens)
                     parse = gfl_parser.parse(tokens,
                                              code,
                                              check_semantics=True)
                     yield text, parse
                 except Exception:
                     print code
                     if not batch_mode: raise
                     traceback.print_exc()
                     yield text, None
コード例 #3
0
ファイル: view.py プロジェクト: nsaphra/gfl_syntax
    for filename in args:
        print "FILE",filename
        if filename=='/dev/stdin':
            bigbase = 'tmp'
        else:
            bigbase = re.sub(r'\.(txt|anno)$','', filename)

        tokens_codes_texts = process_potentially_multifile(filename)

        if len(tokens_codes_texts)==1:
            tokens,code,anno_text = tokens_codes_texts[0]
            try:
                if not is_balanced(code):
                    raise Exception("Unbalanced parentheses, brackets, or braces in annotation")
                parse = gfl_parser.parse(tokens, code, check_semantics=True)
            except Exception:
                if not batch_mode: raise
                traceback.print_exc()
                continue
            base = bigbase
            process_one_parse(parse, base)
            htmlfile = make_html(base, anno_text, base+'.png')
            if do_open:
                if opts.open_html:
                    desktop_open(htmlfile)
                else:
                    desktop_open("{base}.png".format(**locals()))
            continue  ## to next file

        parses = []
コード例 #4
0
ファイル: make_json.py プロジェクト: ldmt-muri/gfl_syntax
SentenceID TAB SpaceSepTokens TAB {ParseGraphAsJson}

E.g.:
  scripts/make_json.py anno/tweets/dev.0000.anno

... It may be desirable to use ID information contained in other parts of the
container, but I guess we'll use filenames for now...
"""
import sys,re,os
try:
  import ujson as json
except ImportError:
  import json
import view
import gfl_parser

args = sys.argv[1:]
for filename in args:
  tokens_codes_annos = view.process_potentially_multifile(filename)
  doc_id = re.sub(r'\.(anno|txt)$','', filename)

  for i,(tokens,code,anno) in enumerate(tokens_codes_annos):
    if not code: continue
    sentence_id = doc_id
    if len(tokens_codes_annos)>1: sentence_id += ':' + str(i)
    parse = gfl_parser.parse(tokens,code)
    parseJ = parse.to_json()
    print "{id}\t{tokens}\t{parse}".format(id=sentence_id, tokens=' '.join(tokens), parse=json.dumps(parseJ))

コード例 #5
0
try:
    import ujson as json
except ImportError:
    import json

import view

sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'gflparser'))
import gfl_parser

args = sys.argv[1:]
for filename in args:
    tokens_codes_annos = view.process_potentially_multifile(filename)
    doc_id = re.sub(r'\.(anno|txt)$', '', filename)

    for i, (tokens, code, anno) in enumerate(tokens_codes_annos):
        if not code: continue
        sentence_id = doc_id
        if len(tokens_codes_annos) > 1: sentence_id += ':' + str(i)
        try:
            parse = gfl_parser.parse(tokens, code, check_semantics=True)
            parseJ = parse.to_json()
            print(sentence_id,
                  ' '.join(tokens).encode('utf-8'),
                  json.dumps(parseJ),
                  sep='\t')
        except gfl_parser.GFLError:
            print(i, file=sys.stderr)
            print(anno, file=sys.stderr)
            raise
コード例 #6
0
def getArcLists(filename, leftMulti=False, uneven=False):
    whiteLists = list()
    blackLists = list()
    rf = open(filename)
    for line in rf:
        whiteList = list()
        blackList = list()
        brackets = list()
        jsonobj = json.loads(line)
        sentenceLength = len(jsonobj["sent"].split())
        parse = gfl_parser.parse(jsonobj["sent"].split(),
                                 jsonobj["anno"].replace("\r\n", "\n").replace(
                                     "\n\n",
                                     "\n").replace("[", "(").replace("]", ")"),
                                 check_semantics=True)
        parseJ = parse.to_json()
        feCoverage = {}
        feSolidCover = {}

        for dep in [dep for dep in parseJ['deps'] if dep[2] == 'fe']:
            if dep[0] not in feCoverage.keys():
                feCoverage[dep[0]] = []

            if dep[1][0:2] != "FE":
                feCoverage[dep[0]].append(
                    parseJ['tokens'].index(dep[1][2:-1]) + 1)
            else:
                feCoverage[dep[0]].append(dep[1])

        for k, v in feCoverage.iteritems():
            if len([item for item in v if isinstance(item, str)]) == 0:
                feSolidCover[k] = [sorted(v)[0], sorted(v)[-1]]

        while len(feSolidCover.keys()) != len(feCoverage.keys()):
            # Replace strings with limits if possible
            for k, v in feCoverage.iteritems():
                for solidKey in feSolidCover.keys():
                    if solidKey in v:
                        i = v.index(solidKey)
                        v[i:i + 1] = feSolidCover[solidKey][0], feSolidCover[
                            solidKey][1]
                feCoverage[k] = v

            # Put completed FE nodes into Solid
            for k, v in feCoverage.iteritems():
                if len([item for item in v if isinstance(item, str)]) == 0:
                    feSolidCover[k] = [sorted(v)[0], sorted(v)[-1]]

        for k, v in feSolidCover.iteritems():
            brackets.append(v)

        # Handle Multiword Expressions
        for node in parseJ["nodes"]:
            if node[0:2] == "MW":
                if leftMulti:
                    gen = CreateLeftBranchingMultiword(node[3:-1])
                else:
                    gen = CreateRightBranchingMultiword(node[3:-1])
                for dep in gen:
                    whiteList.append(dep.strip())

        # Handle Coordination Nodes
        for coord in parseJ["coords"]:
            if uneven:
                if len(coord[2]) == 1 and coord[2][0][0] == "W" and len(
                        coord[1]) == 2:
                    whiteList.append(
                        str(parseJ['tokens'].index(coord[1][0][2:-1]) + 1) +
                        " -> " +
                        str(parseJ['tokens'].index(coord[2][0][2:-1]) + 1))
                    whiteList.append(
                        str(parseJ['tokens'].index(coord[2][0][2:-1]) + 1) +
                        " -> " +
                        str(parseJ['tokens'].index(coord[1][1][2:-1]) + 1))
            else:
                if len(coord[2]) == 1 and coord[2][0][0] == "W":
                    for target in coord[1]:
                        if target[0] == "W":
                            whiteList.append(
                                str(parseJ['tokens'].index(coord[2][0][2:-1]) +
                                    1) + " -> " +
                                str(parseJ['tokens'].index(target[2:-1]) + 1))
        deps = parseJ["deps"]

        for dep in deps:
            if dep[0][0] == "W" and dep[1][0] == "W":
                whiteList.append(
                    str(parseJ['tokens'].index(dep[0][2:-1]) + 1) + " -> " +
                    str(parseJ['tokens'].index(dep[1][2:-1]) + 1))
            elif dep[0][0:2] == "MW":
                whiteList.append(
                    str(parseJ['tokens'].index(dep[0][3:-1].split("_")[-1]) +
                        1) + " -> " +
                    str(parseJ['tokens'].index(dep[1][2:-1]) + 1))
            elif dep[1][0:2] == "MW":
                whiteList.append(
                    str(parseJ['tokens'].index(dep[0][2:-1]) + 1) + " -> " +
                    str(parseJ['tokens'].index(dep[1][3:-1].split("_")[-1]) +
                        1))

        # Add reversed whitelist (Only really useful if we are not doing 'absolute' white-listing, but doesn't hurt in any case)
        #for dep in whiteList:
        #    deps = dep.strip().split(" -> ")
        #    blackList.append(deps[1]+" -> "+deps[0])

        # Add 'No External Children' and 'Single External Head' constraints
        for bracket in brackets:
            b_limit_l = bracket[0]
            b_limit_r = bracket[1]
            b_hasHead = True  # FIXME: Determine whether the head of the bracket is known.
            # If it is, we need to blacklist any arcs with heads external to the bracket.
            # If it isn't, still do the blacklisting, except leave external heads to the bracket head as neutral.

            for headIndex in range(b_limit_l, b_limit_r):
                for childIndex in range(0, sentenceLength):
                    if ((childIndex < b_limit_l) or childIndex > b_limit_r):
                        # blacklist external children
                        blackList.append(
                            str(headIndex) + " -> " + str(childIndex))

                        # blacklist external heads (notation reversed)
                        blackList.append(
                            str(childIndex) + " -> " + str(headIndex))

            if b_hasHead:
                head = b_limit_l
                # FIXME: Unblacklist external connections to the bracket head.
                for headIndex in range(0, sentenceLength):
                    if ((headIndex < b_limit_l) or headIndex > b_limit_r):
                        try:
                            blackList.remove(
                                str(headIndex) + " -> " + str(head))
                        except ValueError:
                            pass
                pass

        # Clean up
        blackList = [dep for dep in blackList if dep not in whiteList]

        whiteLists.append(whiteList)
        blackLists.append(blackList)
    rf.close()

    return whiteLists, blackLists
コード例 #7
0
ファイル: gflDeps.py プロジェクト: jmielens/convex-mst
def getArcLists(filename, leftMulti=False, uneven=False):
    whiteLists = list()
    blackLists = list()
    if filename[-4:] == "json":
        rf = open(filename)
        for line in rf:
            whiteList = list()
            blackList = list()
            brackets = list()
            jsonobj = json.loads(line)
            sentenceLength = len(jsonobj["sent"].split())
            parse = gfl_parser.parse(
                jsonobj["sent"].split(),
                jsonobj["anno"].replace("\r\n",
                                        "\n").replace("\n\n", "\n").replace(
                                            "[", "(").replace("]", ")"),
                check_semantics=True)
            parseJ = parse.to_json()
            feCoverage = {}
            feSolidCover = {}

            for dep in [dep for dep in parseJ['deps'] if dep[2] == 'fe']:
                if dep[0] not in feCoverage.keys():
                    feCoverage[dep[0]] = []

                if dep[1][0:2] != "FE":
                    feCoverage[dep[0]].append(
                        parseJ['tokens'].index(dep[1][2:-1]) + 1)
                else:
                    feCoverage[dep[0]].append(dep[1])

            for k, v in feCoverage.iteritems():
                if len([item for item in v if isinstance(item, str)]) == 0:
                    feSolidCover[k] = [sorted(v)[0], sorted(v)[-1]]

            while len(feSolidCover.keys()) != len(feCoverage.keys()):
                # Replace strings with limits if possible
                for k, v in feCoverage.iteritems():
                    for solidKey in feSolidCover.keys():
                        if solidKey in v:
                            i = v.index(solidKey)
                            v[i:i + 1] = feSolidCover[solidKey][
                                0], feSolidCover[solidKey][1]
                    feCoverage[k] = v

                # Put completed FE nodes into Solid
                for k, v in feCoverage.iteritems():
                    if len([item for item in v if isinstance(item, str)]) == 0:
                        feSolidCover[k] = [sorted(v)[0], sorted(v)[-1]]

            for k, v in feSolidCover.iteritems():
                brackets.append(v)

            # Handle Multiword Expressions
            for node in parseJ["nodes"]:
                if node[0:2] == "MW":
                    if leftMulti:
                        gen = CreateLeftBranchingMultiword(node[3:-1])
                    else:
                        gen = CreateRightBranchingMultiword(node[3:-1])
                    for dep in gen:
                        whiteList.append(dep.strip())

            # Handle Coordination Nodes
            for coord in parseJ["coords"]:
                if uneven:
                    if len(coord[2]) == 1 and coord[2][0][0] == "W" and len(
                            coord[1]) == 2:
                        whiteList.append(
                            str(parseJ['tokens'].index(coord[1][0][2:-1]) +
                                1) + " -> " +
                            str(parseJ['tokens'].index(coord[2][0][2:-1]) + 1))
                        whiteList.append(
                            str(parseJ['tokens'].index(coord[2][0][2:-1]) +
                                1) + " -> " +
                            str(parseJ['tokens'].index(coord[1][1][2:-1]) + 1))
                else:
                    if len(coord[2]) == 1 and coord[2][0][0] == "W":
                        for target in coord[1]:
                            if target[0] == "W":
                                whiteList.append(
                                    str(parseJ['tokens'].index(
                                        coord[2][0][2:-1]) + 1) + " -> " +
                                    str(parseJ['tokens'].index(target[2:-1]) +
                                        1))
            deps = parseJ["deps"]

            for dep in deps:
                if dep[0][0] == "W" and dep[1][0] == "W":
                    whiteList.append(
                        str(parseJ['tokens'].index(dep[0][2:-1]) + 1) +
                        " -> " + str(parseJ['tokens'].index(dep[1][2:-1]) + 1))
                elif dep[0][0:2] == "MW":
                    whiteList.append(
                        str(parseJ['tokens'].index(dep[0][3:-1].split("_")[-1])
                            + 1) + " -> " +
                        str(parseJ['tokens'].index(dep[1][2:-1]) + 1))
                elif dep[1][0:2] == "MW":
                    whiteList.append(
                        str(parseJ['tokens'].index(dep[0][2:-1]) + 1) +
                        " -> " + str(parseJ['tokens'].index(dep[1][3:-1].split(
                            "_")[-1]) + 1))

            # Add reversed whitelist (Only really useful if we are not doing 'absolute' white-listing, but doesn't hurt in any case)
            #for dep in whiteList:
            #    deps = dep.strip().split(" -> ")
            #    blackList.append(deps[1]+" -> "+deps[0])

            # Children of FE Nodes
            for dep in [
                    dep for dep in parseJ['deps']
                    if (dep[0][0:2] == 'FE' and dep[2] == None)
            ]:
                if dep[1][0] == '$' or dep[0][0] == '$':
                    continue
                if dep[1][0:2] == 'FE':  # FE -> FE
                    (head_fe_l, head_fe_r) = feSolidCover[dep[0]]
                    (tail_fe_l, tail_fe_r) = feSolidCover[dep[1]]

                    # Block all children in tail FE from being parent of any node in head FE
                    for tail in range(tail_fe_l, tail_fe_r + 1):
                        for head in range(head_fe_l, head_fe_r + 1):
                            blackList.append(str(tail) + " -> " + str(head))

                else:  # FE -> Word
                    # Block children from being parent of any node in the FE
                    (fe_l, fe_r) = feSolidCover[dep[0]]
                    for tail in range(fe_l, fe_r + 1):
                        blackList.append(
                            str(parseJ['tokens'].index(dep[1][2:-1]) + 1) +
                            " -> " + str(tail))

            # Parents of FE Nodes
            for dep in [
                    dep for dep in parseJ['deps']
                    if (dep[1][0:2] == 'FE' and dep[2] == None)
            ]:
                if dep[1][0] == '$' or dep[0][0] == '$':
                    continue
                if dep[0][0:2] == 'FE':  # FE -> FE
                    pass  # We already did this above...
                else:  # Word -> FE
                    (fe_l, fe_r) = feSolidCover[dep[1]]
                    for fe in range(fe_l, fe_r + 1):
                        # Block fe-parent from being tail of any node in the FE
                        blackList.append(
                            str(fe) + " -> " +
                            str(parseJ['tokens'].index(dep[0][2:-1]) + 1))

                        # Block fe from being tail of any node that isn't the annotated head
                        for head in range(0, sentenceLength + 1):
                            if (head < fe_l or head > fe_r
                                ) and str(head) != str(
                                    parseJ['tokens'].index(dep[0][2:-1]) + 1):
                                blackList.append(str(head) + " -> " + str(fe))

            # Clean up
            blackList = [dep for dep in blackList if dep not in whiteList]

            whiteLists.append(whiteList)
            blackLists.append(blackList)
        rf.close()
    elif filename[-5:] == "conll":
        raw = open(filename).read()
        sentences = raw.split("\n\n")
        for sentence in sentences:
            # Sanity checking for EOF/etc.
            if len(sentence) < 5:
                continue

            whiteList = list()
            blackList = list()

            for line in sentence.split("\n"):
                tokens = line.split("\t")
                if len(tokens) < 5:
                    continue

                childIndex = tokens[0]
                parentIndex = tokens[6]

                if parentIndex >= 0:
                    whiteList.append(parentIndex + " -> " + childIndex)

            whiteLists.append(whiteList)
            blackLists.append(blackList)
    else:
        print "Error: Bad File Extension"

    return whiteLists, blackLists