def parseiter(filename): if file_is_json(filename): # JSON input assert filename != '/dev/stdin', "can't view JSON on stdin sorry!" with codecs.open(filename, 'r', 'utf-8') as inF: for i,line in enumerate(inF): row = line.rstrip('\n').split('\t') sentid = row[0] obj = json.loads(row[-1]) parse = gfl_parser.Parse.from_json(obj) anno_text = u' '.join(obj['tokens']) yield anno_text,parse else: # GFL annotation input tokens_codes_texts = process_potentially_multifile(filename) for tokens,code,text in tokens_codes_texts: if not code or not tokens: yield text,None else: try: if not is_balanced(code): raise Exception("Unbalanced parentheses, brackets, or braces in annotation:\n"+code) if VERBOSE: print 'Parsing: '+str(tokens) parse = gfl_parser.parse(tokens, code, check_semantics=True) yield text,parse except Exception: print code if not batch_mode: raise traceback.print_exc() yield text,None
def parseiter(filename): if file_is_json(filename): # JSON input assert filename != '/dev/stdin', "can't view JSON on stdin sorry!" with codecs.open(filename, 'r', 'utf-8') as inF: for i, line in enumerate(inF): row = line.rstrip('\n').split('\t') sentid = row[0] obj = json.loads(row[-1]) parse = gfl_parser.Parse.from_json(obj) anno_text = u' '.join(obj['tokens']) yield anno_text, parse else: # GFL annotation input tokens_codes_texts = process_potentially_multifile(filename) for tokens, code, text in tokens_codes_texts: if not code or not tokens: yield text, None else: try: if not is_balanced(code): raise Exception( "Unbalanced parentheses, brackets, or braces in annotation:\n" + code) if VERBOSE: print 'Parsing: ' + str(tokens) parse = gfl_parser.parse(tokens, code, check_semantics=True) yield text, parse except Exception: print code if not batch_mode: raise traceback.print_exc() yield text, None
for filename in args: print "FILE",filename if filename=='/dev/stdin': bigbase = 'tmp' else: bigbase = re.sub(r'\.(txt|anno)$','', filename) tokens_codes_texts = process_potentially_multifile(filename) if len(tokens_codes_texts)==1: tokens,code,anno_text = tokens_codes_texts[0] try: if not is_balanced(code): raise Exception("Unbalanced parentheses, brackets, or braces in annotation") parse = gfl_parser.parse(tokens, code, check_semantics=True) except Exception: if not batch_mode: raise traceback.print_exc() continue base = bigbase process_one_parse(parse, base) htmlfile = make_html(base, anno_text, base+'.png') if do_open: if opts.open_html: desktop_open(htmlfile) else: desktop_open("{base}.png".format(**locals())) continue ## to next file parses = []
SentenceID TAB SpaceSepTokens TAB {ParseGraphAsJson} E.g.: scripts/make_json.py anno/tweets/dev.0000.anno ... It may be desirable to use ID information contained in other parts of the container, but I guess we'll use filenames for now... """ import sys,re,os try: import ujson as json except ImportError: import json import view import gfl_parser args = sys.argv[1:] for filename in args: tokens_codes_annos = view.process_potentially_multifile(filename) doc_id = re.sub(r'\.(anno|txt)$','', filename) for i,(tokens,code,anno) in enumerate(tokens_codes_annos): if not code: continue sentence_id = doc_id if len(tokens_codes_annos)>1: sentence_id += ':' + str(i) parse = gfl_parser.parse(tokens,code) parseJ = parse.to_json() print "{id}\t{tokens}\t{parse}".format(id=sentence_id, tokens=' '.join(tokens), parse=json.dumps(parseJ))
try: import ujson as json except ImportError: import json import view sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'gflparser')) import gfl_parser args = sys.argv[1:] for filename in args: tokens_codes_annos = view.process_potentially_multifile(filename) doc_id = re.sub(r'\.(anno|txt)$', '', filename) for i, (tokens, code, anno) in enumerate(tokens_codes_annos): if not code: continue sentence_id = doc_id if len(tokens_codes_annos) > 1: sentence_id += ':' + str(i) try: parse = gfl_parser.parse(tokens, code, check_semantics=True) parseJ = parse.to_json() print(sentence_id, ' '.join(tokens).encode('utf-8'), json.dumps(parseJ), sep='\t') except gfl_parser.GFLError: print(i, file=sys.stderr) print(anno, file=sys.stderr) raise
def getArcLists(filename, leftMulti=False, uneven=False): whiteLists = list() blackLists = list() rf = open(filename) for line in rf: whiteList = list() blackList = list() brackets = list() jsonobj = json.loads(line) sentenceLength = len(jsonobj["sent"].split()) parse = gfl_parser.parse(jsonobj["sent"].split(), jsonobj["anno"].replace("\r\n", "\n").replace( "\n\n", "\n").replace("[", "(").replace("]", ")"), check_semantics=True) parseJ = parse.to_json() feCoverage = {} feSolidCover = {} for dep in [dep for dep in parseJ['deps'] if dep[2] == 'fe']: if dep[0] not in feCoverage.keys(): feCoverage[dep[0]] = [] if dep[1][0:2] != "FE": feCoverage[dep[0]].append( parseJ['tokens'].index(dep[1][2:-1]) + 1) else: feCoverage[dep[0]].append(dep[1]) for k, v in feCoverage.iteritems(): if len([item for item in v if isinstance(item, str)]) == 0: feSolidCover[k] = [sorted(v)[0], sorted(v)[-1]] while len(feSolidCover.keys()) != len(feCoverage.keys()): # Replace strings with limits if possible for k, v in feCoverage.iteritems(): for solidKey in feSolidCover.keys(): if solidKey in v: i = v.index(solidKey) v[i:i + 1] = feSolidCover[solidKey][0], feSolidCover[ solidKey][1] feCoverage[k] = v # Put completed FE nodes into Solid for k, v in feCoverage.iteritems(): if len([item for item in v if isinstance(item, str)]) == 0: feSolidCover[k] = [sorted(v)[0], sorted(v)[-1]] for k, v in feSolidCover.iteritems(): brackets.append(v) # Handle Multiword Expressions for node in parseJ["nodes"]: if node[0:2] == "MW": if leftMulti: gen = CreateLeftBranchingMultiword(node[3:-1]) else: gen = CreateRightBranchingMultiword(node[3:-1]) for dep in gen: whiteList.append(dep.strip()) # Handle Coordination Nodes for coord in parseJ["coords"]: if uneven: if len(coord[2]) == 1 and coord[2][0][0] == "W" and len( coord[1]) == 2: whiteList.append( str(parseJ['tokens'].index(coord[1][0][2:-1]) + 1) + " -> " + str(parseJ['tokens'].index(coord[2][0][2:-1]) + 1)) whiteList.append( str(parseJ['tokens'].index(coord[2][0][2:-1]) + 1) + " -> " + str(parseJ['tokens'].index(coord[1][1][2:-1]) + 1)) else: if len(coord[2]) == 1 and coord[2][0][0] == "W": for target in coord[1]: if target[0] == "W": whiteList.append( str(parseJ['tokens'].index(coord[2][0][2:-1]) + 1) + " -> " + str(parseJ['tokens'].index(target[2:-1]) + 1)) deps = parseJ["deps"] for dep in deps: if dep[0][0] == "W" and dep[1][0] == "W": whiteList.append( str(parseJ['tokens'].index(dep[0][2:-1]) + 1) + " -> " + str(parseJ['tokens'].index(dep[1][2:-1]) + 1)) elif dep[0][0:2] == "MW": whiteList.append( str(parseJ['tokens'].index(dep[0][3:-1].split("_")[-1]) + 1) + " -> " + str(parseJ['tokens'].index(dep[1][2:-1]) + 1)) elif dep[1][0:2] == "MW": whiteList.append( str(parseJ['tokens'].index(dep[0][2:-1]) + 1) + " -> " + str(parseJ['tokens'].index(dep[1][3:-1].split("_")[-1]) + 1)) # Add reversed whitelist (Only really useful if we are not doing 'absolute' white-listing, but doesn't hurt in any case) #for dep in whiteList: # deps = dep.strip().split(" -> ") # blackList.append(deps[1]+" -> "+deps[0]) # Add 'No External Children' and 'Single External Head' constraints for bracket in brackets: b_limit_l = bracket[0] b_limit_r = bracket[1] b_hasHead = True # FIXME: Determine whether the head of the bracket is known. # If it is, we need to blacklist any arcs with heads external to the bracket. # If it isn't, still do the blacklisting, except leave external heads to the bracket head as neutral. for headIndex in range(b_limit_l, b_limit_r): for childIndex in range(0, sentenceLength): if ((childIndex < b_limit_l) or childIndex > b_limit_r): # blacklist external children blackList.append( str(headIndex) + " -> " + str(childIndex)) # blacklist external heads (notation reversed) blackList.append( str(childIndex) + " -> " + str(headIndex)) if b_hasHead: head = b_limit_l # FIXME: Unblacklist external connections to the bracket head. for headIndex in range(0, sentenceLength): if ((headIndex < b_limit_l) or headIndex > b_limit_r): try: blackList.remove( str(headIndex) + " -> " + str(head)) except ValueError: pass pass # Clean up blackList = [dep for dep in blackList if dep not in whiteList] whiteLists.append(whiteList) blackLists.append(blackList) rf.close() return whiteLists, blackLists
def getArcLists(filename, leftMulti=False, uneven=False): whiteLists = list() blackLists = list() if filename[-4:] == "json": rf = open(filename) for line in rf: whiteList = list() blackList = list() brackets = list() jsonobj = json.loads(line) sentenceLength = len(jsonobj["sent"].split()) parse = gfl_parser.parse( jsonobj["sent"].split(), jsonobj["anno"].replace("\r\n", "\n").replace("\n\n", "\n").replace( "[", "(").replace("]", ")"), check_semantics=True) parseJ = parse.to_json() feCoverage = {} feSolidCover = {} for dep in [dep for dep in parseJ['deps'] if dep[2] == 'fe']: if dep[0] not in feCoverage.keys(): feCoverage[dep[0]] = [] if dep[1][0:2] != "FE": feCoverage[dep[0]].append( parseJ['tokens'].index(dep[1][2:-1]) + 1) else: feCoverage[dep[0]].append(dep[1]) for k, v in feCoverage.iteritems(): if len([item for item in v if isinstance(item, str)]) == 0: feSolidCover[k] = [sorted(v)[0], sorted(v)[-1]] while len(feSolidCover.keys()) != len(feCoverage.keys()): # Replace strings with limits if possible for k, v in feCoverage.iteritems(): for solidKey in feSolidCover.keys(): if solidKey in v: i = v.index(solidKey) v[i:i + 1] = feSolidCover[solidKey][ 0], feSolidCover[solidKey][1] feCoverage[k] = v # Put completed FE nodes into Solid for k, v in feCoverage.iteritems(): if len([item for item in v if isinstance(item, str)]) == 0: feSolidCover[k] = [sorted(v)[0], sorted(v)[-1]] for k, v in feSolidCover.iteritems(): brackets.append(v) # Handle Multiword Expressions for node in parseJ["nodes"]: if node[0:2] == "MW": if leftMulti: gen = CreateLeftBranchingMultiword(node[3:-1]) else: gen = CreateRightBranchingMultiword(node[3:-1]) for dep in gen: whiteList.append(dep.strip()) # Handle Coordination Nodes for coord in parseJ["coords"]: if uneven: if len(coord[2]) == 1 and coord[2][0][0] == "W" and len( coord[1]) == 2: whiteList.append( str(parseJ['tokens'].index(coord[1][0][2:-1]) + 1) + " -> " + str(parseJ['tokens'].index(coord[2][0][2:-1]) + 1)) whiteList.append( str(parseJ['tokens'].index(coord[2][0][2:-1]) + 1) + " -> " + str(parseJ['tokens'].index(coord[1][1][2:-1]) + 1)) else: if len(coord[2]) == 1 and coord[2][0][0] == "W": for target in coord[1]: if target[0] == "W": whiteList.append( str(parseJ['tokens'].index( coord[2][0][2:-1]) + 1) + " -> " + str(parseJ['tokens'].index(target[2:-1]) + 1)) deps = parseJ["deps"] for dep in deps: if dep[0][0] == "W" and dep[1][0] == "W": whiteList.append( str(parseJ['tokens'].index(dep[0][2:-1]) + 1) + " -> " + str(parseJ['tokens'].index(dep[1][2:-1]) + 1)) elif dep[0][0:2] == "MW": whiteList.append( str(parseJ['tokens'].index(dep[0][3:-1].split("_")[-1]) + 1) + " -> " + str(parseJ['tokens'].index(dep[1][2:-1]) + 1)) elif dep[1][0:2] == "MW": whiteList.append( str(parseJ['tokens'].index(dep[0][2:-1]) + 1) + " -> " + str(parseJ['tokens'].index(dep[1][3:-1].split( "_")[-1]) + 1)) # Add reversed whitelist (Only really useful if we are not doing 'absolute' white-listing, but doesn't hurt in any case) #for dep in whiteList: # deps = dep.strip().split(" -> ") # blackList.append(deps[1]+" -> "+deps[0]) # Children of FE Nodes for dep in [ dep for dep in parseJ['deps'] if (dep[0][0:2] == 'FE' and dep[2] == None) ]: if dep[1][0] == '$' or dep[0][0] == '$': continue if dep[1][0:2] == 'FE': # FE -> FE (head_fe_l, head_fe_r) = feSolidCover[dep[0]] (tail_fe_l, tail_fe_r) = feSolidCover[dep[1]] # Block all children in tail FE from being parent of any node in head FE for tail in range(tail_fe_l, tail_fe_r + 1): for head in range(head_fe_l, head_fe_r + 1): blackList.append(str(tail) + " -> " + str(head)) else: # FE -> Word # Block children from being parent of any node in the FE (fe_l, fe_r) = feSolidCover[dep[0]] for tail in range(fe_l, fe_r + 1): blackList.append( str(parseJ['tokens'].index(dep[1][2:-1]) + 1) + " -> " + str(tail)) # Parents of FE Nodes for dep in [ dep for dep in parseJ['deps'] if (dep[1][0:2] == 'FE' and dep[2] == None) ]: if dep[1][0] == '$' or dep[0][0] == '$': continue if dep[0][0:2] == 'FE': # FE -> FE pass # We already did this above... else: # Word -> FE (fe_l, fe_r) = feSolidCover[dep[1]] for fe in range(fe_l, fe_r + 1): # Block fe-parent from being tail of any node in the FE blackList.append( str(fe) + " -> " + str(parseJ['tokens'].index(dep[0][2:-1]) + 1)) # Block fe from being tail of any node that isn't the annotated head for head in range(0, sentenceLength + 1): if (head < fe_l or head > fe_r ) and str(head) != str( parseJ['tokens'].index(dep[0][2:-1]) + 1): blackList.append(str(head) + " -> " + str(fe)) # Clean up blackList = [dep for dep in blackList if dep not in whiteList] whiteLists.append(whiteList) blackLists.append(blackList) rf.close() elif filename[-5:] == "conll": raw = open(filename).read() sentences = raw.split("\n\n") for sentence in sentences: # Sanity checking for EOF/etc. if len(sentence) < 5: continue whiteList = list() blackList = list() for line in sentence.split("\n"): tokens = line.split("\t") if len(tokens) < 5: continue childIndex = tokens[0] parentIndex = tokens[6] if parentIndex >= 0: whiteList.append(parentIndex + " -> " + childIndex) whiteLists.append(whiteList) blackLists.append(blackList) else: print "Error: Bad File Extension" return whiteLists, blackLists