def split(self, passage): ends = [] ids = [] tokens = [] for terminal in extract_terminals(passage): tokens.append(terminal.text) sentence = " ".join(tokens) # if len(tokens) > max(map(len, map(str.split, sentence_to_index))): # raise ValueError("Failed matching '%s'" % sentence) if self.index is not None and self.index < len(self.sentences) and \ self.sentences[self.index].startswith(sentence): # Try matching next sentence rather than shortest index = self.index if self.sentences[ self.index] == sentence else None else: index = self.index = self.sentence_to_index.get(sentence) if index is not None: self.matched_indices.add(index) ends.append(terminal.position) ids.append(str(index)) tokens = [] self.index += 1 return split_passage(passage, ends, ids=ids if self.enumerate else None, suffix_format=self.suffix_format, suffix_start=self.suffix_start)
def split(self, passage): ends = [] ids = [] token_lists = [] for terminal in extract_terminals(passage): token_lists.append([]) for terminals in token_lists if self.index is None else [token_lists[0]]: terminals.append(terminal) sentence = " ".join(t.text for t in terminals) if self.index is not None and self.index < len(self.sentences) and self.sentences[ self.index].startswith(sentence): # Try matching next sentence rather than shortest index = self.index if self.sentences[self.index] == sentence else None else: indices = self.sentence_to_index.get(sentence) index = self.index = indices.pop(0) if indices else None if index is not None: self.matched_indices.add(index) last_end = terminals[0].position - 1 if len(terminals) > 1 and last_end and last_end not in ends: ends.append(last_end) ends.append(terminal.position) ids.append(str(index)) token_lists = [] self.index += 1 break return split_passage(passage, ends, ids=ids if self.enumerate else None, suffix_format=self.suffix_format, suffix_start=self.suffix_start)
def split(passage, order): ends = [] ids = [] sentence = [] for terminal in extract_terminals(passage): sentence.append(terminal.text) # if len(sentence) > max(map(len, map(str.split, order))): # raise ValueError("Failed matching '%s'" % " ".join(sentence)) index = order.get(" ".join(sentence)) if index is not None: ends.append(terminal.position) ids.append(str(index)) sentence = [] return split_passage(passage, ends, ids=ids)
def split(self, passage): ends = [] ids = [] token_lists = [] for terminal in extract_terminals(passage): token_lists.append([]) for terminals in token_lists if self.index is None else [ token_lists[0] ]: terminals.append(terminal) sentence = " ".join(t.text for t in terminals) if self.index is not None and self.index < len( self.sentences ) and self.sentences[self.index].startswith( sentence ): # Try matching next sentence rather than shortest index = self.index if self.sentences[ self.index] == sentence else None else: indices = self.sentence_to_index.get(sentence) index = self.index = indices.pop(0) if indices else None if index is not None: self.matched_indices.add(index) last_end = terminals[0].position - 1 if len(terminals ) > 1 and last_end and last_end not in ends: ends.append(last_end) ends.append(terminal.position) ids.append(str(index)) token_lists = [] self.index += 1 break return split_passage(passage, ends, ids=ids if self.enumerate else None, suffix_format=self.suffix_format, suffix_start=self.suffix_start)
def main(args): streusle_file = args[0] ucca_path = args[1] outpath = args[2] for doc, passage, term2tok in get_passages(streusle_file, ucca_path, annotate=True, target='prep'): sent_ids = map(lambda x: ''.join(x['sent_id'].split('-')[-2:]), doc['sents']) sent_passage = zip(sent_ids, uconv.split_passage(passage, doc['ends'], sent_ids)) for sent, psg in sent_passage: p = uconv.join_passages([psg]) l0 = p.layer(ul0.LAYER_ID) l1 = p.layer(ul1.LAYER_ID) for pos, terminal in l0.pairs: # print(terminal.extra) if 'ss' not in terminal.extra or not isinstance( terminal.extra['ss'], str) or terminal.extra['ss'][0] != 'p': # print(terminal.extra) continue unit = doc["exprs"][tuple( map(int, terminal.extra["toknums"].split()))] # pt = terminal.incoming[0].parent # node = pt.fparent # if node.fparent: # node = node.fparent # nodes = set(get_all_descendants(node, remotes=True)) # print(refined) # for n in nodes: ID = f'{doc["id"]}_{unit["sent_offs"]}_{unit["local_toknums"][0]}-{unit["local_toknums"][-1]}' # p = ucore.Passage(ID) # other_l0 = ul0.Layer0(p) # other_l1 = ul1.Layer1(p) # # root = other_l1.add_fnode(other_l1._head_fnode, ul1.EdgeTags.ParallelScene) # # # prep # term = create_terminal(pt, unit, other_l0, True) # if not term: continue # preterminal = other_l1.add_fnode(root, str(pt._fedge() in refined)) # preterminal.add(ul1.EdgeTags.Terminal, term) # # # other node # term = create_terminal(n, unit, other_l0, False) # if not term: continue # preterminal = other_l1.add_fnode(root, str(n._fedge() in refined)) # preterminal.add(ul1.EdgeTags.Terminal, term) refined, error = find_refined(terminal, dict(l0.pairs), local=True) for _, term in p.layer(ul0.LAYER_ID).pairs: _pt = term.incoming[0].parent toks = [t.text for t in _pt.get_terminals()] term.extra['lexlemma'] = ' '.join(toks) term.extra['lexcat'] = _pt.ftag # term.extra.update(unit.get('heuristic_relation', {})) term.extra['is_part_of_mwe'] = len(toks) > 1 term.extra['identified_for_pss'] = str( term.ID == terminal.ID) edges = [ c.edge for cs in uconst.extract_candidates(p).values() for c in cs ] for edge in edges: edge.categories = [] edge.add(str(edge in refined)) uconv.passage2file(p, f'{outpath}/{ID}.xml')
def main(args): try: integrate_full = True integrate_term = False concatenate = False pss_feature = False annotate = True object = False v2_only = True draw = False output = True inp_ucca = False if '-I' in args: args.remove('-I') args.append('--no-integrate') if '--no-integrate' in args: integrate_full = False args.remove('--no-integrate') if '-c' in args: args.remove('-c') args.append('--concatenate') if '--concatenate' in args: concatenate = True args.remove('--concatenate') if '-A' in args: args.remove('-A') args.append('--no-annotate') if '--no-annotate' in args: integrate_full = False annotate = False args.remove('--no-annotate') if '-s' in args: args.remove('-s') args.append('--pss-feature') if '--pss-feature' in args: pss_feature = True args.remove('--pss-feature') if '--term' in args: integrate_term = True integrate_full = False args.remove('--term') if '--inp_ucca' in args: inp_ucca = True args.remove('--inp_ucca') if '-o' in args: args.remove('-o') args.append('--object') if '--object' in args: object = True args.remove('--object') if '-n' in args: args.remove('-n') args.append('--no-output') if '--no-output' in args: output = False args.remove('--no-output') if '--all' in args: v2_only = False args.remove('--all') if '--draw' in args: draw = True args.remove('--draw') import visualization as uviz import matplotlib.pyplot as plt streusle_file = args[ 0] #'../../streusle/streusle.govobj.json' #args[0] #'streusle.govobj.json' # sys.argv[1] ucca_path = args[ 1] #'../../UCCA_English-EWT' #args[1] # '/home/jakob/nert/corpora/UCCA_English-EWT/xml' # sys.argv[2] out_dir = args[2] except: print(f'usage: python3 {sys.argv[0]} STREUSLE_JSON UCCA_PATH OUT_DIR', file=sys.stderr) exit(1) with open(streusle_file) as f: streusle = json.load(f) print() global_error = Counter() unit_counter = 0 successful_units = 0 unsuccessful_units = 0 deductible_multiple_successes = 0 deductible_multiple_fails = 0 deductible_fail_and_success = 0 units_with_remote = 0 doc_error = 0 primary_edges = 0 remote_edges = 0 _doc_id = None v2_docids = set() if v2_only: with open(ucca_path + '/v2.txt') as f: for line in f: v2_docids.add(line.strip()) ignore = [] #"""020851 # 020992 # 059005 # 059416 # 200957 # 210066 # 211797 # 216456 # 217359 # 360937 # 399348""".split() unit_times = [] # print('usnacs.get_passages(streusle_file, ucca_path, annotate=(integrate or annotate), ignore=ignore, docids=v2_docids)') tag_refinements = Counter() for doc, passage, term2tok in get_passages( streusle_file, ucca_path, annotate=(integrate_term or integrate_full or annotate), target='obj' if object else 'prep', ignore=ignore, docids=v2_docids): if output and (not integrate_full and not integrate_term): for p in uconv.split_passage( passage, doc['ends'], map(lambda x: ''.join(x['sent_id'].split('-')[-2:]), doc['sents'])): uconv.passage2file(p, out_dir + '/' + p.ID + '.xml') continue l1 = passage.layer('1') if not output: primary_edges += len( uconstr.extract_candidates( passage, constructions=(uconstr.PRIMARY, ))['primary']) remote_edges += len( uconstr.extract_candidates(passage, constructions=uconstr.get_by_names( ['remote']))['remote']) for terminal in passage.layer('0').words: if integrate_term and concatenate: # and not terminal.incoming[0].parent.tag.startswith('Preterminal'): old_term_edge = terminal.incoming[0] preterminal = old_term_edge.parent preterminal._outgoing.remove(old_term_edge) terminal._incoming.remove(old_term_edge) passage._remove_edge(old_term_edge) # old_preterm_edge = preterminal._fedge() # preterminal.fparent._outgoing.remove(old_preterm_edge) new_preterminal = l1.add_fnode( preterminal, 'Preterminal' ) #[[c.tag, '', c.layer, ''] for c in old_preterm_edge.categories]) # passage._add_node(new_preterminal) #for outg in preterminal.outgoing: #if inc.parent != preterminal.fparent and ul1.EdgeTags.Terminal not in inc.tags: # new_preterminal.add(ul1.EdgeTags.Terminal, terminal) # passage._add_node(new_preterminal) #preterminal._incoming = [] # new_preterminal.add('Preterminal', preterminal) # passage._remove_edge(old_term_edge) new_preterminal.add_multiple( [[c.tag, '', c.layer, ''] for c in old_term_edge.categories], terminal) # assert preterminal.outgoing # assert new_preterminal.outgoing # print(preterminal) # print(new_preterminal) # print(terminal) pss_label = '' if 'ss' in terminal.extra: pss_label = terminal.extra['ss'] if not pss_label.startswith('p'): # print(terminal.extra) continue # print('ok') start_time = time.time() unit_counter += 1 if integrate_term: if concatenate: # old_term_edge = terminal.incoming[0] # preterminal = old_term_edge.parent # new_preterminal = l1.add_fnode(preterminal, 'Preterminal') # passage._add_node(new_preterminal) # old_term_edge.parent._outgoing.remove(old_term_edge) # old_term_edge.child._incoming.remove(old_term_edge) # passage._remove_edge(old_term_edge) # new_term_edge = new_preterminal.add(ul1.EdgeTags.Terminal, terminal) # passage._add_edge(new_term_edge) # refined = new_preterminal.incoming refined = terminal.incoming[0].parent.incoming else: refined = terminal.incoming else: refined, error = find_refined( terminal, dict(passage.layer(ul0.LAYER_ID).pairs)) global_error += Counter( {k: v for k, v in error.items() if isinstance(v, int)}) if error['successes_for_unit'] >= 1: successful_units += 1 deductible_multiple_successes += error[ 'successes_for_unit'] - 1 if error['fails_for_unit'] >= 1: deductible_fail_and_success += 1 else: unsuccessful_units += 1 if error['fails_for_unit'] >= 1: deductible_multiple_fails += error['fails_for_unit'] - 1 if error['remotes'] >= 1: units_with_remote += 1 if not output: if 'larger_UNA_warn' in error['failed_heuristics']: print(terminal, terminal.incoming[0].parent) if 'PP_idiom_not_UNA' in error['failed_heuristics']: print('PP_idiom:', terminal.extra['lexlemma'], terminal, terminal.incoming[0].parent) if 'MWP_not_UNA' in error['failed_heuristics']: print('MWP:', terminal.extra['lexlemma'], terminal, terminal.incoming[0].parent) for r in refined: # TODO: deal with doubly refined edges if (not concatenate and r.refinement) or (concatenate and ':' in r.tag): pass else: if concatenate: cats, r.categories = r.categories, [] for c in cats: composit_tag = f'{c.tag}:{pss_label}' r.add(composit_tag) tag_refinements[composit_tag] += 1 else: r.refinement = pss_label # print('FAIL', doc['id'], terminal.extra['toknums'], terminal.extra['lexlemma']) unit_times.append(time.time() - start_time) if not pss_feature: terminal.extra.pop('ss') # ensuring pss is not also a feature # if integrate_term: # terminal.extra['identified_for_pss'] = str(True) if draw: for sent, psg in zip(doc['sents'], uconv.split_passage(passage, doc['ends'])): uviz.draw(psg) plt.savefig(f'../graphs/{sent["sent_id"]}.svg') plt.clf() # print(passage) if output: for p in uconv.split_passage( passage, doc['ends'], map(lambda x: ''.join(x['sent_id'].split('-')[-2:]), doc['sents'])): # print(p) # augmented = uconv.join_passages([p, ucore.Passage('0')]) # for root_edge in augmented.layer(ul1.LAYER_ID)._head_fnode.outgoing: # if len(root_edge.tag.split('-')) > 1: # assert False, augmented # root_edge.tag = root_edge.tag.split('-')[0] uconv.passage2file(p, out_dir + '/' + p.ID + '.xml') for x, y in tag_refinements.most_common(len(tag_refinements)): print(x, y, sep='\t') #print(f'successful units\t{successful_units}\t{100*successful_units/(unit_counter-doc_error)}%') #print(f'unsuccessful units\t{unsuccessful_units}\t{100-(100*successful_units/(unit_counter-doc_error))}%') #={unit_counter - doc_error - successful_units}={mwe_una_fail+abgh_fail+c_fail+d_fail+e_fail+f_fail+g_fail+no_match} if integrate_full and not output: print('\n\n') print(f'total units\t{unit_counter}') # print(f'gov and obj present\t{gov_and_obj_counter}') print(f'document error\t{doc_error}\t{100*doc_error/unit_counter}%') print( f'document success\t{unit_counter - doc_error}\t{100-(100 * doc_error / unit_counter)}%' ) print(f'total primary edges\t{primary_edges}') print(f'total remote edges\t{remote_edges}') print('----------------------------------------------------') print( f'successful units\t{successful_units}\t{100*successful_units/(unit_counter-doc_error)}%' ) print( f'unsuccessful units\t{unsuccessful_units}\t{100-(100*successful_units/(unit_counter-doc_error))}%' ) #={unit_counter - doc_error - successful_units}={mwe_una_fail+abgh_fail+c_fail+d_fail+e_fail+f_fail+g_fail+no_match} print(f'warnings\t{global_error["warnings"]}') print('---------------------------------') # for ftype, count in fail_counts.most_common(): # print(f'{ftype}\t{count}') print( f'syntactic and semantic obj match\t{global_error["synt_sem_obj_match"]}' ) print('---------------------------------') print(f'\tMWE but not UNA\t{global_error["mwe_una_fail"]}') print(f'\tPP idiom\t{global_error["idiom"]}') print( f'\tR, N, F ({global_error["abgh"]}) but A and B miss\t{global_error["abgh_fail"]}' ) print(f'\tA (scene mod)\t{global_error["a"]}') print(f'\tB (non-scene mod) \t{global_error["b"]}') print(f'\tG (inh purpose) \t{global_error["g"]}') print(f'\t scn \t{global_error["g_scn_mod"]}') print(f'\t non scn \t{global_error["g"] - global_error["g_scn_mod"]}') print(f'\tH (approximator) \t{global_error["h"]}') print(f'\t scn \t{global_error["h_scn_mod"]}') print(f'\t non scn \t{global_error["h"] - global_error["h_scn_mod"]}') print( f'\tP, S ({global_error["c"]}) but C miss\t{global_error["c_fail"]}' ) print( f'\tL ({global_error["d"]}) but D miss\t{global_error["d_fail"]}') print( f'\tA, D, E, T ({global_error["ef"]}) but E miss\t{global_error["ef_fail"]}' ) print(f'\tE (intr adp) \t{global_error["e"]}') print(f'\t scn \t{global_error["e_scn_mod"]}') print(f'\t non scn \t{global_error["e"] - global_error["e_scn_mod"]}') print(f'\tF (poss pron) \t{global_error["f"]}') print(f'\t scn \t{global_error["f_scn_mod"]}') print(f'\t non scn \t{global_error["f"] - global_error["f_scn_mod"]}') #print(f'\tA ({f}) but F miss\t{f_fail}') #print(f'\tF ({g}) but G miss\t{g_fail}') print( f'\tno match\t{global_error["no_match"]}') #\t{ucca_categories}') print(f'\tnon-semantic role\t{global_error["non_semrole"]}') print( f'\tmultiple preterminals\t{global_error["multiple_preterminals"]}' ) print( f'\tunits with remote\t{units_with_remote} (total {global_error["remotes"]})' ) # # print('---------------------------------') print( f'\tdeductible (multiple successes for single unit)\t{deductible_multiple_successes}' ) print( f'\tdeductible (multiple fails for single unit)\t{deductible_multiple_fails}' ) print( f'\tdeductible (fail and success for single unit)\t{deductible_fail_and_success}' )