def pmb2fol(pmb_dir, pd, sig=None, drawDRS=False): '''Read a CLF file of the PMB document and convert its content into a first-order logic formula ''' debug("PMB document {}".format(pd)) clf = read_clf(pmb_dir, pd) if not clf: return None # Parse clausal forms and read them as a set of connected boxes try: box_dict, sub_rel, dir_subs, disc_rels, op_types, ops_fine = check_clf(clf, sig) if logging.DEBUG >= logging.root.level: for b in box_dict: pr_box(box_dict[b]) debug("sub_rel: {}".format(pr_2rel(sub_rel))) debug("dir_sub: {}".format(pr_2rel(dir_subs))) debug("Disc rel: {}".format(disc_rels)) except RuntimeError as e: warning("{} has error: {}".format(pd, e)) return None # recover DRS from boxes drs = boxes2drs(box_dict, sub_rel, disc_rels) if not drs: return None if drawDRS: drs.draw() # Convert DRS into FOL formula fol = drs.fol() debug("FOL formula for {}:\n\t{}".format(pd, fol)) if fol.free(): warning("The FOL formula of {} has occurrences of free variables: {}".format(pd, fol.free())) return None return fol
def is_well_formed_drs(drs, signature): '''Return true/false for whether a DRS is well-formed''' try: _ = check_clf(drs, signature, v=0) return True except RuntimeError: return False
def extensive_format_check(drs, pp_info): '''Do a more extensive semantic format check (referee) Also try to fix the DRSs, if specified''' fixed_drs = False try: _ = check_clf([tuple(c) for c in drs], pp_info.signature, v=0) return drs # DRS invalid, replace by dummy or try to fix except RuntimeError as err: err_message = str(err) # Try to fix subordinate loops by just merging/removing the offending box if pp_info.fix and 'Subordinate relation has a loop' in err_message: err_cat = "sub loop" box_num = err_message.split('||')[1].split('>')[0].strip() fixed_drs = solve_loops([tuple(c) for c in drs], box_num, pp_info) elif pp_info.fix_disc and "Boxes are not connected" in err_message: err_cat = "boxes disconnected" boxes = re.findall('\{(.*?)\}', err_message) fixed_drs = solve_non_connected(drs, boxes[0].replace(',', '').split(), boxes[1].replace(',', '').split(), pp_info.signature) # Only get here if DRS was invalid - if we don't have a fixed one, return dummy if fixed_drs: pp_info.pp_dict[err_cat].append(pp_info.cur_idx) return fixed_drs if pp_info.no_referee: # Don't want to do referee dummies, return initial DRS return drs pp_info.pp_dict["dummies-ref"].append(pp_info.cur_idx) return default_drs(pp_info.baseline, list_output=True)
def solve_loops(clf, box, pp_info): '''Recursive function: simply remove the box from a DRS that returned the subordinate relation has a loop error. If the new DRS gets a loop error message, then simply remove the new box as well (etc)''' # If already empty, return False if not clf: return False # First try to merge the offending box with any other box, and see if the DRS is valid now boxes = get_first_arg_boxes(clf) for b in boxes: if box != b: new_clf = merge_boxes(clf, [b, box]) try: _ = check_clf(new_clf, pp_info.signature, v=0) # No error means the DRS is now valid, so return return new_clf except RuntimeError as err: pass # If this is not the case, we remove the offending box new_clf = remove_by_first_arg_box(clf, box) # Maybe we need to insert or remove REFs again new_clf_tmp = check_ref_clauses(new_clf, pp_info, do_print=False) new_clf = [tuple(x) for x in new_clf_tmp] # Check if the new DRS is valid try: _ = check_clf(new_clf, pp_info.signature, v=0) # No error means the DRS is now valid, so return return new_clf except RuntimeError as err: err_message = str(err) # Check the error message, if again a loop is the problem, call this function again if 'Subordinate relation has a loop' in err_message: box_num = err_message.split('||')[1].split('>')[0].strip() # If nothing changed, avoid infinite loops by stopping here if new_clf == clf: return False # Otherwise try this function again new_clf = solve_loops(new_clf, box_num, pp_info) # Different error, so approach didn't work, quit else: return False return new_clf
def solve_non_connected(drs, boxes1, boxes2, signature): '''Try to solve sets of unconnected boxes by changing a discourse variable to a disc var present in a different box''' # Introduce variables in one of the other boxes and see if that helps for idx, clause in enumerate(drs): if clause[1] == "REF" and clause[0] in boxes1: for box in boxes2: fixed_drs = change_box_in_drs(drs, idx, box) try: _ = check_clf([tuple(c) for c in fixed_drs], signature, v=0) return fixed_drs except RuntimeError: pass elif clause[1] == "REF" and clause[0] in boxes2: for box in boxes1: fixed_drs = change_box_in_drs(drs, idx, box) try: _ = check_clf([tuple(c) for c in fixed_drs], signature, v=0) return fixed_drs except: pass # If we get here nothing worked, return dummy return False
def remove_ill_formed_drss(drss, signature_file): '''Remove ill-formed DRSs from a set of DRSs''' # Get signature to do the checking signature = get_signature(signature_file) # Loop over DRSs and only keep valid ones new_drss, remove_idxs = [], [] for idx, drs in enumerate(drss): # Remove comments and split list_drs = drs_string_to_list(drs) try: _ = check_clf([tuple(c) for c in list_drs], signature, v=0) new_drss.append(drs) # DRS invalid, ignore except RuntimeError: remove_idxs.append(idx) return new_drss, remove_idxs
def clf2graph(clf, alignment, signature=None, pars={}): '''Convert a CLF and alignments into a DRG graph ''' # parse clf and check on correctness (box_dict, top_boxes, disc_rels, presupp_rels, cl_types, arg_typing) =\ clfref.check_clf(clf, signature) assert len(clf) == len(cl_types), '#clauses == #clause_types' # map clauses to alignments cl2al = clause_alignment(clf, cl_types, alignment) # convert constants to nodes and get a mapping from terms to DIs nodes, nid = process_vars_constants(arg_typing) next_id = len(nid) # keep track of these edges = [] # convert boxes into graph components for b, box in sorted(box_dict.items()): next_id = box2graph(box, nid, nodes, edges, next_id, arg_typing, cl2al, pars=pars) # add discourse relations for (r, b1, b2) in sorted(disc_rels): add_edges(edges, [(nid[b1], nid[b2], r)], [cl2al[(r, b1, b2)]]) # add presupposition relations for (b1, b2) in sorted(presupp_rels): add_edges(edges, [(nid[b1], nid[b2], 'PRESUPPOSITION')], [cl2al[(b1, b2)]]) # remove duplicate nodes but keep the order ord_set_nodes = sanity_check_nodes(nodes) if len(ord_set_nodes) != len(nodes): debug("After cleaning {} nodes remains {}".format( len(nodes), len(ord_set_nodes))) # roots = find_roots(ord_set_nodes, edges) edges = clean_set(edges) connectivity_check(ord_set_nodes, edges) remove_recoverable_edges(ord_set_nodes, edges, pars['bm']) # remove type feature from nodes, not needed anymore for nd in ord_set_nodes: del nd['type'] debug("edges ({}); nodes ({})".format(len(edges), len(ord_set_nodes))) return ord_set_nodes, edges, [nid[b] for b in top_boxes]
def extensive_format_check(drss_fixed, sig_file): '''Do a more extensive semantic format check (referee)''' drss_final = [] signature = get_signature(sig_file) error_counter = Counter() for clf in drss_fixed: try: _ = check_clf([tuple(c.split()) for c in clf], signature, v=1) drss_final.append(clf) except RuntimeError as err_message: #DRS invalid, replace by dummy error_counter.update([err_message[0]]) drss_final.append([" ".join(x) for x in dummy_drs()]) error_total = sum(error_counter.values()) print_str = "#wrong = {} ({:.2f}%)".format( error_total, error_total * 100 / float(len(drss_fixed))) for (err, c) in error_counter.most_common(): print_str += str(c) + ' ' + err + ' ' return drss_final, print_str, error_total
def get_clauses(file_name, signature, ill_type): '''Function that returns a list of DRSs (that consists of clauses)''' clause_list, original_clauses, cur_orig, cur_clauses = [], [], [], [] with open(file_name, 'r') as in_f: input_lines = in_f.read().split('\n') for idx, line in enumerate(input_lines): if line.strip().startswith('%'): pass # skip comments elif not line.strip(): if cur_clauses: # newline, so DRS is finished, add to list. Ignore double/clause newlines # First check if the DRS is valid, will error if invalid try: check_clf([tuple(c) for c in cur_clauses], signature, v=False) clause_list.append(cur_clauses) original_clauses.append(cur_orig) except Exception as e: if ill_type == 'error': raise ValueError(e) elif ill_type == 'dummy': # FIXME: uncomment print( 'WARNING: DRS {0} is ill-formed and replaced by a dummy DRS' .format(len(clause_list) + 1)) clause_list.append(dummy_drs()) original_clauses.append( [" ".join(x) for x in dummy_drs()]) elif ill_type == 'spar': print( 'WARNING: DRS {0} is ill-formed and replaced by the SPAR DRS' .format(len(clause_list) + 1)) clause_list.append(spar_drs()) original_clauses.append( [" ".join(x) for x in spar_drs()]) elif ill_type == 'score': print( 'WARNING: DRS {0} is ill-formed, but try to give a score anyway - might still error later' .format(len(clause_list) + 1)) clause_list.append(cur_clauses) original_clauses.append(cur_orig) cur_clauses = [] cur_orig = [] else: cur_clauses.append(line.split( ' %', 1)[0].strip().split()) #remove comments cur_orig.append(line) if cur_clauses: # no newline at the end, still add the DRS clause_list.append(cur_clauses) original_clauses.append(cur_orig) # Invert -of relations and reorder inv_boxes if they contain a constant between quotes inv_boxes = DRS(signature).inv_boxes for drs in clause_list: for clause in drs: if len(clause) == 4 and is_role( clause[1]) and clause[1].endswith('Of') and len( clause[1]) > 2: # Switch clauses and remove the -Of clause[2], clause[3] = clause[3], clause[2] clause[1] = clause[1][:-2] elif clause[1] in inv_boxes and len( clause) == 4 and between_quotes( clause[2]) and not between_quotes(clause[3]): # b1 NEQ x1 x2 is equal to b1 NEQ x2 x1 # If one of the two arguments is between quotes, rewrite them in such a way # that it can always match # For example rewrite b1 NEQ "speaker" x1 to b1 NEQ x1 "speaker" # If there are two variables or two items between quotes, do nothing clause[2], clause[3] = clause[3], clause[2] # If we want to include REF clauses we are done now if args.include_ref: return clause_list, original_clauses else: #else remove redundant REF clauses final_clauses, final_original = remove_refs(clause_list, original_clauses) return final_clauses, final_original
signature = get_signature(args.sig_file, v=args.v) # define counters trg_err_counter = Counter() src_err_counter = Counter() # contrast CLFs sen_ids = [] for sid in trg_clf_dict: # read raw and CLFs (raw, trg_clf) = trg_clf_dict[sid] #pr_clf(trg_clf, pr=True, inline=False) (src_raw, src_clf) = src_clf_dict[sid] #print raw, src_raw #assert raw == src_raw or src_raw is None # check validity of Gold CLF. If it is invalid, report and go to next CLF try: check_clf(trg_clf, signature, v=args.v) except RuntimeError as e: trg_err_counter.update([e[0]]) print '!nvGold [{}] "{}":\tThe gold CLF is invalid'.format( sid, raw) continue # check validity of Source CLF try: check_clf(src_clf, signature, v=args.v) src_invalid = '' except RuntimeError as e: src_err_counter.update([e[0]]) #print '!nvSyst [{}] "{}":\tThe system produced CLF is invalid'.format(sid, raw) src_invalid = '!!!Invalid CLF ' # detect which filter to apply dnf = dnf_ops if dnf_ops else dnf_tks
info("{} mrps read".format(len(mrps))) # converting mrps into clfs one-by-one error_counter = Counter() drg_count = 0 clfs_info_list, meta_list, invalids = [], [], [] for mrp in mrps: if mrp['framework'] != 'drg' \ or args.ids and mrp['id'] not in args.ids: continue meta_list.append((mrp['id'], mrp['input'])) drg_count += 1 try: clf = mrp2clf(mrp, fix=['edge_lab']) # some graphs need this # if signature is if args.validate: clfref.check_clf(clf, sig) clfs_info_list.append(clf) except: if args.throw_error: raise err_message = repr(sys.exc_info()[1]) if not args.quiet: error("{}: {}".format(mrp['id'], err_message)) error_counter.update([re.sub('\d+', 'NUM', err_message)]) invalids.append(mrp['id']) clfs_info_list.append({ 'b REF x': ('b', 'REF', 'x'), 'b nevermatching "n.01" x': ('b', 'LEX', 'x') }) write_clfs(clfs_info_list, meta_list, filename=args.clf) if error_counter and not args.quiet: print("Frequencies of erros") for err, c in error_counter.most_common():