def build_template(cls, cat, final_atom=None, construct_empty=False): """Build a template. Args: cat: A Category instance or a category signature string. final_atom: Optional final atom category for functor template. construct_empty: If true the functor should be constructed with an empty DrsProduction as the final atom. Returns: A tuple of key string and a FunctorTemplate instance. Remarks: Used to load templates from a file. """ if isinstance(cat, (str, unicode)): cat = Category(cat) elif not isinstance(cat, Category): raise TypeError( 'Model.build_template() expects signature or Category') if final_atom: if isinstance(final_atom, (str, unicode)): final_atom = Category(final_atom) elif not isinstance(final_atom, Category): raise TypeError( 'Model.build_template() expects signature or Category') key = cat.clean(True) return key, FunctorTemplate.create_from_category( cat, final_atom, construct_empty=construct_empty)
def build_unary_rule(result, argument): """Build a unary rule. Args: result: The result Category. argument: The argument category. Returns: A tuple of key string and a UnaryRule instance. Remarks: Used to load unary rules from a file. """ if isinstance(result, (str, unicode)): result = Category(result) elif not isinstance(result, Category): raise TypeError( 'Model.build_unary_rule() expects signature or Category result' ) if isinstance(argument, (str, unicode)): argument = Category(argument) elif not isinstance(argument, Category): raise TypeError( 'Model.build_unary_rule() expects signature or Category argument' ) rule = UnaryRule(result, argument) key = rule.getkey() return key, rule
def build_from_model(fn_dict, outdir, modelPath, verbose=False, verify=True): print('Building function templates from model folder...') fname = os.path.join(modelPath, 'markedup') if not os.path.exists(fname) or not os.path.isfile(fname): print('Error: %s does not exist or is not a file' % fname) with open(fname, 'r') as fd: signatures = fd.readlines() failed_rules = [] progress = 0 for sig in signatures: predarg = Category(sig.strip()) progress = print_progress(progress, 1000) try: catkey = predarg.clean(True) template = FunctorTemplate.create_from_category(predarg) if template is None: continue if verify: f = template.create_empty_functor() U1 = f.get_unify_scopes(False) U2 = f.category.extract_unify_atoms(False) if len(U1) != len(U2): assert False C1 = f.category C2 = template.predarg_category.clean(True) if not C1.can_unify(C2): assert False if catkey.signature not in fn_dict: fn_dict[catkey.signature] = template elif verify: f1 = fn_dict[catkey.signature] t1 = str(f1) t2 = str(template) assert t1 == t2, 'verify failed\n t1=%s\n t2=%s\n f1=%s\n f2=%s' % (t1, t2, f1.predarg_category, predarg) except Exception as e: failed_rules.append(safe_utf8_encode('%s: %s' % (predarg, e))) # DEBUG ? if False: try: FunctorTemplate.create_from_category(predarg) except Exception: pass print_progress(progress, done=True) if len(failed_rules) != 0: print('Warning: model - %d rules failed' % len(failed_rules)) with open(os.path.join(outdir, 'functor_easysrl_templates_failed.dat'), 'w') as fd: fd.write(b'\n'.join(failed_rules)) if verbose: for m in failed_rules: print(m) return fn_dict
def test8_Wsj0004_3(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N NN NN Compound N_309/N_309>) (<L N NNS NNS yields N>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBP VBP assume (S[dcl]\NP_236)/NP_237>) (<T NP 0 2> (<T NP 0 2> (<T NP 0 1> (<L N NN NN reinvestment N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_248\NP_248)/NP_249>) (<T NP 0 1> (<L N NNS NNS dividends N>) ) ) ) (<T NP[conj] 1 2> (<L conj CC CC and conj>) (<T S[em] 0 2> (<L S[em]/S[dcl] IN IN that S[em]/S[dcl]_257>) (<T S[dcl] 1 2> (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_297/N_297>) (<T N 1 2> (<L N/N JJ JJ current N_292/N_292>) (<L N NN NN yield N>) ) ) (<T S[dcl]\NP 0 2> (<L S[dcl]\NP VBZ VBZ continues S[dcl]\NP_262>) (<T (S\NP)\(S\NP) 0 2> (<L ((S\NP)\(S\NP))/NP IN IN for ((S_275\NP_270)_275\(S_275\NP_270)_275)/NP_276>) (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_283/N_283>) (<L N NN NN year N>) ) ) ) ) ) ) ) ) ) (<L . . . . .>) ) ''' pt = parse_ccg_derivation(txt) ccg = Ccg2Drs() rule = get_rule(Category.from_cache('conj'), Category.from_cache('S[em]'), Category.from_cache('NP[conj]')) self.assertEqual(rule, RL_TC_ATOM) ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(compound, N/N, NN)', '<PushOp>:(yields, N, NNS)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(assume, (S[dcl]\\NP)/NP, VBP)', '<PushOp>:(reinvestment, N, NN)', '<ExecOp>:(1, LP NP)', '<PushOp>:(of, (NP\\NP)/NP, IN)', '<PushOp>:(dividends, N, NNS)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<PushOp>:(and, conj, CC)', '<PushOp>:(that, S[em]/S[dcl], IN)', '<PushOp>:(the, NP[nb]/N, DT)', '<PushOp>:(current, N/N, JJ)', '<PushOp>:(yield, N, NN)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA NP)', '<PushOp>:(continue, S[dcl]\\NP, VBZ)', '<PushOp>:(for, ((S\\NP)\\(S\\NP))/NP, IN)', '<PushOp>:(a, NP[nb]/N, DT)', '<PushOp>:(year, N, NN)', '<ExecOp>:(2, FA NP)', '<ExecOp>:(2, FA (S\\NP)\\(S\\NP))', '<ExecOp>:(2, BA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<ExecOp>:(2, FA S[em])', '<ExecOp>:(2, ATOM_TC NP[conj])', '<ExecOp>:(2, RCONJ NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])', ] self.assertListEqual(expected, actual)
def extract_predarg_categories_from_pt(pt, lst=None): """Extract the predicate-argument categories from a CCG parse tree. Args: pt: The parse tree returned from marbles.ie.ccg.parse_ccg_derivation2(). lst: An optional list of existing predicate categories. Returns: A list of Category instances. """ global _PredArgIdx if future_string != unicode: pt = pt_to_utf8(pt) if lst is None: lst = [] stk = [pt] while len(stk) != 0: pt = stk.pop() if pt[-1] == 'T': stk.extend(pt[1:-1]) else: # Leaf nodes contains six fields: # <L CCGcat mod_POS-tag orig_POS-tag word PredArgCat> # PredArgCat example: (S[dcl]\NP_3)/(S[pt]_4\NP_3:B)_4> catkey = Category(pt[0]) # Ignore atoms and conj rules. if not catkey.isfunctor or catkey.result_category( ) == CAT_CONJ or catkey.argument_category() == CAT_CONJ: continue predarg = Category(pt[4]) assert catkey == predarg.clean(True) lst.append(predarg) return lst
def issupported(self, category): """Test a FunctorTemplate is in TEMPLATES with key=category.""" if category in self._TEMPLATES: return True # Perform wildcard replacements if category.isfunctor: wc = Category.from_cache( self._Feature.sub('[X]', category.signature)) return wc in self._TEMPLATES return False
def infer_unary(self, result, argument): """Attempt to build a unary modifier from existing templates if possible.""" if result == argument: template = self.lookup(argument) if template is not None: taggedcat = template.predarg_category.complete_tags() return self.add_unary_rule( Category.combine(taggedcat, '\\', taggedcat, False), taggedcat) return None
def infer_template(self, category): """Attempt to build a template from existing templates if possible.""" category = category.remove_conj_feature() if category.isfunctor and not self.issupported(category): catArg = category.argument_category() catArgArg = catArg.argument_category() catResult = category.result_category() if category.istype_raised and (self.issupported(catResult) or catResult.isatom) \ and (self.issupported(catArgArg) or catArgArg.isatom): global _logger # If the catgeory is type raised then check if result type exists and build now. # TODO: This should be sent to a log _logger.info('Adding type-raised category %s to TEMPLATES' % category.signature) # Template categories contain predarg info so build new from these if catResult.isfunctor: catResult = self.lookup( catResult).predarg_category.complete_tags() else: catResult = Category(catResult.signature + '_999') # synthesize pred-arg info if catArgArg.isfunctor: # FIXME: Should really check predarg info does not overlap with catResult. Chances are low. catArgArg = self.lookup( catArgArg).predarg_category.complete_tags() else: catArgArg = Category(catArgArg.signature + '_998') # synthesize pred-arg info newcat = Category.combine( catResult, category.slash, Category.combine(catResult, category.argument_category().slash, catArgArg), False) return self.add_template(newcat) elif category.ismodifier and self.issupported(catResult): predarg = self.lookup( catResult).predarg_category.complete_tags() newcat = Category.combine(predarg, category.slash, predarg, False) return self.add_template(newcat) return None
def save_templates(self, filepath): """Save the model to a file. Args: filepath: The filename and path: """ with open(filepath, 'wb') as fd: for k, v in self._TEMPLATES: final_atom = v.final_atom if final_atom != Category(k).extract_unify_atoms(False)[-1]: fd.write(b'%s, %s\n' % (v.predarg_category, v.final_atom)) else: fd.write(b'%s\n' % v.predarg_category)
def lookup(self, category): """Lookup a FunctorTemplate with key=category.""" category = category.remove_conj_feature() if category in self._TEMPLATES: return self._TEMPLATES[category] # Perform wildcard replacements if category.isfunctor: wc = Category.from_cache( self._Feature.sub('[X]', category.signature)) try: return self._TEMPLATES[wc] except Exception: pass return None
def lookup_unary(self, result, argument): if isinstance(result, (str, unicode)): result = Category(result) elif not isinstance(result, Category): raise TypeError( 'Model.lookup_unary() expects signature or Category result') if isinstance(argument, (str, unicode)): argument = Category(argument) elif not isinstance(argument, Category): raise TypeError( 'Model.lookup_unary() expects signature or Category argument') key = UnaryRule.create_key(result, argument) try: return self._UNARY[key] except Exception: pass # Perform wildcard replacements wc = Category.from_cache(self._Feature.sub('[X]', key.signature)) try: return self._UNARY[wc] except Exception: pass return None
def __init__(self, result, argument): """Constructor for unary rule `result <- argument`. Args: result: The result category. argument: The argument category. Remarks: Both categories must include predarg tags. """ if not isinstance(result, Category): raise TypeError('UnaryRule expects a result Category') if not isinstance(argument, Category): raise TypeError('UnaryRule expects a argument Category') # We implement unary rules using backward application of the functor below ucat = Category.combine(result.clean(), '\\', argument.clean(), cacheable=False) result, final_tag = result.trim_functor_tag() if final_tag is not None: ucat = Category('(%s)_%s' % (ucat, final_tag)) self._template = FunctorTemplate.create_from_category(ucat)
def create_key(result, argument): """Create a rule key from result and argument categories. Args: result: The result category. argument: The argument category. Returns: A string. Remarks: Both categories must NOT include predarg tags. To remove tags do Category.clean(True). """ if not isinstance(result, Category): raise TypeError( 'UnaryRule.create_key() expects a Category instance ') if not isinstance(argument, Category): raise TypeError( 'UnaryRule.create_key() expects a Category instance') return Category.combine(result, '\\', argument)
def __init__(self, rule, predarg_category, finalRef, finalAtom, construct_empty=False): """Constructor. Args: rule: The production constructor rule. predarg_category: A predarg category. finalRef: The final referent result. finalAtom: The final atomic category result. construct_empty: If true the functor should be constructed with an empty DrsProduction as the final atom. """ self._constructor_rule = rule self._predarg_category = predarg_category self._clean_category = Category.from_cache( predarg_category.clean(True)) self._final_ref = finalRef self._final_atom = finalAtom self._construct_empty = construct_empty
def test7_Wsj0051_30(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N NNP NNP Fujitsu N>) (<T N[conj] 1 2> (<L conj CC CC and conj>) (<L N NNP NNP NEC N>) ) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/S[dcl] VBD VBD said (S[dcl]\NP_146)/S[dcl]_147>) (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<L NP PRP PRP they NP>) (<T S[dcl]\NP 0 2> (<T (S[dcl]\NP)/(S[ng]\NP) 0 2> (<L (S[dcl]\NP)/(S[ng]\NP) VBD VBD were (S[dcl]\NP_156)/(S[ng]_157\NP_156:B)_157>) (<L (S\NP)\(S\NP) RB RB still (S_169\NP_164)_169\(S_169\NP_164)_169>) ) (<L S[ng]\NP VBG VBG investigating S[ng]\NP_174>) ) ) (<T S[dcl][conj] 1 2> (<L , , , , ,>) (<T S[dcl][conj] 1 2> (<L conj CC CC and conj>) (<T S[em] 0 2> (<L S[em]/S[dcl] IN IN that S[em]/S[dcl]_181>) (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 1> (<L N NN NN knowledge N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_207\NP_207)/NP_208>) (<T NP 0 1> (<T N 1 2> (<L N/N JJR JJR more N_224/N_224>) (<T N 1 2> (<L N/N JJ JJ such N_217/N_217>) (<L N NNS NNS bids N>) ) ) ) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[b]\NP) MD MD could (S[dcl]\NP_190)/(S[b]_191\NP_190:B)_191>) (<L S[b]\NP VB VB emerge S[b]\NP_196>) ) ) ) ) ) ) ) ) (<L . . . . .>) ) ''' pt = parse_ccg_derivation(txt) ccg = Ccg2Drs() rule = get_rule(Category.from_cache('conj'), Category.from_cache('S[em]'), Category.from_cache('S[dcl][conj]')) self.assertEqual(rule, RL_RPASS) ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(Fujitsu, N, NNP)', '<PushOp>:(and, conj, CC)', '<PushOp>:(NEC, N, NNP)', '<ExecOp>:(2, RP N[conj])', '<ExecOp>:(2, RCONJ N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(say, (S[dcl]\\NP)/S[dcl], VBD)', '<PushOp>:(they, NP, PRP)', '<PushOp>:(be, (S[dcl]\\NP)/(S[ng]\\NP), VBD)', '<PushOp>:(still, (S\\NP)\\(S\\NP), RB)', '<ExecOp>:(2, BX (S[dcl]\\NP)/(S[ng]\\NP))', '<PushOp>:(investigate, S[ng]\\NP, VBG)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(,, ,, ,)', '<PushOp>:(and, conj, CC)', '<PushOp>:(that, S[em]/S[dcl], IN)', '<PushOp>:(knowledge, N, NN)', '<ExecOp>:(1, LP NP)', '<PushOp>:(of, (NP\\NP)/NP, IN)', '<PushOp>:(more, N/N, JJR)', '<PushOp>:(such, N/N, JJ)', '<PushOp>:(bids, N, NNS)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<PushOp>:(could, (S\\NP)/(S\\NP), MD)', '<PushOp>:(emerge, S[b]\\NP, VB)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<ExecOp>:(2, FA S[em])', '<ExecOp>:(2, RP S[dcl][conj])', '<ExecOp>:(2, RP S[dcl][conj])', '<ExecOp>:(2, RCONJ S[dcl])', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])' ] self.assertListEqual(expected, actual)
(options, args) = parser.parse_args() fn_dict = {} outdir = options.outdir or datapath if not os.path.exists(outdir): print('path does not exist - %s' % outdir) sys.exit(1) if not os.path.isdir(outdir): print('path is not a directory - %s' % outdir) sys.exit(1) tstart = datetime.datetime.now() # Clear category cache merge = Category.copy_cache() if options.merge else [] cache = Cache() for a in args: tmp_cache = Model.load_templates(a) cache.initialize(tmp_cache) Category.clear_cache() if options.esrl is not None: build_from_model(fn_dict, outdir, options.esrl, options.verbose) if options.ldc: build_from_ldc_ccgbank(fn_dict, outdir, options.verbose) elapsed = datetime.datetime.now() - tstart print('Processing time = %d seconds' % elapsed.total_seconds())
def make_lexicon(daemon): global pypath, projdir, datapath, idsrch allfiles = [] projdir = os.path.dirname(os.path.dirname(__file__)) easysrl_path = os.path.join(projdir, 'data', 'ldc', daemon, 'lexicon') if not os.path.exists(easysrl_path): os.makedirs(easysrl_path) if not os.path.exists(os.path.join(easysrl_path, 'rt')): os.makedirs(os.path.join(easysrl_path, 'rt')) if not os.path.exists(os.path.join(easysrl_path, 'az')): os.makedirs(os.path.join(easysrl_path, 'az')) # Get files ldcpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank') dirlist1 = sorted(os.listdir(ldcpath)) #dirlist1 = ['ccg_derivation00.txt'] for fname in dirlist1: if 'ccg_derivation' not in fname: continue ldcpath1 = os.path.join(ldcpath, fname) if os.path.isfile(ldcpath1): allfiles.append(ldcpath1) failed_parse = 0 failed_ccg_derivation = [] start = 0 progress = -1 dictionary = None for fn in allfiles: idx = idsrch.match(fn) if idx is None: continue idx = idx.group('id') with open(fn, 'r') as fd: lines = fd.readlines() name, _ = os.path.splitext(os.path.basename(fn)) for i in range(start, len(lines)): start = 0 ccgbank = lines[i].strip() if len(ccgbank) == 0 or ccgbank[0] == '#': continue if progress < 0: print('%s-%04d' % (name, i)) else: progress = print_progress(progress, 10) try: # CCG parser is Java so output is UTF-8. ccgbank = safe_utf8_decode(ccgbank) pt = parse_ccg_derivation(ccgbank) s = sentence_from_pt(pt).strip() except Exception: failed_parse += 1 raise continue uid = '%s-%04d' % (idx, i) try: #dictionary[0-25][stem][set([c]), set(uid)] dictionary = extract_lexicon_from_pt(pt, dictionary, uid=uid) except Exception as e: print(e) raise continue rtdict = {} for idx in range(len(dictionary)): fname = unichr(idx + 0x40) filepath = os.path.join(easysrl_path, 'az', fname + '.txt') with open(filepath, 'w') as fd: d = dictionary[idx] for k, v in d.iteritems(): # k == stem, v = {c: set(uid)} fd.write(b'<predicate name=\'%s\'>\n' % safe_utf8_encode(k)) for x, w in v.iteritems(): fd.write(b'<usage \'%s\'>\n' % safe_utf8_encode(x)) nc = x.split(':') if len(nc) == 2: c = Category.from_cache( Category(nc[1].strip()).clean(True)) # Return type atom rt = c.extract_unify_atoms(False)[-1] if rt in rtdict: cdict = rtdict[rt] if c in cdict: cdict[c].append(nc[0]) else: cdict[c] = [nc[0]] else: rtdict[rt] = {c: [nc[0]]} for y in w: fd.write(b'sentence id: ' + safe_utf8_encode(y)) fd.write(b'\n') fd.write(b'</usage>\n') fd.write(b'</predicate>\n\n') # Free up memory dictionary[idx] = None d = None for rt, cdict in rtdict.iteritems(): fname = rt.signature.replace('[', '_').replace(']', '') filepath = os.path.join(easysrl_path, 'rt', fname + '.txt') with open(filepath, 'w') as fd: for c, vs in cdict.iteritems(): fd.write(b'<category signature=\'%s\'>\n' % safe_utf8_encode(c)) for v in vs: fd.write(v) fd.write(b'\n') fd.write(b'</category>\n\n')