def test_Trigger(self): from pyglottolog.util import Trigger t1 = Trigger('hhtype', 'grammar', 'phonologie AND NOT morphologie') t2 = Trigger('hhtype', 'phonology', 'phonologie') t3 = Trigger('hhtype', 'grammar', 'grammar') assert t1 != t3 and t1 == t1 allkeys = range(5) keys_by_word = dict(grammar=[1, 2], phonologie=[2, 3], morphologie=[3, 4]) self.assertEqual(t1(allkeys, keys_by_word), {2}) self.assertEqual(t2(allkeys, keys_by_word), {2, 3}) self.assertEqual(t3(allkeys, keys_by_word), {1, 2}) self.assertIn('not morphologie and phonologie', Trigger.format('a', t1)) for t in sorted([t1, t2, t3]): self.assertIn(t.type, Trigger.format(t.type, t))
def markall(e, trigs, verbose=True, rank=None): # the set of fields triggers relate to: clss = set(t.field for t in trigs) # all bibitems lacking any of the potential triggered fields: ei = { k: (typ, fields) for k, (typ, fields) in e.items() if any(c not in fields for c in clss) } eikeys = set(list(ei.keys())) # map words in titles to lists of bibitem keys having the word in the title: wk = defaultdict(set) for k, (typ, fields) in ei.items(): for w in wrds(fields.get('title', '')): wk[w].add(k) u = defaultdict(lambda: defaultdict(list)) for clauses, triggers in Trigger.group(trigs): for k in triggers[0](eikeys, wk): for t in triggers: u[k][t.cls].append(t) for k, t_by_c in u.items(): t, f = e[k] f2 = {a: b for a, b in f.items()} for (field, type_), triggers in sorted(t_by_c.items(), key=lambda i: len(i[1])): # Make sure we handle the trigger class with the biggest number of matching # triggers last. if rank and field in f2: # only update the assigned hhtype if something better comes along: if rank(f2[field].split(' (comp')[0]) >= rank(type_): continue f2[field] = Trigger.format(type_, triggers) e[k] = (t, f2) if verbose: print("trigs", len(trigs)) print("label classes", len(clss)) print("unlabeled refs", len(ei)) print("updates", len(u)) return e
def markall(e, trigs, verbose=True, rank=None): # the set of fields triggers relate to: clss = set(t.field for t in trigs) # all bibitems lacking any of the potential triggered fields: ei = {k: (typ, fields) for k, (typ, fields) in e.items() if any(c not in fields for c in clss)} eikeys = set(list(ei.keys())) # map words in titles to lists of bibitem keys having the word in the title: wk = defaultdict(set) for k, (typ, fields) in ei.items(): for w in wrds(fields.get('title', '')): wk[w].add(k) u = defaultdict(lambda: defaultdict(list)) for clauses, triggers in Trigger.group(trigs): for k in triggers[0](eikeys, wk): for t in triggers: u[k][t.cls].append(t) for k, t_by_c in u.items(): t, f = e[k] f2 = {a: b for a, b in f.items()} for (field, type_), triggers in sorted(t_by_c.items(), key=lambda i: len(i[1])): # Make sure we handle the trigger class with the biggest number of matching # triggers last. if rank and field in f2: # only update the assigned hhtype if something better comes along: if rank(f2[field].split(' (comp')[0]) >= rank(type_): continue f2[field] = Trigger.format(type_, triggers) e[k] = (t, f2) if verbose: print("trigs", len(trigs)) print("label classes", len(clss)) print("unlabeled refs", len(ei)) print("updates", len(u)) return e