Beispiel #1
0
def markall(e, trigs, verbose=True, rank=None):
    # the set of fields triggers relate to:
    clss = set(t.field for t in trigs)

    # all bibitems lacking any of the potential triggered fields:
    ei = {
        k: (typ, fields)
        for k, (typ, fields) in e.items() if any(c not in fields for c in clss)
    }
    eikeys = set(list(ei.keys()))

    # map words in titles to lists of bibitem keys having the word in the title:
    wk = defaultdict(set)
    for k, (typ, fields) in ei.items():
        for w in wrds(fields.get('title', '')):
            wk[w].add(k)

    u = defaultdict(lambda: defaultdict(list))
    for clauses, triggers in Trigger.group(trigs):
        for k in triggers[0](eikeys, wk):
            for t in triggers:
                u[k][t.cls].append(t)

    for k, t_by_c in u.items():
        t, f = e[k]
        f2 = {a: b for a, b in f.items()}
        for (field, type_), triggers in sorted(t_by_c.items(),
                                               key=lambda i: len(i[1])):
            # Make sure we handle the trigger class with the biggest number of matching
            # triggers last.
            if rank and field in f2:
                # only update the assigned hhtype if something better comes along:
                if rank(f2[field].split(' (comp')[0]) >= rank(type_):
                    continue
            f2[field] = Trigger.format(type_, triggers)
        e[k] = (t, f2)

    if verbose:
        print("trigs", len(trigs))
        print("label classes", len(clss))
        print("unlabeled refs", len(ei))
        print("updates", len(u))
    return e
Beispiel #2
0
def markall(e, trigs, verbose=True, rank=None):
    # the set of fields triggers relate to:
    clss = set(t.field for t in trigs)

    # all bibitems lacking any of the potential triggered fields:
    ei = {k: (typ, fields) for k, (typ, fields) in e.items()
          if any(c not in fields for c in clss)}
    eikeys = set(list(ei.keys()))

    # map words in titles to lists of bibitem keys having the word in the title:
    wk = defaultdict(set)
    for k, (typ, fields) in ei.items():
        for w in wrds(fields.get('title', '')):
            wk[w].add(k)

    u = defaultdict(lambda: defaultdict(list))
    for clauses, triggers in Trigger.group(trigs):
        for k in triggers[0](eikeys, wk):
            for t in triggers:
                u[k][t.cls].append(t)

    for k, t_by_c in u.items():
        t, f = e[k]
        f2 = {a: b for a, b in f.items()}
        for (field, type_), triggers in sorted(t_by_c.items(), key=lambda i: len(i[1])):
            # Make sure we handle the trigger class with the biggest number of matching
            # triggers last.
            if rank and field in f2:
                # only update the assigned hhtype if something better comes along:
                if rank(f2[field].split(' (comp')[0]) >= rank(type_):
                    continue
            f2[field] = Trigger.format(type_, triggers)
        e[k] = (t, f2)

    if verbose:
        print("trigs", len(trigs))
        print("label classes", len(clss))
        print("unlabeled refs", len(ei))
        print("updates", len(u))
    return e