def run_collapse_differences(fs, get=getdst): base = fs[0] del fs[0] subs = [ [get(rule) for rule in concat(compare(base, f)) if rule.type == lev.SUB and rule.dst != rule.src] for f in fs ] return dct.zip(dct.count(concat(subs)), default=0, *map(dct.count, subs))
def sharedtopwords(talbanken, regions): talwords = [fst(line.split()) for line in s.splitlines() if line[0] != ' '] talvocab = set(talwords) talcount = dct.count(talwords) swewords = [w for r in regions for w in r.splitlines()] swevocab = set(swewords) swecount = dct.count(swewords) unsharedTokensTal = sum(talcount[w] for w in sharedVocab) unsharedTokensSwe = sum(swecount[w] for w in sharedVocab) print(len(talvocab)) print(len(swevocab)) print(len(swevocab & talvocab)) print(len(swevocab - talvocab)) print(len(talvocab - swevocab) + len(swevocab - talvocab) + len(swevocab & talvocab)) print(unsharedTokensTal) print(unsharedTokensSwe) print(len(talwords) + len(swewords) - unsharedTokensTal - unsharedTokensSwe)
def bracketpaths(paths): "add brackets to disambiguate paths (and remove Eq wrapper)" spans = dct.count(concat(paths)) hapax = set(node for (node,n) in spans.items() if n==1) firsts = dict((node,findif(elem(node),paths)[-1]) for node in spans) lasts = dict((node,findif(elem(node),reversed(paths))[-1]) for node in spans) @typecheck([Eq], [str]) def bracket(path): first = edge(path, firsts, hapax) last = edge(path, lasts, hapax) if first != -1: return map(Eq.get,path[:first+1])+["["]+map(Eq.get,path[first+1:]) elif last != -1: return map(Eq.get,path[:last])+["]"]+map(Eq.get,path[last:]) else: return map(Eq.get, path) return map(bracket, paths)
def testPaths(self): acls = map(path.paths, acltree["A"]) test(acls, [["A-[-B-p", "A-]-B-q", "A-B-[-r", "]-A-B-s"], ["A-[-B-p", "A-B-q", "A-B-r", "]-A-B-s"]]) ps = map(path.paths, palmtree["A"]) test(ps, [['S-[-Ns-the', 'S-Ns-closest', 'S-Ns-thing', 'S-Ns-P-[-to', 'S-Ns-P-Ns-[-a', 'S-]-Ns-P-Ns-home', 'S-Vsb-was', 'S-N-[-a', 'S-N-string', 'S-N-hammock', 'S-N-S+-[-and', 'S-N-S+-+,', 'S-N-S+-Fa-[-Rq-when', 'S-N-S+-Fa-Ni-it', 'S-N-S+-]-Fa-Vd-rained', 'S-N-S+-+,', 'S-N-S+-Np-[-some', 'S-N-S+-Np-palm', 'S-N-S+-]-Np-fronds', 'S-N-S+-Vd-draped', 'S-N-S+-P-[-over', ']-S-N-S+-P-sticks']]) test({'hi-[-child-grandchild0':1, ']-hi-child3': 1, 'hi-child2':1, 'hi-]-child-grandchild1':1}, dct.count(path.paths(iceread.sentences('''[<sent> <#1:1:A>] hi child grandchild0 grandchild1 child2 child3'''.split('\n'))['A'][0])))
def count(filepattern, params): return dct.count( [(src, dst) for param in params for (src, dst) in significants(clean(filepattern % tuple(param)), 1000)] )
def countpaths(a,b): "[Path]*[Path]->{Path:(float,float)}" #TODO:I should change default=0 to some smoothed value return dct_zip(dct.count(a), dct.count(b), default=0)
def tinify(regions): items = sorted(dct.count(mapn(concat, regions.values())).items(), key=snd) code = encode(map(fst, items)) return dct.map(cur(map, cur(map, code.__getitem__)), regions)