def to_html_differences(f, name, combined): "file*str*{char:[int]} " print >> f, """<h1>%s</h1>""" % name print >> f, "<table border=1 cellspacing=0 bordercolor='black'><tr><th>Char</th><th>All</th>", print >> f, "".join("<th>%s</th>" % f[21:24] for f in fs[1:]), "<th>Avg</th></tr>" for char, counts in combined.items(): print >> f, "<tr><td>%s</td>" % char, print >> f, "".join("<td>%s</td>" % c for c in counts), print >> f, "<td>%.2f</td></tr>" % avg(counts[1:]) print >> f, "</table>"
def groupavg((c1,c2)): "group average" return avg(map(compose(edges.__getitem__, frozenset), cross(flatten(c1), flatten(c2))))
def sed_avg_total((region1, region2)): "([[{str:[float]}]],[[{str:[float]}]]) -> float" return lst.avg(map(sed_avg, region1, region2)) / 2
def sed_avg(ws1, ws2): "[{str:[float]}]*[{str:[float]}] -> float" segs1,segs2 = (concat(transpose_word(ws1)), concat(transpose_word(ws1))) return lst.avg(map(fnc.uncurry(feature_sub), lst.cross(segs1, segs2)))
def analyse(regions, avgs=None): keys = lst.all_pairs(sorted(regions.keys())) regions = lst.all_pairs(flatten(regions)) avgregions = lst.avg(map(sed_avg_total, regions)) return dict(zip(keys, map(sed_distance(avgregions), regions)))
avgregions = lst.avg(map(sed_avg_total, regions)) return dict(zip(keys, map(sed_distance(avgregions), regions))) def feature_sub(seg1, seg2): "({str:float}*{str:float}) -> float" return (len(set(seg1) ^ set(seg2)) + sum(abs(f1-f2) for f1,f2 in dct.zip(seg1,seg2).values())) @curried def sed_distance(avg, (region1, region2)): "float*([[{str:[float]}]],[[{str:[float]}]])->float" return sum(map(sed_levenshtein(avg), zip(region1, region2))) def transpose_word(word): "[{str:[float]}] -> [[{str:float}]]" def transpose_segment(seg): return [dict(zip(seg.keys(), ns)) for ns in lst.transpose(seg.values())] return lst.transpose(map(transpose_segment, word)) @curried def sed_levenshtein(avg,(ws1,ws2)): "float*([{str:[float]}],[{str:[float]}])->float" def levenshtein((w1, w2)): return lev._levenshtein(w1, w2, avg, (lambda _:avg,lambda _:avg,feature_sub))[-1][-1] return lst.avg(map(levenshtein, lst.cross(transpose_word(ws1), transpose_word(ws2)))) def sed_avg(ws1, ws2): "[{str:[float]}]*[{str:[float]}] -> float" segs1,segs2 = (concat(transpose_word(ws1)), concat(transpose_word(ws1))) return lst.avg(map(fnc.uncurry(feature_sub), lst.cross(segs1, segs2))) def sed_avg_total((region1, region2)): "([[{str:[float]}]],[[{str:[float]}]]) -> float" return lst.avg(map(sed_avg, region1, region2)) / 2
def variance(freqs): average = avg(cdr(freqs)) return sum((average - c) ** 2 for c in cdr(freqs)) / average
def lst_except(l, *ns): """Totally inefficient! You have been warned, dude! (requiring ns to be ordered could help a lot if I actually cared)""" acc = [] for i, x in enumerate(l): if i not in ns: acc.append(x) return acc def find_collapsed(f, collapsed): "{char:[int]} -> [(char,int)] (sorted)" return sorted(dct.map(f, collapsed).items(), key=snd, reverse=True) diff = lambda freqs: avg([freqs[2], freqs[8]]) - avg(lst_except(freqs, 0, 2, 8)) def variance(freqs): average = avg(cdr(freqs)) return sum((average - c) ** 2 for c in cdr(freqs)) / average find_difference = cur(find_collapsed, diff) find_variance = cur(find_collapsed, variance) def to_html_group_differences(f, name, differences): print >> f, "<h1>%s</h1>" % name print >> f, "<table border=1 cellspacing=0 bordercolor='black'><tr><td></td><th>Char</th><th>Variance</th>", for i, (sub, variance) in enumerate(differences):