def build_extrinsic_tables(target, source, env): files = source[0].read() rows = [] for (language, pack), setups in files.iteritems(): for setup, (asr_fname, kws_fname) in setups.iteritems(): with meta_open(asr_fname) as asr_fd, meta_open(kws_fname) as kws_fd: asr = ASRResults(asr_fd) kws = KWSResults(kws_fd) rows.append([language, setup] + [asr.get(x) for x in ["error", "substitutions", "deletions", "insertions"]] + [kws.get(x) for x in ["pmiss", "mtwv"]]) with meta_open(target[0].rstr(), "w") as ofd: body = "\n".join([r" & ".join([str(x) for x in row]) + r" \\" for row in rows]) ofd.write(r""" \begin{tabular}{|*{2}{l|}*{6}{r|}} \hline Language & Augmentation & \multicolumn{4}{|c|}{ASR} & \multicolumn{2}{|c|}{KWS} \\ & & Errors & Subs & Dels & Ins & PMiss & MTWV \\ \hline %s \hline \end{tabular} """ % (body)) return None
def build_site(target, source, env): properties, figures, results = [x.read() for x in source[0:3]] languages = set([x[0] for x in figures.keys()]) lookup = {"PRE" : "Prefixes", "STM" : "Stems", "SUF" : "Suffixes", } packs = ["Limited"] base_path = os.path.dirname(target[0].rstr()) try: os.makedirs(pjoin(base_path, "images")) except: pass with meta_open(pjoin(base_path, "theme.css"), "w") as ofd: ofd.write("body {text-align : center; vertical-align : top;}\n") ofd.write("table {text-align : center; vertical-align : top;}\n") ofd.write("tr {text-align : center; vertical-align : top;}\n") ofd.write("td {text-align : center; vertical-align : top;}\n") with meta_open(target[0].rstr(), "w") as ofd: xml = et.TreeBuilder() xml.start("html", {}) xml.start("head", {}), xml.start("link", {"rel" : "stylesheet", "type" : "text/css", "href" : "theme.css"}), xml.end("link"), xml.end("head") xml.start("body", {}), xml.start("table", {}) for language in languages: language_properties = properties[(language, "Limited")] with meta_open(language_properties["prefixes"]) as prefix_fd, meta_open(language_properties["stems"]) as stem_fd, meta_open(language_properties["suffixes"]) as suffix_fd: babel_output = {"PRE" : [l.strip().split()[0] for l in prefix_fd if "<epsilon>" not in l], "STM" : [l.strip().split()[0] for l in stem_fd if "<epsilon>" not in l], "SUF" : [l.strip().split()[0] for l in suffix_fd if "<epsilon>" not in l], } with meta_open(language_properties["limited_vocabulary"]) as lim_fd, meta_open(language_properties["dev_vocabulary"]) as dev_fd: lim_vocab = set(FrequencyList(lim_fd).make_conservative().keys()) dev_vocab = set(FrequencyList(dev_fd).make_conservative().keys()) lim_vocab_size = len(lim_vocab) dev_vocab_size = len(dev_vocab) both_vocabs = len(lim_vocab.union(dev_vocab)) avg_len_lim_vocab = sum(map(len, lim_vocab)) / float(len(lim_vocab)) avg_len_dev_vocab = sum(map(len, dev_vocab)) / float(len(dev_vocab)) with meta_open(language_properties["morfessor_input"]) as input_fd, meta_open(language_properties["morfessor_output"]) as output_fd: input_vocab = FrequencyList({w : int(c) for c, w in [x.strip().split() for x in input_fd]}) morf_output = MorfessorOutput(output_fd) morf_analysis_counts = len(morf_output) morf_morph_counts = {k : len(v) for k, v in morf_output.morphs.iteritems()} morf_morph_lengths = {k : sum(map(len, v)) / float(len(v)) for k, v in morf_output.morphs.iteritems() if len(v) > 0} xml.start("tr", {}), xml.start("td", {}), xml.start("h1", {}), xml.data(language.title()), xml.end("h1"), xml.end("td"), xml.end("tr") xml.start("table", {}) xml.start("tr", {}), [(xml.start("td", {}), xml.start("h3", {}), xml.data("%s information" % (x)), xml.end("h3"), xml.end("td")) for x in ["Language", "Morfessor", "BabelGUM"]], xml.end("tr") xml.start("tr", {}) # Language information xml.start("td", {}) xml.start("table", {"border" : "1"}) #xml.start("tr", {}), [(xml.start("td", {}), xml.data(x), xml.end("td")) for x in ["Pack", "Vocabulary size"]], xml.end("tr") xml.start("tr", {}), xml.start("td", {}), xml.end("td"), xml.start("td", {}), xml.data("Count"), xml.end("td"), xml.start("td", {}), xml.data("Average length"), xml.end("td"), xml.end("tr") xml.start("tr", {}), xml.start("td", {}), xml.data("Limited vocab"), xml.end("td"), xml.start("td", {}), xml.data("%d" % (lim_vocab_size)), xml.end("td"), xml.start("td", {}), xml.data("%.2f" % (avg_len_lim_vocab)), xml.end("td"), xml.end("tr") xml.start("tr", {}), xml.start("td", {}), xml.data("Dev vocab"), xml.end("td"), xml.start("td", {}), xml.data("%d" % (dev_vocab_size)), xml.end("td"), xml.start("td", {}), xml.data("%.2f" % (avg_len_dev_vocab)), xml.end("td"), xml.end("tr") #for name, values in [(lookup[x[0]], x[1]) for x in sorted(morf_output.morphs.iteritems())]: # xml.start("tr", {}) #xml.start("td", {}), xml.data(name), xml.end("td") #xml.start("td", {}), xml.data(str(len(values))), xml.end("td") #xml.start("td", {}), xml.data(avg_len), xml.end("td") # xml.end("tr") xml.end("table") xml.end("td") # Morfessor information xml.start("td", {}) xml.start("table", {"border" : "1"}) xml.start("tr", {}), [(xml.start("td", {}), xml.data(x), xml.end("td")) for x in ["Type", "Count", "Average length"]], xml.end("tr") for name, values in [(lookup[x[0]], x[1]) for x in sorted(morf_output.morphs.iteritems())]: if len(values) > 0: avg_len = "%.2f" % (sum(map(len, values)) / float(len(values))) else: avg_len = "" xml.start("tr", {}) xml.start("td", {}), xml.data(name), xml.end("td") xml.start("td", {}), xml.data(str(len(values))), xml.end("td") xml.start("td", {}), xml.data(avg_len), xml.end("td") xml.end("tr") xml.end("table") xml.end("td") # BabelGUM information xml.start("td", {}) xml.start("table", {"border" : "1"}) xml.start("tr", {}), [(xml.start("td", {}), xml.data(x), xml.end("td")) for x in ["Type", "Count", "Average length"]], xml.end("tr") for name, values in [(lookup[x[0]], x[1]) for x in sorted(babel_output.iteritems())]: if len(values) > 0: avg_len = "%.2f" % (sum(map(len, values)) / float(len(values))) else: avg_len = "" xml.start("tr", {}) xml.start("td", {}), xml.data(name), xml.end("td") xml.start("td", {}), xml.data(str(len(values))), xml.end("td") xml.start("td", {}), xml.data(avg_len), xml.end("td") xml.end("tr") xml.end("table") xml.end("td") xml.end("tr") xml.end("table") # graphs of IV increase and OOV reduction, type-based and token-based xml.start("tr", {}), xml.start("td", {}), xml.start("h3", {}), xml.data("Intrinsic performance evaluation"), xml.end("h3"), xml.end("td"), xml.end("tr") xml.start("tr", {}), xml.start("td", {}) xml.start("table", {}) for pack in packs: image_file = "%s_%s.png" % (language, pack) shutil.copy(figures[(language, pack)], pjoin(base_path, "images", image_file)) xml.start("tr", {}), xml.start("td", {}), xml.start("img", {"src" : pjoin("images", image_file)}), xml.end("img"), xml.end("td"), xml.end("tr") xml.end("table") xml.end("td"), xml.end("tr") # word error rate for ASR and maximum term-weighted value for KWS xml.start("tr", {}), xml.start("td", {}), xml.start("h3", {}), xml.data("Extrinsic performance evaluation"), xml.end("h3"), xml.end("td"), xml.end("tr") xml.start("tr", {}), xml.start("td", {}), xml.start("table", {"border" : "1"}) xml.start("tr", {}), [(xml.start("td", {}), xml.data(x), xml.end("td")) for x in ["Augmentation", "Error", "Substitutions", "Deletions", "Insertions", "PMiss", "MTWV"]], xml.end("tr") for name, values in sorted(results[(language, "Limited")].iteritems()): with meta_open(values["ASR"]) as asr_fd, meta_open(values["KWS"]) as kws_fd: asr = ASRResults(asr_fd) kws = KWSResults(kws_fd) xml.start("tr", {}) xml.start("td", {}), xml.data(name), xml.end("td") [(xml.start("td", {}), xml.data("%.3f" % (asr.get(x))), xml.end("td")) for x in ["error", "substitutions", "deletions", "insertions"]] [(xml.start("td", {}), xml.data("%.3f" % (kws.get(x))), xml.end("td")) for x in ["pmiss", "mtwv"]] xml.end("tr") xml.end("table") xml.end("td"), xml.end("tr") xml.end("table"), xml.end("body") xml.end("html") ofd.write(et.tostring(xml.close())) return None