def segmented_pronunciations(target, source, env): """Takes a word pronunciation file and a word segmentation file and creates a "segmented word pronunciation file" suitable for training G2P Sources: pronunciation file, word segmentation file Targets: word pronunciation, morph list file """ sep = "+" data = {} segs = {} morphs = set() w2w = {} with meta_open(source[0].rstr()) as pron_fd, meta_open(source[1].rstr()) as seg_fd: for l in pron_fd: word, pron = re.match(r"^(\S+)\(\d+\) (.*)$", l.strip().replace(" [ wb ]", "")).groups() if env.get("LOWER_CASE"): word = word.lower() data[word] = data.get(word, []) + [pron] vals = [[y.strip(sep) for y in x.split()] for x in seg_fd] if env.get("LOWER_CASE"): vals = [[y.lower() for y in x] for x in vals] segs = {sep.join(x) : x for x in vals} w2s = {"".join(x) : sep.join(x) for x in vals} for m in segs.values(): if len(m) == 1: morphs.add(m[0]) if len(m) >= 2: morphs.add("%s%s" % (m[0], sep)) morphs.add("%s%s" % (sep, m[1])) for x in m[1:-1]: morphs.add("%s%s%s" % (sep, x, sep)) with meta_open(target[0].rstr(), "w") as seg_ofd, meta_open(target[1].rstr(), "w") as morph_ofd: seg_ofd.write("\n".join(sum([["%s %s" % (w2s.get(k, k), p) for p in v] for k, v in data.iteritems()], []))) morph_ofd.write("\n".join(morphs)) return None
def split_words(target, source, env, words): #print len(words) good, bad = set(), set() r = True with temp_dir(remove=r) as raw, temp_dir(remove=r) as tokenized, temp_dir(remove=r) as analyzed: with meta_open(os.path.join(raw, "file.txt"), "w") as ofd: ofd.write(" ".join(words)) #.encode("utf-8")) cmd = env.subst("java -Xmx4024M -jar ${MILA_PATH}/tokenizer.jar %s %s" % (raw, tokenized)) pid = Popen(cmd.split(), cwd=env.subst("${MILA_PATH}"), stdout=PIPE, stderr=PIPE) out, err = pid.communicate() #print out, err cmd = env.subst("java -Xmx4024M -jar ${MILA_PATH}/morphAnalyzer.jar false %s %s" % (tokenized, analyzed)) pid = Popen(cmd.split(), cwd=env.subst("${MILA_PATH}"), stdout=PIPE, stderr=PIPE) out, err = pid.communicate() #print out, err with meta_open(os.path.join(analyzed, "file.xml")) as ifd: xml = et.parse(ifd) for token in xml.getiterator("token"): word = token.get("surface") unk = [x for x in token.getiterator("unknown")] if len(unk) == 0: good.add(word) else: bad.add(word) return (good, bad)
def alter_iv_oov(target, source, env): """ NEEDS WORK! If the vocabulary has been expanded, some OOV terms are now IV. """ iv_q, oov_q, iv, term_map, kw_file, w2w_file = source with meta_open(iv_q.rstr()) as iv_q_fd, meta_open(oov_q.rstr()) as oov_q_fd, meta_open(iv.rstr()) as iv_fd, meta_open(term_map.rstr()) as term_map_fd, meta_open(kw_file.rstr()) as kw_file_fd, meta_open(w2w_file.rstr()) as w2w_fd: iv_queries = [x.strip() for x in iv_q_fd] oov_queries = [x.strip() for x in oov_q_fd] iv_words = [x.strip().split("(")[0] for x in iv_fd] oov_to_iv_indices = [i for i, q in enumerate(oov_queries) if all([x in iv_words for x in q.split()])] oov_to_oov_indices = enumerate([i for i, q in enumerate(oov_queries) if not all([x in iv_words for x in q.split()])]) new_iv_queries = iv_queries + [oov_queries[i] for i in oov_to_iv_indices] new_oov_queries = [x for i, x in enumerate(oov_queries) if i not in oov_to_iv_indices] old_mapping = {(y[0], int(y[2])) : y[1] for y in [x.strip().split() for x in term_map_fd]} new_mapping = old_mapping.copy() for i, old_oov_num in enumerate(oov_to_iv_indices): x = old_mapping[("oov", old_oov_num + 1)] new_iv_num = len(iv_queries) + i + 1 del new_mapping[("oov", old_oov_num + 1)] new_mapping[("iv", new_iv_num)] = x for new_oov, old_oov in oov_to_oov_indices: x = old_mapping[("oov", old_oov + 1)] del new_mapping[("oov", old_oov + 1)] new_mapping[("oov", new_oov + 1)] = x #old_xml = et.fromstring(kw_file_fd.read()) new_w2w = [" ".join(y) for y in set([tuple(x.split()) for x in w2w_fd if len(x.split()) == 5] + [("0", "0", x, x, "0") for x in iv_words])] open(target[0].rstr(), "w").write("\n".join(new_iv_queries) + "\n") open(target[1].rstr(), "w").write("\n".join(new_oov_queries) + "\n") #open(target[2].rstr(), "w").write(open(term_map.rstr()).read()) open(target[2].rstr(), "w").write("\n".join(["%s %s %0.5d" % (s, on, n) for (s, n), on in sorted(new_mapping.iteritems(), lambda x, y : cmp(x[1], y[1]))])) open(target[3].rstr(), "w").write(kw_file_fd.read()) open(target[4].rstr(), "w").write("\n".join(new_w2w) + "\n0\n") return None
def strip_logging(target, source, env): for t, s in zip(target, source): with meta_open(s.rstr()) as ifd: lines = [l for l in ifd if "logger.fin" not in l and "assert" not in l] with meta_open(t.rstr(), "w") as ofd: ofd.write("".join(lines)) return None
def query_files(target, source, env): # OUTPUT: # iv oov map w2w <- # INPUT: kw, iv, id # pad id to 4 remove_vocab = source[-1].read() with meta_open(source[0].rstr()) as kw_fd, meta_open(source[1].rstr()) as iv_fd: keyword_xml = et.parse(kw_fd) keywords = set([(x.get("kwid"), x.find("kwtext").text.lower()) for x in keyword_xml.getiterator("kw")]) #print list(keywords)[0][1].split() vocab = [x.decode("utf-8") for x in Pronunciations(iv_fd).get_words()] #print list(vocab)[0].split() #set([x.split()[1].strip().decode("utf-8") for x in iv_fd]) if remove_vocab: remove_vocab = Vocabulary(meta_open(remove_vocab)).get_words() else: remove_vocab = [] iv_keywords = sorted([(int(tag.split("-")[-1]), tag, term) for tag, term in keywords if all([y in vocab for y in term.split()]) and term not in remove_vocab]) oov_keywords = sorted([(int(tag.split("-")[-1]), tag, term) for tag, term in keywords if any([y not in vocab for y in term.split()])]) language_id = source[-2].read() with meta_open(target[0].rstr(), "w") as iv_ofd, meta_open(target[1].rstr(), "w") as oov_ofd, meta_open(target[2].rstr(), "w") as map_ofd, meta_open(target[3].rstr(), "w") as w2w_ofd, meta_open(target[4].rstr(), "w") as kw_ofd: iv_ofd.write("\n".join([x[2].encode("utf-8") for x in iv_keywords])) oov_ofd.write("\n".join([x[2].encode("utf-8") for x in oov_keywords])) map_ofd.write("\n".join(["%s %.5d %.5d" % x for x in sorted([("iv", gi, li) for li, (gi, tag, term) in enumerate(iv_keywords, 1)] + [("oov", gi, li) for li, (gi, tag, term) in enumerate(oov_keywords, 1)], lambda x, y : cmp(x[1], y[1]))])) w2w_ofd.write("\n".join([("0 0 %s %s 0" % (x.encode("utf-8"), x.encode("utf-8"))) for x in vocab if x != "VOCAB_NIL_WORD"] + ["0"])) for x in keyword_xml.getiterator("kw"): x.set("kwid", "KW%s-%s" % (language_id, x.get("kwid").split("-")[-1])) keyword_xml.write(kw_ofd) #.write(et.tostring(keyword_xml.)) return None
def probability_list_to_vocabulary(target, source, env): with meta_open(source[0].rstr()) as ifd: probs = ProbabilityList(ifd) with meta_open(target[0].rstr(), "w") as ofd: vocab = Vocabulary.from_set(probs.get_words()) ofd.write(vocab.format()) return None
def perform_search(target, source, env): """Searches for each query term in the index. Sources: index file, phone symbol file, index symbol file, fst header, keyword 1, keyword 2 ... Targets: search result file """ index, phone_symbols, index_symbols, fst_header = source[0:4] terms = source[4:] prune = env.get("PRUNE") results = [] for f in terms: with meta_open(f.rstr()) as ifd: for fname in ifd.getnames(): key = fname.split(".")[0] e = codecs.getreader("utf-8") query = ifd.extractfile(fname).read() cmd = env.subst("fstcompose - ${SOURCES[0]} | fstprune -weight=${PRUNE} - | fstrmepsilon | fstprint -isymbols=${SOURCES[1]} -osymbols=${SOURCES[2]} - | cat ${SOURCES[3]} - | bin/FsmOp -out-cost - -n-best 50000 -gen | perl bin/process.1.pl - 100 1e-40 %s | sort -k 5 -gr | perl ${CN_KWS_SCRIPTS}/clean_result.words.pl -" % key, source=source, target=target) pid = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) out, err = pid.communicate(query) if not re.match(r"^\s*$", out): for line in out.strip().split("\n"): toks = line.split() toks[3] = key results.append(" ".join(toks)) with meta_open(target[0].rstr(), "w", None) as ofd: ofd.write("\n".join(results).strip() + "\n") return None
def freqs_to_tab(target, source, env): items = {} data = {} tempdata = {} totals = {} avs = {} for f in source[0].children(): subject = os.path.basename(f.rstr()).split("_")[0] tempdata[subject] = {} for r in csv.reader(meta_open(f.rstr()), delimiter="\t"): if r[1] == "COUNT": continue tempdata[subject][r[0]] = r[1] totals[subject] = totals.get(subject, 0) + int(r[1]) items[r[0]] = items.get(r[0], 0) + int(r[1]) for s, wordcounts in tempdata.iteritems(): data[s] = {} for w, c in wordcounts.iteritems(): data[s][w] = float(tempdata[s].get(w, 0)) / float(totals[s]) avs[w] = avs.get(w, []) + [data[s][w]] # items = sorted([x for x, y in items.iteritems() if y > 20000 and not re.match("^.*\W.*$", x)]) items = sorted([x for x, y in items.iteritems()], lambda x, y: cmp(sum(avs[y]), sum(avs[x])))[ 0 : min(2000, len(items)) ] fd = csv.writer(meta_open(target[0].rstr(), "w"), delimiter="\t") fd.writerow(["SUBJECT"] + ["'%s'" % esc_weka(x) for x in items]) for k, v in data.iteritems(): fd.writerow(["'%s'" % esc_weka(k)] + [v.get(x, 0) for x in items]) return None
def prepare_segmentations_for_release(target, source, env): """Intended to produce files suitable for shipping directly to partners. This is not well-planned, and should probably not be used as-is, but I think it would be useful to incorporate something automatic into the build system to make our deliverables more consistent. Sources: segmentation file 1, word file 1, segmentation file 2, word file 2 ... Targets: deliverable file 1, deliverable file 2 ... """ nag = env.get("NON_ACOUSTIC_GRAPHEMES") rx_str = "^(%s)+$" % ("|".join([unichr(int(x, base=16)) for x in env.get("NON_ACOUSTIC_GRAPHEMES")])) rx = re.compile(rx_str) for (seg_file, word_file), out in zip(pairs(source, 2), target): with meta_open(seg_file.rstr()) as ifd: data = [line.strip().split() for line in ifd] morphs = {"".join([x.strip("+") for x in ms]) : ms for ms in data} with meta_open(word_file.rstr()) as ifd: lines = [l.strip().split() for l in ifd if "_" not in l] for words in lines: for word in sum([x.split("-") for x in words], []): if word != "" and word not in morphs and "_" not in word and "<" not in word and not re.match(r"^\d+$", word): return "%s, %s, %s" % (seg_file, word_file, word) with meta_open(out.rstr(), "w") as ofd: for morph, seg in sorted(morphs.iteritems()): ofd.write("%s\t%s\n" % (morph, " ".join(seg))) return None
def conllish_to_xml(target, source, env): with meta_open(source[0].rstr()) as ifd: sentences = [[(w, t, []) for w, t in [re.split(r"\s+", x) for x in s.split("\n") if not re.match(r"^\s*$", x)]] for s in re.split(r"\n\n", ifd.read(), flags=re.M)] data = DataSet.from_sentences(sentences) with meta_open(target[0].rstr(), "w") as ofd: data.write(ofd) return None
def apply_morfessor(target, source, env): """Applies a trained Morfessor model to an unseen word list. Sources: morfessor model file, word list file Targets: segmented word list """ parser = get_default_argparser() args = parser.parse_args([]) io = MorfessorIO(encoding=args.encoding, compound_separator=args.cseparator, atom_separator=args.separator) model = io.read_binary_model_file(source[0].rstr()) words = [] terms = {} for fname in source[1:]: try: with meta_open(fname.rstr(), enc=None) as ifd: for t in et.parse(ifd).getiterator("kw"): text = list(t.getiterator("kwtext"))[0].text words += text.strip().split() except: with meta_open(fname.rstr()) as ifd: words = [l.strip().split()[0] for l in ifd] words = set(sum([w.strip("-").split("-") for w in words if "_" not in w], [])) for w in words: toks, score = model.viterbi_segment(w) if len(toks) >= 2: toks = ["%s+" % toks[0]] + ["+%s+" % t for t in toks[1:-1]] + ["+%s" % toks[-1]] terms[w] = toks with meta_open(target[0].rstr(), "w") as ofd: ofd.write(("\n".join(sorted(["%s" % (" ".join(v)) for k, v in terms.iteritems()]))) + "\n") return None
def train_language_model(target, source, env): """Train an n-gram language model using a plain text transcript. Uses IBM's compiled LM tools that ship with Attila. This can also be used on a segmented transcript, in which case the n-grams are over morphs rather than words. Sources: transcript file, n Targets: language model file """ text_file = source[0].rstr() n = source[1].read() with temp_dir() as prefix_dir, temp_file() as vocab_file, temp_file(suffix=".txt") as sentence_file, meta_open(text_file) as text_fd: sentences = ["<s> %s </s>" % (l) for l in text_fd] words = set(sum([s.split() for s in sentences], []) + ["<s>", "</s>", "<UNK>"]) with meta_open(vocab_file, "w") as ofd: ofd.write("\n".join(words)) with meta_open(sentence_file, "w") as ofd: ofd.write("\n".join(sentences)) prefix = os.path.join(prefix_dir, "counts") cmd = "${ATTILA_PATH}/tools/lm_64/CountNGram -n %d %s %s %s" % (n, sentence_file, vocab_file, prefix) out, err, success = run_command(env.subst(cmd)) if not success: return err lm = ".".join(target[0].rstr().split(".")[0:-2]) cmd = "${ATTILA_PATH}/tools/lm_64/BuildNGram.sh -n %d -arpabo %s %s" % (n, prefix, lm) out, err, success = run_command(env.subst(cmd), env={"SFCLMTOOLS" : env.subst("${ATTILA_PATH}/tools/lm_64")}) if not success: return err return None
def gaussier_morph(target, source, env): if isinstance(source[0], Value): words = source[0].read() else: words = set([x.text for x in et.parse(meta_open(source[0].rstr())).getiterator("f") if x.attrib["name"] == "morph" and x.text and len(x.text) > 4]) pairs = gaussier.suffix_pairs(words, min_psimilarity=int(env["MIN_PSIM"]), min_occurrence=int(env["MIN_OCCURRENCES"])) print len(pairs) #if len(pairs) > 0: # dv, words = gaussier.similarity_matrix(pairs) #print pairs #try: rel_fams = dict([(i, [r + s for s in x[0] for r in x[1]]) for i, x in enumerate(pairs.iteritems())]) #gaussier.relational_families(dv, words) # print rel_fams p = Proteus(fams=rel_fams, title="""Gaussier Morphology, min. p-similarity=%s, unique words=%s, min. occurrences=%s, clustering=(method=%s, threshold=%s)""" % (env["MIN_PSIM"], env["WORD_COUNT"], env["MIN_OCCURRENCES"], env["CLUSTERING_METHOD"], env["CLUSTERING_THRESHOLD"])) fd = meta_open(target[0].rstr(), 'w') fd.write( """<?xml version="1.0" encoding="utf-8"?> <?xml-stylesheet href="morphology.xsl" type="text/xsl"?>""" ) p.write(fd) #except: # pass return None
def create_data_list(target, source, env): """ NEEDS WORK! Creates the master list of lattice transformations. """ args = source[-1].read() data = {} for line in meta_open(source[0].rstr()): toks = line.split() bn = os.path.basename(toks[2]) data[toks[0]] = data.get(toks[0], {}) data[toks[0]][toks[1]] = (bn, toks[4], toks[5]) ofd = meta_open(target[0].rstr(), "w") for lattice_file in glob(os.path.join(args["LATTICE_DIR"], "*")): bn = os.path.basename(lattice_file) path = os.path.join(env["BASE_PATH"], "lattices") uttname, delim, uttnum = re.match(r"(.*)([^\w])(\d+)\.%s$" % (args["oldext"]), bn).groups() try: name, time, timeend = data[uttname][uttnum] newname = os.path.abspath(os.path.join(path, "%s%s%s.%s" % (uttname, delim, uttnum, args["ext"]))) ofd.write("%s %s %s %s %s.osym %s\n" % (os.path.splitext(name)[0], time, timeend, newname, newname, os.path.abspath(lattice_file))) except: return "lattice file not found in database: %s (are you sure your database file matches your lattice directory?)" % bn ofd.close() return None
def index_to_symbol_tables(target, source, env): """Create symbol and transducer symbol tables based on an index file. Sources: index file Targets: symbol file, bsymbol file """ osyms = set() bsyms = set() with meta_open(source[0].rstr()) as ifd: for line in ifd: toks = line.strip().split() if len(toks) == 1: continue isym = toks[2] osym = toks[3] osyms.add(osym) if not (isym == "<epsilon>" and osym == "<epsilon>"): bsym = "%s:%s" % (isym, osym) bsym.replace("<epsilon>:", "").replace(":<epsilon>", "") bsyms.add(bsym) with meta_open(target[0].rstr(), "w") as osym_ofd: osym_ofd.write("<epsilon> 0\n") for i, osym in enumerate(osyms): osym_ofd.write("%s %d\n" % (osym, i + 1)) with meta_open(target[1].rstr(), "w") as bsym_ofd: bsym_ofd.write("<epsilon> 0\n") for i, bsym in enumerate(bsyms): bsym_ofd.write("%s %d\n" % (bsym, i + 1)) return None
def rasp_parse(target, source, env): """ Parse one file, one sentence per line. """ pid = Popen(["/bin/sh", "/home/tom/parsers/rasp/scripts/rasp.sh"], cwd="/home/tom/parsers/rasp", stdin=PIPE, stdout=PIPE, stderr=PIPE) out, err = pid.communicate("\n".join([meta_open(f.rstr()).read() for f in source[0:-1]])) meta_open(target[0].rstr(), "w").write(out) return None
def pronunciations_to_vocab_dict(target, source, env): """Convert a pronunciation file to a vocabulary file (IBM format). Sources: pronunciation file, dictionary_file, boolean Targets: vocabulary file, pronunciations_file """ graphemic = source[-1].read() prons = {} with meta_open(source[0].rstr()) as ifd: for l in ifd: try: morph, num, prob, phones = l.strip().split("\t") except: try: morph, num, prob = l.strip().split("\t") except: try: morph, phones = l.strip().split("\t") num = "1" except: morph = l.strip() phones = "SIL" num = "1" num = int(num) + 1 prons["%s(%.2d)" % (morph, num)] = (morph, phones.split()) with meta_open(target[0].rstr(), "w") as vocab_ofd, meta_open(target[1].rstr(), "w") as dict_ofd: wb = ["[", "wb", "]"] wb = [] for w, (m, p) in prons.iteritems(): if not graphemic: if len(p) == 1: p = p + wb else: p = [p[0]] + wb + p[1:] + wb dict_ofd.write("%s %s\n" % (w, " ".join(p))) vocab_ofd.write("%s %s\n" % (w, m)) vocab_ofd.write("""<s>(01) <s> </s>(01) </s> ~SIL(01) VOCAB_NIL_WORD 1.4771 ~SIL(02) VOCAB_NIL_WORD 1.4771 ~SIL(03) VOCAB_NIL_WORD 1.4771 """) if graphemic: dict_ofd.write("""<s>(01) SIL </s>(01) SIL ~SIL(01) SIL ~SIL(02) NS ~SIL(03) VN """) else: dict_ofd.write("""<s>(01) SIL </s>(01) SIL ~SIL(01) SIL ~SIL(02) NS ~SIL(03) VN """) return None
def create_subset(target, source, env): amount = source[1].read() method = source[2].read() with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd) subset = data.get_subset(range(amount)) with meta_open(target[0].rstr(), "w") as ofd: subset.write(ofd) return None
def evaluate_morphology(target, source, env): with meta_open(source[0].rstr()) as gold_fd, meta_open(source[1].rstr()) as pred_fd: gold = DataSet.from_stream(gold_fd) pred = DataSet.from_stream(pred_fd) gold_analyses = gold.get_analyses() pred_analyses = pred.get_analyses() with meta_open(target[0].rstr(), "w") as ofd: ofd.write("%f\n" % 1.0) return None
def scf_sents(target, source, env): fd = meta_open(target[0].rstr(), "w") regexes = [re.compile("[^V]*%s\S+V.*" % x) for x in source[0].read()] for i, f in enumerate(env["FILES"]): logging.info("%d %s", i, f) for i, s in enumerate(re.split("\n\s*\n", meta_open(f).read())): if any([x.match(s) for x in regexes]): fd.write("%s %d\n" % (f, i) + s.strip() + "\n\n\n") return None
def lattice_list(target, source, env): """ Creates a file that's simply a list of the lattices in the given directory (absolute paths, one per line). """ lattice_dir = source[1].read() if not os.path.exists(lattice_dir): return "No such directory: %s" % lattice_dir meta_open(target[0].rstr(), "w").write("\n".join([os.path.abspath(x) for x in glob(os.path.join(lattice_dir, "*"))]) + "\n") return None
def conll_to_data(target, source, env): args = source[-1].read() if args.get("verbs"): keep_verbs = [x.split()[0] for x in meta_open(args["verbs"])] else: keep_verbs = [] lookups = dict([(x, {}) for x in ["verb_to_id", "gr_to_id", "lemma_to_id"]]) data = dict([(x, []) for x in ["instance_starts", "instance_verbs", "instance_lengths", "instance_grs"]]) for fname in args.get("inputs", []): this_verb = None if "0parsed" in fname: this_verb = re.match(r".*0parsed\.(.*?)\..*", fname).group(1) if len(keep_verbs) > 0 and this_verb not in keep_verbs: continue fd = meta_open(fname) text = fd.read() fd.close() for stext in sentence_rx.split(text.strip()): try: sent = Sentence(stext) except: continue for verb in [x for x in sent if x.pos.startswith("V") and x.gr not in ["auxpass", "cop"] and (not keep_verbs or x.lemma in keep_verbs) and (not this_verb or this_verb == x.lemma)]: try: if verb.head(): if verb.head().pos[0] in fpos: grs = ["%s(%s-%s, %s)" % (verb.gr, verb.head().pos, verb.head().lemma, verb.pos)] else: grs = ["%s(%s, %s)" % (verb.gr, verb.head().pos, verb.pos)] lemmas = [verb.head().lemma] else: grs = [] lemmas = [] except: continue for tok in sent: if tok.head_index == verb.index: if tok.pos[0] in fpos: grs.append("%s(%s, %s-%s)" % (tok.gr, verb.pos, tok.pos, tok.lemma)) else: grs.append("%s(%s, %s)" % (tok.gr, verb.pos, tok.pos)) lemmas.append(tok.lemma) if len(grs) == 0: continue lookups["verb_to_id"][verb.lemma] = lookups["verb_to_id"].get(verb.lemma, len(lookups["verb_to_id"]) + 1) data["instance_starts"].append(len(data["instance_grs"]) + 1) data["instance_verbs"].append(lookups["verb_to_id"][verb.lemma]) data["instance_lengths"].append(len(grs)) for gr in grs: lookups["gr_to_id"][gr] = lookups["gr_to_id"].get(gr, len(lookups["gr_to_id"]) + 1) data["instance_grs"].append(lookups["gr_to_id"][gr]) data["verbs"] = len(lookups["verb_to_id"]) data["grs"] = len(lookups["gr_to_id"]) data["ns"] = max(data["instance_lengths"]) pickle.dump((data, lookups), meta_open(target[0].rstr(), "w")) return None
def top_words(target, source, env): args = source[-1].read() with meta_open(source[0].rstr()) as words_ifd, meta_open(source[1].rstr()) as pron_ifd: top = ProbabilityList(words_ifd).get_top_n(args["COUNT"]) prons = Pronunciations(pron_ifd) prons.filter_by(top) with meta_open(target[0].rstr(), "w") as words_ofd, meta_open(target[1].rstr(), "w") as pron_ofd: words_ofd.write(top.format()) pron_ofd.write(prons.format()) return None
def fst_compile(target, source, env): """ Compile an FST using OpenFST's binary 'fstcompile'. """ command = env.subst("${FSTCOMPILE} --isymbols=${SOURCES[0]} --osymbols=${SOURCES[0]} ${SOURCES[1]}", target=target, source=source) stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if not success: return stderr meta_open(target[0].rstr(), "w").write(stdout) return None
def merge_scores(target, source, env): """ NEEDS WORK! CONVERT TO BUILDER! """ stdout, stderr, success = run_command(env.subst("${MERGESCORESSUMPOSTNORMPL} ${SOURCES[0]}", target=target, source=source), env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}")}) if not success: return stderr meta_open(target[0].rstr(), "w").write(stdout) return None
def cluster_verbs(target, source, env): args = source[-1].read() verbs, samples = pickle.load(meta_open(source[0].rstr())) samples = samples.sum(2) data = numpy.transpose(samples.T / samples.sum(1)) res = stats.kmeans(numpy2ri(data), centers=args.get("clusters", 20)) #data[args["matrix"]].shape[0] / 10) ofd = meta_open(target[0].rstr(), "w") for c in set(res.rx2("cluster")): ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n") return None
def plot_lsa(target, source, env): args = source[-1].read() words, codebook, distortion, whitened = cPickle.load(meta_open(source[0].rstr())) words = [x.get("_NAME") for x in words] assignments = zip(vq(whitened, codebook)[0], words) clusters = dict([(x, [y[1] for y in assignments if y[0] == x]) for x in set([y[0] for y in assignments])]) fd = meta_open(target[0].rstr(), "w") for k, v in clusters.iteritems(): fd.write("%s\n\t%s\n\n" % (k, "\n\t".join(v).encode("utf-8"))) return None
def evaluate_scfs(target, source, env): args = source[-1].read() raw = tempfile.NamedTemporaryFile(dir=env["TEMP_DIR"], delete=False, prefix="tlraw") traw = tempfile.NamedTemporaryFile(dir=env["TEMP_DIR"], delete=False, prefix="tltraw") tout = tempfile.NamedTemporaryFile(dir=env["TEMP_DIR"], delete=False, prefix="tlout") lexicon = tempfile.NamedTemporaryFile(dir=env["TEMP_DIR"], delete=False, prefix="tllex") gold = tempfile.NamedTemporaryFile(dir=env["TEMP_DIR"], delete=False, prefix="tlgold") if source[1].rstr().endswith(".tgz"): tf = tarfile.open(source[1].rstr(), "r:gz") traw.write(tf.extractfile([x for x in tf.getnames() if x.endswith("count.scf")][0]).read()) else: traw.write(meta_open(source[1].rstr()).read()) traw.close() raw_v = set([l.split()[0] for l in meta_open(traw.name)]) lex_v = set([l.split()[0] for l in meta_open(source[0].rstr())]) gold_v = set([l.split()[0] for l in meta_open(source[2].rstr()) if len(l.split()) == 1]) if "FILTER_BY" in args: #filt_v = set([l.split()[0] for l in meta_open(args["FILTER_BY"]) if len(l.split()) == 1]) filt_v = set([l.strip() for l in meta_open(args["FILTER_BY"])]) verbs = set.intersection(raw_v, lex_v, gold_v, filt_v) else: verbs = set.intersection(raw_v, lex_v, gold_v) rx = re.compile("^(%s)\s+" % ("|".join(verbs))) for l in [l for l in meta_open(traw.name) if rx.match(l)]: verb, scf, freq, count = l.strip().split() scf = scf.split("_")[0].lstrip("0") raw.write("%s %s %s %s\n" % (verb, scf, freq, count)) for l in [l for l in meta_open(source[0].rstr()) if rx.match(l)]: verb, scf, freq, count = l.strip().split() scf = scf.split("_")[0].lstrip("0") lexicon.write("%s %s %s %s\n" % (verb, scf, freq, count)) gold.write(meta_open(source[2].rstr()).read()) gold.close() raw.close() lexicon.close() pid = Popen(["%s/scripts/eval-scf-counts.pl" % env["SUBCAT_2009"], lexicon.name, tout.name, "-raw", raw.name, "-gold", gold.name]) pid.communicate() text = open(tout.name).read() xml = et.TreeBuilder() xml.start("xml", {}) for k, v in [x for x in args.iteritems() if not x[0].startswith("_")]: xml.start(k, {}) xml.data(str(v)) xml.end(k) xml.start("text", {}) xml.data(text) xml.end("text") xml.end("xml") meta_open(target[0].rstr(), "w").write(et.tostring(xml.close())) os.remove(lexicon.name) os.remove(gold.name) os.remove(raw.name) os.remove(traw.name) os.remove(tout.name) return None
def graphemic_pronunciations(target, source, env): """Convert a list of words into a list of graphemic pronunciations. Sources: word list file Targets: graphemic pronunciation file """ with meta_open(source[0].rstr()) as ifd: items = [x.strip() for x in ifd] with meta_open(target[0].rstr(), "w") as ofd: ofd.write("\n".join(["%s\t%s" % (w, " ".join(["u%.4x" % (ord(c)) for c in w if unicodedata.category(c)[0] == "L" and c not in [unichr(1100), unichr(1098)]])) for w in items])) return None
def rtm_to_data(target, source, env): sentences = [] with meta_open(source[0].rstr()) as ifd: for sentence in ifd: words = [w for w in sentence.split()[5:] if w not in ["(())", "IGNORE_TIME_SEGMENT_IN_SCORING"]] if len(words) > 0: sentences.append(words) dataset = DataSet.from_sentences([[(w, None, []) for w in s] for s in sentences]) with meta_open(target[0].rstr(), "w") as ofd: dataset.write(ofd) return None