def recoverUnknown(f_unknown_p, f_unknown, f_align_p, f_clean): """ Shows us the Foreign sentence that produced no formality while the English sentence had a "you". """ print "Recovering unknown sentences" unknown = loadStruct(f_unknown_p) align = loadStruct(f_align_p) with copen(f_unknown, "w", encoding="utf-8") as unknown_f: for doc, proj in unknown.iteritems(): if len(proj) > 0: de = [] links = align[doc] for p in proj: for link in links: if p in link[1].split(" "): de.extend(link[0].split(" ")) with copen(f_clean + doc[0].replace(".gz", "")) as doc_f: dom = parse(doc_f) nodes = dom.getElementsByTagName("s") for node in nodes: if node.getAttribute("id") in de: unknown_f.write("%s\n" % node.firstChild.nodeValue)
def computSentenceLength(f_align_p, f_clean): align_p_f = loadStruct(f_align_p) min = maxint max = 0 total = 0 avg = 0.0 longest = [] for k in align_p_f.iterkeys(): with open(f_clean + k[1].replace(".gz", "")) as clean_f: dom = parse(clean_f) nodes = dom.getElementsByTagName("s") for node in nodes: sentence = node.firstChild.nodeValue.split(" ") length = len(sentence) if length > max and "~" not in sentence and "subtitles" not in sentence and "subber" not in sentence: max = length longest = sentence if length != 0 and length < min: min = length avg += length total += len(nodes) print "Min: %d\nMax: %d\nAvg: %f\n" % (min, max, (avg / total)) print longest
def plainCopyDocuments(f_align_p, f_corpus, f_clean): """ Copies the files with alignments to a seperate folder. """ align_p_f = loadStruct(f_align_p) print "Copying %d documents" % len(align_p_f) for key in align_p_f.iterkeys(): to_de = f_clean + key[0] to_en = f_clean + key[1] createPath(to_de) createPath(to_en) copy(f_corpus + key[0], to_de) copy(f_corpus + key[1], to_en)
def compareSentenceCount(misc): """ Compares sentence count from grep -c "s id" for checking purposes. Save the grep output as lang_count.txt. """ print "Comparing sentence counts" for lang in langs: a = loadStruct(misc + lang + "_align.p") b = {} with open(misc + "%s_count.txt" % lang) as counts: for line in counts: k, v = line.strip().split(":") b.setdefault(k, v) for k1, v1 in a.iteritems(): for k2, v2 in b.iteritems(): if k1 == k2: if str(v1) != v2: print k1, k2 print v1, v2
def createProjection(f_align_p, f_stats, f_clean, f_proj, f_unknown_p): """ Creates the projection based on rules. """ fcount = 0 de_count = 0 en_count = 0 pos = 0 neg = 0 lost = 0 nn = 0 on = 0 no = 0 oo = 0 de_lost = 0 scount = 0 align_p_f = loadStruct(f_align_p) total = len(align_p_f) unknown = {} for lang, rels in align_p_f.iteritems(): fcount += 1 if fcount % 500 == 0 or fcount == total or fcount == 1: print "Documents: %d/%d" % (fcount, total) with copen(f_clean + lang[0].replace(".gz", "")) as xml_f: proj = {} dom = parse(xml_f) nodes = dom.getElementsByTagName("s") de_count += len(nodes) for link in rels: for node in nodes: id_de = node.getAttribute("id") links_de = link[0].split(" ") if id_de in links_de and link[1] != "": sentence = node.firstChild.nodeValue.split(" ") meta = "<s id=\"0\" f=\"0\" i=\"0\">" if "du" in sentence or "Du" in sentence: meta = meta.replace("i=\"0\"", "i=\"1\"") if "Sie" in sentence[1:]: meta = meta.replace("f=\"0\"", "f=\"1\"") if "f=\"0\" i=\"0\"" in meta: nn += 1 elif "f=\"1\" i=\"0\"" in meta: on += 1 elif "f=\"0\" i=\"1\"" in meta: no += 1 elif "f=\"1\" i=\"1\"" in meta: oo += 1 if "f=\"1\" i=\"1\"" not in meta: for id_en in link[1].split(" "): proj[id_en] = meta.replace("id=\"0\"", "id=\"%s\"" % id_en) else: de_lost += 1 en_count += len(proj) with copen(f_clean + lang[1].replace(".gz", "")) as xml_e: unknown.setdefault(lang, []) fname_e = f_proj + "_".join(lang[1].split("/")).replace(".xml.gz", ".txt").replace("en_", "") createPath(fname_e) with copen(fname_e, "w", encoding="utf-8") as txt_e: txt_e.write("<d src=\"%s\">\n" % lang[0].replace(".gz", "")) dom_e = parse(xml_e) nodes_e = dom_e.getElementsByTagName("s") for node in nodes_e: id_e = node.getAttribute("id") sent_e = node.firstChild.nodeValue if id_e in proj: proj_e = proj[id_e] s_sent_e = sent_e.split(" ") if "you" in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e: pos += 1 scount += 1 txt_e.write("%s%s</s>\n" % (proj_e, sent_e)) elif "you" in s_sent_e and "f=\"0\" i=\"0\"" in proj_e: neg += 1 unknown[lang].append(id_e) elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" in proj_e: scount += 1 txt_e.write("%s%s</s>\n" % (proj_e, sent_e)) elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e: lost += 1 txt_e.write("</d>\n") txt_e.flush() with open(f_stats, "a") as stats: stats.write("PROJECTED DE_%d TO %d_EN\n" "DE 0 0: %d\n" "DE 1 0: %d\n" "DE 0 1: %d\n" "DE 1 1: %d\n" "Y-Found: %d\n" "Y-NotFound: %d\n" "F-Lost: %d\n" "Sentences: %d\n" "DE no EN: %d" % (de_count, en_count, nn, on, no, oo, pos, neg, lost, scount, de_lost)) dumpStruct(f_unknown_p, unknown)
def createProjection(f_align_p, f_stats, f_clean, f_proj, f_unknown_p): """ Creates the projection based on rules. """ fcount = 0 de_count = 0 en_count = 0 pos = 0 neg = 0 lost = 0 nn = 0 on = 0 no = 0 oo = 0 de_lost = 0 scount = 0 align_p_f = loadStruct(f_align_p) total = len(align_p_f) unknown = {} for lang, rels in align_p_f.iteritems(): fcount += 1 if fcount % 500 == 0 or fcount == total or fcount == 1: print "Documents: %d/%d" % (fcount, total) with copen(f_clean + lang[0].replace(".gz", "")) as xml_f: proj = {} dom = parse(xml_f) nodes = dom.getElementsByTagName("s") de_count += len(nodes) for link in rels: for node in nodes: id_de = node.getAttribute("id") links_de = link[0].split(" ") if id_de in links_de and link[1] != "": sentence = node.firstChild.nodeValue.split(" ") meta = "<s id=\"0\" f=\"0\" i=\"0\">" if "du" in sentence or "Du" in sentence: meta = meta.replace("i=\"0\"", "i=\"1\"") if "Sie" in sentence[1:]: meta = meta.replace("f=\"0\"", "f=\"1\"") if "f=\"0\" i=\"0\"" in meta: nn += 1 elif "f=\"1\" i=\"0\"" in meta: on += 1 elif "f=\"0\" i=\"1\"" in meta: no += 1 elif "f=\"1\" i=\"1\"" in meta: oo += 1 if "f=\"1\" i=\"1\"" not in meta: for id_en in link[1].split(" "): proj[id_en] = meta.replace( "id=\"0\"", "id=\"%s\"" % id_en) else: de_lost += 1 en_count += len(proj) with copen(f_clean + lang[1].replace(".gz", "")) as xml_e: unknown.setdefault(lang, []) fname_e = f_proj + "_".join(lang[1].split("/")).replace( ".xml.gz", ".txt").replace("en_", "") createPath(fname_e) with copen(fname_e, "w", encoding="utf-8") as txt_e: txt_e.write("<d src=\"%s\">\n" % lang[0].replace(".gz", "")) dom_e = parse(xml_e) nodes_e = dom_e.getElementsByTagName("s") for node in nodes_e: id_e = node.getAttribute("id") sent_e = node.firstChild.nodeValue if id_e in proj: proj_e = proj[id_e] s_sent_e = sent_e.split(" ") if "you" in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e: pos += 1 scount += 1 txt_e.write("%s%s</s>\n" % (proj_e, sent_e)) elif "you" in s_sent_e and "f=\"0\" i=\"0\"" in proj_e: neg += 1 unknown[lang].append(id_e) elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" in proj_e: scount += 1 txt_e.write("%s%s</s>\n" % (proj_e, sent_e)) elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e: lost += 1 txt_e.write("</d>\n") txt_e.flush() with open(f_stats, "a") as stats: stats.write("PROJECTED DE_%d TO %d_EN\n" "DE 0 0: %d\n" "DE 1 0: %d\n" "DE 0 1: %d\n" "DE 1 1: %d\n" "Y-Found: %d\n" "Y-NotFound: %d\n" "F-Lost: %d\n" "Sentences: %d\n" "DE no EN: %d" % (de_count, en_count, nn, on, no, oo, pos, neg, lost, scount, de_lost)) dumpStruct(f_unknown_p, unknown)
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True): """ Copies the documents with alignment in a clean format to a new folder as text files. """ align_p_f = loadStruct(f_align_p) stopwords = getStopwords() n_docs = len(align_p_f) words_total = 0 words_lost = 0 sents_lost = 0 with open(f_rem, "w") as rem_f: for i, key in enumerate(align_p_f.iterkeys()): if i % 500 == 0: print "Documents: %d/%d" % (i, n_docs) elif i == 0 or i == n_docs - 1: print "Documents: %d/%d" % (i + 1, n_docs) for lang in key: fname = f_clean + lang.replace(".gz", "") createPath(fname) with copen(fname, "w", encoding="utf-8") as xml_f: doc = [] last_id = 0 words = 0 with gopen(f_corpus + lang) as clean_f: for line in clean_f: line = line.strip() if line.startswith("<s"): last_id = match(".*id=\"([0-9]+)\"", line).group(1) doc.append([]) if line.startswith("<w"): m = match(".*>(.+)</", line) if m: word = m.group(1) words += 1 if lang.startswith("en"): words_total += 1 word = word.strip().lower().replace( "\'", "") if filter and word not in stopwords and len( word) > 1 and word.isalpha(): doc[-1].append(word) elif not filter: doc[-1].append(word) else: words_lost += 1 elif lang.startswith("de"): doc[-1].append(word) xml_f.write( "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<d s=\"%s\" w=\"%s\" f=\"%s\">\n" % (last_id, words, lang.replace(".gz", ""))) for k, v in enumerate(doc): sid = k + 1 if len(v) > 1: xml_f.write("<s id=\"%s\">%s</s>\n" % (sid, " ".join(v).decode("utf-8"))) if len(v) <= 1: sents_lost += 1 rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid)) for projection in align_p_f[key]: if lang.startswith("de") and str( sid) in projection[0].split(" "): align_p_f[key].remove(projection) break elif lang.startswith("en") and str( sid) in projection[1].split(" "): align_p_f[key].remove(projection) break xml_f.write("</d>\n") xml_f.flush() with open(f_stats, "a") as stats_f: stats_f.write("Removed: %d sentences\n" % sents_lost) scount = 0 for v in align_p_f.itervalues(): scount += len(v) stats_f.write("Remaining: %d sentences\n" % scount) stats_f.write("Total words: %d\n" % words_total) stats_f.write("Words lost: %d\n" % words_lost) stats_f.write("Words remmaining: %d\n" % (words_total - words_lost)) dumpStruct(f_align_p, align_p_f)
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True): """ Copies the documents with alignment in a clean format to a new folder as text files. """ align_p_f = loadStruct(f_align_p) stopwords = getStopwords() n_docs = len(align_p_f) words_total = 0 words_lost = 0 sents_lost = 0 with open(f_rem, "w") as rem_f: for i, key in enumerate(align_p_f.iterkeys()): if i % 500 == 0: print "Documents: %d/%d" % (i, n_docs) elif i == 0 or i == n_docs - 1: print "Documents: %d/%d" % (i + 1, n_docs) for lang in key: fname = f_clean + lang.replace(".gz", "") createPath(fname) with copen(fname, "w", encoding="utf-8") as xml_f: doc = [] last_id = 0 words = 0 with gopen(f_corpus + lang) as clean_f: for line in clean_f: line = line.strip() if line.startswith("<s"): last_id = match('.*id="([0-9]+)"', line).group(1) doc.append([]) if line.startswith("<w"): m = match(".*>(.+)</", line) if m: word = m.group(1) words += 1 if lang.startswith("en"): words_total += 1 word = word.strip().lower().replace("'", "") if filter and word not in stopwords and len(word) > 1 and word.isalpha(): doc[-1].append(word) elif not filter: doc[-1].append(word) else: words_lost += 1 elif lang.startswith("de"): doc[-1].append(word) xml_f.write( '<?xml version="1.0" encoding="utf-8"?>\n<d s="%s" w="%s" f="%s">\n' % (last_id, words, lang.replace(".gz", "")) ) for k, v in enumerate(doc): sid = k + 1 if len(v) > 1: xml_f.write('<s id="%s">%s</s>\n' % (sid, " ".join(v).decode("utf-8"))) if len(v) <= 1: sents_lost += 1 rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid)) for projection in align_p_f[key]: if lang.startswith("de") and str(sid) in projection[0].split(" "): align_p_f[key].remove(projection) break elif lang.startswith("en") and str(sid) in projection[1].split(" "): align_p_f[key].remove(projection) break xml_f.write("</d>\n") xml_f.flush() with open(f_stats, "a") as stats_f: stats_f.write("Removed: %d sentences\n" % sents_lost) scount = 0 for v in align_p_f.itervalues(): scount += len(v) stats_f.write("Remaining: %d sentences\n" % scount) stats_f.write("Total words: %d\n" % words_total) stats_f.write("Words lost: %d\n" % words_lost) stats_f.write("Words remmaining: %d\n" % (words_total - words_lost)) dumpStruct(f_align_p, align_p_f)