def countSentences(align, fout_stats): """ Count sentences from alignment structure. """ print "Counting sentences" de_total = 0 en_total = 0 align_de = {} align_en = {} with open(fout_stats, "a") as stats: for doc, align in align.iteritems(): de_doc = 0 en_doc = 0 for pro in align: for _ in pro[0].split(" "): if _ != "": de_doc += 1 for _ in pro[1].split(" "): if _ != "": en_doc += 1 align_de.setdefault(doc[0].rsplit("/", 1)[1].replace(".gz", ""), de_doc) align_en.setdefault(doc[1].rsplit("/", 1)[1].replace(".gz", ""), en_doc) stats.write("%s \t %s\n%d \t %d\n" % (doc[0], doc[1], de_doc, en_doc)) de_total += de_doc en_total += en_doc stats.write("\nDE Sentences: %d\nEN Sentences: %d\n" % (de_total, en_total)) dumpStruct(f_misc + "de_align.p", align_de) dumpStruct(f_misc + "en_align.p", align_en)
def extractAlignmentsRX(f_align, f_align_p, f_stats): """ Extracts the alignments with regex. Easier to parse HUN aligned files, which will be dropped due to inconsistencies. Mainly used for the small OpenSubtitles corpus not the 2011er one. """ print "Extracting alignments" alignments = {} final = {} hun_files = set() doc_count = 0 link_count = 0 with gopen(f_align) as align_f: for line in align_f: line = line.strip() if line.startswith("<linkGrp"): doc_count += 1 m = search("fromDoc=\"(.+)\"\stoDoc=\"(.+)\"", line) if m: key = (m.group(1), m.group(2)) elif not m: m = search("toDoc=\"(.+)\"\sfromDoc=\"(.+)\"", line) key = (m.group(2), m.group(1)) alignments.setdefault(key, []) elif line.startswith("<link id="): link_count += 1 m = search("xtargets=\"(.+?)\"", line) alignments[key].append(m.group(1).split(";")) elif line.startswith("<link certainty="): hun_files.add(key) if key in alignments: del alignments[key] continue empty = set() for k, v in alignments.iteritems(): if len(v) != 0: final.setdefault(k, v) else: empty.add(k) dumpStruct(f_align_p, final) createPath(f_stats) with open(f_stats, "w") as stats: stats.write("DOCS: %d\nHUN: %d\nEMPTY: %d\nLEFT: %d\nLINKS: %d\n\n" % (doc_count, len(hun_files), len(empty), len(final), link_count)) for k in hun_files: stats.write(k[0] + " || " + k[1] + "\n") stats.write("\n")
def extractAlignmentsLXML(f_align, f_align_p, f_stats): """ Extracts alignment information from the alignments file with LXML. Used for the large OpenSubtitles 2011 corpus for faster processing. """ print "Extracting alignments" class Target(object): def __init__(self): self.d = dict() self.n_links = 0 self.n_docs = 0 def start(self, tag, attr): if tag == "linkGrp": self.n_docs += 1 self.k = (attr["fromDoc"], attr["toDoc"]) self.group = self.d[self.k] = [] elif tag == "link": self.n_links += 1 self.group.append(tuple(attr["xtargets"].split(";"))) if "certainty" in attr: print "Attention HUN: %s" % self.k def close(self): pass with gopen(f_align) as xml: targets = Target() parser = etree.XMLParser(target=targets) etree.parse(xml, parser) alignments = targets.d # Documents with no alignments empty = set() for k, v in alignments.iteritems(): if not len(v): empty.add(k) del targets.d[k] dumpStruct(f_align_p, alignments) createPath(f_stats) with open(f_stats, "w") as stats: stats.write("DOCS: %d\nEMPTY: %d\nLEFT: %d\nLINKS: %d\n\n" % (targets.n_docs, len(empty), len(alignments), targets.n_links)) for k in empty: stats.write("!!! Empty files\n%s || %s\n" % (k[0], k[1])) stats.write("\n")
def createProjection(f_align_p, f_stats, f_clean, f_proj, f_unknown_p): """ Creates the projection based on rules. """ fcount = 0 de_count = 0 en_count = 0 pos = 0 neg = 0 lost = 0 nn = 0 on = 0 no = 0 oo = 0 de_lost = 0 scount = 0 align_p_f = loadStruct(f_align_p) total = len(align_p_f) unknown = {} for lang, rels in align_p_f.iteritems(): fcount += 1 if fcount % 500 == 0 or fcount == total or fcount == 1: print "Documents: %d/%d" % (fcount, total) with copen(f_clean + lang[0].replace(".gz", "")) as xml_f: proj = {} dom = parse(xml_f) nodes = dom.getElementsByTagName("s") de_count += len(nodes) for link in rels: for node in nodes: id_de = node.getAttribute("id") links_de = link[0].split(" ") if id_de in links_de and link[1] != "": sentence = node.firstChild.nodeValue.split(" ") meta = "<s id=\"0\" f=\"0\" i=\"0\">" if "du" in sentence or "Du" in sentence: meta = meta.replace("i=\"0\"", "i=\"1\"") if "Sie" in sentence[1:]: meta = meta.replace("f=\"0\"", "f=\"1\"") if "f=\"0\" i=\"0\"" in meta: nn += 1 elif "f=\"1\" i=\"0\"" in meta: on += 1 elif "f=\"0\" i=\"1\"" in meta: no += 1 elif "f=\"1\" i=\"1\"" in meta: oo += 1 if "f=\"1\" i=\"1\"" not in meta: for id_en in link[1].split(" "): proj[id_en] = meta.replace("id=\"0\"", "id=\"%s\"" % id_en) else: de_lost += 1 en_count += len(proj) with copen(f_clean + lang[1].replace(".gz", "")) as xml_e: unknown.setdefault(lang, []) fname_e = f_proj + "_".join(lang[1].split("/")).replace(".xml.gz", ".txt").replace("en_", "") createPath(fname_e) with copen(fname_e, "w", encoding="utf-8") as txt_e: txt_e.write("<d src=\"%s\">\n" % lang[0].replace(".gz", "")) dom_e = parse(xml_e) nodes_e = dom_e.getElementsByTagName("s") for node in nodes_e: id_e = node.getAttribute("id") sent_e = node.firstChild.nodeValue if id_e in proj: proj_e = proj[id_e] s_sent_e = sent_e.split(" ") if "you" in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e: pos += 1 scount += 1 txt_e.write("%s%s</s>\n" % (proj_e, sent_e)) elif "you" in s_sent_e and "f=\"0\" i=\"0\"" in proj_e: neg += 1 unknown[lang].append(id_e) elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" in proj_e: scount += 1 txt_e.write("%s%s</s>\n" % (proj_e, sent_e)) elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e: lost += 1 txt_e.write("</d>\n") txt_e.flush() with open(f_stats, "a") as stats: stats.write("PROJECTED DE_%d TO %d_EN\n" "DE 0 0: %d\n" "DE 1 0: %d\n" "DE 0 1: %d\n" "DE 1 1: %d\n" "Y-Found: %d\n" "Y-NotFound: %d\n" "F-Lost: %d\n" "Sentences: %d\n" "DE no EN: %d" % (de_count, en_count, nn, on, no, oo, pos, neg, lost, scount, de_lost)) dumpStruct(f_unknown_p, unknown)
def createProjection(f_align_p, f_stats, f_clean, f_proj, f_unknown_p): """ Creates the projection based on rules. """ fcount = 0 de_count = 0 en_count = 0 pos = 0 neg = 0 lost = 0 nn = 0 on = 0 no = 0 oo = 0 de_lost = 0 scount = 0 align_p_f = loadStruct(f_align_p) total = len(align_p_f) unknown = {} for lang, rels in align_p_f.iteritems(): fcount += 1 if fcount % 500 == 0 or fcount == total or fcount == 1: print "Documents: %d/%d" % (fcount, total) with copen(f_clean + lang[0].replace(".gz", "")) as xml_f: proj = {} dom = parse(xml_f) nodes = dom.getElementsByTagName("s") de_count += len(nodes) for link in rels: for node in nodes: id_de = node.getAttribute("id") links_de = link[0].split(" ") if id_de in links_de and link[1] != "": sentence = node.firstChild.nodeValue.split(" ") meta = "<s id=\"0\" f=\"0\" i=\"0\">" if "du" in sentence or "Du" in sentence: meta = meta.replace("i=\"0\"", "i=\"1\"") if "Sie" in sentence[1:]: meta = meta.replace("f=\"0\"", "f=\"1\"") if "f=\"0\" i=\"0\"" in meta: nn += 1 elif "f=\"1\" i=\"0\"" in meta: on += 1 elif "f=\"0\" i=\"1\"" in meta: no += 1 elif "f=\"1\" i=\"1\"" in meta: oo += 1 if "f=\"1\" i=\"1\"" not in meta: for id_en in link[1].split(" "): proj[id_en] = meta.replace( "id=\"0\"", "id=\"%s\"" % id_en) else: de_lost += 1 en_count += len(proj) with copen(f_clean + lang[1].replace(".gz", "")) as xml_e: unknown.setdefault(lang, []) fname_e = f_proj + "_".join(lang[1].split("/")).replace( ".xml.gz", ".txt").replace("en_", "") createPath(fname_e) with copen(fname_e, "w", encoding="utf-8") as txt_e: txt_e.write("<d src=\"%s\">\n" % lang[0].replace(".gz", "")) dom_e = parse(xml_e) nodes_e = dom_e.getElementsByTagName("s") for node in nodes_e: id_e = node.getAttribute("id") sent_e = node.firstChild.nodeValue if id_e in proj: proj_e = proj[id_e] s_sent_e = sent_e.split(" ") if "you" in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e: pos += 1 scount += 1 txt_e.write("%s%s</s>\n" % (proj_e, sent_e)) elif "you" in s_sent_e and "f=\"0\" i=\"0\"" in proj_e: neg += 1 unknown[lang].append(id_e) elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" in proj_e: scount += 1 txt_e.write("%s%s</s>\n" % (proj_e, sent_e)) elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e: lost += 1 txt_e.write("</d>\n") txt_e.flush() with open(f_stats, "a") as stats: stats.write("PROJECTED DE_%d TO %d_EN\n" "DE 0 0: %d\n" "DE 1 0: %d\n" "DE 0 1: %d\n" "DE 1 1: %d\n" "Y-Found: %d\n" "Y-NotFound: %d\n" "F-Lost: %d\n" "Sentences: %d\n" "DE no EN: %d" % (de_count, en_count, nn, on, no, oo, pos, neg, lost, scount, de_lost)) dumpStruct(f_unknown_p, unknown)
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True): """ Copies the documents with alignment in a clean format to a new folder as text files. """ align_p_f = loadStruct(f_align_p) stopwords = getStopwords() n_docs = len(align_p_f) words_total = 0 words_lost = 0 sents_lost = 0 with open(f_rem, "w") as rem_f: for i, key in enumerate(align_p_f.iterkeys()): if i % 500 == 0: print "Documents: %d/%d" % (i, n_docs) elif i == 0 or i == n_docs - 1: print "Documents: %d/%d" % (i + 1, n_docs) for lang in key: fname = f_clean + lang.replace(".gz", "") createPath(fname) with copen(fname, "w", encoding="utf-8") as xml_f: doc = [] last_id = 0 words = 0 with gopen(f_corpus + lang) as clean_f: for line in clean_f: line = line.strip() if line.startswith("<s"): last_id = match(".*id=\"([0-9]+)\"", line).group(1) doc.append([]) if line.startswith("<w"): m = match(".*>(.+)</", line) if m: word = m.group(1) words += 1 if lang.startswith("en"): words_total += 1 word = word.strip().lower().replace( "\'", "") if filter and word not in stopwords and len( word) > 1 and word.isalpha(): doc[-1].append(word) elif not filter: doc[-1].append(word) else: words_lost += 1 elif lang.startswith("de"): doc[-1].append(word) xml_f.write( "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<d s=\"%s\" w=\"%s\" f=\"%s\">\n" % (last_id, words, lang.replace(".gz", ""))) for k, v in enumerate(doc): sid = k + 1 if len(v) > 1: xml_f.write("<s id=\"%s\">%s</s>\n" % (sid, " ".join(v).decode("utf-8"))) if len(v) <= 1: sents_lost += 1 rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid)) for projection in align_p_f[key]: if lang.startswith("de") and str( sid) in projection[0].split(" "): align_p_f[key].remove(projection) break elif lang.startswith("en") and str( sid) in projection[1].split(" "): align_p_f[key].remove(projection) break xml_f.write("</d>\n") xml_f.flush() with open(f_stats, "a") as stats_f: stats_f.write("Removed: %d sentences\n" % sents_lost) scount = 0 for v in align_p_f.itervalues(): scount += len(v) stats_f.write("Remaining: %d sentences\n" % scount) stats_f.write("Total words: %d\n" % words_total) stats_f.write("Words lost: %d\n" % words_lost) stats_f.write("Words remmaining: %d\n" % (words_total - words_lost)) dumpStruct(f_align_p, align_p_f)
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True): """ Copies the documents with alignment in a clean format to a new folder as text files. """ align_p_f = loadStruct(f_align_p) stopwords = getStopwords() n_docs = len(align_p_f) words_total = 0 words_lost = 0 sents_lost = 0 with open(f_rem, "w") as rem_f: for i, key in enumerate(align_p_f.iterkeys()): if i % 500 == 0: print "Documents: %d/%d" % (i, n_docs) elif i == 0 or i == n_docs - 1: print "Documents: %d/%d" % (i + 1, n_docs) for lang in key: fname = f_clean + lang.replace(".gz", "") createPath(fname) with copen(fname, "w", encoding="utf-8") as xml_f: doc = [] last_id = 0 words = 0 with gopen(f_corpus + lang) as clean_f: for line in clean_f: line = line.strip() if line.startswith("<s"): last_id = match('.*id="([0-9]+)"', line).group(1) doc.append([]) if line.startswith("<w"): m = match(".*>(.+)</", line) if m: word = m.group(1) words += 1 if lang.startswith("en"): words_total += 1 word = word.strip().lower().replace("'", "") if filter and word not in stopwords and len(word) > 1 and word.isalpha(): doc[-1].append(word) elif not filter: doc[-1].append(word) else: words_lost += 1 elif lang.startswith("de"): doc[-1].append(word) xml_f.write( '<?xml version="1.0" encoding="utf-8"?>\n<d s="%s" w="%s" f="%s">\n' % (last_id, words, lang.replace(".gz", "")) ) for k, v in enumerate(doc): sid = k + 1 if len(v) > 1: xml_f.write('<s id="%s">%s</s>\n' % (sid, " ".join(v).decode("utf-8"))) if len(v) <= 1: sents_lost += 1 rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid)) for projection in align_p_f[key]: if lang.startswith("de") and str(sid) in projection[0].split(" "): align_p_f[key].remove(projection) break elif lang.startswith("en") and str(sid) in projection[1].split(" "): align_p_f[key].remove(projection) break xml_f.write("</d>\n") xml_f.flush() with open(f_stats, "a") as stats_f: stats_f.write("Removed: %d sentences\n" % sents_lost) scount = 0 for v in align_p_f.itervalues(): scount += len(v) stats_f.write("Remaining: %d sentences\n" % scount) stats_f.write("Total words: %d\n" % words_total) stats_f.write("Words lost: %d\n" % words_lost) stats_f.write("Words remmaining: %d\n" % (words_total - words_lost)) dumpStruct(f_align_p, align_p_f)