Esempio n. 1
0
def recoverUnknown(f_unknown_p, f_unknown, f_align_p, f_clean):
    """ Shows us the Foreign sentence that produced no formality while the English sentence had a "you". """
    print "Recovering unknown sentences"

    unknown = loadStruct(f_unknown_p)
    align = loadStruct(f_align_p)

    with copen(f_unknown, "w", encoding="utf-8") as unknown_f:
        for doc, proj in unknown.iteritems():
            if len(proj) > 0:
                de = []
                links = align[doc]

                for p in proj:
                    for link in links:
                        if p in link[1].split(" "):
                            de.extend(link[0].split(" "))

                with copen(f_clean + doc[0].replace(".gz", "")) as doc_f:
                    dom = parse(doc_f)
                    nodes = dom.getElementsByTagName("s")

                    for node in nodes:
                        if node.getAttribute("id") in de:
                            unknown_f.write("%s\n" % node.firstChild.nodeValue)
Esempio n. 2
0
def recoverUnknown(f_unknown_p, f_unknown, f_align_p, f_clean):
    """ Shows us the Foreign sentence that produced no formality while the English sentence had a "you". """
    print "Recovering unknown sentences"

    unknown = loadStruct(f_unknown_p)
    align = loadStruct(f_align_p)

    with copen(f_unknown, "w", encoding="utf-8") as unknown_f:
        for doc, proj in unknown.iteritems():
            if len(proj) > 0:
                de = []
                links = align[doc]

                for p in proj:
                    for link in links:
                        if p in link[1].split(" "):
                            de.extend(link[0].split(" "))

                with copen(f_clean + doc[0].replace(".gz", "")) as doc_f:
                    dom = parse(doc_f)
                    nodes = dom.getElementsByTagName("s")

                    for node in nodes:
                        if node.getAttribute("id") in de:
                            unknown_f.write("%s\n" % node.firstChild.nodeValue)
Esempio n. 3
0
def computSentenceLength(f_align_p, f_clean):
    align_p_f = loadStruct(f_align_p)
    min = maxint
    max = 0
    total = 0
    avg = 0.0
    longest = []

    for k in align_p_f.iterkeys():
        with open(f_clean + k[1].replace(".gz", "")) as clean_f:
            dom = parse(clean_f)
            nodes = dom.getElementsByTagName("s")

            for node in nodes:
                sentence = node.firstChild.nodeValue.split(" ")
                length = len(sentence)

                if length > max and "~" not in sentence and "subtitles" not in sentence and "subber" not in sentence:
                    max = length
                    longest = sentence

                if length != 0 and length < min:
                    min = length

                avg += length
            total += len(nodes)
    print "Min: %d\nMax: %d\nAvg: %f\n" % (min, max, (avg / total))
    print longest
Esempio n. 4
0
def computSentenceLength(f_align_p, f_clean):
    align_p_f = loadStruct(f_align_p)
    min = maxint
    max = 0
    total = 0
    avg = 0.0
    longest = []

    for k in align_p_f.iterkeys():
        with open(f_clean + k[1].replace(".gz", "")) as clean_f:
            dom = parse(clean_f)
            nodes = dom.getElementsByTagName("s")

            for node in nodes:
                sentence = node.firstChild.nodeValue.split(" ")
                length = len(sentence)

                if length > max and "~" not in sentence and "subtitles" not in sentence and "subber" not in sentence:
                    max = length
                    longest = sentence

                if length != 0 and length < min:
                    min = length

                avg += length
            total += len(nodes)
    print "Min: %d\nMax: %d\nAvg: %f\n" % (min, max, (avg / total))
    print longest
Esempio n. 5
0
def plainCopyDocuments(f_align_p, f_corpus, f_clean):
    """ Copies the files with alignments to a seperate folder.
    """
    align_p_f = loadStruct(f_align_p)

    print "Copying %d documents" % len(align_p_f)

    for key in align_p_f.iterkeys():
        to_de = f_clean + key[0]
        to_en = f_clean + key[1]
        createPath(to_de)
        createPath(to_en)
        copy(f_corpus + key[0], to_de)
        copy(f_corpus + key[1], to_en)
Esempio n. 6
0
def plainCopyDocuments(f_align_p, f_corpus, f_clean):
    """ Copies the files with alignments to a seperate folder.
    """
    align_p_f = loadStruct(f_align_p)

    print "Copying %d documents" % len(align_p_f)

    for key in align_p_f.iterkeys():
        to_de = f_clean + key[0]
        to_en = f_clean + key[1]
        createPath(to_de)
        createPath(to_en)
        copy(f_corpus + key[0], to_de)
        copy(f_corpus + key[1], to_en)
Esempio n. 7
0
def compareSentenceCount(misc):
    """ Compares sentence count from grep -c "s id" for checking purposes.

    Save the grep output as lang_count.txt.
    """
    print "Comparing sentence counts"

    for lang in langs:
        a = loadStruct(misc + lang + "_align.p")
        b = {}

        with open(misc + "%s_count.txt" % lang) as counts:
            for line in counts:
                k, v = line.strip().split(":")
                b.setdefault(k, v)

        for k1, v1 in a.iteritems():
            for k2, v2 in b.iteritems():
                if k1 == k2:
                    if str(v1) != v2:
                        print k1, k2
                        print v1, v2
Esempio n. 8
0
def createProjection(f_align_p, f_stats, f_clean, f_proj, f_unknown_p):
    """ Creates the projection based on rules.
    """
    fcount = 0
    de_count = 0
    en_count = 0
    pos = 0
    neg = 0
    lost = 0
    nn = 0
    on = 0
    no = 0
    oo = 0
    de_lost = 0
    scount = 0
    align_p_f = loadStruct(f_align_p)
    total = len(align_p_f)
    unknown = {}

    for lang, rels in align_p_f.iteritems():
        fcount += 1

        if fcount % 500 == 0 or fcount == total or fcount == 1:
            print "Documents: %d/%d" % (fcount, total)

        with copen(f_clean + lang[0].replace(".gz", "")) as xml_f:
            proj = {}
            dom = parse(xml_f)
            nodes = dom.getElementsByTagName("s")
            de_count += len(nodes)

            for link in rels:
                for node in nodes:
                    id_de = node.getAttribute("id")
                    links_de = link[0].split(" ")

                    if id_de in links_de and link[1] != "":
                        sentence = node.firstChild.nodeValue.split(" ")
                        meta = "<s id=\"0\" f=\"0\" i=\"0\">"

                        if "du" in sentence or "Du" in sentence:
                            meta = meta.replace("i=\"0\"", "i=\"1\"")
                        if "Sie" in sentence[1:]:
                            meta = meta.replace("f=\"0\"", "f=\"1\"")

                        if "f=\"0\" i=\"0\"" in meta:
                            nn += 1
                        elif "f=\"1\" i=\"0\"" in meta:
                            on += 1
                        elif "f=\"0\" i=\"1\"" in meta:
                            no += 1
                        elif "f=\"1\" i=\"1\"" in meta:
                            oo += 1

                        if "f=\"1\" i=\"1\"" not in meta:
                            for id_en in link[1].split(" "):
                                proj[id_en] = meta.replace("id=\"0\"", "id=\"%s\"" % id_en)
                    else:
                        de_lost += 1
            en_count += len(proj)

        with copen(f_clean + lang[1].replace(".gz", "")) as xml_e:
            unknown.setdefault(lang, [])
            fname_e = f_proj + "_".join(lang[1].split("/")).replace(".xml.gz", ".txt").replace("en_", "")
            createPath(fname_e)

            with copen(fname_e, "w", encoding="utf-8") as txt_e:
                txt_e.write("<d src=\"%s\">\n" % lang[0].replace(".gz", ""))
                dom_e = parse(xml_e)
                nodes_e = dom_e.getElementsByTagName("s")

                for node in nodes_e:
                    id_e = node.getAttribute("id")
                    sent_e = node.firstChild.nodeValue

                    if id_e in proj:
                        proj_e = proj[id_e]
                        s_sent_e = sent_e.split(" ")

                        if "you" in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e:
                            pos += 1
                            scount += 1
                            txt_e.write("%s%s</s>\n" % (proj_e, sent_e))
                        elif "you" in s_sent_e and "f=\"0\" i=\"0\"" in proj_e:
                            neg += 1
                            unknown[lang].append(id_e)
                        elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" in proj_e:
                            scount += 1
                            txt_e.write("%s%s</s>\n" % (proj_e, sent_e))
                        elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e:
                            lost += 1
                txt_e.write("</d>\n")
                txt_e.flush()

    with open(f_stats, "a") as stats:
        stats.write("PROJECTED DE_%d TO %d_EN\n"
                    "DE 0 0: %d\n"
                    "DE 1 0: %d\n"
                    "DE 0 1: %d\n"
                    "DE 1 1: %d\n"
                    "Y-Found: %d\n"
                    "Y-NotFound: %d\n"
                    "F-Lost: %d\n"
                    "Sentences: %d\n"
                    "DE no EN: %d" %
                   (de_count, en_count, nn, on, no, oo, pos, neg, lost, scount, de_lost))

    dumpStruct(f_unknown_p, unknown)
Esempio n. 9
0
def createProjection(f_align_p, f_stats, f_clean, f_proj, f_unknown_p):
    """ Creates the projection based on rules.
    """
    fcount = 0
    de_count = 0
    en_count = 0
    pos = 0
    neg = 0
    lost = 0
    nn = 0
    on = 0
    no = 0
    oo = 0
    de_lost = 0
    scount = 0
    align_p_f = loadStruct(f_align_p)
    total = len(align_p_f)
    unknown = {}

    for lang, rels in align_p_f.iteritems():
        fcount += 1

        if fcount % 500 == 0 or fcount == total or fcount == 1:
            print "Documents: %d/%d" % (fcount, total)

        with copen(f_clean + lang[0].replace(".gz", "")) as xml_f:
            proj = {}
            dom = parse(xml_f)
            nodes = dom.getElementsByTagName("s")
            de_count += len(nodes)

            for link in rels:
                for node in nodes:
                    id_de = node.getAttribute("id")
                    links_de = link[0].split(" ")

                    if id_de in links_de and link[1] != "":
                        sentence = node.firstChild.nodeValue.split(" ")
                        meta = "<s id=\"0\" f=\"0\" i=\"0\">"

                        if "du" in sentence or "Du" in sentence:
                            meta = meta.replace("i=\"0\"", "i=\"1\"")
                        if "Sie" in sentence[1:]:
                            meta = meta.replace("f=\"0\"", "f=\"1\"")

                        if "f=\"0\" i=\"0\"" in meta:
                            nn += 1
                        elif "f=\"1\" i=\"0\"" in meta:
                            on += 1
                        elif "f=\"0\" i=\"1\"" in meta:
                            no += 1
                        elif "f=\"1\" i=\"1\"" in meta:
                            oo += 1

                        if "f=\"1\" i=\"1\"" not in meta:
                            for id_en in link[1].split(" "):
                                proj[id_en] = meta.replace(
                                    "id=\"0\"", "id=\"%s\"" % id_en)
                    else:
                        de_lost += 1
            en_count += len(proj)

        with copen(f_clean + lang[1].replace(".gz", "")) as xml_e:
            unknown.setdefault(lang, [])
            fname_e = f_proj + "_".join(lang[1].split("/")).replace(
                ".xml.gz", ".txt").replace("en_", "")
            createPath(fname_e)

            with copen(fname_e, "w", encoding="utf-8") as txt_e:
                txt_e.write("<d src=\"%s\">\n" % lang[0].replace(".gz", ""))
                dom_e = parse(xml_e)
                nodes_e = dom_e.getElementsByTagName("s")

                for node in nodes_e:
                    id_e = node.getAttribute("id")
                    sent_e = node.firstChild.nodeValue

                    if id_e in proj:
                        proj_e = proj[id_e]
                        s_sent_e = sent_e.split(" ")

                        if "you" in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e:
                            pos += 1
                            scount += 1
                            txt_e.write("%s%s</s>\n" % (proj_e, sent_e))
                        elif "you" in s_sent_e and "f=\"0\" i=\"0\"" in proj_e:
                            neg += 1
                            unknown[lang].append(id_e)
                        elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" in proj_e:
                            scount += 1
                            txt_e.write("%s%s</s>\n" % (proj_e, sent_e))
                        elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e:
                            lost += 1
                txt_e.write("</d>\n")
                txt_e.flush()

    with open(f_stats, "a") as stats:
        stats.write("PROJECTED DE_%d TO %d_EN\n"
                    "DE 0 0: %d\n"
                    "DE 1 0: %d\n"
                    "DE 0 1: %d\n"
                    "DE 1 1: %d\n"
                    "Y-Found: %d\n"
                    "Y-NotFound: %d\n"
                    "F-Lost: %d\n"
                    "Sentences: %d\n"
                    "DE no EN: %d" % (de_count, en_count, nn, on, no, oo, pos,
                                      neg, lost, scount, de_lost))

    dumpStruct(f_unknown_p, unknown)
Esempio n. 10
0
def cleanCopyDocuments(f_align_p,
                       f_corpus,
                       f_clean,
                       f_stats,
                       f_rem,
                       filter=True):
    """ Copies the documents with alignment in a clean format to a new folder as text files.
    """
    align_p_f = loadStruct(f_align_p)
    stopwords = getStopwords()
    n_docs = len(align_p_f)
    words_total = 0
    words_lost = 0
    sents_lost = 0

    with open(f_rem, "w") as rem_f:
        for i, key in enumerate(align_p_f.iterkeys()):
            if i % 500 == 0:
                print "Documents: %d/%d" % (i, n_docs)
            elif i == 0 or i == n_docs - 1:
                print "Documents: %d/%d" % (i + 1, n_docs)

            for lang in key:
                fname = f_clean + lang.replace(".gz", "")
                createPath(fname)

                with copen(fname, "w", encoding="utf-8") as xml_f:
                    doc = []
                    last_id = 0
                    words = 0

                    with gopen(f_corpus + lang) as clean_f:
                        for line in clean_f:
                            line = line.strip()

                            if line.startswith("<s"):
                                last_id = match(".*id=\"([0-9]+)\"",
                                                line).group(1)
                                doc.append([])
                            if line.startswith("<w"):
                                m = match(".*>(.+)</", line)
                                if m:
                                    word = m.group(1)
                                    words += 1
                                    if lang.startswith("en"):
                                        words_total += 1
                                        word = word.strip().lower().replace(
                                            "\'", "")

                                        if filter and word not in stopwords and len(
                                                word) > 1 and word.isalpha():
                                            doc[-1].append(word)
                                        elif not filter:
                                            doc[-1].append(word)
                                        else:
                                            words_lost += 1
                                    elif lang.startswith("de"):
                                        doc[-1].append(word)

                    xml_f.write(
                        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<d s=\"%s\" w=\"%s\" f=\"%s\">\n"
                        % (last_id, words, lang.replace(".gz", "")))

                    for k, v in enumerate(doc):
                        sid = k + 1

                        if len(v) > 1:
                            xml_f.write("<s id=\"%s\">%s</s>\n" %
                                        (sid, " ".join(v).decode("utf-8")))
                        if len(v) <= 1:
                            sents_lost += 1
                            rem_f.write("[R] %s %s %s\n" %
                                        (str(key), lang[0:2], sid))

                            for projection in align_p_f[key]:
                                if lang.startswith("de") and str(
                                        sid) in projection[0].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                                elif lang.startswith("en") and str(
                                        sid) in projection[1].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                    xml_f.write("</d>\n")
                    xml_f.flush()
    with open(f_stats, "a") as stats_f:
        stats_f.write("Removed: %d sentences\n" % sents_lost)
        scount = 0

        for v in align_p_f.itervalues():
            scount += len(v)

        stats_f.write("Remaining: %d sentences\n" % scount)
        stats_f.write("Total words: %d\n" % words_total)
        stats_f.write("Words lost: %d\n" % words_lost)
        stats_f.write("Words remmaining: %d\n" % (words_total - words_lost))

    dumpStruct(f_align_p, align_p_f)
Esempio n. 11
0
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True):
    """ Copies the documents with alignment in a clean format to a new folder as text files.
    """
    align_p_f = loadStruct(f_align_p)
    stopwords = getStopwords()
    n_docs = len(align_p_f)
    words_total = 0
    words_lost = 0
    sents_lost = 0

    with open(f_rem, "w") as rem_f:
        for i, key in enumerate(align_p_f.iterkeys()):
            if i % 500 == 0:
                print "Documents: %d/%d" % (i, n_docs)
            elif i == 0 or i == n_docs - 1:
                print "Documents: %d/%d" % (i + 1, n_docs)

            for lang in key:
                fname = f_clean + lang.replace(".gz", "")
                createPath(fname)

                with copen(fname, "w", encoding="utf-8") as xml_f:
                    doc = []
                    last_id = 0
                    words = 0

                    with gopen(f_corpus + lang) as clean_f:
                        for line in clean_f:
                            line = line.strip()

                            if line.startswith("<s"):
                                last_id = match('.*id="([0-9]+)"', line).group(1)
                                doc.append([])
                            if line.startswith("<w"):
                                m = match(".*>(.+)</", line)
                                if m:
                                    word = m.group(1)
                                    words += 1
                                    if lang.startswith("en"):
                                        words_total += 1
                                        word = word.strip().lower().replace("'", "")

                                        if filter and word not in stopwords and len(word) > 1 and word.isalpha():
                                            doc[-1].append(word)
                                        elif not filter:
                                            doc[-1].append(word)
                                        else:
                                            words_lost += 1
                                    elif lang.startswith("de"):
                                        doc[-1].append(word)

                    xml_f.write(
                        '<?xml version="1.0" encoding="utf-8"?>\n<d s="%s" w="%s" f="%s">\n'
                        % (last_id, words, lang.replace(".gz", ""))
                    )

                    for k, v in enumerate(doc):
                        sid = k + 1

                        if len(v) > 1:
                            xml_f.write('<s id="%s">%s</s>\n' % (sid, " ".join(v).decode("utf-8")))
                        if len(v) <= 1:
                            sents_lost += 1
                            rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid))

                            for projection in align_p_f[key]:
                                if lang.startswith("de") and str(sid) in projection[0].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                                elif lang.startswith("en") and str(sid) in projection[1].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                    xml_f.write("</d>\n")
                    xml_f.flush()
    with open(f_stats, "a") as stats_f:
        stats_f.write("Removed: %d sentences\n" % sents_lost)
        scount = 0

        for v in align_p_f.itervalues():
            scount += len(v)

        stats_f.write("Remaining: %d sentences\n" % scount)
        stats_f.write("Total words: %d\n" % words_total)
        stats_f.write("Words lost: %d\n" % words_lost)
        stats_f.write("Words remmaining: %d\n" % (words_total - words_lost))

    dumpStruct(f_align_p, align_p_f)