def gutenbergStats(f_gproj): """ Statistics for Project Gutenberg corpus. """ stopwords = getStopwords() lost_words = 0 lost_sents = 0 words = 0 sents = 0 sent_max = 0 sent_min = maxint # Change between testing and training folder manually corpus_type = "test/" for f in listdir(f_gproj + corpus_type): with copen(f_gproj + corpus_type + f, encoding="utf-8") as proj_f: for line in proj_f: m = match(".*>(.+)</.*", line) if m: sentence = m.group(1).strip().split(" ") clean = [] for word in sentence: word = word.lower() if word not in stopwords and len( word) > 1 and word.isalpha(): words += 1 clean.append(word) else: lost_words += 1 if len(clean) > 1: sents += 1 if len(clean) > sent_max: sent_max = len(clean) if len(clean) < sent_min: sent_min = len(clean) else: lost_sents += 1 print "Lost Words: %d" % lost_words print "Lost Sentences: %d" % lost_sents print "Words: %d" % words print "Sentences: %d" % sents print "Sentence minimum length: %d" % sent_min print "Sentence maximum length: %d" % sent_max print "Sentence average length: %f" % (float(words) / float(sents))
def gutenbergStats(f_gproj): """ Statistics for Project Gutenberg corpus. """ stopwords = getStopwords() lost_words = 0 lost_sents = 0 words = 0 sents = 0 sent_max = 0 sent_min = maxint # Change between testing and training folder manually corpus_type = "test/" for f in listdir(f_gproj + corpus_type): with copen(f_gproj + corpus_type + f, encoding="utf-8") as proj_f: for line in proj_f: m = match(".*>(.+)</.*", line) if m: sentence = m.group(1).strip().split(" ") clean = [] for word in sentence: word = word.lower() if word not in stopwords and len(word) > 1 and word.isalpha(): words += 1 clean.append(word) else: lost_words += 1 if len(clean) > 1: sents += 1 if len(clean) > sent_max: sent_max = len(clean) if len(clean) < sent_min: sent_min = len(clean) else: lost_sents += 1 print "Lost Words: %d" % lost_words print "Lost Sentences: %d" % lost_sents print "Words: %d" % words print "Sentences: %d" % sents print "Sentence minimum length: %d" % sent_min print "Sentence maximum length: %d" % sent_max print "Sentence average length: %f" % (float(words) / float(sents))
def convertGutenberg(f_root, f_type, f_gproj, slda=True, filter=True): """ Converts the Gutenberg corps to SLDA or LLDA format. """ stopwords = getStopwords() formal = 0 informal = 0 neutral = 0 double = 0 nn_lost = 0 on_lost = 0 no_lost = 0 total = 0 form = "SLDA" f_ttype = f_type if not slda: form = "LLDA" if f_type == "test": f_ttype = "gold" print "Converting Gutenberg corpus to %s format for %sing" % (form, f_type) with copen(f_root + f_ttype, "w", encoding="utf-8") as lda_f: for f in listdir(f_gproj + f_type): with copen(f_gproj + f_type + "/" + f, encoding="utf-8") as proj_f: for line in proj_f: m = match(".*>(.+)</.*", line) if m: sentence = m.group(1).strip().split(" ") clean = [] for word in sentence: word = word.lower() if filter: if word not in stopwords and len(word) > 1 and word.isalpha(): clean.append(word) else: if word.isalpha(): clean.append(word) if len(clean) > 1: sentence = " ".join(clean) if 'f="0" i="0"' in line: if not "you" in clean: neutral += 1 total += 1 if slda: lda_f.write("[2|%s]." % sentence) else: lda_f.write("[2] %s\n" % sentence) else: nn_lost += 1 elif 'f="1" i="0"' in line: if not "you" in clean: on_lost += 1 else: formal += 1 total += 1 if slda: lda_f.write("[1|%s]." % sentence) else: lda_f.write("[1] %s\n" % sentence) elif 'f="0" i="1"' in line: if not "you" in clean: no_lost += 1 else: informal += 1 total += 1 if slda: lda_f.write("[0|%s]." % sentence) else: lda_f.write("[0] %s\n" % sentence) elif 'f="1" i="1"' in line: double += 0 lda_f.write("\n") print "Formal: %d" % formal print "Informal: %d" % informal print "Neutral: %d" % neutral print "Formal + Informal: %d" % double print "You in Neutral: %d" % nn_lost print "You not in Formal: %d" % on_lost print "You not in Informal: %d" % no_lost print "Total sentences %d" % total print
def convertGutenberg(f_root, f_type, f_gproj, slda=True, filter=True): """ Converts the Gutenberg corps to SLDA or LLDA format. """ stopwords = getStopwords() formal = 0 informal = 0 neutral = 0 double = 0 nn_lost = 0 on_lost = 0 no_lost = 0 total = 0 form = "SLDA" f_ttype = f_type if not slda: form = "LLDA" if f_type == "test": f_ttype = "gold" print "Converting Gutenberg corpus to %s format for %sing" % (form, f_type) with copen(f_root + f_ttype, "w", encoding="utf-8") as lda_f: for f in listdir(f_gproj + f_type): with copen(f_gproj + f_type + "/" + f, encoding="utf-8") as proj_f: for line in proj_f: m = match(".*>(.+)</.*", line) if m: sentence = m.group(1).strip().split(" ") clean = [] for word in sentence: word = word.lower() if filter: if word not in stopwords and len( word) > 1 and word.isalpha(): clean.append(word) else: if word.isalpha(): clean.append(word) if len(clean) > 1: sentence = " ".join(clean) if "f=\"0\" i=\"0\"" in line: if not "you" in clean: neutral += 1 total += 1 if slda: lda_f.write("[2|%s]." % sentence) else: lda_f.write("[2] %s\n" % sentence) else: nn_lost += 1 elif "f=\"1\" i=\"0\"" in line: if not "you" in clean: on_lost += 1 else: formal += 1 total += 1 if slda: lda_f.write("[1|%s]." % sentence) else: lda_f.write("[1] %s\n" % sentence) elif "f=\"0\" i=\"1\"" in line: if not "you" in clean: no_lost += 1 else: informal += 1 total += 1 if slda: lda_f.write("[0|%s]." % sentence) else: lda_f.write("[0] %s\n" % sentence) elif "f=\"1\" i=\"1\"" in line: double += 0 lda_f.write("\n") print "Formal: %d" % formal print "Informal: %d" % informal print "Neutral: %d" % neutral print "Formal + Informal: %d" % double print "You in Neutral: %d" % nn_lost print "You not in Formal: %d" % on_lost print "You not in Informal: %d" % no_lost print "Total sentences %d" % total print
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True): """ Copies the documents with alignment in a clean format to a new folder as text files. """ align_p_f = loadStruct(f_align_p) stopwords = getStopwords() n_docs = len(align_p_f) words_total = 0 words_lost = 0 sents_lost = 0 with open(f_rem, "w") as rem_f: for i, key in enumerate(align_p_f.iterkeys()): if i % 500 == 0: print "Documents: %d/%d" % (i, n_docs) elif i == 0 or i == n_docs - 1: print "Documents: %d/%d" % (i + 1, n_docs) for lang in key: fname = f_clean + lang.replace(".gz", "") createPath(fname) with copen(fname, "w", encoding="utf-8") as xml_f: doc = [] last_id = 0 words = 0 with gopen(f_corpus + lang) as clean_f: for line in clean_f: line = line.strip() if line.startswith("<s"): last_id = match(".*id=\"([0-9]+)\"", line).group(1) doc.append([]) if line.startswith("<w"): m = match(".*>(.+)</", line) if m: word = m.group(1) words += 1 if lang.startswith("en"): words_total += 1 word = word.strip().lower().replace( "\'", "") if filter and word not in stopwords and len( word) > 1 and word.isalpha(): doc[-1].append(word) elif not filter: doc[-1].append(word) else: words_lost += 1 elif lang.startswith("de"): doc[-1].append(word) xml_f.write( "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<d s=\"%s\" w=\"%s\" f=\"%s\">\n" % (last_id, words, lang.replace(".gz", ""))) for k, v in enumerate(doc): sid = k + 1 if len(v) > 1: xml_f.write("<s id=\"%s\">%s</s>\n" % (sid, " ".join(v).decode("utf-8"))) if len(v) <= 1: sents_lost += 1 rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid)) for projection in align_p_f[key]: if lang.startswith("de") and str( sid) in projection[0].split(" "): align_p_f[key].remove(projection) break elif lang.startswith("en") and str( sid) in projection[1].split(" "): align_p_f[key].remove(projection) break xml_f.write("</d>\n") xml_f.flush() with open(f_stats, "a") as stats_f: stats_f.write("Removed: %d sentences\n" % sents_lost) scount = 0 for v in align_p_f.itervalues(): scount += len(v) stats_f.write("Remaining: %d sentences\n" % scount) stats_f.write("Total words: %d\n" % words_total) stats_f.write("Words lost: %d\n" % words_lost) stats_f.write("Words remmaining: %d\n" % (words_total - words_lost)) dumpStruct(f_align_p, align_p_f)
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True): """ Copies the documents with alignment in a clean format to a new folder as text files. """ align_p_f = loadStruct(f_align_p) stopwords = getStopwords() n_docs = len(align_p_f) words_total = 0 words_lost = 0 sents_lost = 0 with open(f_rem, "w") as rem_f: for i, key in enumerate(align_p_f.iterkeys()): if i % 500 == 0: print "Documents: %d/%d" % (i, n_docs) elif i == 0 or i == n_docs - 1: print "Documents: %d/%d" % (i + 1, n_docs) for lang in key: fname = f_clean + lang.replace(".gz", "") createPath(fname) with copen(fname, "w", encoding="utf-8") as xml_f: doc = [] last_id = 0 words = 0 with gopen(f_corpus + lang) as clean_f: for line in clean_f: line = line.strip() if line.startswith("<s"): last_id = match('.*id="([0-9]+)"', line).group(1) doc.append([]) if line.startswith("<w"): m = match(".*>(.+)</", line) if m: word = m.group(1) words += 1 if lang.startswith("en"): words_total += 1 word = word.strip().lower().replace("'", "") if filter and word not in stopwords and len(word) > 1 and word.isalpha(): doc[-1].append(word) elif not filter: doc[-1].append(word) else: words_lost += 1 elif lang.startswith("de"): doc[-1].append(word) xml_f.write( '<?xml version="1.0" encoding="utf-8"?>\n<d s="%s" w="%s" f="%s">\n' % (last_id, words, lang.replace(".gz", "")) ) for k, v in enumerate(doc): sid = k + 1 if len(v) > 1: xml_f.write('<s id="%s">%s</s>\n' % (sid, " ".join(v).decode("utf-8"))) if len(v) <= 1: sents_lost += 1 rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid)) for projection in align_p_f[key]: if lang.startswith("de") and str(sid) in projection[0].split(" "): align_p_f[key].remove(projection) break elif lang.startswith("en") and str(sid) in projection[1].split(" "): align_p_f[key].remove(projection) break xml_f.write("</d>\n") xml_f.flush() with open(f_stats, "a") as stats_f: stats_f.write("Removed: %d sentences\n" % sents_lost) scount = 0 for v in align_p_f.itervalues(): scount += len(v) stats_f.write("Remaining: %d sentences\n" % scount) stats_f.write("Total words: %d\n" % words_total) stats_f.write("Words lost: %d\n" % words_lost) stats_f.write("Words remmaining: %d\n" % (words_total - words_lost)) dumpStruct(f_align_p, align_p_f)