def get_info(filename): try: # alignment m = re.match('(.*)/(\w\w)-(\w\w).(\w+)$', filename) if m: from toolkit import Alignment a = Alignment.from_file(filename) return { 'type' : 'alignment2', 'basename' : m.group(1), 'lang1' : m.group(2), 'lang2' : m.group(3), 'text1' : "%s/%s.txt" % (m.group(1), m.group(2)), 'text2' : "%s/%s.txt" % (m.group(1), m.group(3)), 'backend' : m.group(4), 'cost' : a.summed_cost(), 'length' : len(a.data)} # text file m = re.match('(.*)/(\w\w).txt+$', filename) if m: from toolkit import Text t = Text.from_file(filename) return { 'type' : 'text', 'basename' : m.group(1), 'lang' : m.group(2), 'paragraphs' : len(t.as_paragraphs()), 'length' : len(t.as_string()), 'title' : t.as_paragraphs()[0] } return { 'filename' : filename, 'size' : os.path.getsize(filename) } except Exception, e: from collections import defaultdict return defaultdict(lambda: "<error>")
def read_all_pairs(filename): """Iterates over sentence pairs in a file. """ m = re.match('(.*)/(\w\w)-(\w\w).\w+$', filename) assert m basename = m.group(1) global lang1, lang2 lang1 = m.group(2) lang2 = m.group(3) try: alignment = Alignment.from_file(filename) except ValueError: return t1 = Text.from_file(basename + '/' + lang1 + '.txt', lang1) t2 = Text.from_file(basename + '/' + lang2 + '.txt', lang2) seq1 = t1.as_sentences_flat() seq2 = t2.as_sentences_flat() # print "%s text: %d sentences" % (lang1, len(seq1)) # print "%s text: %d sentences" % (lang2, len(seq2)) separator = unicode(' ♦ ', 'utf-8') for s1, s2 in alignment.as_ranges(seq1, seq2): s1 = preprocess(separator.join(s1)) s2 = preprocess(separator.join(s2)) yield s1, s2
import sys from toolkit import Text from collections import defaultdict if __name__ == '__main__': paragraph_counts = defaultdict(lambda: 0) filenames = sys.argv[1:] if not filenames: print __doc__ sys.exit() for filename in filenames: t = Text.from_file(filename) for paragraph in t.as_paragraphs(): paragraph_counts[paragraph] += 1 paragraphs_as_list = [(count, paragraph) for (paragraph, count) in paragraph_counts.iteritems()] paragraphs_as_list.sort(reverse=True) for (count, paragraph) in paragraphs_as_list: if count > 1: print count, paragraph[:100].encode('utf-8')