Example #1
0
def get_info(filename):

    try:
        # alignment
        m = re.match('(.*)/(\w\w)-(\w\w).(\w+)$', filename)
        if m:
            from toolkit import Alignment
            a = Alignment.from_file(filename)
            return { 'type' : 'alignment2',
                     'basename' : m.group(1),
                     'lang1' : m.group(2),
                     'lang2' : m.group(3),
                     'text1' : "%s/%s.txt" % (m.group(1), m.group(2)),
                     'text2' : "%s/%s.txt" % (m.group(1), m.group(3)),
                     'backend' : m.group(4),
                     'cost' : a.summed_cost(),
                     'length' : len(a.data)}

        # text file
        m = re.match('(.*)/(\w\w).txt+$', filename)
        if m:
            from toolkit import Text
            t = Text.from_file(filename)
            return { 'type' : 'text',
                     'basename' : m.group(1),
                     'lang' : m.group(2),
                     'paragraphs' : len(t.as_paragraphs()),
                     'length' : len(t.as_string()),
                     'title' : t.as_paragraphs()[0] }

        return { 'filename' : filename,
                 'size' : os.path.getsize(filename) }
    except Exception, e:
        from collections import defaultdict
        return defaultdict(lambda: "<error>")
Example #2
0
def read_all_pairs(filename):
    """Iterates over sentence pairs in a file.
    """
    m = re.match('(.*)/(\w\w)-(\w\w).\w+$', filename)
    assert m
    basename = m.group(1)
    global lang1, lang2
    lang1 = m.group(2)
    lang2 = m.group(3)
    try:
        alignment = Alignment.from_file(filename)
    except ValueError:
        return
    t1 = Text.from_file(basename + '/' + lang1 + '.txt', lang1)
    t2 = Text.from_file(basename + '/' + lang2 + '.txt', lang2)
    seq1 = t1.as_sentences_flat()
    seq2 = t2.as_sentences_flat()
#    print "%s text: %d sentences" % (lang1, len(seq1))
#    print "%s text: %d sentences" % (lang2, len(seq2))
    separator = unicode(' ♦ ', 'utf-8')
    for s1, s2 in alignment.as_ranges(seq1, seq2):
        s1 = preprocess(separator.join(s1))
        s2 = preprocess(separator.join(s2))
        yield s1, s2
Example #3
0
import sys
from toolkit import Text
from collections import defaultdict

if __name__ == '__main__':

    paragraph_counts = defaultdict(lambda: 0)

    filenames = sys.argv[1:]
    if not filenames:
        print __doc__
        sys.exit()

    for filename in filenames:
        t = Text.from_file(filename)
        for paragraph in t.as_paragraphs():
            paragraph_counts[paragraph] += 1

    paragraphs_as_list = [(count, paragraph)
                          for (paragraph, count)
                          in paragraph_counts.iteritems()]

    paragraphs_as_list.sort(reverse=True)

    for (count, paragraph) in paragraphs_as_list:
        if count > 1:
            print count, paragraph[:100].encode('utf-8')