예제 #1
0
def from_aligned_file(syllabus_name, aligned_file, output_file):
    _log.start('Extracting gp-aligned words', nSteps=4)

    _log.log('Building set of expected words')
    include_set = set((w.surface, w.reading) for w in \
            align_core.iter_words(syllabus_name))

    _log.log('Loading alignments')
    alignments = AlignedFile(aligned_file)
    
    _log.log('Saving alignments')
    o_stream = sopen(output_file, 'w')
    for alignment in alignments:
        key = (alignment.grapheme, alignment.phoneme)
        if key in include_set:
            print >> o_stream, alignment.to_line()
            include_set.remove(key)
    o_stream.close()

    if include_set:
        _log.finish('%d entries not found (see missing.log)' % len(include_set))
        o_stream = sopen('missing.log', 'w')
        for surface, reading in sorted(include_set):
            print >> o_stream, '%s %s:%s %s' % (surface, reading, surface,
                reading)
        o_stream.close()
    else:
        _log.finish('All entries found')
예제 #2
0
def from_aligned_file(syllabus_name, aligned_file, output_file):
    _log.start('Extracting gp-aligned words', nSteps=4)

    _log.log('Building set of expected words')
    include_set = set((w.surface, w.reading) for w in \
            align_core.iter_words(syllabus_name))

    _log.log('Loading alignments')
    alignments = AlignedFile(aligned_file)

    _log.log('Saving alignments')
    o_stream = sopen(output_file, 'w')
    for alignment in alignments:
        key = (alignment.grapheme, alignment.phoneme)
        if key in include_set:
            print >> o_stream, alignment.to_line()
            include_set.remove(key)
    o_stream.close()

    if include_set:
        _log.finish('%d entries not found (see missing.log)' %
                    len(include_set))
        o_stream = sopen('missing.log', 'w')
        for surface, reading in sorted(include_set):
            print >> o_stream, '%s %s:%s %s' % (surface, reading, surface,
                                                reading)
        o_stream.close()
    else:
        _log.finish('All entries found')
예제 #3
0
def escapeUtf8(inputFile, outputFile):
    iStream = sopen(inputFile, 'r', 'utf8')
    oStream = sopen(outputFile, 'w', 'unicode-escape')
    for line in iStream:
        oStream.write(line)
    oStream.close()
    iStream.close()
    return
예제 #4
0
 def test_from_packed_file(self):
     test_filename = tempfile.mktemp()
     o_stream = sopen(test_filename, 'w')
     print >> o_stream, 'dog bark:9,pee:1'
     print >> o_stream, 'cat meow:10'
     o_stream.close()
     self._check_file(test_filename, 'packed')
예제 #5
0
    def _build_alternation_map(self):
        """
        Calculates and returns an alternation map, from alternation to
        canonical reading. In other words, it maps (k, r) to r*.
        """
        # Generate an alternation distribution.
        from_canonical_reading = {}
        i_stream = sopen(_reading_counts_map_file, 'r')
        for line in i_stream:
            line = line.rstrip().split()
            kanji = line.pop(0)
            assert line

            for lineSeg in line:
                lineSeg = lineSeg.split(':')
                if len(lineSeg) == 2:
                    reading, count = lineSeg
                    alt_reading = reading
                elif len(lineSeg) == 3:
                    reading, alt_reading, count = lineSeg
                else:
                    raise Exception, "File %s is badly formatted" % \
                            _reading_counts_map_file

                key = (kanji, reading)
                if key in from_canonical_reading:
                    from_canonical_reading[key].add(alt_reading)
                else:
                    from_canonical_reading[key] = set([alt_reading])

        i_stream.close()

        return from_canonical_reading
예제 #6
0
def dump_errors(output_file):
    ostream = sopen(output_file, 'w')
    fields = ['item', 'item_type', 'question_type', 'user_answer',
            'correct_answer']
    for i in xrange(settings.N_DISTRACTORS - 1):
        fields.append('other_distractor_%d' % (i + 1))
    print >> ostream, '#' + ','.join(fields)
    writer = csv.writer(ostream)
    
    for response in models.MultipleChoiceResponse.objects.filter(
            option__is_correct=False):
        row = []
        row.append(response.question.pivot)
        row.append(response.question.pivot_type)
        row.append(response.question.question_type)
        
        user_answer = response.option.value
        all_options = response.option.question.options.all()
        correct_answer = all_options.get(is_correct=True).value
        distractors = [o.value for o in all_options 
                if not o.is_correct and o.value != user_answer]
        row.append(user_answer)
        row.append(correct_answer)
        row.extend(distractors)
        writer.writerow(row)
    ostream.close()
예제 #7
0
    def load_file(self, fpath_or_buf, **csv_kwargs):
        """
        A generator reading a given file line by line.

        :param fpath_or_buf:
            This can either be a file path or open file buffer.

        :param csv_kwargs:
            By default, the kwargs passed to :py:func:`csv.reader` are those for
            a standard Tatoeba file. You can pass additional keyword arguments
            here.
        """
        reader_kwargs = dict(delimiter='\t')
        reader_kwargs.update(csv_kwargs)

        if PY2:
            encoding = None
            encode_row = lambda row: [col.decode('utf-8') for col in row]
        else:
            encoding = 'utf-8'
            encode_row = lambda row: row

        if getattr(fpath_or_buf, 'read', None) is None:
            cfile = sopen(fpath_or_buf, mode='r', encoding=encoding)
        else:
            cfile = _NullContextWrapper(fpath_or_buf)

        with cfile as f:
            reader = csv.reader(f, **reader_kwargs)

            for row in reader:
                yield encode_row(row)
예제 #8
0
 def test_from_packed_file(self):
     test_filename = tempfile.mktemp()
     o_stream = sopen(test_filename, 'w')
     print >> o_stream, 'dog bark:9,pee:1'
     print >> o_stream, 'cat meow:10'
     o_stream.close()
     self._check_file(test_filename, 'packed')
예제 #9
0
def dump_errors(output_file):
    ostream = sopen(output_file, 'w')
    fields = [
        'item', 'item_type', 'question_type', 'user_answer', 'correct_answer'
    ]
    for i in xrange(settings.N_DISTRACTORS - 1):
        fields.append('other_distractor_%d' % (i + 1))
    print >> ostream, '#' + ','.join(fields)
    writer = csv.writer(ostream)

    for response in models.MultipleChoiceResponse.objects.filter(
            option__is_correct=False):
        row = []
        row.append(response.question.pivot)
        row.append(response.question.pivot_type)
        row.append(response.question.question_type)

        user_answer = response.option.value
        all_options = response.option.question.options.all()
        correct_answer = all_options.get(is_correct=True).value
        distractors = [
            o.value for o in all_options
            if not o.is_correct and o.value != user_answer
        ]
        row.append(user_answer)
        row.append(correct_answer)
        row.extend(distractors)
        writer.writerow(row)
    ostream.close()
예제 #10
0
def load_lexicon(filename=_jmdict_path):
    " Reloads the lexicon into the database."
    log.start('Rebuilding the lexicon', nSteps=5)
    if not Checksum.needs_update(_checksum_tag, _dependencies + [filename]):
        log.finish('Already up-to-date')
        return

    log.log('Loading probability distributions')
    models.initialise()
    
    log.start('Loading JMdict', nSteps=2)
    _clear_lexicon()
    log.log('Reading from %s' % path.basename(filename))
    iStream = sopen(filename, 'r', 'byte')
    data = iStream.read()
    iStream.close()
    log.log('Parsing XML tree')
    tree = ElementTree.fromstring(data)
    del data
    log.finish()
    
    _store_lexemes(tree.getchildren())

    log.log('Storing checksum')
    Checksum.store(_checksum_tag, _dependencies + [filename])
    
    log.finish()
예제 #11
0
def load_lexicon(filename=_jmdict_path):
    " Reloads the lexicon into the database."
    log.start('Rebuilding the lexicon', nSteps=5)
    if not Checksum.needs_update(_checksum_tag, _dependencies + [filename]):
        log.finish('Already up-to-date')
        return

    log.log('Loading probability distributions')
    models.initialise()

    log.start('Loading JMdict', nSteps=2)
    _clear_lexicon()
    log.log('Reading from %s' % path.basename(filename))
    iStream = sopen(filename, 'r', 'byte')
    data = iStream.read()
    iStream.close()
    log.log('Parsing XML tree')
    tree = ElementTree.fromstring(data)
    del data
    log.finish()

    _store_lexemes(tree.getchildren())

    log.log('Storing checksum')
    Checksum.store(_checksum_tag, _dependencies + [filename])

    log.finish()
예제 #12
0
 def from_file(filename):
     dist = FreqDist()
     i_stream = sopen(filename)
     for line in i_stream:
         symbol, count = line.rstrip().split()
         dist.inc(symbol, int(count))
     i_stream.close()
     return dist
예제 #13
0
 def __init__(self, filename):
     i_stream = sopen(filename)
     unique_kanji = scripts.unique_kanji
     for line in i_stream:
         if line.lstrip().startswith('#'):
             continue
         self.update(unique_kanji(line))
     i_stream.close()
예제 #14
0
 def test_from_row_file(self):
     test_filename = tempfile.mktemp()
     o_stream = sopen(test_filename, 'w')
     print >> o_stream, 'dog bark 9'
     print >> o_stream, 'dog pee 1'
     print >> o_stream, 'cat meow 10'
     o_stream.close()
     self._check_file(test_filename, 'row')
예제 #15
0
 def __init__(self, filename):
     i_stream = sopen(filename)
     unique_kanji = scripts.unique_kanji
     for line in i_stream:
         if line.lstrip().startswith('#'):
             continue
         self.update(unique_kanji(line))
     i_stream.close()
예제 #16
0
 def test_add(self):
     import add_syllabus
     from kanji_test.lexicon import load_lexicon
     import consoleLog
     consoleLog.default.oStream = sopen('/dev/null', 'w')
     load_lexicon.load_lexicon()
     add_syllabus.add_all_syllabi()
     models.Syllabus.validate()
예제 #17
0
 def test_add(self):
     import add_syllabus
     from kanji_test.lexicon import load_lexicon
     import consoleLog
     consoleLog.default.oStream = sopen('/dev/null', 'w')
     load_lexicon.load_lexicon()
     add_syllabus.add_all_syllabi()
     models.Syllabus.validate()
예제 #18
0
 def test_from_row_file(self):
     test_filename = tempfile.mktemp()
     o_stream = sopen(test_filename, 'w')
     print >> o_stream, 'dog bark 9'
     print >> o_stream, 'dog pee 1'
     print >> o_stream, 'cat meow 10'
     o_stream.close()
     self._check_file(test_filename, 'row')
예제 #19
0
def _dumpArticle(text):
    global _lastArticle
    filename = 'article%.04d.txt' % _lastArticle
    print filename
    oStream = sopen(filename, 'w', 'utf8')
    oStream.write(text)
    oStream.close()
    _lastArticle += 1
    return
예제 #20
0
def get_edict():
    if getattr(get_edict, '_cached', None) is not None:
        return get_edict._cached

    with sopen(get_data_loc('edict', extension=''), mode='r') as edf:
        edict = auto_format.load_dictionary(edf)

    get_edict._cached = edict

    return edict
예제 #21
0
 def to_file(self, filename):
     """Stores the distribution to a file."""
     o_stream = sopen(filename, 'w')
     for condition in self.conditions():
         cond_dist = self[condition]
         for sample in cond_dist.samples():
             count = cond_dist[sample]
             print >> o_stream, u'%s %s %d' % (condition, sample, count)
     o_stream.close()
     return
예제 #22
0
 def to_file(self, filename):
     o_stream = sopen(filename, 'w')
     for sample in self.samples():
         count = self[sample]
         sample = unicode(sample)
         if len(sample.split()) > 1:
             raise ValueError('sample contains whitespace')
         print >> o_stream, u'%s %d' % (sample, count)
     o_stream.close()
     return
예제 #23
0
def get_edict():
    if getattr(get_edict, '_cached', None) is not None:
        return get_edict._cached

    with sopen(get_data_loc('edict', extension=''), mode='r') as edf:
        edict = auto_format.load_dictionary(edf)

    get_edict._cached = edict

    return edict
예제 #24
0
파일: kanjidic.py 프로젝트: eromoe/cjktools
    def __init__(self, kanjidic_files=None):
        dict.__init__(self)

        if kanjidic_files is None:
            kanjidic_files = [
                cjkdata.get_resource('kanjidic'),
                cjkdata.get_resource('kanjd212'),
            ]

        line_stream = reduce(chain, [sopen(f) for f in kanjidic_files])
        self._parse_kanjidic(line_stream)
예제 #25
0
def unescapeUtf8(inputFile, outputFile):
    iStream = open(inputFile, 'r')
    oStream = sopen(outputFile, 'w', 'utf8')

    for line in iStream:
        line = unicode(line.replace('\\N', '\\\\N'), 'unicode-escape')
        oStream.write(line)

    oStream.close()
    iStream.close()
    return
예제 #26
0
def parseSgml(inputFile, outputFile):
    """
    """
    iStream = sopen(inputFile, 'r')
    oStream = sopen(outputFile, 'w')

    startSentence = '<s>'
    endSentence = '</s>'
    pat = re.compile(r'<s>(.+?)</s>', re.MULTILINE | re.DOTALL | re.UNICODE) 

    blockSize = 1024*1024
    block = iStream.read(blockSize)
    while block:
        for match in pat.finditer(block):
            print >> oStream, match.group(1).replace('\n', ' ').strip(u'  ')
        block = iStream.read(blockSize)

    oStream.close()
    iStream.close()
    return
예제 #27
0
    def __init__(self, filename):
        self._words = []
        i_stream = sopen(filename)
        for i, line in enumerate(i_stream):
            if line.lstrip().startswith('#'):
                continue

            try:
                self._words.append(WordEntry.from_line(line))
            except:
                raise FormatError('on line %d of %s' % (i + 1, filename))
        i_stream.close()
예제 #28
0
    def __init__(self):
        ConditionalFreqDist.__init__(self)

        kanji_script = scripts.Script.Kanji
        i_stream = sopen(_edict_aligned_file, 'r')
        for line in i_stream:
            alignment = Alignment.from_line(line)
            for (g, p) in alignment:
                if scripts.contains_script(kanji_script, g):
                    self[g].inc(scripts.to_hiragana(p))
        i_stream.close()
        return
예제 #29
0
 def from_file_row_format(filename):
     """
     Loads a distribution from a row_format file.
     """
     dist = ConditionalFreqDist()
     i_stream = sopen(filename)
     for line in i_stream:
         condition, symbol, count = line.rstrip().split()
         count = int(count)
         dist[condition].inc(symbol, count)
     i_stream.close()
     return dist
예제 #30
0
    def __init__(self, filename):
        self._words = []
        i_stream = sopen(filename)
        for i, line in enumerate(i_stream):
            if line.lstrip().startswith('#'):
                continue

            try:
                self._words.append(WordEntry.from_line(line))
            except:
                raise FormatError('on line %d of %s' % (i + 1, filename))
        i_stream.close()
예제 #31
0
    def __init__(self):
        ConditionalFreqDist.__init__(self)

        kanji_script = scripts.Script.Kanji
        i_stream = sopen(_edict_aligned_file, 'r')
        for line in i_stream:
            alignment = Alignment.from_line(line)
            for (g, p) in alignment:
                if scripts.contains_script(kanji_script, g):
                    self[g].inc(scripts.to_hiragana(p))
        i_stream.close()
        return
예제 #32
0
    def __init__(self, kanjidic_files=None):
        super(Kanjidic, self).__init__

        if kanjidic_files is None:
            kanjidic_files = [
                cjkdata.get_resource('kanjidic'),
                cjkdata.get_resource('kanjd212'),
            ]

        with ExitStack() as stack:
            file_chain = (stack.enter_context(sopen(f, mode='r'))
                          for f in kanjidic_files)
            line_stream = reduce(chain, file_chain)

            self._parse_kanjidic(line_stream)
예제 #33
0
 def from_file_packed_format(filename):
     """
     Loads a distribution from a packed format file. Rows in this file
     look like:
     
     conditionA symA:1,symB:10
     """
     dist = ConditionalFreqDist()
     i_stream = sopen(filename)
     for line in i_stream:
         condition, symbol_counts = line.split()
         for symbol_count in symbol_counts.split(','):
             symbol, count_str = symbol_count.split(':')
             count = int(count_str)
             dist[condition].inc(symbol, count)
     i_stream.close()
     return dist
예제 #34
0
파일: place.py 프로젝트: eromoe/cjktools
    def from_file(cls, filename):
        i_stream = sopen(filename)
        lines = iter(enumerate(i_stream))

        depth, root_node = cls._from_line(lines.next()[1])
        if depth != 0:
            raise Exception("file %s should start with a root node"
                            % filename)

        path = [root_node]
        last_depth = depth
        last_node = root_node
        for line_no, line in lines:
            depth, node = cls._from_line(line)
            if depth == last_depth + 1:
                # One level deeper, the last node was the parent.
                path.append(last_node)

            elif depth == last_depth:
                # Same level, same parent.
                pass

            elif depth < last_depth:
                # Up one or more levels.
                depth_diff = last_depth - depth
                path = path[:-depth_diff]

            else:
                raise Exception, "Strange depth found %s (line %d)" % (
                    filename,
                    line_no + 1
                )

            path[-1].append(node)
            last_node = node
            last_depth = depth

        i_stream.close()

        return root_node
예제 #35
0
def _store_words(syllabus, syllabus_bundle):
    """
    Try to find a matching lexicon word for each word in the syllabus, then
    store the limited knowledge we have about it in a partial lexeme object.
    """
    _log.start('Parsing word list', nSteps=1)
    n_ok = 0
    skipped_words = []
    for word in syllabus_bundle.words: 
        partial_lexeme = _find_in_lexicon(word, skipped_words, syllabus)
        if partial_lexeme:
            n_ok += 1
    _log.log('%d ok, %d skipped (see skipped.log)' % (n_ok,
            len(skipped_words)))

    o_stream = sopen('skipped.log', 'w')
    vim_header = "# vim: set ts=20 noet sts=20:"
    print >> o_stream, vim_header
    for word, reason in skipped_words:
        print >> o_stream, '%s\t%s' % (word.to_line(), reason)
    o_stream.close()
    _log.finish()
예제 #36
0
    def from_file(cls, filename):
        with sopen(filename, mode='r') as i_stream:
            lines = iter(enumerate(i_stream))

            depth, root_node = cls._from_line(next(lines)[1])
            if depth != 0:
                raise Exception("file %s should start with a root node" %
                                filename)

            path = [root_node]
            last_depth = depth
            last_node = root_node
            for line_no, line in lines:
                depth, node = cls._from_line(line)
                if depth == last_depth + 1:
                    # One level deeper, the last node was the parent.
                    path.append(last_node)

                elif depth == last_depth:
                    # Same level, same parent.
                    pass

                elif depth < last_depth:
                    # Up one or more levels.
                    depth_diff = last_depth - depth
                    path = path[:-depth_diff]

                else:
                    raise Exception("Strange depth found %s (line %d)" %
                                    (filename, line_no + 1))

                path[-1].append(node)
                last_node = node
                last_depth = depth

        return root_node
예제 #37
0
    def _load_alternation_dist(self, filename):
        """
        Loads an alternation distribution and returns it. This
        distribution gives P(r|r*).
        """
        alternation_dist = ConditionalFreqDist()
        i_stream = sopen(_reading_counts_map_file, 'r')
        for line in i_stream:
            line = line.rstrip().split()
            kanji = line.pop(0)
            for data in line:
                data = data.split(":")
                if len(data) == 2:
                    reading, count = data
                    count = int(count)
                    alt_reading = reading
                else:
                    reading, alt_reading, count = data
                    count = int(count)

                alternation_dist[reading].inc(alt_reading)

        i_stream.close()
        return alternation_dist
예제 #38
0
def to_alignment_format(syllabus_name, output_file):
    o_stream = sopen(output_file, 'w')
    for word in align_core.iter_words(syllabus_name):
        if word.reading and word.has_kanji():
            print >> o_stream, word.surface, word.reading
    o_stream.close()
예제 #39
0
 def dump(self, filename):
     with sopen(filename, 'w') as o_stream:
         for depth, place in self.walk():
             print(place._to_line(depth), file=o_stream)
예제 #40
0
파일: place.py 프로젝트: eromoe/cjktools
 def dump(self, filename):
     o_stream = sopen(filename, 'w')
     for depth, place in self.walk():
         print >> o_stream, place._to_line(depth)
     o_stream.close()
예제 #41
0
 def dump(self, filename):
     with sopen(filename, 'w') as o_stream:
         for depth, place in self.walk():
             print(place._to_line(depth), file=o_stream)