def from_aligned_file(syllabus_name, aligned_file, output_file): _log.start('Extracting gp-aligned words', nSteps=4) _log.log('Building set of expected words') include_set = set((w.surface, w.reading) for w in \ align_core.iter_words(syllabus_name)) _log.log('Loading alignments') alignments = AlignedFile(aligned_file) _log.log('Saving alignments') o_stream = sopen(output_file, 'w') for alignment in alignments: key = (alignment.grapheme, alignment.phoneme) if key in include_set: print >> o_stream, alignment.to_line() include_set.remove(key) o_stream.close() if include_set: _log.finish('%d entries not found (see missing.log)' % len(include_set)) o_stream = sopen('missing.log', 'w') for surface, reading in sorted(include_set): print >> o_stream, '%s %s:%s %s' % (surface, reading, surface, reading) o_stream.close() else: _log.finish('All entries found')
def escapeUtf8(inputFile, outputFile): iStream = sopen(inputFile, 'r', 'utf8') oStream = sopen(outputFile, 'w', 'unicode-escape') for line in iStream: oStream.write(line) oStream.close() iStream.close() return
def test_from_packed_file(self): test_filename = tempfile.mktemp() o_stream = sopen(test_filename, 'w') print >> o_stream, 'dog bark:9,pee:1' print >> o_stream, 'cat meow:10' o_stream.close() self._check_file(test_filename, 'packed')
def _build_alternation_map(self): """ Calculates and returns an alternation map, from alternation to canonical reading. In other words, it maps (k, r) to r*. """ # Generate an alternation distribution. from_canonical_reading = {} i_stream = sopen(_reading_counts_map_file, 'r') for line in i_stream: line = line.rstrip().split() kanji = line.pop(0) assert line for lineSeg in line: lineSeg = lineSeg.split(':') if len(lineSeg) == 2: reading, count = lineSeg alt_reading = reading elif len(lineSeg) == 3: reading, alt_reading, count = lineSeg else: raise Exception, "File %s is badly formatted" % \ _reading_counts_map_file key = (kanji, reading) if key in from_canonical_reading: from_canonical_reading[key].add(alt_reading) else: from_canonical_reading[key] = set([alt_reading]) i_stream.close() return from_canonical_reading
def dump_errors(output_file): ostream = sopen(output_file, 'w') fields = ['item', 'item_type', 'question_type', 'user_answer', 'correct_answer'] for i in xrange(settings.N_DISTRACTORS - 1): fields.append('other_distractor_%d' % (i + 1)) print >> ostream, '#' + ','.join(fields) writer = csv.writer(ostream) for response in models.MultipleChoiceResponse.objects.filter( option__is_correct=False): row = [] row.append(response.question.pivot) row.append(response.question.pivot_type) row.append(response.question.question_type) user_answer = response.option.value all_options = response.option.question.options.all() correct_answer = all_options.get(is_correct=True).value distractors = [o.value for o in all_options if not o.is_correct and o.value != user_answer] row.append(user_answer) row.append(correct_answer) row.extend(distractors) writer.writerow(row) ostream.close()
def load_file(self, fpath_or_buf, **csv_kwargs): """ A generator reading a given file line by line. :param fpath_or_buf: This can either be a file path or open file buffer. :param csv_kwargs: By default, the kwargs passed to :py:func:`csv.reader` are those for a standard Tatoeba file. You can pass additional keyword arguments here. """ reader_kwargs = dict(delimiter='\t') reader_kwargs.update(csv_kwargs) if PY2: encoding = None encode_row = lambda row: [col.decode('utf-8') for col in row] else: encoding = 'utf-8' encode_row = lambda row: row if getattr(fpath_or_buf, 'read', None) is None: cfile = sopen(fpath_or_buf, mode='r', encoding=encoding) else: cfile = _NullContextWrapper(fpath_or_buf) with cfile as f: reader = csv.reader(f, **reader_kwargs) for row in reader: yield encode_row(row)
def dump_errors(output_file): ostream = sopen(output_file, 'w') fields = [ 'item', 'item_type', 'question_type', 'user_answer', 'correct_answer' ] for i in xrange(settings.N_DISTRACTORS - 1): fields.append('other_distractor_%d' % (i + 1)) print >> ostream, '#' + ','.join(fields) writer = csv.writer(ostream) for response in models.MultipleChoiceResponse.objects.filter( option__is_correct=False): row = [] row.append(response.question.pivot) row.append(response.question.pivot_type) row.append(response.question.question_type) user_answer = response.option.value all_options = response.option.question.options.all() correct_answer = all_options.get(is_correct=True).value distractors = [ o.value for o in all_options if not o.is_correct and o.value != user_answer ] row.append(user_answer) row.append(correct_answer) row.extend(distractors) writer.writerow(row) ostream.close()
def load_lexicon(filename=_jmdict_path): " Reloads the lexicon into the database." log.start('Rebuilding the lexicon', nSteps=5) if not Checksum.needs_update(_checksum_tag, _dependencies + [filename]): log.finish('Already up-to-date') return log.log('Loading probability distributions') models.initialise() log.start('Loading JMdict', nSteps=2) _clear_lexicon() log.log('Reading from %s' % path.basename(filename)) iStream = sopen(filename, 'r', 'byte') data = iStream.read() iStream.close() log.log('Parsing XML tree') tree = ElementTree.fromstring(data) del data log.finish() _store_lexemes(tree.getchildren()) log.log('Storing checksum') Checksum.store(_checksum_tag, _dependencies + [filename]) log.finish()
def from_file(filename): dist = FreqDist() i_stream = sopen(filename) for line in i_stream: symbol, count = line.rstrip().split() dist.inc(symbol, int(count)) i_stream.close() return dist
def __init__(self, filename): i_stream = sopen(filename) unique_kanji = scripts.unique_kanji for line in i_stream: if line.lstrip().startswith('#'): continue self.update(unique_kanji(line)) i_stream.close()
def test_from_row_file(self): test_filename = tempfile.mktemp() o_stream = sopen(test_filename, 'w') print >> o_stream, 'dog bark 9' print >> o_stream, 'dog pee 1' print >> o_stream, 'cat meow 10' o_stream.close() self._check_file(test_filename, 'row')
def test_add(self): import add_syllabus from kanji_test.lexicon import load_lexicon import consoleLog consoleLog.default.oStream = sopen('/dev/null', 'w') load_lexicon.load_lexicon() add_syllabus.add_all_syllabi() models.Syllabus.validate()
def _dumpArticle(text): global _lastArticle filename = 'article%.04d.txt' % _lastArticle print filename oStream = sopen(filename, 'w', 'utf8') oStream.write(text) oStream.close() _lastArticle += 1 return
def get_edict(): if getattr(get_edict, '_cached', None) is not None: return get_edict._cached with sopen(get_data_loc('edict', extension=''), mode='r') as edf: edict = auto_format.load_dictionary(edf) get_edict._cached = edict return edict
def to_file(self, filename): """Stores the distribution to a file.""" o_stream = sopen(filename, 'w') for condition in self.conditions(): cond_dist = self[condition] for sample in cond_dist.samples(): count = cond_dist[sample] print >> o_stream, u'%s %s %d' % (condition, sample, count) o_stream.close() return
def to_file(self, filename): o_stream = sopen(filename, 'w') for sample in self.samples(): count = self[sample] sample = unicode(sample) if len(sample.split()) > 1: raise ValueError('sample contains whitespace') print >> o_stream, u'%s %d' % (sample, count) o_stream.close() return
def __init__(self, kanjidic_files=None): dict.__init__(self) if kanjidic_files is None: kanjidic_files = [ cjkdata.get_resource('kanjidic'), cjkdata.get_resource('kanjd212'), ] line_stream = reduce(chain, [sopen(f) for f in kanjidic_files]) self._parse_kanjidic(line_stream)
def unescapeUtf8(inputFile, outputFile): iStream = open(inputFile, 'r') oStream = sopen(outputFile, 'w', 'utf8') for line in iStream: line = unicode(line.replace('\\N', '\\\\N'), 'unicode-escape') oStream.write(line) oStream.close() iStream.close() return
def parseSgml(inputFile, outputFile): """ """ iStream = sopen(inputFile, 'r') oStream = sopen(outputFile, 'w') startSentence = '<s>' endSentence = '</s>' pat = re.compile(r'<s>(.+?)</s>', re.MULTILINE | re.DOTALL | re.UNICODE) blockSize = 1024*1024 block = iStream.read(blockSize) while block: for match in pat.finditer(block): print >> oStream, match.group(1).replace('\n', ' ').strip(u' ') block = iStream.read(blockSize) oStream.close() iStream.close() return
def __init__(self, filename): self._words = [] i_stream = sopen(filename) for i, line in enumerate(i_stream): if line.lstrip().startswith('#'): continue try: self._words.append(WordEntry.from_line(line)) except: raise FormatError('on line %d of %s' % (i + 1, filename)) i_stream.close()
def __init__(self): ConditionalFreqDist.__init__(self) kanji_script = scripts.Script.Kanji i_stream = sopen(_edict_aligned_file, 'r') for line in i_stream: alignment = Alignment.from_line(line) for (g, p) in alignment: if scripts.contains_script(kanji_script, g): self[g].inc(scripts.to_hiragana(p)) i_stream.close() return
def from_file_row_format(filename): """ Loads a distribution from a row_format file. """ dist = ConditionalFreqDist() i_stream = sopen(filename) for line in i_stream: condition, symbol, count = line.rstrip().split() count = int(count) dist[condition].inc(symbol, count) i_stream.close() return dist
def __init__(self, kanjidic_files=None): super(Kanjidic, self).__init__ if kanjidic_files is None: kanjidic_files = [ cjkdata.get_resource('kanjidic'), cjkdata.get_resource('kanjd212'), ] with ExitStack() as stack: file_chain = (stack.enter_context(sopen(f, mode='r')) for f in kanjidic_files) line_stream = reduce(chain, file_chain) self._parse_kanjidic(line_stream)
def from_file_packed_format(filename): """ Loads a distribution from a packed format file. Rows in this file look like: conditionA symA:1,symB:10 """ dist = ConditionalFreqDist() i_stream = sopen(filename) for line in i_stream: condition, symbol_counts = line.split() for symbol_count in symbol_counts.split(','): symbol, count_str = symbol_count.split(':') count = int(count_str) dist[condition].inc(symbol, count) i_stream.close() return dist
def from_file(cls, filename): i_stream = sopen(filename) lines = iter(enumerate(i_stream)) depth, root_node = cls._from_line(lines.next()[1]) if depth != 0: raise Exception("file %s should start with a root node" % filename) path = [root_node] last_depth = depth last_node = root_node for line_no, line in lines: depth, node = cls._from_line(line) if depth == last_depth + 1: # One level deeper, the last node was the parent. path.append(last_node) elif depth == last_depth: # Same level, same parent. pass elif depth < last_depth: # Up one or more levels. depth_diff = last_depth - depth path = path[:-depth_diff] else: raise Exception, "Strange depth found %s (line %d)" % ( filename, line_no + 1 ) path[-1].append(node) last_node = node last_depth = depth i_stream.close() return root_node
def _store_words(syllabus, syllabus_bundle): """ Try to find a matching lexicon word for each word in the syllabus, then store the limited knowledge we have about it in a partial lexeme object. """ _log.start('Parsing word list', nSteps=1) n_ok = 0 skipped_words = [] for word in syllabus_bundle.words: partial_lexeme = _find_in_lexicon(word, skipped_words, syllabus) if partial_lexeme: n_ok += 1 _log.log('%d ok, %d skipped (see skipped.log)' % (n_ok, len(skipped_words))) o_stream = sopen('skipped.log', 'w') vim_header = "# vim: set ts=20 noet sts=20:" print >> o_stream, vim_header for word, reason in skipped_words: print >> o_stream, '%s\t%s' % (word.to_line(), reason) o_stream.close() _log.finish()
def from_file(cls, filename): with sopen(filename, mode='r') as i_stream: lines = iter(enumerate(i_stream)) depth, root_node = cls._from_line(next(lines)[1]) if depth != 0: raise Exception("file %s should start with a root node" % filename) path = [root_node] last_depth = depth last_node = root_node for line_no, line in lines: depth, node = cls._from_line(line) if depth == last_depth + 1: # One level deeper, the last node was the parent. path.append(last_node) elif depth == last_depth: # Same level, same parent. pass elif depth < last_depth: # Up one or more levels. depth_diff = last_depth - depth path = path[:-depth_diff] else: raise Exception("Strange depth found %s (line %d)" % (filename, line_no + 1)) path[-1].append(node) last_node = node last_depth = depth return root_node
def _load_alternation_dist(self, filename): """ Loads an alternation distribution and returns it. This distribution gives P(r|r*). """ alternation_dist = ConditionalFreqDist() i_stream = sopen(_reading_counts_map_file, 'r') for line in i_stream: line = line.rstrip().split() kanji = line.pop(0) for data in line: data = data.split(":") if len(data) == 2: reading, count = data count = int(count) alt_reading = reading else: reading, alt_reading, count = data count = int(count) alternation_dist[reading].inc(alt_reading) i_stream.close() return alternation_dist
def to_alignment_format(syllabus_name, output_file): o_stream = sopen(output_file, 'w') for word in align_core.iter_words(syllabus_name): if word.reading and word.has_kanji(): print >> o_stream, word.surface, word.reading o_stream.close()
def dump(self, filename): with sopen(filename, 'w') as o_stream: for depth, place in self.walk(): print(place._to_line(depth), file=o_stream)
def dump(self, filename): o_stream = sopen(filename, 'w') for depth, place in self.walk(): print >> o_stream, place._to_line(depth) o_stream.close()