def __init__(self, lt, config_file, table_file): """ @param table_file an open output stream. """ WikipediaParser.__init__(self, lt, config_file) self.table_file = table_file self.morph_set = set() self.ocamorph = Ocamorph( self.lt.config['ocamorph_runnable'], self.lt.config['ocamorph_model'], self.lt.config.get('ocamorph_encoding', 'iso-8859-2')) self.morph_analyzer = OcamorphAnalyzer(self.ocamorph)
class WikitextToMorphTable(WikipediaParser): """ Parses Wikipedia files and creates a morphtable from them. """ def __init__(self, lt, config_file, table_file): """ @param table_file an open output stream. """ WikipediaParser.__init__(self, lt, config_file) self.table_file = table_file self.morph_set = set() self.ocamorph = Ocamorph( self.lt.config['ocamorph_runnable'], self.lt.config['ocamorph_model'], self.lt.config.get('ocamorph_encoding', 'iso-8859-2')) self.morph_analyzer = OcamorphAnalyzer(self.ocamorph) def process_tokens(self, actual_title, tokens, templates): logging.info("TITLE " + actual_title) for sentence in tokens: for token in sentence: # Well, that's it for loose coupling self.morph_set.add(self.morph_analyzer.replace_stuff(token[0])) # self.morph_set.add(token[0]) def print_tokens(self): sorted_words = sorted(self.morph_set) for i in xrange(0, len(sorted_words), 25): analyzed = [ sen for sen in self.morph_analyzer.analyze( [sorted_words[i:i + 25]]) ][0] # analyzed = list(analyzed) # print "ANAL", analyzed self.table_file.write(u"\n".join("\t".join(token) for token in analyzed).encode( self.ocamorph._encoding)) self.table_file.write("\n") self.table_file.flush() def close(self): if self.table_file is not None: self.table_file.close() self.table_file = None def __del__(self): self.close()
def __init__(self, lt, config_file, table_file): """ @param table_file an open output stream. """ WikipediaParser.__init__(self, lt, config_file) self.table_file = table_file self.morph_set = set() self.ocamorph = Ocamorph(self.lt.config['ocamorph_runnable'], self.lt.config['ocamorph_model'], self.lt.config.get('ocamorph_encoding', 'iso-8859-2')) self.morph_analyzer = OcamorphAnalyzer(self.ocamorph)
class WikitextToMorphTable(WikipediaParser): """ Parses Wikipedia files and creates a morphtable from them. """ def __init__(self, lt, config_file, table_file): """ @param table_file an open output stream. """ WikipediaParser.__init__(self, lt, config_file) self.table_file = table_file self.morph_set = set() self.ocamorph = Ocamorph(self.lt.config['ocamorph_runnable'], self.lt.config['ocamorph_model'], self.lt.config.get('ocamorph_encoding', 'iso-8859-2')) self.morph_analyzer = OcamorphAnalyzer(self.ocamorph) def process_tokens(self, actual_title, tokens, templates): logging.info("TITLE " + actual_title) for sentence in tokens: for token in sentence: # Well, that's it for loose coupling self.morph_set.add(self.morph_analyzer.replace_stuff(token[0])) # self.morph_set.add(token[0]) def print_tokens(self): sorted_words = sorted(self.morph_set) for i in xrange(0, len(sorted_words), 25): analyzed = [sen for sen in self.morph_analyzer.analyze([sorted_words[i : i + 25]])][0] # analyzed = list(analyzed) # print "ANAL", analyzed self.table_file.write(u"\n".join("\t".join(token) for token in analyzed).encode(self.ocamorph._encoding)) self.table_file.write("\n") self.table_file.flush() def close(self): if self.table_file is not None: self.table_file.close() self.table_file = None def __del__(self): self.close()