Beispiel #1
0
 def __init__(self, lt, config_file, table_file):
     """
     @param table_file an open output stream.
     """
     WikipediaParser.__init__(self, lt, config_file)
     self.table_file = table_file
     self.morph_set = set()
     self.ocamorph = Ocamorph(
         self.lt.config['ocamorph_runnable'],
         self.lt.config['ocamorph_model'],
         self.lt.config.get('ocamorph_encoding', 'iso-8859-2'))
     self.morph_analyzer = OcamorphAnalyzer(self.ocamorph)
Beispiel #2
0
class WikitextToMorphTable(WikipediaParser):
    """
    Parses Wikipedia files and creates a morphtable from them.
    """
    def __init__(self, lt, config_file, table_file):
        """
        @param table_file an open output stream.
        """
        WikipediaParser.__init__(self, lt, config_file)
        self.table_file = table_file
        self.morph_set = set()
        self.ocamorph = Ocamorph(
            self.lt.config['ocamorph_runnable'],
            self.lt.config['ocamorph_model'],
            self.lt.config.get('ocamorph_encoding', 'iso-8859-2'))
        self.morph_analyzer = OcamorphAnalyzer(self.ocamorph)

    def process_tokens(self, actual_title, tokens, templates):
        logging.info("TITLE " + actual_title)
        for sentence in tokens:
            for token in sentence:
                # Well, that's it for loose coupling
                self.morph_set.add(self.morph_analyzer.replace_stuff(token[0]))


#                self.morph_set.add(token[0])

    def print_tokens(self):
        sorted_words = sorted(self.morph_set)
        for i in xrange(0, len(sorted_words), 25):
            analyzed = [
                sen for sen in self.morph_analyzer.analyze(
                    [sorted_words[i:i + 25]])
            ][0]
            #            analyzed = list(analyzed)
            #            print "ANAL", analyzed
            self.table_file.write(u"\n".join("\t".join(token)
                                             for token in analyzed).encode(
                                                 self.ocamorph._encoding))
            self.table_file.write("\n")
        self.table_file.flush()

    def close(self):
        if self.table_file is not None:
            self.table_file.close()
            self.table_file = None

    def __del__(self):
        self.close()
Beispiel #3
0
 def __init__(self, lt, config_file, table_file):
     """
     @param table_file an open output stream.
     """
     WikipediaParser.__init__(self, lt, config_file)
     self.table_file = table_file
     self.morph_set = set()
     self.ocamorph = Ocamorph(self.lt.config['ocamorph_runnable'],
                              self.lt.config['ocamorph_model'],
                              self.lt.config.get('ocamorph_encoding', 'iso-8859-2'))
     self.morph_analyzer = OcamorphAnalyzer(self.ocamorph)
Beispiel #4
0
class WikitextToMorphTable(WikipediaParser):
    """
    Parses Wikipedia files and creates a morphtable from them.
    """
    def __init__(self, lt, config_file, table_file):
        """
        @param table_file an open output stream.
        """
        WikipediaParser.__init__(self, lt, config_file)
        self.table_file = table_file
        self.morph_set = set()
        self.ocamorph = Ocamorph(self.lt.config['ocamorph_runnable'],
                                 self.lt.config['ocamorph_model'],
                                 self.lt.config.get('ocamorph_encoding', 'iso-8859-2'))
        self.morph_analyzer = OcamorphAnalyzer(self.ocamorph)

    def process_tokens(self, actual_title, tokens, templates):
        logging.info("TITLE " + actual_title)
        for sentence in tokens:
            for token in sentence:
                # Well, that's it for loose coupling
                self.morph_set.add(self.morph_analyzer.replace_stuff(token[0]))
#                self.morph_set.add(token[0])

    def print_tokens(self):
        sorted_words = sorted(self.morph_set)
        for i in xrange(0, len(sorted_words), 25):
            analyzed = [sen for sen in self.morph_analyzer.analyze([sorted_words[i : i + 25]])][0]
#            analyzed = list(analyzed)
#            print "ANAL", analyzed
            self.table_file.write(u"\n".join("\t".join(token) for token in analyzed).encode(self.ocamorph._encoding))
            self.table_file.write("\n")
        self.table_file.flush()

    def close(self):
        if self.table_file is not None:
            self.table_file.close()
            self.table_file = None

    def __del__(self):
        self.close()