class CorpusReader(BaseReader): def __init__(self, token_reader: BaseReader, linesep: str=os.linesep): super().__init__(linesep=linesep) self.token_reader = token_reader self.sentence_parser = SentenceReader(self.token_reader) def read(self, text: str): # it parses the whole(!) analysed corpus sentences = list() for line in text.split(self.linesep): if len(line) > 0: sentences.append(self.sentence_parser.read(line)) paragraph = Paragraph(sentences) document = Document() document.append(paragraph) return document
class HunPosCorpusReader(BaseReader): # Ugyan olyan reader, mint a CorpusReader, csak más a kódolás és a szeparátor. # Célszerű lenne úgy refaktorálni, hogy egy paraméterezhető Corpusreader legyen. def __init__(self): super().__init__(encoding="ISO-8859-2") self.word_parser = TaggedTokenReader("\t") self.sentence_parser = SentenceReader(self.word_parser, self.linesep) def read(self, text: str): sentences = list() for sent in text.split(self.linesep + self.linesep): if len(sent)-1 > 0: sentences.append(self.sentence_parser.read(sent)) paragraph = Paragraph(sentences) document = Document() document.append(paragraph) return document
def __init__(self): super().__init__(encoding="ISO-8859-2") self.word_parser = TaggedTokenReader("\t") self.sentence_parser = SentenceReader(self.word_parser, self.linesep)
def __init__(self, token_reader: BaseReader, linesep: str=os.linesep): super().__init__(linesep=linesep) self.token_reader = token_reader self.sentence_parser = SentenceReader(self.token_reader)