class PlaintextParser(AbstractParser): def __init__(self, filename): super(PlaintextParser, self).__init__(filename) if not self.wants_this_file(): return self._init_line_count_progress() self.nlp_pipeline = NlpPipeline() def _wanted_file_endings(self): return (".txt", ) def parse(self): text = Text() with open(self.filename, "r") as file_: for line_unenc in file_: self._progress += 1 line = unicode(line_unenc.encode('utf8')) if line.startswith(TEXT_SEPARATOR): if (len(text.sentences) > 0): yield text text = Text() continue sentences = self.nlp_pipeline.sentence_segmentation(line) for sentence in sentences: s = Sentence() s.set_sentence_text(sentence) s.set_tokens(self.nlp_pipeline.parse_text(sentence)) text.add_sentence(s) if (len(text.sentences) > 0): yield text def progress(self): return self._line_count_progress()
class PlaintextParser(AbstractParser): def __init__(self, filename): super(PlaintextParser, self).__init__(filename) if not self.wants_this_file(): return self._init_line_count_progress() self.nlp_pipeline = NlpPipeline() def _wanted_file_endings(self): return (".txt",) def parse(self): text = Text() with open(self.filename, "r") as file_: for line_unenc in file_: self._progress += 1 line = unicode(line_unenc.encode('utf8')) if line.startswith(TEXT_SEPARATOR): if (len(text.sentences) > 0): yield text text = Text() continue sentences = self.nlp_pipeline.sentence_segmentation(line) for sentence in sentences: s = Sentence() s.set_sentence_text(sentence) s.set_tokens(self.nlp_pipeline.parse_text(sentence)) text.add_sentence(s) if (len(text.sentences) > 0): yield text def progress(self): return self._line_count_progress()
class InputText(object): def __init__(self, text): self.text = text self.nlp_pipeline = NlpPipeline() self.gold_tokens = self.nlp_pipeline.parse_text(self.text) def get_gold_tokens(self): return self.gold_tokens
class XMLParser(AbstractParser): def __init__(self, filename): super(XMLParser, self).__init__(filename) if not self.wants_this_file(): return self.nlp_pipeline = NlpPipeline() self._linenumber = self._count_docs() self._progress = 0 def _wanted_file_endings(self): return (".xml", ) def parse(self): mteval = xml.etree.ElementTree.parse(self.filename).getroot() srcset = mteval.find("srcset") for doc in srcset.findall('doc'): self._progress += 1 talk = Text() for sentence in doc.findall("seg"): sentence_text = unicode(sentence.text) sentence = Sentence() sentence.set_sentence_text(sentence_text) sentence.set_tokens( self.nlp_pipeline.parse_text(sentence_text)) talk.add_sentence(sentence) yield talk def progress(self): return self._line_count_progress() def _count_docs(self): mteval = xml.etree.ElementTree.parse(self.filename).getroot() srcset = mteval.find("srcset") i = 0 for doc in srcset.findall('doc'): i += 1 return i
def _initialize_with_text(self, text): nlp_pipeline = NlpPipeline() self.tokens = nlp_pipeline.parse_text(text)