class PlaintextParser(AbstractParser): def __init__(self, filename): super(PlaintextParser, self).__init__(filename) if not self.wants_this_file(): return self._init_line_count_progress() self.nlp_pipeline = NlpPipeline() def _wanted_file_endings(self): return (".txt", ) def parse(self): text = Text() with open(self.filename, "r") as file_: for line_unenc in file_: self._progress += 1 line = unicode(line_unenc.encode('utf8')) if line.startswith(TEXT_SEPARATOR): if (len(text.sentences) > 0): yield text text = Text() continue sentences = self.nlp_pipeline.sentence_segmentation(line) for sentence in sentences: s = Sentence() s.set_sentence_text(sentence) s.set_tokens(self.nlp_pipeline.parse_text(sentence)) text.add_sentence(s) if (len(text.sentences) > 0): yield text def progress(self): return self._line_count_progress()
def __init__(self, filename): super(XMLParser, self).__init__(filename) if not self.wants_this_file(): return self.nlp_pipeline = NlpPipeline() self._linenumber = self._count_docs() self._progress = 0
class PlaintextParser(AbstractParser): def __init__(self, filename): super(PlaintextParser, self).__init__(filename) if not self.wants_this_file(): return self._init_line_count_progress() self.nlp_pipeline = NlpPipeline() def _wanted_file_endings(self): return (".txt",) def parse(self): text = Text() with open(self.filename, "r") as file_: for line_unenc in file_: self._progress += 1 line = unicode(line_unenc.encode('utf8')) if line.startswith(TEXT_SEPARATOR): if (len(text.sentences) > 0): yield text text = Text() continue sentences = self.nlp_pipeline.sentence_segmentation(line) for sentence in sentences: s = Sentence() s.set_sentence_text(sentence) s.set_tokens(self.nlp_pipeline.parse_text(sentence)) text.add_sentence(s) if (len(text.sentences) > 0): yield text def progress(self): return self._line_count_progress()
def _initialize_with_tokens(self, tokens): # convert tokens to WordTokens word_tokens = [ WordToken(token) for token in tokens ] # do pos_tagging if needed if sbd.config.getboolean('features', 'pos_tagging'): nlp_pipeline = NlpPipeline() nlp_pipeline.pos_tag(wordTokens) self.tokens = word_tokens
def _initialize_with_tokens(self, tokens): # convert tokens to WordTokens word_tokens = [WordToken(token) for token in tokens] # do pos_tagging if needed if sbd.config.getboolean('features', 'pos_tagging'): nlp_pipeline = NlpPipeline() nlp_pipeline.pos_tag(wordTokens) self.tokens = word_tokens
def __init__(self, filename): super(LineParser, self).__init__(filename) if not self.wants_this_file(): return self._init_line_count_progress() # if sbd.config.getboolean('features', 'use_question_mark'): # raise ValueError("Question marks not supported by LineParser") self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging') self.nlp_pipeline = NlpPipeline()
def _initialize_with_talks(self, talks): nlp_pipeline = NlpPipeline() word_tokens = [] for talk in talks: for sentence in talk.sentences: sentence_tokens = [] # get all word tokens for token in sentence.tokens: if not token.is_punctuation(): sentence_tokens.append(WordToken(token.word)) # do pos_tagging if needed on sentence level if sbd.config.getboolean('features', 'pos_tagging'): nlp_pipeline.pos_tag(sentence_tokens) for t in sentence_tokens: t.word = t.word.lower() word_tokens += sentence_tokens self.tokens = word_tokens
class InputText(object): def __init__(self, text): self.text = text self.nlp_pipeline = NlpPipeline() self.gold_tokens = self.nlp_pipeline.parse_text(self.text) def get_gold_tokens(self): return self.gold_tokens
class XMLParser(AbstractParser): def __init__(self, filename): super(XMLParser, self).__init__(filename) if not self.wants_this_file(): return self.nlp_pipeline = NlpPipeline() self._linenumber = self._count_docs() self._progress = 0 def _wanted_file_endings(self): return (".xml", ) def parse(self): mteval = xml.etree.ElementTree.parse(self.filename).getroot() srcset = mteval.find("srcset") for doc in srcset.findall('doc'): self._progress += 1 talk = Text() for sentence in doc.findall("seg"): sentence_text = unicode(sentence.text) sentence = Sentence() sentence.set_sentence_text(sentence_text) sentence.set_tokens( self.nlp_pipeline.parse_text(sentence_text)) talk.add_sentence(sentence) yield talk def progress(self): return self._line_count_progress() def _count_docs(self): mteval = xml.etree.ElementTree.parse(self.filename).getroot() srcset = mteval.find("srcset") i = 0 for doc in srcset.findall('doc'): i += 1 return i
def _initialize_with_text(self, text): nlp_pipeline = NlpPipeline() self.tokens = nlp_pipeline.parse_text(text)
class LineParser(AbstractParser): def __init__(self, filename): super(LineParser, self).__init__(filename) if not self.wants_this_file(): return self._init_line_count_progress() # if sbd.config.getboolean('features', 'use_question_mark'): # raise ValueError("Question marks not supported by LineParser") self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging') self.nlp_pipeline = NlpPipeline() def _wanted_file_endings(self): return (".line", ) def parse(self): with open(self.filename, "r") as file_: text = Text() sentence = Sentence() sentence.tokens = [] for line_unenc in file_: # end of a text reached if line_unenc.rstrip() == END_OF_TEXT_MARKER: yield text text = Text() continue self._progress += 1 # parse line line = unicode(line_unenc, errors='ignore') line = line.rstrip() # split line into word, pos_tags and type line_parts = line.split('\t') word = self._get_word(line_parts) if word is None: continue pos_tags = self._get_pos_tags(line_parts) punctuation = self._get_punctuation(line_parts) sentence.tokens.extend(self._create_tokens(word, pos_tags, punctuation)) # we are at the end of a sentence if punctuation == 'PERIOD': if self.POS_TAGGING and not pos_tags: self.nlp_pipeline.pos_tag(sentence.tokens) text.add_sentence(sentence) sentence = Sentence() sentence.tokens = [] # if we do not have any end-of-text-marker # return everything as one text if len(text.sentences) > 0: yield text def _get_word(self, line_parts): word = unicode(line_parts[0]) word = self.nlp_pipeline.process_word(word) # check if needed # if "?" in word and len(word) > 0: # word = word.replace("?", "") return word def _get_punctuation(self, line_parts): if len(line_parts) == 2: return unicode(line_parts[1]) else: return unicode(line_parts[2]) def _get_pos_tags(self, line_parts): if len(line_parts) == 2: return set() else: pos_tag_str = line_parts[1].split(",") pos_tag_types = map(lambda x: x.split(".")[1], pos_tag_str) return set(map(lambda x: PosTag[x], pos_tag_types)) def progress(self): return self._line_count_progress() def _create_tokens(self, word, pos_tags, punctuation): word_token = WordToken(word) word_token.set_pos_tags(pos_tags) punctuation_token = None if punctuation == 'PERIOD': punctuation_token = PunctuationToken(punctuation, Punctuation.PERIOD) elif punctuation == 'COMMA': punctuation_token = PunctuationToken(punctuation, Punctuation.COMMA) if punctuation_token is not None: return [word_token, punctuation_token] return [word_token]
def __init__(self, text): self.text = text self.nlp_pipeline = NlpPipeline() self.gold_tokens = self.nlp_pipeline.parse_text(self.text)
def __init__(self, filename): super(PlaintextParser, self).__init__(filename) if not self.wants_this_file(): return self._init_line_count_progress() self.nlp_pipeline = NlpPipeline()