def split_into_segments(self): self.check_for_parens_between_quotes() sents = self.text.split('\r') # remove empty and none values sents = self.rm_none_flatten(sents) sents = [ Text(s).apply(self.lang.SingleNewLineRule, *self.lang.EllipsisRules.All) for s in sents ] sents = [self.check_for_punctuation(s) for s in sents] # flatten list of list of sentences sents = self.rm_none_flatten(sents) new_sents = [] for sent in sents: sent = Text(sent).apply(*self.lang.SubSymbolsRules.All) post_process_sent = self.post_process_segments(sent) if post_process_sent and isinstance(post_process_sent, str): new_sents.append(post_process_sent) elif isinstance(post_process_sent, list): for pps in post_process_sent: new_sents.append(pps) new_sents = [ Text(ns).apply(self.lang.SubSingleQuoteRule) for ns in new_sents ] return new_sents
def replace(self): self.text = Text(self.text).apply(Common.PossessiveAbbreviationRule, Common.KommanditgesellschaftRule, *SingleLetterAbbreviationRules.All) self.text = self.search_for_abbreviations_in_string() self.replace_multi_period_abbreviations() self.text = Text(self.text).apply(*AmPmRules.All) self.text = replace_abbreviation_as_sentence_boundary(self.text) return self.text
def sentence_boundary_punctuation(self, txt): if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'): txt = Text(txt).apply(self.lang.ReplaceColonBetweenNumbersRule) if hasattr(self.lang, 'ReplaceNonSentenceBoundaryCommaRule'): txt = Text(txt).apply( self.lang.ReplaceNonSentenceBoundaryCommaRule) # retain exclamation mark if it is an ending character of a given text txt = re.sub(r'&ᓴ&$', '!', txt) txt = [ m.group() for m in re.finditer(self.lang.SENTENCE_BOUNDARY_REGEX, txt) ] return txt
def replace(self): self.text = Text(self.text).apply( self.lang.PossessiveAbbreviationRule, self.lang.KommanditgesellschaftRule, *self.lang.SingleLetterAbbreviationRules.All) abbr_handled_text = "" for line in self.text.splitlines(True): abbr_handled_text += self.search_for_abbreviations_in_string(line) self.text = abbr_handled_text self.replace_multi_period_abbreviations() self.text = Text(self.text).apply(*self.lang.AmPmRules.All) self.text = self.replace_abbreviation_as_sentence_boundary() return self.text
def replace_punctuation(match, match_type=None): text = Text(match.group()).apply(*EscapeRegexReservedCharacters.All) sub = re.sub(r'\.', '∯', text) sub_1 = re.sub(r'\。', '&ᓰ&', sub) sub_2 = re.sub(r'\.', '&ᓱ&', sub_1) sub_3 = re.sub(r'\!', '&ᓳ&', sub_2) sub_4 = re.sub(r'\!', '&ᓴ&', sub_3) sub_5 = re.sub(r'\?', '&ᓷ&', sub_4) last_sub = re.sub(r'\?', '&ᓸ&', sub_5) if match_type != 'single': last_sub = re.sub(r"'", '&⎋&', last_sub) text = Text(last_sub).apply(*SubEscapedRegexReservedCharacters.All) return text
def process_text(self, txt): if txt[-1] not in self.lang.Punctuations: txt += 'ȸ' txt = ExclamationWords.apply_rules(txt) txt = self.between_punctuation(txt) # handle text having only doublepunctuations if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation, txt): txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All) txt = Text(txt).apply(self.lang.QuestionMarkInQuotationRule, *self.lang.ExclamationPointRules.All) txt = ListItemReplacer(txt).replace_parens() txt = self.sentence_boundary_punctuation(txt) return txt
def clean(self): if not self.text: return self.text self.remove_all_newlines() self.replace_double_newlines() self.replace_newlines() self.replace_escaped_newlines() self.text = Text(self.text).apply(*HTML.All) self.replace_punctuation_in_brackets() self.text = Text(self.text).apply(cr.InlineFormattingRule) self.clean_quotations() self.clean_table_of_contents() self.check_for_no_space_in_between_sentences() self.clean_consecutive_characters() return self.text
def post_process_segments(self, txt): if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt): return txt # below condition present in pragmatic segmenter # dont know significance of it yet. # if self.consecutive_underscore(txt) or len(txt) < 2: # return txt if re.match(r'\t', txt): pass # TODO: # Decide on keeping or removing Standard.ExtraWhiteSpaceRule # removed to retain original text spans # txt = Text(txt).apply(*ReinsertEllipsisRules.All, # Standard.ExtraWhiteSpaceRule) txt = Text(txt).apply(*self.lang.ReinsertEllipsisRules.All) if re.search(self.lang.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt): txt = re.split( self.lang.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt) return txt else: txt = txt.replace('\n', '') return txt.strip()
def replace_newlines(self): if self.doc_type == 'pdf': self.remove_pdf_line_breaks() else: self.text = Text(self.text).apply( cr.NewLineFollowedByPeriodRule, cr.ReplaceNewlineWithCarriageReturnRule)
def clean_quotations(self): # method added explicitly # pragmatic-segmenter applies thhis method # at different location self.text = re.sub('`', "'", self.text) self.text = Text(self.text).apply(cr.QuotationsFirstRule, cr.QuotationsSecondRule)
def add_line_breaks_for_numbered_list_with_periods(self): if ('♨' in self.text) and (not re.search( '♨.+(\n|\r).+♨', self.text)) and (not re.search( r'for\s\d{1,2}♨\s[a-z]', self.text)): self.text = Text(self.text).apply( self.SpaceBetweenListItemsFirstRule, self.SpaceBetweenListItemsSecondRule)
def search_for_connected_sentences(self, word, txt, regex, rule): if not re.search(regex, word): return txt if any(k in word for k in cr.URL_EMAIL_KEYWORDS): return txt new_word = Text(word).apply(rule) txt = re.sub(re.escape(word), new_word, txt) return txt
def replace(self): # Rubular: http://rubular.com/r/B4X33QKIL8 SingleLowerCaseLetterRule = Rule(r'(?<=\s[a-z])\.(?=\s)', '∯') # Rubular: http://rubular.com/r/iUNSkCuso0 SingleLowerCaseLetterAtStartOfLineRule = Rule( r'(?<=^[a-z])\.(?=\s)', '∯') self.text = Text(self.text).apply( self.lang.PossessiveAbbreviationRule, *self.lang.SingleLetterAbbreviationRules.All, SingleLowerCaseLetterRule, SingleLowerCaseLetterAtStartOfLineRule) self.text = self.search_for_abbreviations_in_string() self.replace_multi_period_abbreviations() self.text = Text(self.text).apply(*self.lang.AmPmRules.All) self.text = self.replace_abbreviation_as_sentence_boundary() return self.text
def search_for_connected_sentences(self, word, txt, regex, rule): if not re.search(regex, word): return txt if any(k in word for k in cr.URL_EMAIL_KEYWORDS): return txt if any(a in word for a in Abbreviation.ABBREVIATIONS): return txt new_word = Text(word).apply(rule) txt = re.sub(word, new_word, txt) return txt
def replace(self): SingleUpperCaseCyrillicLetterAtStartOfLineRule = Rule( r'(?<=^[А-ЯЁ])\.(?=\s)', '∯') SingleUpperCaseCyrillicLetterRule = Rule(r'(?<=\s[А-ЯЁ])\.(?=\s)', '∯') self.text = Text(self.text).apply( SingleUpperCaseCyrillicLetterAtStartOfLineRule, SingleUpperCaseCyrillicLetterRule) self.replace_multi_period_abbreviations() return self.text
def between_punctuation(self, txt): txt = self.between_punctuation_processor(txt).replace() # Rubular: http://rubular.com/r/WRWy56Z5zp QuestionMarkFollowedByDashLowercaseRule = Rule( r'(?<=)\?(?=\s*[-—]\s*)', '&ᓷ&') # Rubular: http://rubular.com/r/lixxP7puSa ExclamationMarkFollowedByDashLowercaseRule = Rule( r'(?<=)!(?=\s*[-—]\s*)', '&ᓴ&') txt = Text(txt).apply(QuestionMarkFollowedByDashLowercaseRule, ExclamationMarkFollowedByDashLowercaseRule) return txt
def process(self): if not self.text: return self.text li = ListItemReplacer(self.text) self.text = li.add_line_break() self.replace_abbreviations() self.replace_numbers() self.replace_continuous_punctuation() self.replace_periods_before_numeric_references() self.text = Text(self.text).apply( self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule, self.lang.GeoLocationRule, self.lang.FileFormatRule) postprocessed_sents = self.split_into_segments() return postprocessed_sents
def process(self): if not self.text: return self.text self.doc = nlp(self.text) li = ListItemReplacer(self.text) self.text = li.add_line_break() self.replace_abbreviations() self.replace_numbers() self.replace_continuous_punctuation() self.replace_periods_before_numeric_references() self.text = Text(self.text).apply( self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule, self.lang.GeoLocationRule, self.lang.FileFormatRule) processed = self.split_into_segments() if self.char_span: return self.sentences_with_char_spans(processed) else: return processed
def format_numbered_list_with_parens(self): self.replace_parens_in_numbered_list() self.add_line_breaks_for_numbered_list_with_parens() self.text = Text(self.text).apply(self.ListMarkerRule)
def replace_escaped_newlines(self): self.text = Text(self.text).apply(cr.EscapedNewLineRule, cr.EscapedCarriageReturnRule, cr.TypoEscapedNewLineRule, cr.TypoEscapedCarriageReturnRule)
def format_numbered_list_with_periods(self): self.replace_periods_in_numbered_list() self.add_line_breaks_for_numbered_list_with_periods() self.text = Text(self.text).apply(self.SubstituteListPeriodRule)
def remove_pdf_line_breaks(self): self.text = Text(self.text).apply( cr.NewLineFollowedByBulletRule, PDF.NewLineInMiddleOfSentenceRule, PDF.NewLineInMiddleOfSentenceNoSpacesRule)
def replace_double_newlines(self): self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule, cr.DoubleNewLineRule)
def remove_newline_in_middle_of_word(self): self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule)
def replace_numbers(self): self.text = Text(self.text).apply(*self.lang.Numbers.All) self.replace_period_in_slovak_dates() self.replace_period_in_ordinal_numerals() self.replace_period_in_roman_numerals() return self.text
def clean_consecutive_characters(self): self.text = Text(self.text).apply(cr.ConsecutivePeriodsRule, cr.ConsecutiveForwardSlashRule)
def replace_numbers(self): self.text = Text(self.text).apply(*self.lang.Numbers.All) self.replace_period_in_deutsch_dates() return self.text
def clean_table_of_contents(self): self.text = Text(self.text).apply(cr.TableOfContentsRule, cr.ConsecutivePeriodsRule, cr.ConsecutiveForwardSlashRule)
def replace_numbers(self): self.text = Text(self.text).apply(*self.lang.Numbers.All)
def remove_newline_in_middle_of_word(self): NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '') self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)