def replace(self): SingleUpperCaseCyrillicLetterAtStartOfLineRule = Rule( r'(?<=^[А-ЯЁ])\.(?=\s)', '∯') SingleUpperCaseCyrillicLetterRule = Rule(r'(?<=\s[А-ЯЁ])\.(?=\s)', '∯') self.text = Text(self.text).apply( SingleUpperCaseCyrillicLetterAtStartOfLineRule, SingleUpperCaseCyrillicLetterRule) self.replace_multi_period_abbreviations() return self.text
def search_for_connected_sentences(self, word, txt, regex, rule): if not re.search(regex, word): return txt if any(k in word for k in cr.URL_EMAIL_KEYWORDS): return txt if any(a in word for a in self.lang.Abbreviation.ABBREVIATIONS): return txt new_word = Text(word).apply(rule) txt = re.sub(re.escape(word), new_word, txt) return txt
def between_punctuation(self, txt): txt = self.between_punctuation_processor(txt).replace() # Rubular: http://rubular.com/r/WRWy56Z5zp QuestionMarkFollowedByDashLowercaseRule = Rule( r'(?<=)\?(?=\s*[-—]\s*)', '&ᓷ&') # Rubular: http://rubular.com/r/lixxP7puSa ExclamationMarkFollowedByDashLowercaseRule = Rule( r'(?<=)!(?=\s*[-—]\s*)', '&ᓴ&') txt = Text(txt).apply(QuestionMarkFollowedByDashLowercaseRule, ExclamationMarkFollowedByDashLowercaseRule) return txt
def process(self): if not self.text: return self.text li = ListItemReplacer(self.text) self.text = li.add_line_break() self.replace_abbreviations() self.replace_numbers() self.replace_continuous_punctuation() self.replace_periods_before_numeric_references() self.text = Text(self.text).apply( self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule, self.lang.GeoLocationRule, self.lang.FileFormatRule) postprocessed_sents = self.split_into_segments() return postprocessed_sents
def process(self): if not self.text: return self.text self.doc = nlp(self.text) li = ListItemReplacer(self.text) self.text = li.add_line_break() self.replace_abbreviations() self.replace_numbers() self.replace_continuous_punctuation() self.replace_periods_before_numeric_references() self.text = Text(self.text).apply( self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule, self.lang.GeoLocationRule, self.lang.FileFormatRule) processed = self.split_into_segments() if self.char_span: return self.sentences_with_char_spans(processed) else: return processed
def remove_newline_in_middle_of_word(self): self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule)
def clean_consecutive_characters(self): self.text = Text(self.text).apply(cr.ConsecutivePeriodsRule, cr.ConsecutiveForwardSlashRule)
def replace_numbers(self): self.text = Text(self.text).apply(*self.lang.Numbers.All)
def add_line_breaks_for_numbered_list_with_parens(self): if '☝' in self.text and not re.search("☝.+\n.+☝|☝.+\r.+☝", self.text): self.text = Text(self.text).apply( self.SpaceBetweenListItemsThirdRule)
def replace_escaped_newlines(self): self.text = Text(self.text).apply(cr.EscapedNewLineRule, cr.EscapedCarriageReturnRule, cr.TypoEscapedNewLineRule, cr.TypoEscapedCarriageReturnRule)
def replace_numbers(self): self.text = Text(self.text).apply(*self.lang.Numbers.All) self.replace_period_in_deutsch_dates() return self.text
def remove_newline_in_middle_of_word(self): NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '') self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
def format_numbered_list_with_periods(self): self.replace_periods_in_numbered_list() self.add_line_breaks_for_numbered_list_with_periods() self.text = Text(self.text).apply(self.SubstituteListPeriodRule)
def format_numbered_list_with_parens(self): self.replace_parens_in_numbered_list() self.add_line_breaks_for_numbered_list_with_parens() self.text = Text(self.text).apply(self.ListMarkerRule)
def replace_double_newlines(self): self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule, cr.DoubleNewLineRule)
def remove_pdf_line_breaks(self): self.text = Text(self.text).apply( cr.NewLineFollowedByBulletRule, PDF.NewLineInMiddleOfSentenceRule, PDF.NewLineInMiddleOfSentenceNoSpacesRule)
def replace_numbers(self): self.text = Text(self.text).apply(*self.lang.Numbers.All) self.replace_period_in_slovak_dates() self.replace_period_in_ordinal_numerals() self.replace_period_in_roman_numerals() return self.text
def clean_table_of_contents(self): self.text = Text(self.text).apply(cr.TableOfContentsRule, cr.ConsecutivePeriodsRule, cr.ConsecutiveForwardSlashRule)
def add_line_breaks_for_numbered_list_with_periods(self): if ('♨' in self.text) and (not re.search( '♨.+(\n|\r).+♨', self.text)) and (not re.search( r'for\s\d{1,2}♨\s[a-z]', self.text)): self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule, self.SpaceBetweenListItemsSecondRule)