コード例 #1
0
 def replace(self):
     SingleUpperCaseCyrillicLetterAtStartOfLineRule = Rule(
         r'(?<=^[А-ЯЁ])\.(?=\s)', '∯')
     SingleUpperCaseCyrillicLetterRule = Rule(r'(?<=\s[А-ЯЁ])\.(?=\s)',
                                              '∯')
     self.text = Text(self.text).apply(
         SingleUpperCaseCyrillicLetterAtStartOfLineRule,
         SingleUpperCaseCyrillicLetterRule)
     self.replace_multi_period_abbreviations()
     return self.text
コード例 #2
0
ファイル: cleaner.py プロジェクト: databill86/pySBD
 def search_for_connected_sentences(self, word, txt, regex, rule):
     if not re.search(regex, word):
         return txt
     if any(k in word for k in cr.URL_EMAIL_KEYWORDS):
         return txt
     if any(a in word for a in self.lang.Abbreviation.ABBREVIATIONS):
         return txt
     new_word = Text(word).apply(rule)
     txt = re.sub(re.escape(word), new_word, txt)
     return txt
コード例 #3
0
        def between_punctuation(self, txt):
            txt = self.between_punctuation_processor(txt).replace()
            # Rubular: http://rubular.com/r/WRWy56Z5zp
            QuestionMarkFollowedByDashLowercaseRule = Rule(
                r'(?<=)\?(?=\s*[-—]\s*)', '&ᓷ&')
            # Rubular: http://rubular.com/r/lixxP7puSa
            ExclamationMarkFollowedByDashLowercaseRule = Rule(
                r'(?<=)!(?=\s*[-—]\s*)', '&ᓴ&')

            txt = Text(txt).apply(QuestionMarkFollowedByDashLowercaseRule,
                                  ExclamationMarkFollowedByDashLowercaseRule)
            return txt
コード例 #4
0
ファイル: processor.py プロジェクト: shradhit/pySBD
 def process(self):
     if not self.text:
         return self.text
     li = ListItemReplacer(self.text)
     self.text = li.add_line_break()
     self.replace_abbreviations()
     self.replace_numbers()
     self.replace_continuous_punctuation()
     self.replace_periods_before_numeric_references()
     self.text = Text(self.text).apply(
         self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
         self.lang.GeoLocationRule, self.lang.FileFormatRule)
     postprocessed_sents = self.split_into_segments()
     return postprocessed_sents
コード例 #5
0
ファイル: processor.py プロジェクト: databill86/pySBD
 def process(self):
     if not self.text:
         return self.text
     self.doc = nlp(self.text)
     li = ListItemReplacer(self.text)
     self.text = li.add_line_break()
     self.replace_abbreviations()
     self.replace_numbers()
     self.replace_continuous_punctuation()
     self.replace_periods_before_numeric_references()
     self.text = Text(self.text).apply(
         self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
         self.lang.GeoLocationRule, self.lang.FileFormatRule)
     processed = self.split_into_segments()
     if self.char_span:
         return self.sentences_with_char_spans(processed)
     else:
         return processed
コード例 #6
0
 def remove_newline_in_middle_of_word(self):
     self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule)
コード例 #7
0
 def clean_consecutive_characters(self):
     self.text = Text(self.text).apply(cr.ConsecutivePeriodsRule,
                                       cr.ConsecutiveForwardSlashRule)
コード例 #8
0
ファイル: processor.py プロジェクト: shradhit/pySBD
 def replace_numbers(self):
     self.text = Text(self.text).apply(*self.lang.Numbers.All)
コード例 #9
0
 def add_line_breaks_for_numbered_list_with_parens(self):
     if '☝' in self.text and not re.search("☝.+\n.+☝|☝.+\r.+☝", self.text):
         self.text = Text(self.text).apply(
             self.SpaceBetweenListItemsThirdRule)
コード例 #10
0
 def replace_escaped_newlines(self):
     self.text = Text(self.text).apply(cr.EscapedNewLineRule,
                                       cr.EscapedCarriageReturnRule,
                                       cr.TypoEscapedNewLineRule,
                                       cr.TypoEscapedCarriageReturnRule)
コード例 #11
0
 def replace_numbers(self):
     self.text = Text(self.text).apply(*self.lang.Numbers.All)
     self.replace_period_in_deutsch_dates()
     return self.text
コード例 #12
0
 def remove_newline_in_middle_of_word(self):
     NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
     self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
コード例 #13
0
 def format_numbered_list_with_periods(self):
     self.replace_periods_in_numbered_list()
     self.add_line_breaks_for_numbered_list_with_periods()
     self.text = Text(self.text).apply(self.SubstituteListPeriodRule)
コード例 #14
0
 def format_numbered_list_with_parens(self):
     self.replace_parens_in_numbered_list()
     self.add_line_breaks_for_numbered_list_with_parens()
     self.text = Text(self.text).apply(self.ListMarkerRule)
コード例 #15
0
 def replace_double_newlines(self):
     self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule,
                                       cr.DoubleNewLineRule)
コード例 #16
0
 def remove_pdf_line_breaks(self):
     self.text = Text(self.text).apply(
         cr.NewLineFollowedByBulletRule, PDF.NewLineInMiddleOfSentenceRule,
         PDF.NewLineInMiddleOfSentenceNoSpacesRule)
コード例 #17
0
 def replace_numbers(self):
     self.text = Text(self.text).apply(*self.lang.Numbers.All)
     self.replace_period_in_slovak_dates()
     self.replace_period_in_ordinal_numerals()
     self.replace_period_in_roman_numerals()
     return self.text
コード例 #18
0
 def clean_table_of_contents(self):
     self.text = Text(self.text).apply(cr.TableOfContentsRule,
                                       cr.ConsecutivePeriodsRule,
                                       cr.ConsecutiveForwardSlashRule)
コード例 #19
0
 def add_line_breaks_for_numbered_list_with_periods(self):
     if ('♨' in self.text) and (not re.search(
             '♨.+(\n|\r).+♨', self.text)) and (not re.search(
                 r'for\s\d{1,2}♨\s[a-z]', self.text)):
         self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule,
                                 self.SpaceBetweenListItemsSecondRule)