Beispiel #1
0
class EllipsisRules(object):

    # below rules aren't similar to original rules of pragmatic segmenter
    # modification: spaces replaced with same number of symbols
    # Rubular: http://rubular.com/r/i60hCK81fz
    ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.')

    # Rubular: http://rubular.com/r/Hdqpd90owl
    FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ')

    # Rubular: http://rubular.com/r/YBG1dIHTRu
    ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟')

    # Rubular: http://rubular.com/r/2VvZ8wRbd8
    FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝')

    OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ')

    All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
           ThreeConsecutiveRule, OtherThreePeriodRule]
Beispiel #2
0
class Numbers(object):
    # Rubular: http://rubular.com/r/oNyxBOqbyy
    PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯')

    # Rubular: http://rubular.com/r/EMk5MpiUzt
    NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯')

    # Rubular: http://rubular.com/r/rf4l1HjtjG
    NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯')

    # Rubular: http://rubular.com/r/HPa4sdc6b9
    StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯')

    # Rubular: http://rubular.com/r/NuvWnKleFl
    StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯')

    All = [
        PeriodBeforeNumberRule, NumberAfterPeriodBeforeLetterRule,
        NewLineNumberPeriodSpaceLetterRule, StartLineNumberPeriodRule,
        StartLineTwoDigitNumberPeriodRule
    ]
Beispiel #3
0
class Persian(Common, Standard):

    iso_code = 'fa'

    Punctuations = ['?', '!', ':', '.', '؟']
    SENTENCE_BOUNDARY_REGEX = r'.*?[:\.!\?؟]|.*?\Z|.*?$'

    # Rubular: http://rubular.com/r/RX5HpdDIyv
    ReplaceColonBetweenNumbersRule = Rule(r'(?<=\d):(?=\d)', '♭')

    # Rubular: http://rubular.com/r/kPRgApNHUg
    ReplaceNonSentenceBoundaryCommaRule = Rule(r'،(?=\s\S+،)', '♬')

    class AbbreviationReplacer(AbbreviationReplacer):

        SENTENCE_STARTERS = []

        def __init__(self, text, lang):
            super().__init__(text, lang)

        def scan_for_replacements(self, txt, am, index, character_array):
            txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
            return txt
    class Abbreviation(object):
        """Defines the abbreviations for each language (if available)"""
        ABBREVIATIONS = [
            'adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz',
            'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld',
            'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt',
            'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp',
            'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det',
            'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp',
            'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft',
            'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway',
            'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing',
            'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken',
            'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md',
            'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss',
            'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr',
            'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov',
            'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa',
            'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz',
            'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps',
            'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept',
            'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex',
            'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt',
            'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig'
        ]
        PREPOSITIVE_ABBREVIATIONS = [
            'adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr',
            'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt',
            'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen',
            'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig'
        ]
        NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']

        # Rubular: http://rubular.com/r/EUbZCNfgei
        # WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3')
        # \w in python matches unicode abbreviations also so limit to english alphanumerics
        WithMultiplePeriodsAndEmailRule = Rule(
            r'([a-zA-Z0-9_])(\.)([a-zA-Z0-9_])', '\\1∮\\3')
Beispiel #5
0
class Standard(object):

    # This class holds the punctuation marks.
    Punctuations = ['。', '.', '.', '!', '!', '?', '?']

    # Rubular: http://rubular.com/r/G2opjedIm9
    GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯')

    FileFormatRule = Rule(
        r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)',
        '∯')

    SingleNewLineRule = Rule(r'\n', 'ȹ')

    # Rubular: http://rubular.com/r/aXPUGm6fQh
    QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&')

    ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ')

    SubSingleQuoteRule = Rule(r'&⎋&', "'")
Beispiel #6
0
 class SubSymbolsRules(object):
     Period = Rule(r'∯', '.')
     ArabicComma = Rule(r'♬', '،')
     SemiColon = Rule(r'♭', ':')
     FullWidthPeriod = Rule(r'&ᓰ&', '。')
     SpecialPeriod = Rule(r'&ᓱ&', '.')
     FullWidthExclamation = Rule(r'&ᓳ&', '!')
     ExclamationPoint = Rule(r'&ᓴ&', '!')
     QuestionMark = Rule(r'&ᓷ&', '?')
     FullWidthQuestionMark = Rule(r'&ᓸ&', '?')
     MixedDoubleQE = Rule(r'☉', '?!')
     MixedDoubleQQ = Rule(r'☇', '??')
     MixedDoubleEQ = Rule(r'☈', '!?')
     MixedDoubleEE = Rule(r'☄', '!!')
     LeftParens = Rule(r'&✂&', '(')
     RightParens = Rule(r'&⌬&', ')')
     TemporaryEndingPunctutation = Rule(r'ȸ', '')
     Newline = Rule(r'ȹ', "\n")
     All = [Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod,
            FullWidthExclamation, ExclamationPoint, QuestionMark,
            FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ,
            MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation,
            Newline]
Beispiel #7
0
class Standard:

    # This class holds the punctuation marks.
    Punctuations = ['。', '.', '.', '!', '!', '?', '?']

    # Rubular: http://rubular.com/r/G2opjedIm9
    GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯')

    FileFormatRule = Rule(r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)', '∯')

    SingleNewLineRule = Rule(r'\n', 'ȹ')

    # Rubular: http://rubular.com/r/aXPUGm6fQh
    QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&')

    ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ')

    SubSingleQuoteRule = Rule(r'&⎋&', "'")

    class Abbreviation(object):
        """Defines the abbreviations for each language (if available)"""
        ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig']
        PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig']
        NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']

        # Part of "Abbreviations" ruby module
        # Rubular: http://rubular.com/r/EUbZCNfgei
        WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3')

    class DoublePunctuationRules(object):
        FirstRule = Rule(r'\?!', '☉')
        SecondRule = Rule(r'!\?', '☈')
        ThirdRule = Rule(r'\?\?', '☇')
        ForthRule = Rule(r'!!', '☄')
        DoublePunctuation = r'\?!|!\?|\?\?|!!'
        All = [FirstRule, SecondRule, ThirdRule, ForthRule]

    class ExclamationPointRules(object):
        # Rubular: http://rubular.com/r/XS1XXFRfM2
        InQuotationRule = Rule(r'\!(?=(\'|\"))', '&ᓴ&')

        # Rubular: http://rubular.com/r/sl57YI8LkA
        BeforeCommaMidSentenceRule = Rule(r'\!(?=\,\s[a-z])', '&ᓴ&')

        # Rubular: http://rubular.com/r/f9zTjmkIPb
        MidSentenceRule = Rule(r'\!(?=\s[a-z])', '&ᓴ&')

        All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule]

    class SubSymbolsRules(object):
        Period = Rule(r'∯', '.')
        ArabicComma = Rule(r'♬', '،')
        SemiColon = Rule(r'♭', ':')
        FullWidthPeriod = Rule(r'&ᓰ&', '。')
        SpecialPeriod = Rule(r'&ᓱ&', '.')
        FullWidthExclamation = Rule(r'&ᓳ&', '!')
        ExclamationPoint = Rule(r'&ᓴ&', '!')
        QuestionMark = Rule(r'&ᓷ&', '?')
        FullWidthQuestionMark = Rule(r'&ᓸ&', '?')
        MixedDoubleQE = Rule(r'☉', '?!')
        MixedDoubleQQ = Rule(r'☇', '??')
        MixedDoubleEQ = Rule(r'☈', '!?')
        MixedDoubleEE = Rule(r'☄', '!!')
        LeftParens = Rule(r'&✂&', '(')
        RightParens = Rule(r'&⌬&', ')')
        TemporaryEndingPunctutation = Rule(r'ȸ', '')
        Newline = Rule(r'ȹ', "\n")
        All = [Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod,
               FullWidthExclamation, ExclamationPoint, QuestionMark,
               FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ,
               MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation,
               Newline]

    class EllipsisRules(object):

        # below rules aren't similar to original rules of pragmatic segmenter
        # modification: spaces replaced with same number of symbols
        # Rubular: http://rubular.com/r/i60hCK81fz
        ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.')

        # Rubular: http://rubular.com/r/Hdqpd90owl
        FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ')

        # Rubular: http://rubular.com/r/YBG1dIHTRu
        ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟')

        # Rubular: http://rubular.com/r/2VvZ8wRbd8
        FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝')

        OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ')

        All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
               ThreeConsecutiveRule, OtherThreePeriodRule]

    class ReinsertEllipsisRules(object):
        # below rules aren't similar to original rules of pragmatic segmenter
        # modification: symbols replaced with same number of ellipses
        SubThreeConsecutivePeriod = Rule(r'ƪƪƪ', '...')
        SubThreeSpacePeriod = Rule(r'♟♟♟♟♟♟♟', ' . . . ')
        SubFourSpacePeriod = Rule(r'♝♝♝♝♝♝♝', '. . . .')
        SubTwoConsecutivePeriod = Rule(r'☏☏', '..')
        SubOnePeriod = Rule(r'∮', '.')
        All = [SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod,
               SubTwoConsecutivePeriod, SubOnePeriod]

    class AbbreviationReplacer(AbbreviationReplacer):
        SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\
            "More She That The There They We What When Where Who Why".split(" ")
Beispiel #8
0
class PDF(object):
    # Rubular: http://rubular.com/r/UZAVcwqck8
    NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '')

    # Rubular: http://rubular.com/r/eaNwGavmdo
    NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ')
Beispiel #9
0
class CleanRules(object):

    # NOTE: Caution: Might require \\ for special characters
    # if regex is defined with r'' then dont
    # add extra \\ for special characters
    # Rubular: http://rubular.com/r/V57WnM9Zut
    NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '')

    # Rubular: http://rubular.com/r/dMxp5MixFS
    DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r")

    # Rubular: http://rubular.com/r/H6HOJeA8bq
    DoubleNewLineRule = Rule(r'\n\n', "\r")

    # Rubular: http://rubular.com/r/FseyMiiYFT
    NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '')

    ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r")

    EscapedNewLineRule = Rule(r'\\n', "\n")

    EscapedCarriageReturnRule = Rule(r'\\r', "\r")

    TypoEscapedNewLineRule = Rule(r'\\\ n', "\n")

    TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r")

    # Rubular: http://rubular.com/r/bAJrhyLNeZ
    InlineFormattingRule = Rule(r'{b\^&gt;\d*&lt;b\^}|{b\^>\d*<b\^}', '')

    # Rubular: http://rubular.com/r/8mc1ArOIGy
    TableOfContentsRule = Rule(r'\.{4,}\s*\d+-*\d*', "\r")

    # Rubular: http://rubular.com/r/DwNSuZrNtk
    ConsecutivePeriodsRule = Rule(r'\.{5,}', ' ')

    # Rubular: http://rubular.com/r/IQ4TPfsbd8
    ConsecutiveForwardSlashRule = Rule(r'\/{3}', '')

    # Rubular: http://rubular.com/r/6dt98uI76u
    NO_SPACE_BETWEEN_SENTENCES_REGEX = r'(?<=[a-z])\.(?=[A-Z])'
    # NO_SPACE_BETWEEN_SENTENCES_REGEX = r'[a-z]\.[A-Z]'
    NoSpaceBetweenSentencesRule = Rule(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')

    # Rubular: http://rubular.com/r/l6KN6rH5XE
    NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = r'(?<=\d)\.(?=[A-Z])'
    NoSpaceBetweenSentencesDigitRule = Rule(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')

    URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']

    # Rubular: http://rubular.com/r/3GiRiP2IbD
    NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = r'(?<=\s)\n(?=([a-z]|\())'

    # Rubular: http://rubular.com/r/Gn18aAnLdZ
    NewLineFollowedByBulletRule = Rule(r"\n(?=•')", "\r")

    QuotationsFirstRule = Rule(r"''", '"')
    QuotationsSecondRule = Rule(r'``', '"')
Beispiel #10
0
class Common(object):

    # added special case: r"[。..!!? ]{2,}" to handle intermittent dots, exclamation, etc.
    # r"[。..!!?] at end to handle single instances of these symbol inputs
    SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]"

    # # Rubular: http://rubular.com/r/NqCqv372Ix
    QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]'

    # # Rubular: http://rubular.com/r/6flGnUMEVl
    PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]'

    # # Rubular: http://rubular.com/r/TYzr4qOW1Q
    # BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/

    # # Rubular: http://rubular.com/r/JMjlZHAT4g
    SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])'

    # # Rubular: http://rubular.com/r/mQ8Es9bxtk
    CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))'

    # https://rubular.com/r/UkumQaILKbkeyc
    # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
    NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])'

    # # Rubular: http://rubular.com/r/yqa4Rit8EY
    PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '∯')

    # # Rubular: http://rubular.com/r/NEv265G2X2
    KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '∯')

    # # Rubular: http://rubular.com/r/xDkpFZ0EgH
    MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]"

    class SingleLetterAbbreviationRules(object):
        """Searches for periods within an abbreviation and
        replaces the periods.
        """
        # Rubular: http://rubular.com/r/e3H6kwnr6H
        SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '∯')

        # Rubular: http://rubular.com/r/gitvf0YWH4
        SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯')

        All = [
            SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule
        ]

    class AmPmRules(object):

        # Rubular: http://rubular.com/r/Vnx3m4Spc8
        UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.')

        # Rubular: http://rubular.com/r/AJMCotJVbW
        UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.')

        # Rubular: http://rubular.com/r/13q7SnOhgA
        LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.')

        # Rubular: http://rubular.com/r/DgUDq4mLz5
        LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.')

        All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]

    class Numbers(object):
        # Rubular: http://rubular.com/r/oNyxBOqbyy
        PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯')

        # Rubular: http://rubular.com/r/EMk5MpiUzt
        NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯')

        # Rubular: http://rubular.com/r/rf4l1HjtjG
        NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯')

        # Rubular: http://rubular.com/r/HPa4sdc6b9
        StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯')

        # Rubular: http://rubular.com/r/NuvWnKleFl
        StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯')

        All = [
            PeriodBeforeNumberRule,
            NumberAfterPeriodBeforeLetterRule,
            NewLineNumberPeriodSpaceLetterRule,
            StartLineNumberPeriodRule,
            StartLineTwoDigitNumberPeriodRule
            ]
 def remove_newline_in_middle_of_word(self):
     NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
     self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
Beispiel #12
0
class ListItemReplacer(object):

    ROMAN_NUMERALS = "i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx".split(' ')
    LATIN_NUMERALS = list(string.ascii_lowercase)

    # Rubular: http://rubular.com/r/XcpaJKH0sz
    ALPHABETICAL_LIST_WITH_PERIODS = r'(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)'

    # Rubular: http://rubular.com/r/Gu5rQapywf
    # TODO: Make sure below regex call is case-insensitive
    ALPHABETICAL_LIST_WITH_PARENS = r'(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'

    # (pattern, replacement)
    SubstituteListPeriodRule = Rule('♨', '∯')
    ListMarkerRule = Rule('☝', '')

    # Rubular: http://rubular.com/r/Wv4qLdoPx7
    # https://regex101.com/r/62YBlv/1
    SpaceBetweenListItemsFirstRule = Rule(r'(?<=\S\S)\s(?=\S\s*\d+♨)', "\r")

    # Rubular: http://rubular.com/r/AizHXC6HxK
    # https://regex101.com/r/62YBlv/2
    SpaceBetweenListItemsSecondRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}♨)', "\r")

    # Rubular: http://rubular.com/r/GE5q6yID2j
    # https://regex101.com/r/62YBlv/3
    SpaceBetweenListItemsThirdRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}☝)', "\r")

    NUMBERED_LIST_REGEX_1 = r'\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))'
    # 1. abcd
    # 2. xyz
    NUMBERED_LIST_REGEX_2 = r'(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))'
    # 1) abcd
    # 2) xyz
    NUMBERED_LIST_PARENS_REGEX = r'\d{1,2}(?=\)\s)'

    # Rubular: http://rubular.com/r/NsNFSqrNvJ
    # TODO: Make sure below regex call is case-insensitive
    EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX = r'\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'

    # Rubular: http://rubular.com/r/wMpnVedEIb
    # TODO: Make sure below regex call is case-insensitive
    ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX = r'(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\.'

    # Rubular: http://rubular.com/r/GcnmQt4a3I
    ROMAN_NUMERALS_IN_PARENTHESES = r'\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])'

    def __init__(self, text):
        self.text = text

    def add_line_break(self):
        self.format_alphabetical_lists()
        self.format_roman_numeral_lists()
        self.format_numbered_list_with_periods()
        self.format_numbered_list_with_parens()
        return self.text

    def replace_parens(self):
        text = re.sub(self.ROMAN_NUMERALS_IN_PARENTHESES,
                      r'&✂&\1&⌬&', self.text)
        return text

    def format_numbered_list_with_parens(self):
        self.replace_parens_in_numbered_list()
        self.add_line_breaks_for_numbered_list_with_parens()
        self.text = Text(self.text).apply(self.ListMarkerRule)

    def replace_periods_in_numbered_list(self):
        self.scan_lists(self.NUMBERED_LIST_REGEX_1, self.NUMBERED_LIST_REGEX_2,
                        '♨', strip=True)

    def format_numbered_list_with_periods(self):
        self.replace_periods_in_numbered_list()
        self.add_line_breaks_for_numbered_list_with_periods()
        self.text = Text(self.text).apply(self.SubstituteListPeriodRule)

    def format_alphabetical_lists(self):
        self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
            roman_numeral=False)
        self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
            roman_numeral=False)
        return self.txt

    def format_roman_numeral_lists(self):
        self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
            roman_numeral=True)
        self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
            roman_numeral=True)
        return self.txt

    def add_line_breaks_for_alphabetical_list_with_periods(
            self, roman_numeral=False):
        txt = self.iterate_alphabet_array(
            self.ALPHABETICAL_LIST_WITH_PERIODS,
            roman_numeral=roman_numeral)
        return txt

    def add_line_breaks_for_alphabetical_list_with_parens(self, roman_numeral=False):
        txt = self.iterate_alphabet_array(
            self.ALPHABETICAL_LIST_WITH_PARENS,
            parens=True,
            roman_numeral=roman_numeral)
        return txt

    def scan_lists(self, regex1, regex2, replacement, strip=False):
        list_array = re.findall(regex1, self.text)
        list_array = list(map(int, list_array))
        for ind, item in enumerate(list_array):
            # to avoid IndexError
            # ruby returns nil if index is out of range
            if (ind < len(list_array) - 1 and item + 1 == list_array[ind + 1]):
                self.substitute_found_list_items(regex2, item, strip, replacement)
            elif ind > 0:
                if (((item - 1) == list_array[ind - 1]) or
                    ((item == 0) and (list_array[ind - 1] == 9)) or
                    ((item == 9) and (list_array[ind - 1] == 0))):
                    self.substitute_found_list_items(regex2, item, strip, replacement)

    def substitute_found_list_items(self, regex, each, strip, replacement):

        def replace_item(match, val=None, strip=False, repl='♨'):
            match = match.group()
            if strip:
                match = str(match).strip()
            chomped_match = match if len(match) == 1 else match.strip('.])')
            if str(each) == chomped_match:
                return "{}{}".format(each, replacement)
            else:
                return str(match)

        self.text = re.sub(regex, partial(replace_item, val=each,
                           strip=strip, repl=replacement), self.text)

    def add_line_breaks_for_numbered_list_with_periods(self):
        if ('♨' in self.text) and (not re.search(
                '♨.+(\n|\r).+♨', self.text)) and (not re.search(
                    r'for\s\d{1,2}♨\s[a-z]', self.text)):
            self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule,
                                    self.SpaceBetweenListItemsSecondRule)

    def replace_parens_in_numbered_list(self):
        self.scan_lists(
            self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')
        self.scan_lists(self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')

    def add_line_breaks_for_numbered_list_with_parens(self):
        if '☝' in self.text and not re.search("☝.+\n.+☝|☝.+\r.+☝", self.text):
            self.text = Text(self.text).apply(
                self.SpaceBetweenListItemsThirdRule)

    def replace_alphabet_list(self, a):
        """
        Input: 'a. ffegnog b. fgegkl c.'
        Output: \ra∯ ffegnog \rb∯ fgegkl \rc∯
        """

        def replace_letter_period(match, val=None):
            match = match.group()
            match_wo_period = match.strip('.')
            if match_wo_period == val:
                return '\r{}∯'.format(match_wo_period)
            else:
                return match

        txt = re.sub(self.ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX,
                     partial(replace_letter_period, val=a),
                     self.text, flags=re.IGNORECASE)
        return txt

    def replace_alphabet_list_parens(self, a):
        """
        Input: "a) ffegnog (b) fgegkl c)"
        Output: "\ra) ffegnog \r&✂&b) fgegkl \rc)"
        """

        def replace_alphabet_paren(match, val=None):
            match = match.group()
            if '(' in match:
                match_wo_paren = match.strip('(')
                if match_wo_paren == val:
                    return '\r&✂&{}'.format(match_wo_paren)
                else:
                    return match
            else:
                if match == val:
                    return '\r{}'.format(match)
                else:
                    return match

        # Make it cases-insensitive
        txt = re.sub(self.EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX,
                     partial(replace_alphabet_paren, val=a),
                     self.text, flags=re.IGNORECASE)
        return txt

    def replace_correct_alphabet_list(self, a, parens):
        if parens:
            a = self.replace_alphabet_list_parens(a)
        else:
            a = self.replace_alphabet_list(a)
        return a

    def last_array_item_replacement(self, a, i, alphabet, list_array, parens):
        if (len(alphabet) == 0) & (len(list_array) == 0) or (
                list_array[i - 1] not in alphabet) or (a not in alphabet):
            return self.text
        if abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
            return self.text
        result = self.replace_correct_alphabet_list(a, parens)
        return result

    def other_items_replacement(self, a, i, alphabet, list_array, parens):
        if (len(alphabet) == 0) & (len(list_array) == 0) or (
                list_array[i - 1] not in alphabet) or (a not in alphabet) or (
                    list_array[i + 1] not in alphabet):
            return self.text
        if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 and \
                abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
            return self.text
        result = self.replace_correct_alphabet_list(a, parens)
        return result

    def iterate_alphabet_array(self, regex, parens=False, roman_numeral=False):
        list_array = re.findall(regex, self.text)
        alphabet = self.ROMAN_NUMERALS if roman_numeral else self.LATIN_NUMERALS
        list_array = [i for i in list_array if i in alphabet]
        for ind, each in enumerate(list_array):
            if ind == len(list_array) - 1:
                self.text = self.last_array_item_replacement(each, ind, alphabet, list_array, parens)
            else:
                self.text = self.other_items_replacement(
                    each, ind, alphabet, list_array, parens)
        return self.text