Beispiel #1
0
def misc_fixes(filepath):
    if filepath.stem == 'info':
        return
    po = pofile(filepath.open('r'))
    changed = False
    for i, unit in enumerate(po.units):
        # Fix immersion
        for j, msgid in enumerate(unit.msgid):
            if 'immersion' in msgid:
                unit.msgid[j] = msgid.replace('immersion', 'samādhi')
                changed = True
        # fix evam
        evam_found = False
        if not evam_found and any(
                msgid.startswith('"Evaṃ me sutaṃ') for msgid in unit.msgid):
            evam_found = True
            if 'evam' not in ''.join(unit.automaticcomments):
                unit.automaticcomments += '#. <span class="evam">\n'
                po.units[i + 1].automaticcomments.insert(0, '#. </span>\n')
                changed = True
        for j, comment in enumerate(unit.automaticcomments):
            new_comment, n = regex.subn(r'-pi([^a-z])', r'-pli\1', comment)
            if n:
                unit.automaticcomments[j] = new_comment
                changed = True
            new_comment, n = regex.subn(r'\bpi-', 'pli-', new_comment)
            if n:
                unit.automaticcomments[j] = new_comment
                changed = True

    if changed:
        po.save()
Beispiel #2
0
def link_manuals():
    index_path = Path("docs/index.html")
    index_text = index_path.read_text()
    (index_text, n) = regex.subn(
        r'(<li><a href="#about">About</a><ul>)',
        ('<li><a href="user_manual/index.html">User manual</a></li>'
         "<ul>"
         '<li><a href="user_manual/index.html#how-to-read-this-manual">How to read this manual</a></li>'
         '<li><a href="user_manual/index.html#preparing-your-program-collection">Preparing your program collection</a></li>'
         '<li><a href="user_manual/index.html#taxonomy">Taxonomy</a></li>'
         '<li><a href="user_manual/index.html#pipeline-tutorial">Pipeline tutorial</a></li>'
         '<li><a href="user_manual/index.html#pipeline-documentation">Pipeline documentation</a></li>'
         '<li><a href="user_manual/index.html#glossary">Glossary</a></li>'
         "</ul>"
         '<li><a href="developer_manual/index.html">Developer manual</a></li>'
         "<ul>"
         '<li><a href="developer_manual/index.html#bird-view">Bird view</a></li>'
         '<li><a href="developer_manual/index.html#helper-programs">Helper programs</a></li>'
         '<li><a href="developer_manual/index.html#tag-databases">Tag databases</a></li>'
         '<li><a href="developer_manual/index.html#implementation-notes">Implementation notes</a></li>'
         "</ul>"
         r"\1"),
        index_text,
    )
    assert n == 1
    (index_text, n) = regex.subn(r'\b(src|href)="docs/', r'\1="', index_text)
    assert n == 2
    index_path.write_text(index_text)
Beispiel #3
0
def remove_citations(text):
    n_subs_made = 1
    while n_subs_made > 0:
        text, n_subs_made = regex.subn(citation_re, ';', text)
        text, n_subs_made2 = regex.subn(citation_re, ';', text)
        n_subs_made += n_subs_made2
    return text
Beispiel #4
0
async def doit(chat, match):
    fr = match.group(1)
    to = match.group(2)
    to = to.replace('\\/', '/')
    try:
        fl = match.group(3)
        if fl == None:
            fl = ''
        fl = fl[1:]
    except IndexError:
        fl = ''

    # Build Python regex flags
    count = 1
    flags = 0
    for f in fl:
        if f == 'i':
            flags |= re.IGNORECASE
        elif f == 'g':
            count = 0
        else:
            await chat.reply('unknown flag: {}'.format(f))
            return

    # Handle replies
    if 'reply_to_message' in chat.message:
        # Try to find the original message text
        message = chat.message['reply_to_message']
        original = find_original(message)
        if not original:
            return

        # Substitute the text
        try:
            s, i = re.subn(fr, to, original)
            if i > 0:
                return (await Chat.from_message(bot,
                                                message).reply(s))['result']
        except Exception as e:
            await chat.reply('u dun goofed m8: ' + str(e))
            return

    # Try matching the last few messages
    global last_msgs
    if chat.id not in last_msgs:
        return

    for msg in reversed(last_msgs[chat.id]):
        try:
            original = find_original(msg)
            if not original:
                continue

            s, i = re.subn(fr, to, original, count=count, flags=flags)
            if i > 0:
                return (await Chat.from_message(bot, msg).reply(s))['result']
        except Exception as e:
            await chat.reply('u dun goofed m8: ' + str(e))
            return
Beispiel #5
0
 def beta_code(self, text):
     """Replace method. Note: regex.subn() returns a tuple (new_string,
     number_of_subs_made).
     """
     text = text.replace('-', '')
     for (pattern, repl) in self.pattern1:
         text = regex.subn(pattern, repl, text)[0]
     for (pattern, repl) in self.pattern2:
         text = regex.subn(pattern, repl, text)[0]
     # remove third run, if punct list not used
     for (pattern, repl) in self.pattern3:
         text = regex.subn(pattern, repl, text)[0]
     return text
Beispiel #6
0
 def beta_code(self, text):
     """Replace method. Note: regex.subn() returns a tuple (new_string,
     number_of_subs_made).
     """
     text = text.replace('-', '')
     for (pattern, repl) in self.pattern1:
         text = regex.subn(pattern, repl, text)[0]
     for (pattern, repl) in self.pattern2:
         text = regex.subn(pattern, repl, text)[0]
     # remove third run, if punct list not used
     for (pattern, repl) in self.pattern3:
         text = regex.subn(pattern, repl, text)[0]
     return text
def update_readme_example():
    source = Path("docs/resources/fibonacci.py").read_text().strip()
    readme_path = Path("README.md")
    readme_text = readme_path.read_text()
    (readme_text, n) = regex.subn(
        r"(?sm)^\| Taxon \| Lines \|.+?(?=\n\n)",
        tag_program(f"# {source}"),
        readme_text,
        count=1,
    )
    assert n == 1
    (readme_text, n) = regex.subn(r"(?<=paroxython )\S+(?= loaded)", VERSION, readme_text)
    assert n == 1
    readme_path.write_text(readme_text)
Beispiel #8
0
def name(name):
    name, number = re.subn(r'\s', '_', name.strip())
    if number:
        return redirect(url_for('.name', name=name))

    board = Board.q.filter(Board.name == name).one_or_none()
    return render_template('boards_view.html', board=board)
Beispiel #9
0
    def substitute(m):
        if not m.raw_text:
            return None

        s, i = re.subn(fr, to, m.raw_text, count=count, flags=flags)
        if i > 0:
            return s
Beispiel #10
0
 async def substitute(original, msg):
     try:
         s, i = re.subn(fr, to, original, count=count, flags=flags)
         if i > 0:
             return (await Chat.from_message(bot, msg).reply(s))['result']
     except Exception as e:
         await chat.reply('u dun goofed m8: ' + str(e))
Beispiel #11
0
def FixError_GetProcAddress(line):
    #GetProcAddress\((.+,[ ])*_T\((.+?)\)\)
    #GetProcAddress($1$2)
    rslt = re.subn(r'GetProcAddress\((.+,[ ])*_T\((.+?)\)\)', r'GetProcAddress(\1\2)', line)
    if rslt[1] > 0:
        return rslt[0]
    return line
    def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs):
        """Wrapper for subn."""

        pattern = compile_search(pattern, flags)
        return regex.subn(
            pattern, compile_replace(pattern, repl), string, count, flags, pos, endpos, concurrent, **kwargs
        )
Beispiel #13
0
def combiningStrip(text):
    """
    From a string, remove combining diacritics and modifiers.
    
    Parameters:
        text : string
    
    Requires regex module as re
    
    Return string with combining characters removed
    """

    assert type(text) is str

    unicodeBlockList = [
        r'\p{InCombining_Diacritical_Marks_for_Symbols}',
        r'\p{InSuperscripts_and_Subscripts}',
        r'\p{InCombining_Diacritical_Marks}',
        r'\p{InSpacing_Modifier_Letters}',
        r'\p{InCombining_Diacritical_Marks_Extended}'
        r'\p{InCombining_Diacritical_Marks_Supplement}'
    ]

    additionalChars = [r'ᴸ', r'ᵇ', r':', r'<', r'←', r'=', r"'", r"‚"]

    pattern = r'(' + r'|'.join(unicodeBlockList + additionalChars) + r')'
    pattern = re.compile(pattern)
    # re.search(pattern, text)
    result = re.subn(pattern, '', text)

    return result[0]
Beispiel #14
0
def FixError_Overlap_T(line):
    #_T\([ ]*(_T\([ ]*".+?"[ ]*\))[ ]*\)
    #$1
    rslt = re.subn(r'_T\([ ]*(_T\([ ]*".*?"[ ]*\))[ ]*\)', r'\1', line)
    if rslt[1] > 0:
        return rslt[0]
    return line
Beispiel #15
0
    def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs):
        """Wrapper for subn."""

        pattern = compile_search(pattern, flags)
        return regex.subn(
            pattern, compile_replace(pattern, repl), string,
            count, flags, pos, endpos, concurrent, **kwargs
        )
    def _normalize( cleanTxt, dirtyTxt ):
        cleanCorpus = unicodedata.normalize( 'NFKD', cleanTxt.lower( ) ).encode( 'ascii', 'ignore' )
        cleanCorpus = cleanCorpus.decode( 'ascii' )
        dirtyCorpus = dirtyTxt.lower( )

        cleanCorpus = cleanCorpus.replace( '_', ' ' ).replace( '|', '' ).replace( '&c.', ' ' ). \
            replace( '`', ' ' ).replace( '@', ' ' ).replace( '(', '' ).replace( ')', '' ).replace( '"', '' ). \
            replace( "''", "" )

        dirtyCorpus = dirtyCorpus.replace( '&amp;c.', ' ' ). \
            replace( '&amp;', ' ' ).replace( '&lt', ' ' ). \
            replace( '&gt', ' ' ).replace( r"\\", " " ).replace( '/', ' ' ).replace( '(', '' ).replace( ')',
                                                                                                        '' ).replace(
            '"', '' )

        regexTasks = [
            (r'(\w)\^(\w)', r'\1\2', lambda: cleanCorpus, "clean"),

            (r'([^-+])--([^-+])', r'\1 \2', lambda: corpus, "clean" ),

            (r'(\w)- (\w)', r'\1-\2', lambda: corpus, "clean" ),

            (r'\+-+?|-+\+|(-\s?){2,}', r' ', lambda: corpus, "clean" ),

            (r'\[[=)\']?(\w)[.]?\]', r'\1', lambda: corpus, "clean"),

            (r'[\]\[\}\{]', r' ', lambda: corpus, "clean"),

            (r'(\s?\.\s*){2,}', r' ', lambda: corpus, "clean"),

            (r'\s+', r' ', lambda: corpus, "clean"),

            (r'\n{4}.+\n{4}(?:.+\n{4})?', r' ', lambda: dirtyCorpus, "dirty"),

            (r'(\w)-\s{2,}(\w)', r'\1\2', lambda: corpus, "dirty" ),

            (r'(\s?\.\s*){2,}', r' ', lambda: corpus, "dirty"),

            (r'-{3,}', r' ', lambda: corpus, "dirty"),

            (r'\s+', r' ', lambda: corpus, "dirty" ),

        ]

        updated_corpora = dict( )
        taskNum = 0

        for ((corpus, n), Type) in map( lambda argLst: (regex.subn( *argLst[ 0 ] ), argLst[ 1 ]),
                                        map( lambda T: [ (T[ 0 ], T[ 1 ], T[ 2 ]( )), T[ 3 ] ], regexTasks ) ):
            updated_corpora[ Type ] = corpus

            if _DEBUG:
                print(
                    "substituted {0} of {1} in {2} corpus".format( n, regexTasks[ taskNum ][ 0 ], Type ) )
                taskNum += 1
                sys.stdout.flush( )

        return updated_corpora
Beispiel #17
0
def in_memory(string: str) -> str:
    """
    Determine how string would appear in memory by removing opening/closing quotes and resolving
    escaped characters.
    """
    in_mem = string[1:-1].replace("\\\\", "x")
    in_mem = in_mem.replace('\\"', "x")
    in_mem, _ = re.subn("\\\\x..", "x", in_mem)
    return in_mem
Beispiel #18
0
def inject_flow_diagram_in_nav():
    path = Path("docs/developer_manual/index.html")
    text = path.read_text()
    (text,
     n) = regex.subn(r"(</nav>)",
                     r'<p><img alt="" src="../resources/flow.png"></p>\1',
                     text)
    assert n == 1
    path.write_text(text)
Beispiel #19
0
def update_version_number():
    for path in ["paroxython/cli_tag.py", "paroxython/cli_collect.py"]:
        path = Path(path)
        source = path.read_text()
        (source, n) = regex.subn(
            r"(?<=https://github\.com/laowantong/paroxython/blob/)[^/]+",
            VERSION, source)
        assert n == 1, path
        path.write_text(source)
Beispiel #20
0
 def actually_doit(original):
     try:
         s = original.message
         if s.startswith(HEADER):
             s = s[len(HEADER):]
         s, i = regex.subn(fr, to, s, count=count, flags=flags)
         if i > 0:
             return original, s
     except Exception as e:
         return None, f"u dun goofed m8: {str(e)}"
     return None, None
Beispiel #21
0
def inject_taxonomy():
    index_path = Path("docs/user_manual/index.html")
    text = index_path.read_text()
    tree = Path("docs/resources/tree.js").read_text()
    head = f"""
        <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
        <script type="text/javascript">{tree}</script>
    """
    (text, n) = regex.subn("</head>", fr"{head}</head>", text)
    assert n == 1
    index_path.write_text(text)
Beispiel #22
0
def update_github_links():
    count = 2
    source = Path("tests/test_recommend_programs.py").read_text()
    path = Path("docs/md/pipeline_documentation.md")
    text = path.read_text()
    (text, n) = regex.subn(r"test_recommend_programs.py#L\d+-L\d+",
                           f"test_recommend_programs.py#L-L", text)
    assert n == count
    for i in range(1, count + 1):
        start = source.partition(f"# extract_{i} (start)")[0].count("\n") + 2
        stop = source.partition(f"# extract_{i} (stop)")[0].count("\n")
        assert start < stop
        (text, n) = regex.subn(
            r"test_recommend_programs.py#L-L",
            f"test_recommend_programs.py#L{start}-L{stop}",
            text,
            count=1,
        )
        assert n == 1
    path.write_text(text)
Beispiel #23
0
 def actually_doit(original):
     try:
         s, i = regex.subn(fr,
                           to,
                           original.message,
                           count=count,
                           flags=flags)
         if i > 0:
             return original, s
     except Exception as e:
         return None, f"u dun goofed m8: {str(e)}"
     return None, None
Beispiel #24
0
def compute_stats():
    readme_path = Path("README.md")
    readme_text = readme_path.read_text()
    cleanup = Cleanup("full")
    directories = ["paroxython", "tests", "helpers"]
    for directory in directories:
        total = 0
        for program_path in Path(directory).glob("**/*.py"):
            source = program_path.read_text()
            # Work around a weird error:
            # tokenize.TokenError: ('EOF in multi-line string', (12, 10))
            source = source.replace(
                'if __name__ == "__main__":\n    bar = foo', "pass\npass")
            source = cleanup.run(source)
            total += source.count("\n")
        print(f"{directory}: {total} SLOC")
        total = 50 * round(total / 50)
        (readme_text, n) = regex.subn(
            fr"(?m)(!\[{directory} SLOC\].+?)~\d+(%20SLOC)",
            fr"\1~{total}\2",
            readme_text,
        )
        assert n > 0, f"Unable to create badge for '{directory}' SLOC."
    total = 50 * round(
        Path("paroxython/resources/spec.md").read_text().count("\n") / 50)
    (readme_text, n) = regex.subn(
        fr"(?m)(!\[spec lines\].+?)~\d+(%20lines)",
        fr"\1~{total}\2",
        readme_text,
    )
    assert n == 1
    total = Path("paroxython/resources/taxonomy.tsv").read_text().partition(
        "-- EOF")[0].count("\n")
    (readme_text, n) = regex.subn(
        fr"(?m)(!\[taxonomy mappings\].+)-\d+(%20mappings)",
        fr"\1-{total}\2",
        readme_text,
    )
    assert n == 1
    readme_path.write_text(readme_text)
Beispiel #25
0
def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs):
    """Wrapper for `subn`."""

    is_replace = _is_replace(repl)
    is_string = isinstance(repl, (_util.string_type, _util.binary_type))
    if is_replace and repl.use_format:
        raise ValueError("Compiled replace cannot be a format object!")

    pattern = compile_search(pattern, flags)
    return _regex.subn(
        pattern, (compile_replace(pattern, repl) if is_replace or is_string else repl), string,
        count, flags, pos, endpos, concurrent, **kwargs
    )
Beispiel #26
0
def subfn(pattern, format, string, *args, **kwargs):  # noqa A002
    """Wrapper for `subfn`."""

    flags = args[4] if len(args) > 4 else kwargs.get('flags', 0)
    is_replace = _is_replace(format)
    is_string = isinstance(format, (str, bytes))
    if is_replace and not format.use_format:
        raise ValueError("Compiled replace is not a format object!")

    pattern = compile_search(pattern, flags)
    rflags = FORMAT if is_string else 0
    return _regex.subn(pattern, (compile_replace(pattern, format, flags=rflags)
                                 if is_replace or is_string else format),
                       string, *args, **kwargs)
Beispiel #27
0
def subn(pattern, repl, string, *args, **kwargs):
    """Wrapper for `subn`."""

    flags = args[4] if len(args) > 4 else kwargs.get('flags', 0)
    is_replace = _is_replace(repl)
    is_string = isinstance(repl, (str, bytes))
    if is_replace and repl.use_format:
        raise ValueError("Compiled replace cannot be a format object!")

    pattern = compile_search(pattern, flags)
    return _regex.subn(
        pattern,
        (compile_replace(pattern, repl) if is_replace or is_string else repl),
        string, *args, **kwargs)
Beispiel #28
0
def subn(pattern, repl, string, *args, **kwargs):
    """Wrapper for `subn`."""

    flags = args[4] if len(args) > 4 else kwargs.get('flags', 0)
    is_replace = _is_replace(repl)
    is_string = isinstance(repl, (str, bytes))
    if is_replace and repl.use_format:
        raise ValueError("Compiled replace cannot be a format object!")

    pattern = compile_search(pattern, flags)
    return _regex.subn(
        pattern, (compile_replace(pattern, repl) if is_replace or is_string else repl), string,
        *args, **kwargs
    )
Beispiel #29
0
def subfn(pattern, format, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs):  # noqa A002
    """Wrapper for `subfn`."""

    is_replace = _is_replace(format)
    is_string = isinstance(format, (_util.string_type, _util.binary_type))
    if is_replace and not format.use_format:
        raise ValueError("Compiled replace is not a format object!")

    pattern = compile_search(pattern, flags)
    rflags = FORMAT if is_string else 0
    return _regex.subn(
        pattern, (compile_replace(pattern, format, flags=rflags) if is_replace or is_string else format), string,
        count, flags, pos, endpos, concurrent, **kwargs
    )
Beispiel #30
0
    def __init__(self, in_data, timezone=None):
        self.time = datetime.now(pytz.timezone(timezone))
        self.time_provided = False
        self.interval_provided = False

        self.targets = []

        self.content = 'Reminder'
        self.content_provided = False

        mentions = re.search(self.find_mentions, in_data)
        in_data = re.subn(self.find_mentions, '', in_data, 1)

        self.process_mentions(mentions)
        self.try_and_match(in_data)
Beispiel #31
0
def subfn(pattern, format, string, *args, **kwargs):  # noqa A002
    """Wrapper for `subfn`."""

    flags = args[4] if len(args) > 4 else kwargs.get('flags', 0)
    is_replace = _is_replace(format)
    is_string = isinstance(format, (str, bytes))
    if is_replace and not format.use_format:
        raise ValueError("Compiled replace is not a format object!")

    pattern = compile_search(pattern, flags)
    rflags = FORMAT if is_string else 0
    return _regex.subn(
        pattern, (compile_replace(pattern, format, flags=rflags) if is_replace or is_string else format), string,
        *args, **kwargs
    )
Beispiel #32
0
def register_submit():
    v = Validate(request)
    email = v.require('email')
    user = v.require('user')
    if not v.ok:
        return render_template("register.html", valid=v)

    user, number = re.subn(r'[^a-zA-Z0-9_-]', '', user)
    v.expect(
        number == 0,
        "Invalid symbol in Username, Username can only contain ASCII letters, numbers, dashes, and underscores.",
        'user')
    v.expect(len(user) > 3, "Username Too short", 'user')
    v.expect(
        len(user) and user[0] != '_',
        "Username can not start with an underscore.", 'user')
    if not v.ok:
        return render_template("register.html", valid=v)

    usr = User.query.filter(User.username.ilike(user)).one_or_none()
    v.expect(usr is None, "Username taken", 'user')
    if not v.ok:
        return render_template("register.html", valid=v)

    usr = User(username=user)
    usr.email = email
    usr.nickname = v.optional('nick')
    usr.discord = v.optional('discord')
    usr.postal = v.optional('postal')

    msg = "Unable to find this domain to send and email to. If the email address is valid, please open an issue."
    try:
        name, host = email.split('@', 1)
        addr = socket.gethostbyname(host)
        v.expect(addr is not None, msg, 'email')
    except socket.gaierror:
        v.expect(False, msg, 'email')

    if not v.ok:
        return render_template("register.html", valid=v)

    db.session.add(usr)
    db.session.commit()

    login_user(usr)
    return redirect("/")
def add_field(current_class, filename, field_name, field_type, config, lang,
              classes, depends):
    if filename in config.EnumMap and field_type in config.EnumMap[filename]:
        field_type = config.EnumMap[filename][field_type]

    field_flag = {}
    field_text = field_type

    field_local = True
    if "super_class" in classes[current_class]:
        super = classes[classes[current_class].super_class]
        if "fields" in super:
            if (len(
                    list(
                        filter(lambda x: x.field_name == field_name,
                               super.fields))) != 0):
                field_local = False

    for key, value in lang.items():
        if type(value) == str:
            field_text = regex.sub(key, value, field_text)
        else:
            field_text, n = regex.subn(key, value["replace"], field_text)
            if n > 0:
                if value["flag"]:
                    field_flag.update(value["flag"])

    classes[current_class].fields.append(
        util.attrdict(field_type=field_type,
                      field_name=field_name,
                      field_text=field_text))
    classes[current_class].fields[-1].update(field_flag)
    classes[current_class].fields[-1]._N = len(classes[current_class].fields)
    if current_class == field_type:
        classes[current_class].fields[-1].optional = True
    classes[current_class].fields[-1].field_local = field_local

    if (field_type in classes and "abstract" in classes[field_type]
            and classes[field_type].abstract):
        classes[current_class].fields[-1].optional = True

    types = set(regex.sub("[^a-zA-Z0-9_]+", " ", field_type).split())
    types.discard(current_class)
    depends[current_class].update(types)
def test_update_docstring():
    indent = "\n            "
    result = []
    for (title, original, expected) in examples:
        original = original.replace("\n", indent)
        expected = expected.replace("\n", indent)
        result.append(f"- {title}")
        result.append(fr"```python{indent}{original}{indent}```")
        result.append(fr"```python{indent}{expected}{indent}```")
    result = regex.sub(f"(?m){indent}$", "\n", indent.join(result))
    path = Path("paroxython/preprocess_source.py")
    source = path.read_text()
    (source, n) = regex.subn(
        r"(?sm)^(\s*def full_cleaning.+?Examples:\n).+?^\n(?= +All examples)",
        fr"\1            {result}\n\n",
        source,
    )
    assert n == 1
    path.write_text(source)
Beispiel #35
0
def patch_prose():
    index_path = Path("docs/index.html")
    index_text = index_path.read_text()
    index_text = index_text.replace("<h1>Index</h1>\n", "")
    for title in ("User manual", "Developer manual"):
        slug = title.lower().replace(" ", "_")
        path = Path("docs") / slug / "index.html"
        text = path.read_text()
        (text, n) = regex.subn(
            f"""<h1 class="title">Module <code>paroxython.{slug}</code></h1>""",
            f"""<h1 class="title">{title}</h1>""",
            text,
        )
        assert n == 1, f"Unable to change the title of {slug}!"
        (text, n) = regex.subn(
            f"<h1>Index</h1>",
            f"<h1>{title}</h1>",
            text,
        )
        assert n == 1, f"Unable to change the title of {slug} in nav!"
        (text, n) = regex.subn(fr"""(?s)</div>\n<ul id="index">.+</ul>\n""",
                               "", text)
        assert n == 1, f"Unable to suppress the index section in prose {slug}'s nav!"
        (index_text,
         n) = regex.subn(fr"""<li><code><a title="paroxython.{slug}".+\n""",
                         "", index_text)
        assert n == 1, f"Unable to remove nav url for {slug}!"
        (index_text, n) = regex.subn(
            fr"""(?s)<dt><code class="name"><a title="paroxython\.{slug}".+?</dd>\n""",
            "",
            index_text,
        )
        assert n == 1, f"Unable to remove module section for {slug}!"
        (text,
         n) = regex.subn(fr"""(?s)<details class="source">.+</details>\n""",
                         "", text)
        assert n == 1, f"Unable to suppress the source code in prose {slug}!"
        (text, n) = regex.subn(
            """href="index.html">""",
            """href="../index.html">""",
            text,
        )
        assert n == 1, f"Unable to patch the Home url in {slug}!"
        path.write_text(text)
        index_path.write_text(index_text)
Beispiel #36
0
    def subn(pattern,
             repl,
             string,
             count=0,
             flags=0,
             pos=None,
             endpos=None,
             concurrent=None,
             **kwargs):
        """Wrapper for subn."""

        is_replace = _is_replace(repl)
        is_string = isinstance(repl, (compat.string_type, compat.binary_type))
        if is_replace and repl.use_format:
            raise ValueError("Compiled replace cannot be a format object!")

        pattern = compile_search(pattern, flags)
        return regex.subn(pattern, (compile_replace(pattern, repl) if
                                    is_replace or is_string else repl), string,
                          count, flags, pos, endpos, concurrent, **kwargs)
Beispiel #37
0
    def _fix_hijri_gregorian_feb_mismatch(self, date_formats, languages):
        # Now, search for 29th or 30th day of 2nd month.
        # If found, reduce it by 10 days and use regular parse
        # function again, if succeeds this time, then add 10
        # days to parsed Hijri form.
        for lang_shortname in languages:
            language = default_language_loader.get_language(lang_shortname)
            translated = language.translate(self.source, settings=settings)

            def _sub_fn(m):
                digit = int(m.group(0))
                return '{:02d}'.format(digit - 10)
            fixed_date_string, nreplaced = re.subn(
                r'(?<!\d)(29|30)', _sub_fn, translated, 1)
            if not nreplaced:
                continue

            date_data = self._parser_get_date(fixed_date_string, date_formats, languages)
            date_obj = date_data.get('date_obj')
            if date_obj:
                # Remember that, we have subtracted 10 days.
                date_data['date_obj'] = self._hijri_to_gregorian(
                    date_obj.year, date_obj.month, date_obj.day + 10, date_obj)
                return date_data
Beispiel #38
0
def normalize_text(text, lcase=True):
    text = str(text).strip()
    if lcase: text = text.lower()
    text = unicodedata.normalize('NFKD', text)
    text = regex.subn(r'\p{P}+', '', text)[0]
    return text.encode('ascii', 'ignore').decode()
def adjustOutputsModel( textFolder ):
    with open( 'PickledData/HMM_data/outputs_FIXED1_stage1.pickle', 'rb' ) as file:
        emissions = pickle.load( file )

    directory = "/home/jcavalie/NLPtools/wiki_dump/" + textFolder + '/'
    print( 'Directory:', directory )

    wikiFiles = os.listdir( directory )
    print( 'Files:', wikiFiles )
    count = 0
    for fileName in wikiFiles:
        print( "file count: ", count )
        count += 1

        with open( directory + fileName, 'r', encoding = "ISO-8859-15" ) as file:
            text_ = file.read( )

        text_ = text_.replace( "-", " " )
        text_ = parallelCorpora._normalize( text_, "" )[ 'clean' ]

        text_ = text_.strip( )

        text_ = text_.replace( "''", " " )

        pattern = r'([~`!@#$%&|*)(_+=\\^\]\[}{;:"><.,/?]+)'

        text_, num = regex.subn( pattern, ' ', text_ )

        print( "removed unwanted chars: ", num )
        text_ = regex.sub( r"(\d+)", " ", text_ )
        text_ = regex.sub( r'(\s+)', ' ', text_ )

        text_ = ' ' + text_ + ' '

        gc.collect( )

        print( "building Ngrams" )

        corporaLength = len( text_ )
        print( "CORPUS LENGTH: ", corporaLength )
        counter = 0

        print( "starting loop" )
        for one_grams, two_grams, three_grams in zip_longest( ngrams( text_, 1 ), ngrams( text_, 2 ),
                                                              ngrams( text_, 3 ) ):
            counter += 1
            if not (counter) % 1000:
                print( "1000 more complete", counter )

            if counter == corporaLength // 4:
                print( "~1/4 complete" )
            elif counter == corporaLength // 2:
                print( "~1/2 complete" )
            elif counter == int( corporaLength * (3 / 4) ):
                print( "~3/4 complete" )

            if one_grams is not None:
                if emissions[ ''.join( one_grams ) ].get(''.join( one_grams ),None ) is None:
                    N1 = emissions[ ''.join( one_grams ) ].N( )
                    # print( 'one_gram:[{0}]'.format( ''.join( one_grams ) ) )
                    emissions[ ''.join( one_grams ) ][ ''.join( one_grams ) ] += \
                        text_.count( ''.join( one_grams ) ) - N1

                    if emissions[ ''.join( one_grams ) ][ ''.join( one_grams ) ] < 0:
                        emissions[ ''.join( one_grams ) ][ ''.join( one_grams ) ]=0

            if two_grams is not None:
                if emissions[ ''.join( two_grams ) ].get(''.join( two_grams ),None ) is None:
                    N2 = emissions[ ''.join( two_grams ) ].N( )
                    # print( 'two_gram:[{0}]'.format( ''.join( two_grams ) ) )
                    emissions[ ''.join( two_grams ) ][ ''.join( two_grams ) ] += \
                        text_.count( ''.join( two_grams ) ) - N2

                    if emissions[ ''.join( two_grams ) ][ ''.join( two_grams ) ] < 0:
                        emissions[ ''.join( two_grams ) ][ ''.join( two_grams ) ]=0

            if three_grams is not None:
                if emissions[ ''.join( three_grams ) ].get(''.join( three_grams ),None ) is None:
                    N3 = emissions[ ''.join( three_grams ) ].N( )
                    # print( 'three_gram:[{0}]'.format( ''.join( three_grams ) ) )
                    emissions[ ''.join( three_grams ) ][ ''.join( three_grams ) ] += text_.count(
                        ''.join( three_grams ) ) - N3

                    if emissions[ ''.join( three_grams ) ][ ''.join( three_grams ) ] < 0:
                        emissions[ ''.join( three_grams ) ][ ''.join( three_grams ) ]=0

    with open( 'PickledData/HMM_data/outputs_FIXED1_final.pickle', 'wb' ) as file:
        pickle.dump( emissions, file, pickle.HIGHEST_PROTOCOL )
def buildNgrams( textFolder ):
    bigram1_1 = ConditionalFreqDist( )
    bigram2_2 = ConditionalFreqDist( )
    bigram3_3 = ConditionalFreqDist( )
    bigram1_2 = ConditionalFreqDist( )
    bigram1_3 = ConditionalFreqDist( )
    bigram2_1 = ConditionalFreqDist( )
    bigram2_3 = ConditionalFreqDist( )
    bigram3_1 = ConditionalFreqDist( )
    bigram3_2 = ConditionalFreqDist( )

    directory = "/home/jcavalie/NLPtools/wiki_dump/" + textFolder + '/'
    print( 'Directory:', directory )

    wikiFiles = os.listdir( directory )
    print( 'Files:', wikiFiles )
    count = 0
    for fileName in wikiFiles:
        print( "file count: ", count )
        count += 1

        with open( directory + fileName, 'r', encoding = "ISO-8859-15" ) as file:
            if 'wiki' in textFolder:
                text_ = cleanhtml( file.read( ) )
            else:
                text_ = file.read( )

        text_ = text_.replace( "-", " " )
        text_ = parallelCorpora._normalize( text_, "" )[ 'clean' ]

        text_ = text_.strip( )

        text_ = text_.replace( "''", " " )

        pattern = r'([~`!@#$%&|*)(_+=\\^\]\[}{;:"><.,/?]+)'

        text_, num = regex.subn( pattern, ' ', text_ )

        print( "removed unwanted chars: ", num )
        text_ = regex.sub( r"(\d+)", " ", text_ )
        text_ = regex.sub( r'(\s+)', ' ', text_ )

        text_ = ' ' + text_ + ' '

        gc.collect( )

        print( "building Ngrams" )

        corporaLength = len( text_ )
        print( "CORPUS LENGTH: ", corporaLength )
        counter = 0

        print( "starting loop" )
        for one_grams, two_grams, three_grams, four_grams, five_grams, six_grams in \
                zip_longest( ngrams( text_, 1 ), ngrams( text_, 2 ),
                             ngrams( text_, 3 ), ngrams( text_, 4 ),
                             ngrams( text_, 5 ),
                             ngrams( text_, 6 ) ):
            counter += 1
            if not (counter) % 1000000:
                print( "1000000 more complete", counter )

            if counter == corporaLength // 4:
                print( "~1/4 complete" )
            elif counter == corporaLength // 2:
                print( "~1/2 complete" )
            elif counter == int( corporaLength * (3 / 4) ):
                print( "~3/4 complete" )

            if two_grams is not None:
                bigram1_1[ ''.join( two_grams[ :1 ] ) ][ ''.join( two_grams[ 1: ] ) ] += 1

            if three_grams is not None:
                bigram1_2[ ''.join( three_grams[ :1 ] ) ][ ''.join( three_grams[ 1: ] ) ] += 1
                bigram2_1[ ''.join( three_grams[ :2 ] ) ][ ''.join( three_grams[ 2: ] ) ] += 1

            if four_grams is not None:
                bigram2_2[ ''.join( four_grams[ :2 ] ) ][ ''.join( four_grams[ 2: ] ) ] += 1
                bigram3_1[ ''.join( four_grams[ :3 ] ) ][ ''.join( four_grams[ 3: ] ) ] += 1
                bigram1_3[ ''.join( four_grams[ :1 ] ) ][ ''.join( four_grams[ 1: ] ) ] += 1

            if five_grams is not None:
                bigram3_2[ ''.join( five_grams[ :3 ] ) ][ ''.join( five_grams[ 3: ] ) ] += 1
                bigram2_3[ ''.join( five_grams[ :2 ] ) ][ ''.join( five_grams[ 2: ] ) ] += 1

            if six_grams is not None:
                bigram3_3[ ''.join( six_grams[ :3 ] ) ][ ''.join( six_grams[ 3: ] ) ] += 1

    print( "finished building, begin pickling" )
    CORPUS = textFolder
    with open( './PickledData/langModels/bigrams1_1' + CORPUS + '.pickle', 'wb' ) as file1:
        pickle.dump( bigram1_1, file1, pickle.HIGHEST_PROTOCOL )

    del bigram1_1

    print( "finished 1-1" )
    with open( './PickledData/langModels/bigrams2_2' + CORPUS + '.pickle', 'wb' ) as file2:
        pickle.dump( bigram2_2, file2, pickle.HIGHEST_PROTOCOL )

    del bigram2_2

    print( "finished 2-2" )
    with open( './PickledData/langModels/bigrams3_3' + CORPUS + '.pickle', 'wb' ) as file3:
        pickle.dump( bigram3_3, file3, pickle.HIGHEST_PROTOCOL )

    del bigram3_3
    gc.collect( )

    print( "finished 3-3" )
    with open( './PickledData/langModels/bigrams1_2' + CORPUS + '.pickle', 'wb' ) as file4:
        pickle.dump( bigram1_2, file4, pickle.HIGHEST_PROTOCOL )

    del bigram1_2

    print( "finished 1-2" )
    with open( './PickledData/langModels/bigrams1_3' + CORPUS + '.pickle', 'wb' ) as file5:
        pickle.dump( bigram1_3, file5, pickle.HIGHEST_PROTOCOL )

    del bigram1_3
    gc.collect( )

    print( "finished 1-3" )
    with open( './PickledData/langModels/bigrams2_1' + CORPUS + '.pickle', 'wb' ) as file6:
        pickle.dump( bigram2_1, file6, pickle.HIGHEST_PROTOCOL )

    del bigram2_1

    print( "finished 2-1" )
    with open( './PickledData/langModels/bigrams2_3' + CORPUS + '.pickle', 'wb' ) as file7:
        pickle.dump( bigram2_3, file7, pickle.HIGHEST_PROTOCOL )

    del bigram2_3

    print( "finished 2-3" )
    with open( './PickledData/langModels/bigrams3_1' + CORPUS + '.pickle', 'wb' ) as file8:
        pickle.dump( bigram3_1, file8, pickle.HIGHEST_PROTOCOL )

    del bigram3_1

    print( "finished 3-2" )
    with open( './PickledData/langModels/bigrams3_2' + CORPUS + '.pickle', 'wb' ) as file9:
        pickle.dump( bigram3_2, file9, pickle.HIGHEST_PROTOCOL )

    del bigram3_2
    gc.collect( )

    print( "finished all" )

    return