コード例 #1
0
def _clean_text(text: typing.AnyStr) -> str:
    # try and approximate unicode with ascii
    text = unicodedata.normalize("NFKD", text).encode("ascii",
                                                      "ignore").decode()

    text = text.lower()  # make lowercase
    text = text.replace("?", ".").replace("!", ".")
    for c in "/-\n\r":
        text = text.replace(c, " ")
    text = "".join(filter(ALPHABET.__contains__,
                          text))  # filter to alphabet chars

    text = text.lstrip(" .")  # filter out leading spaces and periods
    if text == "":
        raise ValueError("text needs to have at least one letter")

    ret = ""
    for x in text:
        # ret is a valid string after every iteration
        if x == ".":
            ret = ret.rstrip(". ") + ". "
        elif x == " ":
            ret = ret.rstrip(" ") + " "
        else:
            ret += x

    ret = ret.rstrip(" ")  # strip trailing spaces
    return ret
コード例 #2
0
def fetch_one_page_of_links(link: typing.AnyStr,
                            pbar: tqdm.tqdm) -> typing.Tuple:
    pbar.set_description(
        desc=link.replace(r"https://www.allrecipes.com/", ".../") +
        " (fetching page)")
    start = datetime.datetime.now()
    if "https://www.allrecipes.com/" in link:
        r = requests.get(link)
    else:
        print(
            f"Link {link} doesn't contain 'https://www.allrecipes.com/', attempting to prepend it"
        )
        r = requests.get('https://www.allrecipes.com' + link)
    duration = datetime.datetime.now() - start
    pbar.set_description(
        desc=link.replace(r"https://www.allrecipes.com/", ".../") +
        " (making soup)")
    # print(f"\tFetching {link} took {duration.total_seconds()}s")
    soup = BeautifulSoup(r.text, 'html.parser')
    pbar.set_description(
        desc=link.replace(r"https://www.allrecipes.com/", ".../") +
        " (scraping links)")

    # ====================================================================
    #   Collect all the item links on this page.
    #   They could either start with https://www.allrecipes.com/recipe/...
    #    or just with /recipe/...
    # ====================================================================

    item_links = [
        link.get("href") for link in soup.select("a")
        if re.match(r".+www\.allrecipes\.com/recipe/.+", link.get("href", ""))
    ]
    item_links.extend([
        "https://www.allrecipes.com" + link.get("href")
        for link in soup.select("a")
        if re.match(r"^/recipe/", link.get("href", ""))
    ])

    # =================================================================
    #   Get the link for the next page.
    #   It'll either be in a 'next page' button or a 'load more' button
    # =================================================================
    next_page_link = [
        next_page.get("href", None) for next_page in soup.select(
            "a.category-page-list-related-load-more-button")
    ]
    if not next_page_link:
        next_page_link = [
            next_page.get("href", None) for next_page in soup.select(
                "a.category-page-list-related-nav-next-button")
        ]
        if not next_page_link:
            next_page_link = [None]

    # Return the found links, and the link to take us to the next page.
    return item_links, next_page_link[0]
コード例 #3
0
def convert_line(line: typing.AnyStr) -> typing.AnyStr:
    """
    Конвертирует одну строку

    :param line: строка для конвертаций
    :return: сконвертированная строка с требуемым отступом
    """
    line = line.replace('<', '&lt;') \
        .replace('>', '&gt;')

    for tag, value in single_tags.items():
        line = line.replace(tag, value)

    for tag, value in inline_tags.items():
        if line.startswith(tag):
            line = value.format(line[len(tag):].strip())

    return line
コード例 #4
0
def rid(value: typing.AnyStr, old: typing.Union[typing.AnyStr,
                                                typing.Pattern[typing.AnyStr]],
        new: typing.AnyStr) -> typing.AnyStr:
    """
    去掉匹配成功的字段

    :param value: 要被处理的字符串
    :param old: 被替换的内容,可以为正则表达示或字符串
    :param new: 替换的字符串
    :return: 结果
    """
    if hasattr(old, "sub"):
        return old.sub(new, value)
    else:
        return value.replace(old, new)
コード例 #5
0
ファイル: RuleParser.py プロジェクト: umr-ds/NOREC4DNA
def iupac_replace(sequence: typing.AnyStr):
    iupac_regex = {
        'M': '[AC]',
        'R': '[AG]',
        'W': '[AT]',
        'S': '[CG]',
        'Y': '[CT]',
        'K': '[GT]',
        'V': '[ACG]',
        'H': '[ACT]',
        'D': '[AGT]',
        'B': '[CGT]',
        'X': '[ACGT]',
        'N': '[ACGT]'
    }
    for i, j in iupac_regex.items():
        sequence = sequence.replace(i, j)
    if debug:
        print(sequence)
    return compile(sequence)
コード例 #6
0
    def __init__(self, pat: ty.AnyStr, *, period_special: bool = True):
        """
		Arguments
		---------
		pat
			The glob pattern to use for matching
		period_special
			Whether a leading period in file/directory names should be matchable by
			``*``, ``?`` and ``[…]`` – traditionally they are not, but many modern
			shells allow one to disable this behaviour
		"""
        self.period_special = period_special  # type: bool

        self._sep = utils.maybe_fsencode(os.path.sep, pat)  # type: ty.AnyStr
        dblstar = utils.maybe_fsencode("**", pat)  # type: ty.AnyStr
        dot = utils.maybe_fsencode(".", pat)  # type: ty.AnyStr
        pat_ndot = utils.maybe_fsencode(r"(?![.])", pat)  # type: ty.AnyStr

        # Normalize path separator
        if os.path.altsep:
            pat = pat.replace(utils.maybe_fsencode(os.path.altsep, pat),
                              self._sep)

        # Sanity checks for stuff that will definitely NOT EVER match
        # (there is another one in the loop below)
        assert not os.path.isabs(
            pat), "Absolute matching patterns will never match"

        # Note the extra final slash for its effect of only matching directories
        #
        # (TBH, I find it hard to see how that is useful, but everybody does it
        #  and it keeps things consistent overall – something to only match files
        #  would be nice however.)
        self._dir_only = pat.endswith(self._sep)  # type: bool

        self._pat = []  # type: ty.List[ty.Optional[re_pattern_t]]
        for label in pat.split(self._sep):
            # Skip over useless path components
            if len(label) < 1 or label == dot:
                continue

            assert label != dot + dot, 'Matching patterns containing ".." will never match'

            if label == dblstar:
                self._pat.append(None)
            elif dblstar in label:
                raise NotImplementedError(
                    "Using double-star (**) and other characters in the same glob "
                    "path label ({0}) is not currently supported – please do file "
                    "an issue if you need this!".format(os.fsdecode(label)))
            else:
                #re_expr: ty.AnyStr
                if not isinstance(label, bytes):
                    re_expr = fnmatch.translate(label)
                else:
                    re_expr = fnmatch.translate(
                        label.decode("latin-1")).encode("latin-1")

                if period_special and not label.startswith(dot):
                    re_expr = pat_ndot + re_expr
                self._pat.append(re.compile(re_expr))
コード例 #7
0
def _fixed_sesar_id(id: typing.AnyStr) -> typing.AnyStr:
    fixed_id = id.replace("igsn", "IGSN")
    return fixed_id