def _clean_text(text: typing.AnyStr) -> str: # try and approximate unicode with ascii text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode() text = text.lower() # make lowercase text = text.replace("?", ".").replace("!", ".") for c in "/-\n\r": text = text.replace(c, " ") text = "".join(filter(ALPHABET.__contains__, text)) # filter to alphabet chars text = text.lstrip(" .") # filter out leading spaces and periods if text == "": raise ValueError("text needs to have at least one letter") ret = "" for x in text: # ret is a valid string after every iteration if x == ".": ret = ret.rstrip(". ") + ". " elif x == " ": ret = ret.rstrip(" ") + " " else: ret += x ret = ret.rstrip(" ") # strip trailing spaces return ret
def fetch_one_page_of_links(link: typing.AnyStr, pbar: tqdm.tqdm) -> typing.Tuple: pbar.set_description( desc=link.replace(r"https://www.allrecipes.com/", ".../") + " (fetching page)") start = datetime.datetime.now() if "https://www.allrecipes.com/" in link: r = requests.get(link) else: print( f"Link {link} doesn't contain 'https://www.allrecipes.com/', attempting to prepend it" ) r = requests.get('https://www.allrecipes.com' + link) duration = datetime.datetime.now() - start pbar.set_description( desc=link.replace(r"https://www.allrecipes.com/", ".../") + " (making soup)") # print(f"\tFetching {link} took {duration.total_seconds()}s") soup = BeautifulSoup(r.text, 'html.parser') pbar.set_description( desc=link.replace(r"https://www.allrecipes.com/", ".../") + " (scraping links)") # ==================================================================== # Collect all the item links on this page. # They could either start with https://www.allrecipes.com/recipe/... # or just with /recipe/... # ==================================================================== item_links = [ link.get("href") for link in soup.select("a") if re.match(r".+www\.allrecipes\.com/recipe/.+", link.get("href", "")) ] item_links.extend([ "https://www.allrecipes.com" + link.get("href") for link in soup.select("a") if re.match(r"^/recipe/", link.get("href", "")) ]) # ================================================================= # Get the link for the next page. # It'll either be in a 'next page' button or a 'load more' button # ================================================================= next_page_link = [ next_page.get("href", None) for next_page in soup.select( "a.category-page-list-related-load-more-button") ] if not next_page_link: next_page_link = [ next_page.get("href", None) for next_page in soup.select( "a.category-page-list-related-nav-next-button") ] if not next_page_link: next_page_link = [None] # Return the found links, and the link to take us to the next page. return item_links, next_page_link[0]
def convert_line(line: typing.AnyStr) -> typing.AnyStr: """ Конвертирует одну строку :param line: строка для конвертаций :return: сконвертированная строка с требуемым отступом """ line = line.replace('<', '<') \ .replace('>', '>') for tag, value in single_tags.items(): line = line.replace(tag, value) for tag, value in inline_tags.items(): if line.startswith(tag): line = value.format(line[len(tag):].strip()) return line
def rid(value: typing.AnyStr, old: typing.Union[typing.AnyStr, typing.Pattern[typing.AnyStr]], new: typing.AnyStr) -> typing.AnyStr: """ 去掉匹配成功的字段 :param value: 要被处理的字符串 :param old: 被替换的内容,可以为正则表达示或字符串 :param new: 替换的字符串 :return: 结果 """ if hasattr(old, "sub"): return old.sub(new, value) else: return value.replace(old, new)
def iupac_replace(sequence: typing.AnyStr): iupac_regex = { 'M': '[AC]', 'R': '[AG]', 'W': '[AT]', 'S': '[CG]', 'Y': '[CT]', 'K': '[GT]', 'V': '[ACG]', 'H': '[ACT]', 'D': '[AGT]', 'B': '[CGT]', 'X': '[ACGT]', 'N': '[ACGT]' } for i, j in iupac_regex.items(): sequence = sequence.replace(i, j) if debug: print(sequence) return compile(sequence)
def __init__(self, pat: ty.AnyStr, *, period_special: bool = True): """ Arguments --------- pat The glob pattern to use for matching period_special Whether a leading period in file/directory names should be matchable by ``*``, ``?`` and ``[…]`` – traditionally they are not, but many modern shells allow one to disable this behaviour """ self.period_special = period_special # type: bool self._sep = utils.maybe_fsencode(os.path.sep, pat) # type: ty.AnyStr dblstar = utils.maybe_fsencode("**", pat) # type: ty.AnyStr dot = utils.maybe_fsencode(".", pat) # type: ty.AnyStr pat_ndot = utils.maybe_fsencode(r"(?![.])", pat) # type: ty.AnyStr # Normalize path separator if os.path.altsep: pat = pat.replace(utils.maybe_fsencode(os.path.altsep, pat), self._sep) # Sanity checks for stuff that will definitely NOT EVER match # (there is another one in the loop below) assert not os.path.isabs( pat), "Absolute matching patterns will never match" # Note the extra final slash for its effect of only matching directories # # (TBH, I find it hard to see how that is useful, but everybody does it # and it keeps things consistent overall – something to only match files # would be nice however.) self._dir_only = pat.endswith(self._sep) # type: bool self._pat = [] # type: ty.List[ty.Optional[re_pattern_t]] for label in pat.split(self._sep): # Skip over useless path components if len(label) < 1 or label == dot: continue assert label != dot + dot, 'Matching patterns containing ".." will never match' if label == dblstar: self._pat.append(None) elif dblstar in label: raise NotImplementedError( "Using double-star (**) and other characters in the same glob " "path label ({0}) is not currently supported – please do file " "an issue if you need this!".format(os.fsdecode(label))) else: #re_expr: ty.AnyStr if not isinstance(label, bytes): re_expr = fnmatch.translate(label) else: re_expr = fnmatch.translate( label.decode("latin-1")).encode("latin-1") if period_special and not label.startswith(dot): re_expr = pat_ndot + re_expr self._pat.append(re.compile(re_expr))
def _fixed_sesar_id(id: typing.AnyStr) -> typing.AnyStr: fixed_id = id.replace("igsn", "IGSN") return fixed_id