def book_href(html): links = [ node.attributes.get("href") for node in HTMLParser(html).css("div.image_container > a") ] return links
def test_unwrap_tags(): html_parser = HTMLParser("<div><a href=" ">Hello</a> <i>world</i>!</div>") html_parser.body.unwrap_tags(['i', 'a']) assert html_parser.body.html == '<body><div>Hello world!</div></body>'
def test_insert_after(): html_parser = HTMLParser('<div>Get <img src="" alt="Laptop"></div>') img = html_parser.css_first('img') img.insert_after(img.attributes.get('alt', '')) assert html_parser.body.child.html == '<div>Get <img src="" alt="Laptop">Laptop</div>'
def test_get_node_id(html, expected): html_parser = HTMLParser(html) node = html_parser.css_first('div') assert node.id == expected
def test_text_node_returns_text_when_deep(): html = '<div>foo bar</div>' html_parser = HTMLParser(html) node = html_parser.css_first('div').child assert node.text(deep=True) == 'foo bar'
import requests import os from selectolax.parser import HTMLParser import concurrent.futures base_url = 'https://pythonbytes.fm' headers = {"user-agent": "Mozilla/5.0 (Windows NT 11.5; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3629.169 Safari/537.36"} output_dir = 'MP3s/' if not os.path.isdir(output_dir): os.mkdir(output_dir) def scrape_podcast(link): print('[*] Scraping', base_url + link) selectolax = HTMLParser(requests.get(base_url + link, headers=headers).content) dl_link = base_url + str(selectolax.css_first('a.btn.btn-default.subscribe-btn.btn-sm').attrs['href']) file_name = dl_link.split('/')[-1] print('[+] Downloading', file_name) with open(output_dir + file_name, 'wb') as file: file.write(requests.get(dl_link, headers=headers).content) if __name__ == '__main__': with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: executor.map(scrape_podcast, [node.attrs['href'] for node in HTMLParser(requests.get(base_url+ '/episodes/all', headers=headers).content).css('tr > :nth-child(3) > a')])
def parse_html_page(self, page) -> str: # It's necessary render html page body using request_html to load all javascript page.html.render() tree = HTMLParser(page.html.html) return tree.css_first(".posttitle").text()
for dir in range(4): for num in range(1, dir_nums[dir] + 1): print("\rProgress: {:.2f}%".format(files_checked / num_files * 100), end='') file_name = dir_names[dir] + "/" + dir_names[dir] + "." + str( num) + ".html" try: f = open("../input-indexing/" + file_name, 'r') except IOError: continue content = f.read() html = HTMLParser(content) if html.body is None: continue for tag in html.css('script'): tag.decompose() for tag in html.css('style'): tag.decompose() content = html.body.text(separator='\n') text = tokenize_text(content) content = preprocess(content) content, snippets = index_words(content, text) for word in content:
def parser(self): return HTMLParser(self.text)
async def get_weather(location: str) -> str: async with aiohttp.request('GET', domain + weather_request + quote(location), headers={'User-Agent': user_agent}) as resp: search_text = await resp.text() title = HTMLParser(search_text).css_first('title').text() possible_href = str(resp.url) if title != 'Яндекс.Погода': # if we got rerouted to weather weather_text = search_text exact_location = '' for node in HTMLParser(weather_text).css('span.breadcrumbs__title'): exact_location += node.text() + ',' exact_location = exact_location[:-1] href = possible_href else: # if we got location list as we expected node = HTMLParser(search_text).css_first('div.grid__cell') if node is None: return f'По запросу "{location}" ничего не найдено' node = node.css_first('li.place-list__item') node = node.css_first('a') href = domain + node.attributes['href'] exact_location = node.text() async with aiohttp.request('GET', href, headers={'User-Agent': user_agent}) as resp: weather_text = await resp.text() # parsing weather card = HTMLParser(weather_text).css_first('div.content__main').css_first('div.content__row').css_first('div.card') temp_info = card.css_first('div.fact__temp-wrap').css_first('a') now_temp = temp_info.css_first('div.fact__temp').css_first('span.temp__value').text() now_condition = temp_info.css_first('div.fact__feelings').css_first('div.link__condition').text() wind_info = card.css_first('div.fact__props').css_first('dl.fact__wind-speed').css_first('dd.term__value') now_wind = wind_info.css_first('span.wind-speed').text() + ' ' + wind_info.css_first('span.fact__unit').text() day_info = HTMLParser(weather_text).css_first('div.forecast-briefly').css_first('div.swiper-wrapper') # print(day_info.html) slide = None for day in day_info.css('div.swiper-slide'): text: str = day.text() if text.find('Сегодня') != -1: slide = day.css_first('a') day_temp = slide.css_first('div.forecast-briefly__temp_day').css_first('span.temp__value').text() night_temp = slide.css_first('div.forecast-briefly__temp_night').css_first('span.temp__value').text() condition = slide.css_first('div.forecast-briefly__condition').text() return f'Место: {exact_location}' \ f'\n\nCЕЙЧАС:\nТемпература: {now_temp}\nСостояние: {now_condition}\nВетер: {now_wind}' \ f'\n\nCЕГОДНЯ:\nТемпература днем: {day_temp}\nТемпература ночью: {night_temp}\nСостояние: {condition}'\ f'\n\nПолный прогноз: {href}'
def cli(url, repositories, search, rows, minstar, token, output_file_name, max_repos_retrieved): MODE = os.environ.get("GHTOPDEP_ENV") REPOS_PER_FILE_SIZE_LIMIT = 3000 if (search) and token: gh = github3.login(token=token) CacheControl(gh.session, cache=FileCache(CACHE_DIR), heuristic=OneDayHeuristic()) elif (search) and not token: click.echo("Please provide token") sys.exit() destination = "repository" destinations = "repositories" if not repositories: destination = "package" destinations = "packages" repos = [] more_than_zero_count = 0 total_repos_count = 0 # spinner = Halo(text="Fetching information about {0}".format(destinations), spinner="dots") # spinner.start() sess = requests.session() retries = Retry(total=15, backoff_factor=15, status_forcelist=[429]) adapter = CacheControlAdapter(max_retries=retries, cache=FileCache(CACHE_DIR), heuristic=OneDayHeuristic()) sess.mount("http://", adapter) sess.mount("https://", adapter) page_url = get_page_url(sess, url, destination) found_repos = 0 total_found_repos = 0 number_of_files_processed = 0 while True: time.sleep(1) response = sess.get(page_url) print(page_url) parsed_node = HTMLParser(response.text) dependents = parsed_node.css(ITEM_SELECTOR) total_repos_count += len(dependents) for dep in dependents: repo_stars_list = dep.css(STARS_SELECTOR) # only for ghost or private? packages if repo_stars_list: repo_stars = repo_stars_list[0].text().strip() repo_stars_num = int(repo_stars.replace(",", "")) else: continue if repo_stars_num != 0: more_than_zero_count += 1 if repo_stars_num >= minstar: relative_repo_url = dep.css( REPO_SELECTOR)[0].attributes["href"] repo_url = "{0}{1}".format(GITHUB_URL, relative_repo_url) # can be listed same package is_already_added = already_added(repo_url, repos) if not is_already_added and repo_url != url: # print("adding repo ", repo_url) found_repos += 1 total_found_repos += 1 repos.append({"url": repo_url, "stars": repo_stars_num}) if found_repos >= REPOS_PER_FILE_SIZE_LIMIT: sorted_repos = repos repos = [] number_of_files_processed += 1 found_repos = 0 show_result(sorted_repos, total_repos_count, more_than_zero_count, destinations, number_of_files_processed, output_file_name) print("JSON output placed into file!") if total_found_repos > max_repos_retrieved: print(f'Collected {total_found_repos} repos.') exit node = parsed_node.css(NEXT_BUTTON_SELECTOR) if len(node) == 2: page_url = node[1].attributes["href"] elif len(node) == 0 or node[0].text() == "Previous": # spinner.stop() break elif node[0].text() == "Next": page_url = node[0].attributes["href"] sorted_repos = repos if search: for repo in repos: repo_path = urlparse(repo["url"]).path[1:] for s in gh.search_code("{0} repo:{1}".format(search, repo_path)): click.echo("{0} with {1} stars".format(s.html_url, repo["stars"])) elif number_of_files_processed == 0: show_result(sorted_repos, total_repos_count, more_than_zero_count, destinations, number_of_files_processed, output_file_name)
def product_desc(html): desc = HTMLParser(html).css_first("article > p").text() return desc
def stock_num(html): num = HTMLParser(html).css_first("p.availability").text() num = re.sub(r"\D", "", num) return num
def book_rating(html): ratings = [ node.attributes.get("class").strip("star-rating ") for node in HTMLParser(html).css("p.star-rating") ] return ratings
def cli(url, repositories, rows, minstar, description, token): if description and token: gh = github3.login(token=token) CacheControl(gh.session, cache=FileCache(".ghtopdep_cache"), heuristic=OneDayHeuristic()) Repo = namedtuple("Repo", ["url", "stars", "description"]) elif description and not token: click.echo("Please provide token") else: Repo = namedtuple("Repo", ["url", "stars"]) destination = "repository" destinations = "repositories" if not repositories: destination = "package" destinations = "packages" page_url = "{0}/network/dependents?dependent_type={1}".format(url, destination.upper()) repos = [] more_than_zero_count = 0 total_repos_count = 0 spinner = Halo(text="Fetching information about {0}".format(destinations), spinner="dots") spinner.start() sess = requests.session() cached_sess = CacheControl(sess, cache=FileCache(".ghtopdep_cache"), heuristic=OneDayHeuristic()) while True: response = cached_sess.get(page_url) parsed_node = HTMLParser(response.text) dependents = parsed_node.css(ITEM_SELECTOR) total_repos_count += len(dependents) for dep in dependents: repo_stars_list = dep.css(STARS_SELECTOR) # only for ghost or private? packages if repo_stars_list: repo_stars = dep.css(STARS_SELECTOR)[0].text().strip() repo_stars_num = int(repo_stars.replace(",", "")) else: continue if repo_stars_num != 0: more_than_zero_count += 1 if repo_stars_num >= minstar: relative_repo_url = dep.css(REPO_SELECTOR)[0].attributes["href"] repo_url = "{0}{1}".format(GITHUB_URL, relative_repo_url) # can be listed same package is_already_added = already_added(repo_url, repos) if not is_already_added and repo_url != url: if description: repo_description = fetch_description(gh, relative_repo_url) repos.append(Repo(repo_url, repo_stars_num, repo_description)) else: repos.append(Repo(repo_url, repo_stars_num)) node = parsed_node.css(NEXT_BUTTON_SELECTOR) if len(node) == 2: page_url = node[1].attributes["href"] elif len(node) == 0 or node[0].text() == "Previous": spinner.stop() break elif node[0].text() == "Next": page_url = node[0].attributes["href"] sorted_repos = sort_repos(repos, rows) show_result(sorted_repos, total_repos_count, more_than_zero_count, destination, destinations)
def fetch_resource_iteratively(self, ingest_type: str, base_url: str, force_recrawl: bool) -> dict: """ This is copypasta from process_file(), should probably refactor. """ result: Dict[str, Any] = dict(hit=False) result["hops"] = [base_url] next_url = base_url # check against blocklist for block in self.base_url_blocklist: # NOTE: hack to not skip archive.org content if "archive.org" in block: continue if block in next_url: result["status"] = "skip-url-blocklist" return result try: resource = self.find_resource(next_url, force_recrawl=force_recrawl) except SavePageNowError as e: result["status"] = "spn2-error" result["error_message"] = str(e)[:1600] return result except PetaboxError as e: result["status"] = "petabox-error" result["error_message"] = str(e)[:1600] return result except CdxApiError as e: result["status"] = "cdx-error" result["error_message"] = str(e)[:1600] # add a sleep in cdx-error path as a slow-down time.sleep(2.0) return result except WaybackError as e: result["status"] = "wayback-error" result["error_message"] = str(e)[:1600] return result except WaybackContentError as e: result["status"] = "wayback-content-error" result["error_message"] = str(e)[:1600] return result except NotImplementedError: # result['status'] = 'not-implemented' # result['error_message'] = str(e)[:1600] # return result resource = None html_biblio = None if resource: if resource.terminal_url: result["terminal"] = { "terminal_url": resource.terminal_url, "terminal_dt": resource.terminal_dt, "terminal_status_code": resource.terminal_status_code, } if resource.terminal_url not in result["hops"]: result["hops"].append(resource.terminal_url) if not resource.hit: result["status"] = resource.status return result if resource.terminal_url: for pattern in self.base_url_blocklist: if pattern in resource.terminal_url: result["status"] = "skip-url-blocklist" return result if resource.terminal_url: for pattern in self.cookie_blocklist: if pattern in resource.terminal_url: result["status"] = "blocked-cookie" return result if not resource.body: result["status"] = "null-body" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: result["status"] = "body-too-large" return result file_meta = gen_file_metadata(resource.body) try: file_meta, resource = fix_transfer_encoding( file_meta, resource) except Exception as e: result["status"] = "bad-gzip-encoding" result["error_message"] = str(e) return result if not resource.body or file_meta["size_bytes"] == 0: result["status"] = "null-body" return result # here we split based on ingest type to try and extract a next hop html_ish_resource = bool( "html" in file_meta["mimetype"] or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml" or "application/xml" in file_meta["mimetype"] or "text/xml" in file_meta["mimetype"]) html_biblio = None html_doc = None if html_ish_resource and resource.body: try: html_doc = HTMLParser(resource.body) html_biblio = html_extract_biblio(resource.terminal_url, html_doc) if html_biblio: if "html_biblio" not in result and html_biblio.title: result["html_biblio"] = json.loads( html_biblio.json(exclude_none=True)) # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) except ValueError: pass # fetch must be a hit if we got this far (though not necessarily an ingest hit!) assert resource assert resource.hit is True assert resource.terminal_status_code in (200, 226) if resource.terminal_url: result["terminal"] = { "terminal_url": resource.terminal_url, "terminal_dt": resource.terminal_dt, "terminal_status_code": resource.terminal_status_code, "terminal_sha1hex": file_meta["sha1hex"], } result["file_meta"] = file_meta result["cdx"] = cdx_to_dict(resource.cdx) if resource.revisit_cdx: result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx) if ingest_type == "pdf": if file_meta["mimetype"] != "application/pdf": result[ "status"] = "wrong-mimetype" # formerly: "other-mimetype" return result elif ingest_type == "xml": if file_meta["mimetype"] not in ( "application/xml", "text/xml", "application/jats+xml", ): result["status"] = "wrong-mimetype" return result elif ingest_type == "html": if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"): result["status"] = "wrong-mimetype" return result else: # raise NotImplementedError() pass result["_html_biblio"] = html_biblio result["_resource"] = resource return result
def multi_parser(self, html, link): #soup = BeautifulSoup(html, 'html.parser') if "khan" in link: selector = "#articleBody > p" elif "kmib" in link: selector = "#articleBody" elif "kookje" in link: selector = ".news_article" elif "naeil" in link: selector = "#contents > p" elif "donga" in link: selector = ".article_txt" #동아 좆까 elif "dt.co.kr" in link: selector = ".art_txt" elif "mk.co.kr" in link: #빅카인즈 주소 오류ㅠ selector = "#article_body" elif "imaeil" in link: selector = ".article_area > p" elif "moneytoday" in link: selector = "#textBody" #리디렉션 오류 elif "munhwa" in link: selector = "#NewsAdContent" elif "sedaily" in link: selector = ".view_con" elif "segye" in link: selector = "#article_txt > article > p" elif "asiae." in link: selector = "#txt_area > p" elif "ajunews." in link: selector = "#articleBody" elif "etnews." in link: selector = "#articleBody > p" elif "chosun." in link: selector = "#news_body_id" elif "joins." in link: selector = "#article_body" elif "fnnews." in link: selector = "#article_content" elif "hani." in link: selector = "#contents-article .text" elif "hankyung." in link: selector = "#articletxt" elif "hankookilbo." in link: #BigKinds 주소오류 selector = "#article_story" elif "heraldcorp." in link: selector = "#articleText > p" elif "kbs." in link: selector = "#cont_newstext" elif "imbc." in link: selector = ".txt" elif "obsnews." in link: selector = "#CmAdContent" elif "sbs." in link: selector = ".text_area" elif "ytn." in link: selector = "#CmAdContent > span" elif "naver." in link: selector = "#articleBodyContents" else: self.error_cnt['parse'] += 1 return "ERR" text = "" for node in HTMLParser(html).css(selector): text += node.text() result = re.sub('\xa0', '', text) return result.split(".")
def test_css_first_default(): html = "<span></span><div><p class='p3'>text</p><p class='p3'>sd</p></div><p></p>" selector = ".s3" assert HTMLParser(html).css_first(selector, default='lorem ipsum') == 'lorem ipsum'
def parse_html_page(self, page): selector = ".markdown-body > p:nth-child(4)" tree = HTMLParser(page.text) with suppress(IndexError): return tree.css(selector)[0].text() return ""
def test_malformed_attributes(): html = '<div> <meta name="description" content="ÐаÑ"Ð " /></div>' html_parser = HTMLParser(html) for tag in html_parser.tags('meta'): assert tag
def parse_html_page(self, page) -> str: tree = HTMLParser(page.text) return tree.css_first("h1").text()
def cli(url, repositories, search, table, rows, minstar, report, description, token): MODE = os.environ.get("GHTOPDEP_ENV") BASE_URL = 'https://437w61gcj1.execute-api.us-west-2.amazonaws.com/api' if MODE == "development": BASE_URL = 'http://127.0.0.1:8080' if report: try: result = requests.get('{}/repos?url={}'.format(BASE_URL, url)) if result.status_code != 404: sorted_repos = sort_repos(result.json()['deps'], rows) repos = readable_stars(sorted_repos) click.echo(tabulate(repos, headers="keys", tablefmt="github")) sys.exit() except requests.exceptions.ConnectionError as e: click.echo(e) if (description or search) and token: gh = github3.login(token=token) CacheControl(gh.session, cache=FileCache(CACHE_DIR), heuristic=OneDayHeuristic()) elif (description or search) and not token: click.echo("Please provide token") sys.exit() destination = "repository" destinations = "repositories" if not repositories: destination = "package" destinations = "packages" page_url = "{0}/network/dependents?dependent_type={1}".format( url, destination.upper()) repos = [] more_than_zero_count = 0 total_repos_count = 0 spinner = Halo(text="Fetching information about {0}".format(destinations), spinner="dots") spinner.start() sess = requests.session() retries = Retry(total=15, backoff_factor=15, status_forcelist=[429]) adapter = CacheControlAdapter(max_retries=retries, cache=FileCache(CACHE_DIR), heuristic=OneDayHeuristic()) sess.mount("http://", adapter) sess.mount("https://", adapter) while True: response = sess.get(page_url) parsed_node = HTMLParser(response.text) dependents = parsed_node.css(ITEM_SELECTOR) total_repos_count += len(dependents) for dep in dependents: repo_stars_list = dep.css(STARS_SELECTOR) # only for ghost or private? packages if repo_stars_list: repo_stars = repo_stars_list[0].text().strip() repo_stars_num = int(repo_stars.replace(",", "")) else: continue if repo_stars_num != 0: more_than_zero_count += 1 if repo_stars_num >= minstar: relative_repo_url = dep.css( REPO_SELECTOR)[0].attributes["href"] repo_url = "{0}{1}".format(GITHUB_URL, relative_repo_url) # can be listed same package is_already_added = already_added(repo_url, repos) if not is_already_added and repo_url != url: if description: repo_description = fetch_description( gh, relative_repo_url) repos.append({ "url": repo_url, "stars": repo_stars_num, "description": repo_description }) else: repos.append({ "url": repo_url, "stars": repo_stars_num }) node = parsed_node.css(NEXT_BUTTON_SELECTOR) if len(node) == 2: page_url = node[1].attributes["href"] elif len(node) == 0 or node[0].text() == "Previous": spinner.stop() break elif node[0].text() == "Next": page_url = node[0].attributes["href"] if report: try: requests.post('{}/repos'.format(BASE_URL), json={ "url": url, "deps": repos }) except requests.exceptions.ConnectionError as e: click.echo(e) sorted_repos = sort_repos(repos, rows) if search: for repo in repos: repo_path = urlparse(repo["url"]).path[1:] for s in gh.search_code("{0} repo:{1}".format(search, repo_path)): click.echo("{0} with {1} stars".format(s.html_url, repo["stars"])) else: show_result(sorted_repos, total_repos_count, more_than_zero_count, destinations, table)
def test_html_attribute_works_for_text(): html = '<div>foo bar</div>' html_parser = HTMLParser(html) node = html_parser.css_first('div').child assert node.html == 'foo bar'
def extract_infos(headers, content): data = dict() headers = {k.lower(): v for k, v in headers.items()} # check wp version wp_version = re.findall( r'wp-(?:emoji-release|embed)\.min\.js.*ver=(.*?)[\"\']', content) if wp_version: wp_version = wp_version[0] cms = 'Default' version = 'version' dom = HTMLParser(content) for tag in dom.tags('meta'): attrs = tag.attributes if 'name' in attrs: if 'generator' == attrs['name'].lower(): cms = attrs['content'] version = re.findall(r'\d+\.*\d*\.*\d*', cms) if version: version = version[0] cms = re.sub(re.escape(version), '', cms).strip() if cms == 'Default': if 'x-powered-by' in headers.keys(): cms = headers.get('x-powered-by') if 'x-aspnet-version' in headers.keys(): version = headers.get('x-aspnet-version') elif 'magento' in content.lower(): cms = 'Magento' elif 'shopify' in content.lower(): cms = 'Shopify' elif 'squarespace' in content.lower(): cms = 'Squarespace' elif 'blogger.com' in content.lower(): cms = 'Blogger' elif 'typo3' in content.lower(): cms = 'TYPO3' elif 'opencart' in content.lower(): cms = 'OpenCart' elif 'joomla' in content.lower(): cms = 'Joomla' elif 'prestashop' in content.lower(): cms = 'Prestashop' elif 'wordpress' in content.lower(): cms = 'Wordpress' elif 'drupal' in content.lower(): cms = 'Drupal' data['cms'] = cms if wp_version: data['version'] = wp_version else: data['version'] = version for key in headers.keys(): if 'server' == key or 'x-server' == key: data['server'] = headers.get(key) if key.startswith('x-') and headers.get(key) not in data.values(): data[key] = headers.get(key) plugins = re.findall(r'wp-content/plugins/(.*?)/.*ver=(.*?)[\s\'\"]', content) if plugins: data = append_info(plugins, data, 'Plugins') wp_themes = re.findall(r'/wp-content/themes/(.*)/.*?ver=(.*?)[\s\'\"]', content) if wp_themes: data = append_info(wp_themes, data, 'Themes') drupal_modules = re.findall(r'/modules/.*/(.*?)\.css\?v=(.*?)[\s\"\']', content) if drupal_modules: data = append_info(drupal_modules, data, 'Plugins') drupal_themes = re.findall(r'/themes/.*?/(.*)/css.*?v=(.*?)[\s\'\"]', content) if drupal_themes: data = append_info(drupal_themes, data, 'Themes') return data
def test_unwrap(): html = '<a id="url" href="https://rushter.com/">I linked to <i>rushter.com</i></a>' html_parser = HTMLParser(html) node = html_parser.css_first('i') node.unwrap() assert html_parser.body.child.html == '<a id="url" href="https://rushter.com/">I linked to rushter.com</a>'
def leopold_sold_out_html(leopold_sold_out) -> HTMLParser: yield HTMLParser(leopold_sold_out.read())
def test_replace_with_multiple_nodes(): html_parser = HTMLParser( '<div>Get <span alt="Laptop"><img src="/jpg"> <div>/div></span></div>') img = html_parser.css_first('span') img.replace_with(img.attributes.get('alt', '')) assert html_parser.body.child.html == '<div>Get Laptop</div>'
def leopold_in_stock_html(leopold_in_stock) -> HTMLParser: yield HTMLParser(leopold_in_stock.read())
def test_attrs_sets_attribute(): html_parser = HTMLParser('<div id="id"></div>') node = html_parser.css_first('div') node.attrs['id'] = 'new_id' assert node.attributes == {'id': 'new_id'}
def in_stock(html): stock_status = [node.text() for node in HTMLParser(html).css("p.instock")] stock_status = [re.sub(r"\W", "", i) for i in stock_status] return stock_status