def getallcheatinfo(page_request): cd_strain = bs4.SoupStrainer(class_="arcode-header") cheat_strain = bs4.SoupStrainer(class_="arcode-float") cd_soup = bs4.BeautifulSoup(page_request.data, 'lxml', parse_only=cd_strain).find_all('a') cheat_soup = bs4.BeautifulSoup(page_request.data, 'lxml', parse_only=cheat_strain) # Creates | delimited string containing all cheat codes cheatcodes = str(cheat_soup) \ .replace("<!DOCTYPE HTML>", "") \ .replace("</div>", "") \ .replace('<div class="arcode-float">', "") \ .replace("<textarea readonly=\"\">", "") \ .replace("</textarea>", "|") \ .replace("\r", "") \ .replace("\n", " ") \ .rstrip("|") \ .strip() i = 0 q = "" y = 0 while i < len(cd_soup): p = re.search("(?<=>)(.*)(?=<)", str(cd_soup[i])) q += str(p.group()) + "^" + cheatcodes.split("|")[y] + "|" i += 2 y += 1 allcheatinfo = (q.rstrip("|")) return allcheatinfo
def boosted_creature_from_header(cls, content): """Get the boosted creature from any Tibia.com page. Parameters ---------- content: :class:`str` The HTML content of a Tibia.com page. Returns ------- :class:`CreatureEntry` The boosted creature of the day. Raises ------ InvalidContent If content is not the HTML of a Tibia.com's page. """ try: parsed_content = bs4.BeautifulSoup(content.replace('ISO-8859-1', 'utf-8'), "lxml", parse_only=bs4.SoupStrainer("div", attrs={"id": "RightArtwork"})) img = parsed_content.find("img", attrs={"id": "Monster"}) name = img["title"].replace(BOOSTED_ALT, "").strip() image_url = img["src"] identifier = image_url.split("/")[-1].replace(".gif", "") return CreatureEntry(name, identifier) except TypeError as e: raise InvalidContent("content is not from Tibia.com", e)
async def get_max_page(self): async with aiohttp.ClientSession() as session: task = asyncio.tasks.create_task(fetch(session, self.url)) page = await asyncio.gather(task) for one in tqdm(page, desc="Fetching max page", bar_format=self.printstring): strainer = bs4.SoupStrainer("div", {'class': ['jix_pagination_pages']}) soup = bs4.BeautifulSoup(one.decode('utf-8'), "lxml", parse_only=strainer) pages = soup.find("div", {'class': ['jix_pagination_pages']}) hrefs = [] for element in pages: if 'href' in str(element): try: hrefs.append(int(element.contents[0])) except Exception as e: pass self.max_page = max(hrefs) + 1
async def update_menus(day): html = await _fetch.fetch_data_get_text(_menu_url) if html == -1: return -1 html = bs4.BeautifulSoup(html, "html.parser", parse_only=bs4.SoupStrainer(id="menu-repas")) for menu in html.find_all("h3"): embed = discord.Embed() embed.type = "rich" embed.title = "Resto’ U de l’Illberg" embed.url = _menu_url embed.colour = random.randint(0, 16581375) embed.description = menu.text.strip() menu = menu.next_element.next_element.next_element menu = menu.find("h4", string=re.compile( "Déjeuner")).next_element.next_element.next_element for group in menu.find_all('span'): name = group.text.strip() value = "" group = group.next_element.next_element try: for field in group.find_all('li'): value += field.text.strip() + "\n" except AttributeError: pass if value == "": value = "¯\_(ツ)_/¯" embed.add_field(name=name, value=value, inline=False) _menus[embed.description] = embed try: return _menus[day] except KeyError: return -1
def find_form_request(html: str): soup = bs4.BeautifulSoup(html, "html.parser", parse_only=bs4.SoupStrainer("form")) form = soup.form if not form: raise _exception.ParseError("Could not find form to submit", data=html) url = form.get("action") if not url: raise _exception.ParseError("Could not find url to submit to", data=form) # From what I've seen, it'll always do this! if url.startswith("/"): url = "https://www.facebook.com" + url # It's okay to set missing values to something crap, the values are localized, and # hence are not available in the raw HTML data = { x["name"]: x.get("value", "[missing]") for x in form.find_all(["input", "button"]) } return url, data
def _data_from_xml( fpath_xml: str) -> Dict[str, Union[str, Dict[str, np.ndarray]]]: ecg_data = dict() # define tags that we want to find and use SoupStrainer to speed up search tags = [ "patientdemographics", "testdemographics", "order", "restingecgmeasurements", "originalrestingecgmeasurements", "diagnosis", "originaldiagnosis", "intervalmeasurementtimeresolution", "intervalmeasurementamplituderesolution", "intervalmeasurementfilter", "waveform", ] strainer = bs4.SoupStrainer(tags) # lxml parser makes all tags lower case with open(fpath_xml, "r") as f: soup = bs4.BeautifulSoup(f, "lxml", parse_only=strainer) for tag in tags: tag_suffix = "" if tag == "restingecgmeasurements": tag_suffix = "_md" elif tag == "originalrestingecgmeasurements": tag_suffix = "_pc" elif tag == "diagnosis": soup_tag = soup.find(tag) if soup_tag is not None: ecg_data["diagnosis_md"] = _parse_soup_diagnosis(soup_tag) continue elif tag == "originaldiagnosis": soup_tag = soup.find(tag) if soup_tag is not None: ecg_data["diagnosis_pc"] = _parse_soup_diagnosis(soup_tag) continue elif tag == "waveform": voltage_data = _get_voltage_from_waveform_tags(soup.find_all(tag)) ecg_data.update(voltage_data) continue soup_tag = soup.find(tag) if soup_tag is not None: # find sub tags soup_sub_tags = soup_tag.find_all() # if there are no sub tags, use original tag if len(soup_sub_tags) == 0: soup_sub_tags = [soup_tag] ecg_data.update( {st.name + tag_suffix: st.text for st in soup_sub_tags}) return ecg_data
def from_content(cls, content): """ Gets the boosted creature from any Tibia.com page. Parameters ---------- content: :class:`str` The HTML content of a Tibia.com page. Returns ------- :class:`News` The boosted article shown. Raises ------ InvalidContent If content is not the HTML of a Tibia.com's page. """ try: parsed_content = bs4.BeautifulSoup(content.replace('ISO-8859-1', 'utf-8'), "lxml", parse_only=bs4.SoupStrainer("div", attrs={"id": "RightArtwork"})) img = parsed_content.find("img", attrs={"id": "Monster"}) name = img["title"].replace(BOOSTED_ALT, "").strip() image_url = img["src"] return cls(name, image_url) except TypeError: raise InvalidContent("content is not from Tibia.com")
def get_error_data(html: str) -> Optional[str]: """Get error message from a request.""" soup = bs4.BeautifulSoup( html, "html.parser", parse_only=bs4.SoupStrainer("form", id="login_form") ) # Attempt to extract and format the error string return " ".join(list(soup.stripped_strings)[1:3]) or None
def submitForms(url, session, value): html = session.get(url).text soup = bs4.BeautifulSoup(html, "html.parser", parse_only=bs4.SoupStrainer("form")) responses = list() for form in soup: if form.get('method') == None: method = "GET" else: method = form['method'].upper() if form.get('action') == None: submiturl = url else: submiturl = url[0:url.rfind("/")] + "/" + form['action'] parameters = setFormParams(form, value) start = time.time() if method == "POST": response = session.post(url=submiturl, data=parameters) else: response = session.request(method=method, url=submiturl, params=parameters) end = time.time() responses.append([response, end - start, parameters]) return responses
def __init__(self, address, ssl=True, testString=None): self.address = address.strip() if "http" not in self.address: if ssl: self.address = "https://%s" % (self.address) else: self.address = "http://%s" % (self.address) self.http = httplib2.Http() self.testString = testString self.testStatus = None try: """ If self.status and self.response, go ahead and get the "body" part of the page and do runCheck """ self.status, self.response = self.http.request(self.address, redirections=0) self.page = bs4.BeautifulSoup(self.response, parse_only=bs4.SoupStrainer("body"), features="html.parser").text except TimeoutError: """ If there's a timeout error, indicate that in the response and testStatus We want to do this instead of just setting self.testStatus to False because we can't even run the test to see if the string is on the page """ self.status, self.response, self.page, self.testStatus = False, "Timeout Error", False, "Timeout Error" self.runCheck()
def rand_page(user_id): link = 'https://vk.com/id' url = str(random.randint(1, 1000000000)) not_created = [] only_content = bs4.SoupStrainer("div", id='content') r = requests.get(link+url, stream=True, headers={'User-Agent': UserAgent().chrome}) soup = bs4.BeautifulSoup(r.text, "lxml", parse_only=only_content) blocked = soup.find_all('h5', class_="profile_blocked") not_created = soup.find_all('div', class_='message_page page_block') if blocked != []: bot.send_message(user_id, '*User is blocked :(*\n' + link+url, parse_mode='Markdown', reply_markup=main_menu) return if not_created != []: rand_page(user_id) try: name = soup.find('h1', class_='page_name').text bot.send_message(user_id, '*User alive!*\nName: ' + str(name) + '\n' + link+url, parse_mode='Markdown', reply_markup=callback) print(str(user_id), 'Page found!') return except AttributeError: print('Error blyat')
def get_soup(school_id, page_id): """Requests a page from CollegeData.com corresponding to the provided school_id and page_id and converts the response to a BeautifulSoup object """ # Build URL url = url_pt1 + str(page_id) + url_pt2 + str(school_id) # Request the url and raise exception if something strange returned. response = requests.get(url, headers=headers) if response.status_code != 200: msg = url + ' gave status code ' + response.status_code logging.warning(msg) raise IOError # Limit HTML parsing to only <h1> tags or the tag <div id='tabcontwrap'>. strainer = bs4.SoupStrainer( lambda name, attrs: name == 'h1' or attrs.get('id') == 'tabcontwrap') # Parse response text into a BeautifulSoup object. soup = bs4.BeautifulSoup(markup=response.text, features="lxml", parse_only=strainer) # Raise an error if the <h1> tag contained the empty page string. # It's not really an error, as it's expected that many school_id will # not correspond to a page with actual school information, but this will # allow the scraper to skip any further attempts at more pages for this # school_id, saving time. if soup.h1.string == empty_h1_string: msg = 'School ID ' + str(school_id) + ' has no info.' logging.info(msg) raise LookupError return soup
def parse_tibiacom_content(content, *, html_class="BoxContent", tag="div", builder="lxml"): """Parses HTML content from Tibia.com into a BeautifulSoup object. Parameters ---------- content: :class:`str` The raw HTML content from Tibia.com html_class: :class:`str` The HTML class of the parsed element. The default value is ``BoxContent``. tag: :class:`str` The HTML tag select. The default value is ``div``. builder: :class:`str` The builder to use. The default value is ``lxml``. Returns ------- :class:`bs4.BeautifulSoup`, optional The parsed content. """ strainer = bs4.SoupStrainer( tag, class_=html_class) if builder != "html5lib" else None return bs4.BeautifulSoup(content.replace('ISO-8859-1', 'utf-8'), builder, parse_only=strainer)
def dlp(p): url = "https://movie.douban.com/celebrity/" + str( celebrity_id) + "/photos/?type=C&start=" + str( p * 30) + "&sortby=like&size=a&subtype=a" req = requests.get(url) if req.status_code != 200: print("NETWORK ERROR") return 1 content = req.text covers = bs4.BeautifulSoup( content, "html.parser", parse_only=bs4.SoupStrainer(class_="cover")).find_all("a") img_id = [cover["href"].split("/")[-2] for cover in covers] if len(img_id) == 0: print("No image in page #%s, please check!" % (str(p))) return 1 img_link = [ "https://img3.doubanio.com/view/photo/raw/public/p" + str(id) + ".jpg" for id in img_id ] print( "========================= START Page %d: =========================" % p) for im, l in zip(img_id, img_link): img_dl(l, im) print( "========================= FINISH Page %d: =========================" % p)
def run_atc(self): self.log.info('Starting ATC') start = get_time() # get product page item_page = self.session.get(self.url) while not item_page and item_page.status_code is not '200': self.log.info("Failed to get item page " + item_page.status_code) item_page = self.session.get(self.url) self.log.info("Retrieved product page - %dms" % (get_time()-start)) start = get_time() strain = bs4.SoupStrainer(id='product_addtocart_form') item_parse = bs4.BeautifulSoup(item_page.content, "lxml", parse_only=strain) form = item_parse.find('form') select = form.find('select') self.log.info("Page parsed - %dms" % (get_time() - start)) start = get_time() # find add url action = form['action'].replace("checkout/cart", "ajax/index") # form payload payload = {'qty': '1', 'isAjax': '1'} for item in form.find_all('input'): payload[item['name']] = item['value'] opts = form.find(id='options_502').contents if hasattr(self, 'size'): size_id = sizes[self.size] size = self.size for item in form.find(id='options_502').contents: if type(item) == element.Tag and item['data-simplesku'].split('-', 1)[-1] == self.size: size = item['data-simplesku'].split('-')[-1] size_id = item['id'].split('_')[-1] break else: rand = random.choice(opts[:-2]) size = rand['data-simplesku'].split('-', 1)[-1] size_id = rand['id'].split('_')[-1] payload[select['name']] = size_id self.log.info('Selected size %s' % size) print("POST request created - {}ms {}".format((get_time() - start), str(payload))) # stdin.readline() start = get_time() start_atc = get_time() atc_resp = self.session.post(action, data=payload) while atc_resp.status_code != '200' and json.loads(atc_resp.content)['status'] != 'SUCCESS': self.log.info('POST atc failed - {} - {}'.format(atc_resp.status_code, json.loads(atc_resp.content)['status'])) time.sleep(1) start = get_time() atc_resp = self.session.post(action, data=payload) print("Added - %dms" % (get_time() - start_atc)) self.queue.put(self.session.cookies['frontend']) self.log.info('Added cookie to queue')
def getpagecount(page_request): page_strain = bs4.SoupStrainer( onchange="$(this).closest('form').submit();") page_soup = bs4.BeautifulSoup(page_request.data, 'lxml', parse_only=page_strain).find_all("option") pagecount = int(len(page_soup) - 1) return pagecount
def scrape_all_pages(self): for job_page in tqdm(self.html_pages, desc="Scraping:", bar_format=self.printstring): strainer = bs4.SoupStrainer("div", {'class': ['PaidJob']}) soup = bs4.BeautifulSoup(job_page.decode('utf-8'), 'lxml', parse_only=strainer) self.scrape_page(soup)
def file_parse(filepath): link_list = [] with open(filepath, 'r') as file_object: for link in bs4.BeautifulSoup(file_object.read(), "html.parser", parse_only=bs4.SoupStrainer('a')): if link.has_attr('href'): link_list.append(link['href']) return link_list
def scrape_items(): from src.gather_files import populate_cache populate_cache() strainer = bs4.SoupStrainer("table") items_html = bs4.BeautifulSoup(ITEMS_LIST.read_text(), "lxml", parse_only=strainer) items = items_html.select("tbody tr") return [parse_item(item) for item in items]
async def fetch_tibia_bosses_world(world: str): url = f"https://www.tibiabosses.com/{world}/" try: bosses = CACHE_BOSSES[world] return bosses except KeyError: bosses = defaultdict(list) try: async with aiohttp.ClientSession() as session: async with session.get(url) as resp: content = await resp.text() except (aiohttp.ClientError, asyncio.TimeoutError): raise errors.NetworkError(f"get_world_bosses({world})") try: parsed_content = bs4.BeautifulSoup(content, "lxml", parse_only=bs4.SoupStrainer( "div", class_="panel-layout")) _sections = parsed_content.find_all('div', class_="widget_execphp") for section in _sections: heading = section.find('h3') if heading is None: continue title = heading.text section_content = section.find('div', class_="execphpwidget") m = boss_pattern.findall(str(section_content)) if m: for (chance, link, image, expect_last, days) in m: name = link.split("/")[-1].replace("-", " ").lower() bosses[title].append( dict(name=name, chance=chance.strip(), url=link, image=image, type=expect_last, days=int(days))) else: # This regex is for bosses without prediction m = unpredicted_pattern.findall(str(section_content)) for (link, image, expect_last, days) in m: name = link.split("/")[-1].replace("-", " ").lower() bosses[title].append( dict(name=name, chance="Unpredicted", url=link, image=image, type=expect_last, days=int(days))) except: pass CACHE_BOSSES[world] = bosses return bosses
def init_bs(html, head_only=False): features = "html5lib" parse_only = None if LXML_AVAILABLE: features = "lxml" if head_only: parse_only = bs4.SoupStrainer("head") return bs4.BeautifulSoup(html, features=features, parse_only=parse_only)
def scrape_secret(): """ hit the website and scrape the first page """ url = "https://jobs.secrettelaviv.com/" req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}) page = req.text # jobs are in spans parse_only = bs4.SoupStrainer('span') return bs4.BeautifulSoup(page, "lxml", parse_only=parse_only)
def add_sentiment(): url = request.args.get('a', 0, type=str) url = str('https://www.' + re.sub('https://|www.', "", url)) source = urllib.request.urlopen(url) soup = bs.BeautifulSoup(source, 'html.parser', parse_only=bs.SoupStrainer('div')) txt = soup.text txt = re.findall("[A-z]+", txt) txt = " ".join(txt) return jsonify(result=txt)
def gettitleinfo(page_request): table_strain = bs4.SoupStrainer(style="width:753px;") table_soup = bs4.BeautifulSoup(page_request.data, 'lxml', parse_only=table_strain) titleinfo = table_soup.text \ .replace("Publisher: ", "|") \ .replace("Title ID: ", "|") \ .replace("Serial: ", "|") \ .replace("\n", "") \ .strip() return titleinfo
def get_error_data(html: str, url: str) -> Tuple[Optional[int], Optional[str]]: """Get error code and message from a request.""" code = None try: code = int(_util.get_url_parameter(url, "e")) except (TypeError, ValueError): pass soup = bs4.BeautifulSoup( html, "html.parser", parse_only=bs4.SoupStrainer("div", id="login_error") ) return code, soup.get_text() or None
def getcheatdesc(page_request): hdr_strain = bs4.SoupStrainer(class_="arcode-header") hdr_soup = bs4.BeautifulSoup(page_request.data, 'lxml', parse_only=hdr_strain).find_all('a') i = 0 q = "" while i < len(hdr_soup): p = re.search("(?<=>)(.*)(?=<)", str(hdr_soup[i])) q += str(p.group()) + "|" i += 2 cheatdesc = q.rstrip("|") return cheatdesc
def get_error_data(html, url): """Get error code and message from a request.""" try: code = _util.get_url_parameter(url, "e") except IndexError: code = None soup = bs4.BeautifulSoup( html, "html.parser", parse_only=bs4.SoupStrainer("div", id="login_error"), ) return code, soup.get_text() or None
def getFormInputs(session, url): foundInputs = list() html = session.get(url).text soup = bs4.BeautifulSoup(html, "html.parser", parse_only=bs4.SoupStrainer('input')) inputLines = soup.prettify() for line in inputLines.splitlines(keepends=False): curLine = line.strip() #check if it is an input opening tag that is not a submit button if curLine.startswith("<input") and 'type="submit"' not in curLine: foundInputs.append(curLine) return foundInputs
def download(domain): if not path.isdir(base_dir): mkdir(base_dir) url = build_url(domain) soup = bs4.BeautifulSoup(requests.get(url).content, 'html.parser', parse_only=bs4.SoupStrainer( ['p', 'a', 'ul', 'ol', 'li'])) text = build_text(soup) with open(path.join(base_dir, build_filename(url)), 'w') as f: f.write(text) return text
def getcheatcodes(page_request): cheat_strain = bs4.SoupStrainer(class_="arcode-float") cheat_soup = bs4.BeautifulSoup(page_request.data, 'lxml', parse_only=cheat_strain) cheatcodes = str(cheat_soup) \ .replace("<!DOCTYPE HTML>", "") \ .replace("</div>", "") \ .replace('<div class="arcode-float">', "") \ .replace("<textarea readonly=\"\">", "") \ .replace("</textarea>", "|") \ .replace("\r", "") \ .replace("\n", " ") \ .rstrip("|") \ .strip() return cheatcodes