def getACMRecords(searchInput, bibs, years=''): searchString = getSearchString(searchInput) + years print("from get acm records", searchString) url = "https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=dl" + searchString + '&pageSize=50' print(url) try: x = requests.get(url) parsed_html = sp(x.text, "html.parser") totalNumber = int( parsed_html.find("span", { "class": "hitsLength" }).string.strip().replace(',', '')) totalPages = ceil(totalNumber / 50) print("Total Results: ", totalNumber) print("Total Pages: ", totalPages) # Extract first page bibs += scrape(parsed_html) # return for i in range(1, totalPages): print("Page ", i) next_page = url + "&pageSize=50&startPage=" + str(i) print(next_page) x = requests.get(next_page) parsed_html = sp(x.text, "html.parser") bibs += scrape(parsed_html) print("total bibs from acm", len(bibs)) except: print("Results are none") return bibs
def get_ads(category, pages_count): ads_links = [] for page in range(pages_count): refurl = f'https://www.olx.com.eg/{category}/?page={page+1}' if page == 1: refurl = f'https://www.olx.com.eg/{category}/' ajx_url = 'https://www.olx.com.eg/ajax/search/list/' print(f'=========={page+1}==========') print(refurl) headers = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9', 'referer': refurl, 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' } response = get(ajx_url, headers=headers) soup = sp(response.content, 'lxml') ads = soup.find_all('div', class_='ads__item__info') for ad in ads: ads_links.append(ad.a['href']) return ads_links
def tc(self): pg = r( techPage.url1, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img class="img-fluid" src="https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg" title="The Indian Express" alt="The Indian Express">' heads = [] links = [] imgs = [] logos = [] try: imgsl = soup.find('ul', {'class': 'article-list'}).findAll('li') for i in imgsl: imgs.append(i.find('img').get('src')) links.append(i.find('a').get('href')) logos.append(logourl) heads.append(i.find('img').get('alt')) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def dork():#Recuperation des urls menu(1) print"\033[94m[INFO]Rechercher lance\033[0m" a = 0 b = 0 c = sys.argv[2] page1 = open("page.html","w") try: while a<100: try: r = requests.get("http://www.ask.com/web?q="+str(c)+"&page="+str(a)) except IOError,e: print str(e) break print str(b) m = str(r.text) soup = sp(m) for i in soup.find_all("p",{"class":"web-result-url"}): lien= i.text try: page1.write(lien+"\n") except: pass print "[\033[92m"+str(b)+"\033[0m]--> "+str(lien) b+=1 a+=1 except IOError,e: print (bcolors.FAIL+"Stoped\033[0m\n"+str(e) )
def dork(): #Recuperation des urls menu(1) print "\033[94m[INFO]Rechercher lance\033[0m" a = 0 b = 0 c = sys.argv[2] page1 = open("page.html", "w") try: while a < 100: try: r = requests.get("http://www.ask.com/web?q=" + str(c) + "&page=" + str(a)) except IOError, e: print str(e) break print str(b) m = str(r.text) soup = sp(m) for i in soup.find_all("p", {"class": "web-result-url"}): lien = i.text try: page1.write(lien + "\n") except: pass print "[\033[92m" + str(b) + "\033[0m]--> " + str(lien) b += 1 a += 1 except IOError, e: print(bcolors.FAIL + "Stoped\033[0m\n" + str(e))
def look_up(entry): url = "https://baike.baidu.com/item/" + quote(entry) req = request.Request(url, headers=header) html = request.urlopen(req).read() soup = sp(html, "html.parser") le2 = (soup.select('.para-title.' + 'level-2 ')) le2 = (soup.findAll('h2', {'class': "title-text"})) #print(le2) x = 1 msg = [] msg1 = [] for l in le2: msg.append(l.text.replace(\ '<bound method Tag.get_text of <h2 class="title-text"><span class="title-prefix">','').replace('</span>','').replace('</h2>>','')) msg1.append(url + "#" + str(x)) x = x + 1 le3 = (soup.select(".para-title\0level-3 ")) content = soup.findAll('div', {'class': 'para'}) for i in content: i = i.get_text() i = i.replace('\n', '') i = i.replace('\r', '') i = re.sub(bracket, '', i) # print(i) # for k in range(len(msg) - 1): # return msg[k] # for f in range(len(msg1) - 1): # return msg1[f] return msg, msg1
def sc1(self): pg = r( homePage.url2, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img alt="Times of India" class="img-fluid" src="https://static.mediawire.in/brands/profilepic/1117/TOI%20Logo%20in%20Red%20Bakcground.jpg">' heads = [] links = [] imgs = [] logos = [] try: imgs1 = soup.find('div', { 'class': 'listing4 clearfix' }).find('ul').findAll('li') for i in imgs1: heads.append(i.find('span').find('a').text) links.append(i.find('span').find('a').get('href')) imgs.append(i.find('a').find('img').get('data-src')) logos.append(logourl) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def sp(self): # pg=r(url1,{'User-Agent':'Magic Browser'}) pg = r( sportPage.url1, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img class="img-fluid" src="https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg" title="The Indian Express" alt="The Indian Express">' heads = [] links = [] imgs = [] logos = [] try: imgsl = soup.find('div', { 'class': 'nation' }).findAll('div', {'class': 'snaps'}) headsl = soup.find('div', { 'class': 'nation' }).findAll('h2', {'class': 'title'}) for i in imgsl: links.append(i.find('a').get('href')) logos.append(logourl) imgs.append(i.find('img').get('data-lazy-src')) for i in headsl: heads.append(i.find('a').text) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def tc2(self): pg = r( techPage.url3, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img class="img-fluid" src="https://cdn.gadgets360.com/gadgets360_logo.png" alt="Technology News" title="NDTV Gadgets 360">' heads = [] links = [] imgs = [] logos = [] try: imgsl = soup.find('div', { 'class': 'story_list row margin_b30' }).findAll('div', {'class': 'thumb'}) for i in imgsl: if i.find('img').get( 'src' ) == "https://gadgets.ndtv.com/static/icons/img_120n.png": imgs.append(i.find('img').get('data-original')) else: imgs.append(i.find('img').get('src')) links.append(i.find('a').get('href')) logos.append(logourl) heads.append(i.find('img').get('alt')) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def loadImage(): for j in range(3): j += 2 urlLink = 'https://www.27270.com/tag/333' urlLink = urlLink + '_' + str(j) + '.html' print("正在爬取第%d页" % j) imLinks=getLink(urlLink) for imLink in imLinks: for i in range(40): i += 1 imLink1 = imLink[0:-5] imLink2 = imLink1 + '_' + str(i) + '.html' print(imLink2) content = loadPage(imLink2) html = sp(content, 'html.parser') try: link = html.find_all('img', attrs={'alt':True,'height':False})[0] time.sleep(3) # print(link) if link is None: print("爬取完成") pass else: name1 = link.get('alt') name1 = re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", name1) link1 = link.get('src') name1 = name1 + str(i) print('正在爬取' + name1) saveImage(link1,name1) except: print("爬取完成") break
def formdata(links): data=[] for i in links: try: html=requests.get(i).text soup=sp(html,'lxml') url=i title=soup.find('title').text tag='' category='' try: tag=getTagAndCateByLink(i,'Tags') except Exception: pass try: category=getTagAndCateByLink(i,'Categories') except Exception: pass try: catchhtml=soup.find(id='cnblogs_post_body') catchhtml=str(catchhtml) data.append(article(url, title, catchhtml,tag,category)) except Exception: print(i+' has no main') except Exception as e: print e return data
def ec(self): # pg=r(url1,{'User-Agent':'Magic Browser'}) pg = r( ecoPage.url1, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img class="img-fluid" src="https://www.financialexpress.com/wp-content/themes/vip/financialexpress/assets/images/fe-logo-with-read-to-lead.svg" alt="Financial Express">' heads = [] links = [] imgs = [] logos = [] try: imgl1 = soup.find('div', {'class': 'leftcol'}).findAll('figure') titles1 = soup.find('div', {'class': 'leftcol'}).findAll('h2') titles2 = soup.find('div', {'class': 'leftcol'}).findAll('h3') for i in imgl1: imgs.append(i.find('img').get('data-src')) links.append(i.find('a').get('href')) logos.append(logourl) for i in titles1: heads.append(i.find('a').text) for i in titles2: heads.append(i.find('a').text) news = zip(imgs, heads, links, logos) return news except: news = [] return news
def update(): local_time = time.ctime(time.time()) url = "https://rate.bot.com.tw/gold?Lang=zh-TW" with request.urlopen(url) as response: data = response.read().decode("utf-8") root = sp(data, "html.parser") goal_in = root.find_all("td")[5].text.replace("回售", "").strip() goal_out = root.find_all("td")[2].text.replace("買進", "").strip() s1 = ("\nGold" + "\n銀行買進: " + goal_in + "\n銀行賣出: " + goal_out) note = local_time + s1 lb.config(text=note)
def index(request, url): conn = urllib.urlopen(url) respose = conn.read() cleanSoup = sp(respose, "html.parser") try: for a in cleanSoup.findAll('a'): a['href'] = "/proxy/" + a['href'] except: print "err" respose = str(cleanSoup) return HttpResponse(respose)
def getLink(url): imList = [] # 用来存放图片链接 content = loadPage(url) # html = sp(content, 'html.parser') html = sp(content, 'html.parser') for link in html.find_all('ul', attrs={'id':'Tag_list'}): link = link.find_all('a', attrs={'target':'_blank'}) for link1 in link: link1 = link1.get('href') imList.append(link1) # rint(link1) return imList
def getPageLink(start,end): links=[] for i in range(start,end+1): payload['PageIndex']=i try: response = requests.post(next,headers=headers,data=json.dumps(payload)) s = sp(response.text, 'lxml') link = s.findAll(class_='titlelnk') for j in link: links.append(j.get('href')) except Exception as e: print e return links
def search_process(self): #get the actual result process data = self.search_init() x = "\nSearching results.....\n" for i in x: print(i, end=" ") time.sleep(.200) sys.stdout.flush() #print("Searching for results....") soup = sp(data.content, "lxml") data_scrap = soup.find("a", {"class": "result__a"}).get_text() return data_scrap
def getLinks(articleUrl,pageNum): page=requests.get("http://en.wikipedia.org"+articleUrl,timeout=5) html=page.content html=html.decode('utf-8') bsObj=sp(html,"html.parser") info=bsObj.find("div",{"class":"mw-parser-output"}) Filter ={'script','nonscipt','style'} for items in Filter: for va in info.find_all(items): va.decompose; filepath='C:/Users/qyjbo/Desktop/Web_English/' f=open(filepath+str(pageNum)+'.txt','w',encoding='gb18030') f.write(info.get_text()) return bsObj.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
def save_html(result_url): html_folder_path = os.getcwd() + "/" + "w2" + "_folder" os.mkdir(html_folder_path) index = 1 print("Save html Begin...") for url in result_url: print("saving num " + str(index)) response = requests.get(url).text time.sleep(1) html = str(sp(response, "html.parser")) html_path = html_folder_path + "/" + str(index) + ".txt" html_fw = open(html_path, "w") html_fw.write(html) html_fw.close() index += 1
def web_crawl(url, dat): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } try: url_cont = requests.get(url, headers=headers, timeout=5) pg_soup = sp(url_cont.content, "html.parser") except requests.ConnectionError as e: print( "OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n" ) print(str(e)) return dat except requests.Timeout as e: print("OOPS!! Timeout Error") print(str(e)) return dat except requests.RequestException as e: print("OOPS!! General Error") print(str(e)) return dat except KeyboardInterrupt: print("Someone closed the program") return dat try: pgno = pg_soup.find("span", {"pageInfo"}).text.split() except AttributeError: return dat a1 = int(pgno[1]) + 1 a2 = int(pgno[3]) p = pg_soup.findAll("div", {"class": "row review_table_row"}) for x in p: review = x.find("div", {"class": "user_review"}).text emo = len(x.findAll("span", {"class": "glyphicon glyphicon-star"})) if emo == 3: continue dat.append([review, emo]) print(len(dat)) ind = url.find('/?') url2 = url[:ind] + "/?page=" + str(a1) + "&type=user" print(url2) if (int(a1) < int(a2)): web_crawl(url2, dat) else: return dat
def Get_Entity_Description(entry): url = "https://baike.baidu.com/item/" + quote(entry) req = request.Request(url, headers=header) html = request.urlopen(req).read() soup = sp(html, "html.parser") Description_len = 10 content = soup.findAll('div', {'class': 'para'}) if content: for idx, i in enumerate(content): i = i.get_text() i = i.replace('\n', '') i = i.replace('\r', '') i = i.replace(u'\xa0', u'') i = re.sub(bracket, '', i) if (len(i) > Description_len): break return i if len(i) < Max_len else i[:Max_len] else: return 'NIL'
def getTagAndCateByLink(link,k): try: # from link get blogApp p1 ='.com/[\s\S]*/p/' a = re.search(p1,link) blogApp=a.group()[5:-3] # from link get postId p2 = '/p/[\s\S]*.html' b = re.search(p2, link) postId = b.group()[3:-5] html = requests.get(link).text soup = sp(html, 'lxml') l = soup.findAll('script') p3 = 'cb_blogId=\d*,' for i in l: s = str(i) c = re.search(p3, s) if c is not None: blogId = c.group()[10:-1] findTagParam['blogApp'] = blogApp findTagParam['blogId'] = blogId findTagParam['postId'] = postId json_ = requests.get(findTagUrl, params=findTagParam).json() p4='>(\w*|\W*|[\u4e00-\u9fa5])+</a>' text=json_[k] t2 = text.split(',') # print(d.group()[1:-4]) for t in t2: a = re.search(p4, t) b = a.group()[1:-4] k = k + b + ',' if k=='Tags': return k[4:-1] else: return k[10:-1] except Exception: return ''
def get_detail(word): value_dict = {} url = url_head + word req = Request(url, headers=header) html = urllib2.urlopen(req).read() soup = sp(html, "html.parser") list_same = re.findall(pattern_name, html) if list_same: value_dict["同义词"] = list_same[0] list_title = soup.find_all("dt", class_="basicInfo-item name") list_value = soup.find_all("dd", class_="basicInfo-item value") for index, item in enumerate(list_title): title = re.findall(pattern_title, str(item))[0] value_list = re.findall(pattern_value, str(list_value[index])) if value_list and value_list[0].find("target") <= 0: value_dict[title] = value_list[0] continue elif value_list and value_list[0].find("target") > 0: value = re.findall(p_2, str(value_list[0])) if value: value_dict[title] = value[0] continue value_list = re.findall(p_2, str(list_value[index])) if value_list: value = "" for item in value_list: value += item + ";" value = value.strip(";") value_dict[title] = value continue value_list = re.findall(p_3, str(list_value[index])) if value_list: value_dict[title] = value_list[0] continue if 0: for key, value in value_dict.items(): print key, value return word + "\001" + json.dumps(value_dict)
def GetImage(img_url, rawurl): root = tk.Tk() root.title("FASTAF Music Downloader") root.iconbitmap('ytb.ico') response = requests.get(img_url) img_data = response.content image1 = Image.open(BytesIO(img_data)) image2 = image1.resize((400, 230), Image.ANTIALIAS) image3 = ImageTk.PhotoImage(image2) canvas = tk.Canvas(root) canvas.pack() soup = sp(urlopen(rawurl), "lxml") Title = soup.title.string STitle = Title.replace("- YouTube", "") text1 = "Downloaded " + STitle + " !" canvas.create_text(180, 250, fill="darkblue", font="Times 10 ", text=text1) canvas.create_image(0, 0, anchor=tk.NW, image=image3) canvas.update BtnSubmit = tk.Button(root, text="Submit", command=partial(down(rawurl))).grid(row=3, column=3) BtnSubmit.pack() root.mainloop()
def sc2(self): pg = r( homePage.url3, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img src="https://www.cs.utah.edu/~deb/assets/images/media/logo_it.png" alt="India Today" class="img-fluid">' heads = [] links = [] imgs = [] logos = [] try: imgs1 = soup.find('div', { 'class': 'view-content' }).findAll('div', {'class': 'catagory-listing'}) for i in imgs1: imgs.append( i.find('div', { 'class': 'pic' }).find('img').get('src')) heads.append(i.find('div', {'class': 'detail'}).find('a').text) links.append( i.find('div', { 'class': 'detail' }).find('a').get('href')) logos.append(logourl) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def tc1(self): pg = r( techPage.url2, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img src="https://akm-img-a-in.tosshub.com/indiatoday/../sites/all/themes/itg/logo.png?v=1.3" alt="India Today" class="img-fluid">' heads = [] links = [] imgs = [] logos = [] try: imgs1 = soup.find('div', { 'class': 'view-content' }).findAll('div', {'class': 'catagory-listing'}) for i in imgs1: imgs.append( i.find('div', { 'class': 'pic' }).find('img').get('src')) heads.append(i.find('div', {'class': 'detail'}).find('a').text) links.append( i.find('div', { 'class': 'detail' }).find('a').get('href')) logos.append(logourl) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def scraper(): # Link link = "https://www.olx.ua/poltava" chrome_options = webdriver.ChromeOptions() chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN") chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--no-sandbox") driver = webdriver.Chrome( executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options) driver.get(link) jobs_list = [] #Number of ads that user want ro parse jobs_num = int(100) #Counter of jobs that have been alreade parced jobs_counter = 0 #Page counter (we start from page 1) page_counter = 1 price_range = [0, 999999999] from_price, to_price = price_range # Iterationg through pages while link != None: # Get the number of work page_tree = sp(driver.page_source, 'html.parser') jobs = page_tree.find_all('tr', {'class': 'wrap'}) # Iterating through work for job in jobs: if jobs_counter == jobs_num: break # Title title = job.find('h3').text.strip() #Price parsing try: price = job.find('p', {'class': 'price'}).text.strip() price_int = float(''.join(x for x in price if x.isdigit() or x == '.')) except: price = 'Не указана.' if ((from_price or to_price) != 0): if price == 'Не указана.' or from_price > price_int or price_int > to_price: continue # Link to the details job_link = job.find('a')["href"] driver.get(job_link) # Finding a button to click in order to unblock telephone number try: phone_btn = driver.find_element_by_class_name('spoiler') except: continue # Wait until telephone number gets clear driver.execute_script("arguments[0].click();", phone_btn) wait = WebDriverWait(driver, 10) try: wait.until_not( ec.text_to_be_present_in_element( (By.CLASS_NAME, 'contactitem'), 'x')) except: continue # Parse job link page job_page = sp(driver.page_source, 'html.parser') user_since = job_page.find('div', { 'class': 'quickcontact__user-since' }).text #Parse heading try: heading = job_page.select('td.middle > ul > li')[1].text except: heading = 'Недобавленная рубрика.' #Parse phone number try: phones = job_page.select('div.contactitem')[0].text print(phones) except: # If no phone than ignore this job continue #Parse username try: name = driver.find_element_by_class_name( 'quickcontact__user-name').text except NoSuchElementException: name = 'Имя не указано.' jobs_list.append({ 'title': title.strip(), 'phone': phones.strip(), 'name': name.strip(), 'heading': heading.strip(), 'user_since': user_since.strip(), 'price': price, 'link': job_link.strip(), }) #Try to find unique Add if no then add it to database try: JobAdds.objects.get(phone=phones.strip()) except (JobAdds.MultipleObjectsReturned, JobAdds.DoesNotExist): JobAdds.objects.create(title=title.strip(), link=job_link.strip(), phone=phones.strip(), name=name.strip(), heading=heading.strip(), price=price, user_since=user_since.strip()) jobs_counter += 1 # Link to another page try: if jobs_counter == jobs_num: break page_counter += 1 link = page_tree.find( 'a', {'class': '{page:' + str(page_counter) + '}'})['href'] driver.get(link) driver.implicitly_wait(0.3) except (NoSuchElementException, IndexError, InvalidArgumentException, TypeError): link = None driver.close()
import pandas as pd import requests from bs4 import BeautifulSoup as sp response = requests.get("https://www.flipkart.com/mobiles/pr?sid=tyy,4io&marketplace=FLIPKART") print(response) soup = sp(response.content, "html.parser") Name_list = [] Price_list = [] Rating_list = [] for i in range(1,51): link = soup.find("a",text = "Next").get("href") home_page_url = "https://www.flipkart.com" next_page_link = home_page_url + link[:-1]+str(i) response2 = requests.get(next_page_link) soup2 = sp(response2.content, "html.parser") cards = soup2.find_all("div", attrs = {"class": "_1UoZlX"}) for card in cards: name = card.find("div", attrs = {"class": "_3wU53n"}) price = card.find("div",attrs= {"class":"_2rQ-NK"}) rating = card.find("div",attrs = {"class":"hGSR34"}) if name: name_text = name.text else: name_text = None
import requests from bs4 import BeautifulSoup as sp url = "https://www.vulnerability-lab.com/list-of-bug-bounty-programs.php" # our url for parsing webpage = requests.get(url=url) # we make get requests and assign it webpage soup = sp(webpage.content, 'html.parser') # we create soup object for parse webpage content tables = soup.find_all( 'table') # take tables from soup object with find_all methods a_tags = tables[4].find_all('a') # take a tags in the table with open("bug_bounty-sites.txt", "w") as sites_list: # open a file for write url list for a in a_tags: if "mailto" in a: pass elif "http" not in a.get("href"): sites_list.write("http://" + a.get("href") + "\n") else: sites_list.write(a.get('href') + "\n") # write url list after open file
def getReply(opener,url,*args): pg = sp(getPg(opener,url,*args),'html.parser') tds = pg.findAll('td',{'class':'t_msgfont'}) replys = [d.text.strip() for d in tds] return replys
# -*- coding: utf-8 -*- """ Created on Sun Dec 20 13:52:30 2020 @author: konstnar """ from bs4 import BeautifulSoup as sp import os, re file = open('page.html', encoding='utf-8') file = file.readline() page = sp(file, 'lxml') qstn = page.findAll( 'div', class_= 'freebirdFormviewerViewItemsItemItemTitle exportItemTitle freebirdCustomFont' ) for q in qstn: print(re.sub("\s\*", "", q.text)) # if answer is in radio button # wrong answers will not be retrived (have to add manually) ans = page.findAll( ['label', 'span'], class_= 'docssharedWizToggleLabeledContainer freebirdFormviewerViewItemsRadioChoice freebirdLabeledControlDarkerDisabled isChecked freebirdFormviewerViewItemsRadioGraded freebirdFormviewerViewItemsRadioCorrect isDisabled' ) # if answers is in text box
urllib2.install_opener(opener) # first come by page :login request2 = urllib2.Request(url,params) response2 = opener.open(request2) pp = response2.read() # second come by page: qiandao qiandao_rq = urllib2.Request(qiandaourl,urllib.urlencode(qd_info)) rsp3 = opener.open(qiandao_rq) qdao_page = rsp3.read() # regular expression: acquire info rx=re.compile(r'class="postbox">\r\n(.+?)</div') qdresult = re.findall(rx,qdao_page) # print info #print str(qdresult[0]).decode('utf-8').encode('gbk') #print qdao_page print qdresult[0].decode('utf-8') ##formhash = re.findall('name=\"formhash\" value=\"([0-9a-f]*)\" ',s) uy = 'http://130.211.8.178/' su1 = 'http://130.211.8.178/forum-19-1.html'#88-6 pg = sp(getPg(opener,su1),'html.parser') ths = pg.findAll('th',{'class':'subject common'}) #rs = [d.span.a['href'] for d in ths] for d in ths[0:-1:2]: print d.span.text h=uy + d.span.a['href'] doReply(opener,h) sleep(20)
def openFile(): file = open("corpus_berita.txt", encoding="utf8") soup = sp(file, 'html.parser') doc_berita = soup.find_all("doc") return doc_berita
import urllib2 as ur from bs4 import BeautifulSoup as sp #url = "http://gucky.uni-muenster.de/cgi-bin/rgbtab-en" url = "http://cvsweb.xfree86.org/cvsweb/*checkout*/xc/programs/rgb/rgb.txt?rev=1.1" page = ur.urlopen(url).read() soup = sp(page) colors = soup.body.string import re pattern = re.compile('(\d+ \d+ \d+)') rgb_colors = pattern.findall(colors) rgb_uniq = [] for color in rgb_colors: if color not in rgb_uniq: rgb_uniq.append(color)