def go_desc(): BASE_URL = 'http://www.svatibor.ru/internet_magazin/product/' import urllib2 from bs4 import BeautifulSoup for i in Item.objects.all(): print BASE_URL + str(i.product_id) c = urllib2.urlopen(BASE_URL + str(i.product_id)) soup = BeautifulSoup(c.read()) i.text = str(soup.findAll('div', attrs={'class' : 'full'})[0]).replace('\n', '<br />') i.stock=100 i.art = soup.findAll('div', attrs={'id' : 'tovar_card'})[0].findAll('ul', attrs={'id' : 'p_list'})[0].findAll('span')[0].string i.is_novelty = len(soup.findAll('li', attrs={'class' : 'new'})) > 0 image = 'http://www.svatibor.ru' + soup.findAll('div', attrs={'id' : 'tovar_card'})[0].findAll('a')[0]['href'] if image.endswith('.jpg'): f = open('media/uploads/items/%s.jpg' % i.id,'wb') f.write(urllib2.urlopen(image).read()) f.close() i.image = 'uploads/items/%s.jpg' % i.id elif image.endswith('.png'): f = open('media/uploads/items/%s.png' % i.id,'wb') f.write(urllib2.urlopen(image).read()) f.close() i.image = 'uploads/items/%s.png' % i.id print image i.save()
def sanitize_html(value, valid_tags=VALID_TAGS): soup = BeautifulSoup(value) comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] # Some markup can be crafted to slip through BeautifulSoup's parser, so # we run this repeatedly until it generates the same output twice. newoutput = soup.renderContents() while 1: oldoutput = newoutput soup = BeautifulSoup(newoutput) for tag in soup.findAll(True): if tag.name not in valid_tags: tag.hidden = True else: print tag.name,'***', tag.attrs print tag.name,'###', [x for x in tag.attrs ] m={} for k in tag.attrs.keys(): if k in valid_tags[tag.name]: m[k] = tag.attrs[k] tag.attrs = m print tag.name,'===', m #tag.attrs = [(attr, value) for attr, value in tag.attrs if attr in valid_tags[tag.name]] newoutput = soup.renderContents() if oldoutput == newoutput: break return newoutput
def scrape_song_metadata(soup, verbose = False): result = {} first_soup = soup.find("div", {"class":"song_header-primary_info"}) first_soup = BeautifulSoup(soup.prettify(), "html.parser") artist = first_soup.find("a", {"class":"song_header-primary_info-primary_artist"}) artist = clean_text(artist.string) if verbose: print "Artist : " + artist.encode('utf-8') result["artist"] = artist song = first_soup.find("h1", {"class":"song_header-primary_info-title"}) song = clean_text(song.string) if verbose: print "Song : " + song.encode('utf-8') result["song"] = song labels = first_soup.findAll("span", {"class":"song_info-label"}) labels = [clean_text(l.string) for l in labels] contents = first_soup.findAll("span", {"class":"song_info-info"}) contents = [BeautifulSoup(c.prettify(), "html.parser") for c in contents] contents = [c.a for c in contents] for i in range(len(labels)): if contents[i]: if verbose: print labels[i] + " :" print " " + clean_text(contents[i].string).encode('utf-8') print " " + contents[i]['href'].encode('utf-8') result[labels[i]] = {"name" : clean_text(contents[i].string), "link" : geniusify(contents[i]['href'])} return result
def extract_content(self, task, url, title, hashcode): c = download.Crawler(url) d = c.get_data() if d is None: return None res = { 'title': title.replace(u'【', u'[').replace(u'】', u']').replace(u'喷嚏图卦', u'小兔子图说'), 'src': url, 'account_id': task['account_id'], 'category': task['category'], 'hashcode': hashcode, 'username': task['username'], 'head_image': None, } d = d.decode('gbk', 'ignore').encode('utf8', 'ignore') start_pos = d.find(task['start_text'].encode('utf8')) end_pos = d.find(task['end_text'].encode('utf8')) soup = BeautifulSoup(d[start_pos:end_pos],"lxml") del_tags = ['script'] for tag in del_tags: for match in soup.findAll(tag): match.decompose() invalid_tags = ['a', 'b', 'i', 'u'] for tag in invalid_tags: for match in soup.findAll(tag): match.replaceWithChildren() self.extract_image(task, soup, res) message_queue.put(res, False)
def parseresultpage(page, search, order, sort, regex): logger.info(" [+] Pulling results from page " + str(page)) githubbase = "https://github.com/search?" githubsearchurl = {"o": order, "p": page, "q": search, "s": sort, "type": "Code", "ref": "searchresults"} searchurl = githubbase + str(urlencode(githubsearchurl)) pagehtml = urlopen(searchurl).read() soup = BeautifulSoup(pagehtml, "html.parser") # Find GitHub div with code results results = soup.findAll("div", attrs={"class": "code-list-item"}) # Pull url's from results and hit each of them soup1 = BeautifulSoup(str(results), "html.parser") for item in soup1.findAll("p", attrs={"class": "full-path"}): soup2 = BeautifulSoup(str(item), "html.parser") for link in soup2.findAll("a"): individualresult = "https://github.com" + str(link["href"]) individualresultpage = urlopen(individualresult).read() soup3 = BeautifulSoup(str(individualresultpage), "html.parser") for rawlink in soup3.findAll("a", attrs={"id": "raw-url"}): rawurl = "https://github.com" + str(rawlink["href"]) if args.custom_regex: searchcode(rawurl, regex) else: wpsearchcode(rawurl, regex)
def createSpreadsheet(): wb = xlwt.Workbook() sheet = wb.add_sheet("Google Alerts") style = xlwt.easyxf('font: bold 1') sheet.write(0, 3, 'Headline', style) sheet.write(0, 1, 'Company', style) sheet.write(0, 4, 'URL', style) sheet.write(0, 0, 'Date', style) cur_row = 1 for url in LA_HONDA_ALERTS_URLS: print 'Processing google alerts for ' + LA_HONDA_ALERTS_URLS[url] + '...' r = requests.get(url) xml = r.text soup = BeautifulSoup(xml) for title, link, date in zip(soup.findAll('title')[1:], soup.findAll('link')[1:], soup.findAll('published')): title = cleanTitle(title) link = cleanLink(link) date = cleanDate(date) writeToSheet(sheet, title, LA_HONDA_ALERTS_URLS[url], link, date, cur_row) cur_row = cur_row + 1 processSheet(sheet) savewb(wb)
def search_handler(): """Return DBLP Author Search results in JSON Format""" results = "5" author = None try: author = request.values['author'] except: print "author parameter not found" url = "http://www.dblp.org/autocomplete-php/autocomplete/ajax.php?"\ "query=%s&"\ "name=dblpmirror&"\ "path=/search/&"\ "page=index.php&"\ "log=/var/log/dblp/error.login&"\ "qid=34&navigation_mode=user&"\ "language=en&mcsr=40&mcc=0&mcl=80&"\ "hppwt=20&hppoc=1000&eph=1&er=20&dm=3&"\ "bnm=R&ll=2&mo=100&accc=:&syn=0&deb=0&"\ "hrd=1a&hrw=1d&qi=1&fh=1&fhs=1&mcs=%s&"\ "rid=44&qt=F" % (author, results) page = urlopen(url, data="Void").read() soup = BeautifulSoup(page) authors_span = soup.findAll("span", {"class": "\\\"completion\\\""})[:-1] pubs_count_span = soup.findAll( "span", {"class": "\\\"hits_number\\\""})[:-1] authors = [{'id':-1,'name' :a.string} for a in authors_span] pubs_count = [int(c.string[1:-1]) for c in pubs_count_span] response = {'authors': authors, 'pubs_count': pubs_count} json_response = jsonify(response) return json_response
def get_plugins(self, url): plugins = [] headers = {'User-Agent': self.get_user_agent()} page_req = self.req.get(url, headers=headers) soup = BeautifulSoup(page_req.text, "html.parser") # Search pluginss in css plugin_paths = soup.findAll("link", {"rel": "stylesheet"}) for plugin_path in plugin_paths: if 'wp-content/plugins/' in plugin_path['href']: regex = re.compile("wp-content/plugins/([a-zA-Z0-9-_]+)/", re.IGNORECASE) r = regex.findall(plugin_path['href']) for plugin_name in r: plugins.append(plugin_name) # Search plugins in javascript plugin_paths = soup.findAll("script", {"type": "text/javascript"}) for plugin_path in plugin_paths: try: if 'wp-content/plugins/' in plugin_path['src']: regex = re.compile("wp-content/plugins/([a-zA-Z0-9-_]+)/", re.IGNORECASE) r = regex.findall(plugin_path['src']) for plugin_name in r: plugins.append(plugin_name) except: # Silently pass, parsing html is pain in the ass pass return list(set(plugins))
def scrape_coins_ccc(): print 'Scraping CCC' url = 'http://www.cryptocoincharts.info/coins/info/1001' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html, 'html.parser') trlist = soup.findAll('tr') coins = {} for i, tr in enumerate(trlist): print str(int(i+1)) try: a = tr.find('td').a name = tr.findAll('td')[1].text coin_url = str('http://www.cryptocoincharts.info'+a['href']) coin_html = urllib2.urlopen(coin_url).read() coin_soup = BeautifulSoup(coin_html, 'html.parser') coin_table = coin_soup.findAll('table',{'class': 'table table-striped'}) coin_tr = coin_table[1].find('tbody').findAll('tr') coin_markets = [] for ctr in coin_tr: market_td = ctr.findAll('td') coin_markets.append({'market':market_td[0].text, 'pair':market_td[1].text, 'link': str('http://www.cryptocoincharts.info' + market_td[1].a['href'])}) coins[a.text] = {'markets':coin_markets, 'name':name} except AttributeError: pass return coins
def getkaomoji(): headers = {"User-Agent": "007"} requests.get = functools.partial(requests.get, headers=headers) url = "http://dongerlist.com" r = requests.get(url) soup = BS(r.text, "html.parser") anchors = soup.findAll("a", attrs={"class": "list-2-anchor"})[1:] category_urls = {a.text.lower(): a["href"] for a in anchors} kaomoji = {category: [] for category in category_urls} kaomoji["all"] = {} for category, url in category_urls.items(): print("Scraping %s ..." % url) page_number = 1 while True: print("Is there a page %d? owo" % page_number) r = requests.get("%s/page/%d" % (url, page_number)) if r.status_code == 200: print("Yeah! ;D") soup = BS(r.text, "html.parser") textareas = soup.findAll("textarea", attrs={"class": "donger"}) kaomoji_list = [textarea.text for textarea in textareas] kaomoji[category].extend(kaomoji_list) kaomoji["all"].update({k: True for k in kaomoji_list}) else: print("No. D;") break page_number += 1 print("Success.") return kaomoji
def getStatusList(self, page): try: if(re.search('\<a href=status(\?top\=.+?)\>', page.text)): self.nextStatusUrl = self.statusUrl + str(re.findall('\<a href=status(\?top\=.+?)\>', page.text)[0]) else: return [] soup = BeautifulSoup(page.text, "html.parser") table = soup.findAll('table', {'class':'a'}) soup = BeautifulSoup(str(table), "html.parser") allStatus = soup.findAll('tr', {'align':'center'}) statusList = [] for status in allStatus: model = "(?<=\>)(.*?)(?=\<)" tempList = re.findall(model, str(status)) resuList = [] if(len(tempList) == 27): for i in self.unusalList: resuList.append(tempList[int(i)]) else: for i in self.usallyList: resuList.append(tempList[int(i)]) statusList.append(resuList) print("get statusList: " + str(len(statusList)) +" records") return statusList except: print("get statusList failure") return []
def search(self, domain): dnsdumpster_url = 'https://dnsdumpster.com/' s = requests.session() req = s.get(dnsdumpster_url) soup = BeautifulSoup(req.content, 'html.parser') csrf_middleware = soup.findAll('input', attrs={'name': 'csrfmiddlewaretoken'})[0]['value'] self.display_message('Retrieved token: %s' % csrf_middleware) cookies = {'csrftoken': csrf_middleware} headers = {'Referer': dnsdumpster_url} data = {'csrfmiddlewaretoken': csrf_middleware, 'targetip': domain} req = s.post(dnsdumpster_url, cookies=cookies, data=data, headers=headers) if req.status_code != 200: print( u"Unexpected status code from {url}: {code}".format( url=dnsdumpster_url, code=req.status_code), file=sys.stderr, ) return [] if 'error getting results' in req.content.decode('utf-8'): print("There was an error getting results", file=sys.stderr) return [] soup = BeautifulSoup(req.content, 'html.parser') tables = soup.findAll('table') res = {'domain': domain, 'dns_records': {}} res['dns_records']['dns'] = self.retrieve_results(tables[0]) res['dns_records']['mx'] = self.retrieve_results(tables[1]) res['dns_records']['txt'] = self.retrieve_txt_record(tables[2]) res['dns_records']['host'] = self.retrieve_results(tables[3]) return res
def google_image_results_parser(code): soup = BeautifulSoup(code) # initialize 2d array whole_array = {'links':[], 'description':[], 'title':[], 'result_qty':[]} # Links for all the search results for div in soup.findAll('div', attrs={'class':'g'}): sLink = div.find('a') whole_array['links'].append(sLink['href']) # Search Result Description for desc in soup.findAll('span', attrs={'class':'st'}): whole_array['description'].append(desc.get_text()) # Search Result Title for title in soup.findAll('h3', attrs={'class':'r'}): whole_array['title'].append(title.get_text()) # Number of results for result_qty in soup.findAll('div', attrs={'id':'resultStats'}): whole_array['result_qty'].append(result_qty.get_text()) return build_json_return(whole_array)
def soupIt(self): http = urllib3.PoolManager() r = http.request("GET", self.url) soup = BeautifulSoup(r.data.decode('ISO-8859-1'), "lxml") self.title = soup.title.string # remove unused header parts # in comments because of firefox # for p in soup(["meta"]): # p.extract() # remove comments for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() # remove some images unused_images = soup.find_all('img', {'alt': 'bullet'}) \ + soup.find_all('img', {'src': '../../images/ilmulislam.gif'}) \ + soup.find_all('img', {'src': '../../images/enzykopf.gif'}) for i in soup.find_all('img'): if i in unused_images: i.extract() # remove all links, but keep text # don't keep text for navigation links that don't lead to "begriffe" or "manuskripte" for l in soup.findAll('a'): if "begriffe" in urljoin(self.url, l['href']) or "manuskripte" in urljoin(self.url, l['href']): l.replaceWith(l.text) else: l.extract() # remove top blocks topBlocks = soup.findAll('td', {'width': '50%'}) for block in topBlocks: if len(block.findChildren('img')): self.images += block.findChildren('img') block.extract() # remove trash tags and empty tags for tag in soup.findAll(): if tag.name == "meta": continue if tag.name in ("td", "tr", "table", "center", "div", "font", "strong", "b"): tag.unwrap() if len(tag.text) == 0 or tag.text == '\n' or re.match(r'^\s*$', tag.text) or tag.is_empty_element or tag.isSelfClosing: tag.extract() for l in soup.find_all(text=re.compile('^\n')): l.extract() for l in soup.find_all(text=re.compile('\r\n')): l.replaceWith(" ") # append immages for i in self.images: soup.body.insert(0, i) return soup.prettify()
def getGoogleLinks(url): url = url.replace(" ","+") url = "https://www.google.com/search?q="+url+"&num=100&filter=0" results_arr = [] br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Firefox')] htmltext = br.open(url).read() soup = BeautifulSoup(htmltext) searchres = soup.findAll('div',attrs={"id":"search"}) searchtext = str(searchres[0]) soup1 = BeautifulSoup(searchtext) lis = soup1.findAll("li") regex = "q(?!.*q).*?&" pattern = re.compile(regex) for li in lis: soup3 = BeautifulSoup(str(li)) links = soup3.findAll('a') rep = links[0] results = re.findall(pattern,str(rep)) if len(results)>0: if "http" in str(results[0]): results_arr.append(str(results[0].replace("q=htt","htt").replace("&",""))) #print links[0] return results_arr
def getFaculty_det(reg_no = "", pwd = "", emp_id = ""): br = login(reg_no,pwd) print br.geturl() if br.geturl() == ("https://academics.vit.ac.in/student/stud_home.asp") or br.geturl() == ("https://academics.vit.ac.in/student/home.asp"): print "SUCCESS" br.open("https://academics.vit.ac.in/student/official_detail_view.asp?empid=%(id)s" % {"id" : emp_id }) response = br.open("https://academics.vit.ac.in/student/official_detail_view.asp?empid=%(id)s" % {"id" : emp_id }) soup = BeautifulSoup(response.get_data()) img = soup.findAll('img') #fac_img = "https://academics.vit.ac.in/student/"+img[0]['src']+"?" tables = soup.findAll('table') myTable = tables[1] rows = myTable.findChildren(['th','tr']) rows = rows[1:10] data = [] facDet = {} for row in rows: cells = row.findChildren('td') cells = cells[1] value = cells.string data.append(value) try: myTable = tables[2] except IndexError: facDet = {"name" : data[0], "school" : data[1], "destination" : data[1], "venue" : data[1], "intercom" : data[1], "email" : data[1], "division" : data[1], "additional_role" : data[1]} else: rows = myTable.findChildren(['th','tr']) rows = rows[1:4] openhr = [] for row in rows: rowdata = [] cells = row.findChildren('td') for cell in cells: value = cell.string rowdata.append(value) openhr.append(rowdata) facDet = {"name" : data[0], "school" : data[1], "destination" : data[1], "venue" : data[1], "intercom" : data[1], "email" : data[1], "division" : data[1], "additional_role" : data[1], "openhr_details" : openhr} return {"status" : "Success" ,"details" : facDet} else : print "FAIL" return {"status" : "Failure"}
def exclaim_album_scrape(num_pages = 2, section_url = 'Album_EP/Page/'): BASE_URL = "http://exclaim.ca/music/Reviews/" linklist = [] artistalbumlist = [] for page in range(1,num_pages+1): url = BASE_URL + section_url+str(page) req = urllib2.Request(url, headers=hdr) html = urllib2.urlopen(req).read() soup = BeautifulSoup(html, "lxml") heads = soup.findAll('h4') artists = [s.contents[0] for s in heads] albums = [s.contents[0].strip() for s in soup.findAll('span', {'class':'streamSingle-item-details'})] info = zip(artists, albums) artistalbumlist.extend(info) links = [s.findAll('a') for s in soup.findAll('ul', {'class':'streamSingle'})] links = [s['href'] for s in links[0] if s] if set(links)<= set(linklist): return artistalbumlist, linklist linklist.extend(links) return artistalbumlist, linklist
def get_movie_list(self, winner = True): if winner == True: url = "http://www.imdb.com/search/title?count=10000&groups=oscar_winners&title_type=feature&sort=year,desc&view=simple" else: url = "http://www.imdb.com/search/title?count=10000&groups=oscar_nominees&title_type=feature&sort=year,desc&view=simple" # url = "https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films" html = self.get_url(url) soup = BeautifulSoup(html, "lxml") span_list = soup.findAll("span", { "class" : "lister-item-header" }) title_list = soup.find_all(href=re.compile("/title/tt.*\?ref_=adv_li_tt")) year_list = soup.findAll("span", { "class" : "lister-item-year text-muted unbold" }) oscar_list = list() for i in range(0, len(title_list)): title = re.sub('<.*?>', "", str(title_list[i])) year = re.sub('<.*?>', "", str(year_list[i])) year = re.sub('[a-zA-Z]|\s|\(|\)', '', year) d = {'title': title, 'year': year} oscar_list.append(d) if winner == True: self.winner_list = oscar_list else: self.nominated_list = oscar_list
def scrape_lyrics_from_url(url): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ from bs4 import BeautifulSoup, Comment html = fetch_url(url) if not html: return None soup = BeautifulSoup(html) for tag in soup.findAll('br'): tag.replaceWith('\n') # Remove non relevant html parts [s.extract() for s in soup(['head', 'script'])] comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [s.extract() for s in comments] try: for tag in soup.findAll(True): tag.name = 'p' # keep tag contents except Exception, e: log.debug('Error %s when replacing containing marker by p marker' % e, exc_info=True)
def scrape(url): home = 'http://www.moray.gov.uk/' datePattern = r'[0-9][0-9]-[0-9][0-9]-20[0-9][0-9]' departments = r'(Chief Executive\'s Office|Corporate Services|Education and Social Care|Environmental Services|Multiple Services)' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) links = soup.findAll('a', href=True) for l in links: if l.string is not None: #print l.string if re.search(departments, l.string) is not None: page = urllib2.urlopen(home+l['href']).read() pSoup = BeautifulSoup(page) pLinks = pSoup.findAll('a', href=True) for pl in pLinks: if pl.string is not None: try: if re.search(datePattern, pl.string): #print pl.string + ' : ' + pl['href'] foi = urllib2.urlopen(home+pl['href']).read() foiSoup = BeautifulSoup(foi) bill = foiSoup.find('div', {'class': 'boxj_txt_ara'}) if bill is not None: print bill.p except UnicodeEncodeError: pass
def analyze_iframe(self,iframe_ana): iframe_child_ans=[] iframe_size=[] object_cnt=0 embed_cnt=0 h = iframe_ana.get('height') w = iframe_ana.get('width') try: if((h.isdigit()) and (w.isdigit())): iframe_size.append(h) iframe_size.append(w) elif((len(h)>0) and (len(w)>0)): iframe_escaped = True except Exception: pass new_url = iframe_ana.get('src') file_type = self.getContentType(new_url) #checking if the url points to an html page if('html' in file_type): iframe_child_ans.append(iframe_size) child_get = urllib2.urlopen(new_url).read() child_dom = BeautifulSoup(child_get) object_data = child_dom.findAll('object') embed_data = child_dom.findAll('embed') for i in object_data: object_cnt = int(object_cnt)+1 for i in embed_data: embed_cnt = int(embed_cnt)+1 iframe_child_ans.append(object_cnt) iframe_child_ans.append(embed_cnt) iframe_src = iframe_ana.get('src') iframe_child_ans.append(iframe_src) return iframe_child_ans else: return "not_dynamic"
def _getAllHrefsFromPage(self, url, pageSource): '''解析html源码,获取页面所有链接。返回链接列表''' #print 'ok3' hrefs = [] soup = BeautifulSoup(pageSource) #print 'soup=',soup #print results # 1. as <a href=http://www.example.com></a> results = soup.findAll('a',href=True) for a in results: #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf #在bs4中不会被自动url编码,从而导致encodeException href = a.get('href').encode('utf8') if not href.startswith('http'): href = urljoin(url, href)#处理相对链接的问题 if href not in hrefs: hrefs.append(href) # 2. as <from action=http://www.example.com></form> results = soup.findAll('form',action=True) for form in results: href = form.get('action').encode('utf8') if not href.startswith('http'): href = urljoin(url, href)#处理相对链接的问题 if href not in hrefs: hrefs.append(href) return hrefs
def mouthsnap_spider(max_pages): page = 1 while page <= max_pages: #page = 1 url = 'http://www.mouthshut.com/product-reviews/Snapdeal-com-reviews-925602969-sort-MsDate-order-d-page-' + str(page) print (url) source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text) for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl03_lnkTitle'}): href3 = link.get('href') title3 = link.string #print (href3) print (title3) for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl01_lnkTitle'}): href1 = link.get('href') title1 = link.string #print (href1) print (title1) for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl02_lnkTitle'}): href2 = link.get('href') title2 = link.string #print (href1) print (title2) for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl04_lnkTitle'}): href1 = link.get('href') title4 = link.string #print (href1) print (title4) for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl05_lnkTitle'}): href5 = link.get('href') title5 = link.string #print (href1) print (title5) for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl06_lnkTitle'}): href6 = link.get('href') title6 = link.string #print (href1) print (title6) for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl07_lnkTitle'}): href7 = link.get('href') title7 = link.string #print (href1) print (title7) for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl08_lnkTitle'}): href8 = link.get('href') title8 = link.string #print (href1) print (title8) for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl09_lnkTitle'}): href9 = link.get('href') title9 = link.string #print (href1) print (title9) for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl10_lnkTitle'}): href10 = link.get('href') title10 = link.string #print (href1) print (title10) page += 1
def parseHTML(html): # cooking some soup bsObj = BeautifulSoup(html,"html.parser") # the main results table is the second on the page resultsTable = bsObj.findAll("table")[1] # the results stats table is the third on the page statsTable = bsObj.findAll("table")[2] # get rows from the various tables in format that's # easy to iterate on resultsRows = scrapeTable(resultsTable) analysisRows = scrapeTable(statsTable) # outputting the main results table print "\n\n" for row in resultsRows: if len(row) > 0: print "%50s%5s%5s%5s"%(row[1],row[2],row[3],row[4]) print "----------------------------------------------------------------------\n" print "\n\n" # outputting the results analysis table print "%40s%15s%15s"%(" ","SEMESTER","CUMULATIVE") for row in analysisRows: if len(row) > 0: print "%40s%15s%15s"%(row[1],row[2],row[3])
def make_quiz(source, destination): """ Extracting from bjc file """ filename = source.rsplit('/', 1)[1] test_path = source soup = BeautifulSoup(open(test_path)) """ make sure this is a multiple choice quiz """ if soup.find("div", { "class" : "prompt" }) == None: return prompt = ((soup.find("div", { "class" : "prompt" }).get_text()).encode('utf-8', "ignore")).strip() correct_answer_tag = soup.find("div", { "class" : "correctResponse" }) correct_answer = ((soup.find(identifier=correct_answer_tag['identifier']).find("div", { "class" : "text" }).get_text()).encode('utf-8', "ignore")).strip() answer_list_unf = soup.findAll("div", { "class" : "text" }) answer_list = [] for a in answer_list_unf: answer_list.append(((a.get_text()).encode('utf-8', "ignore")).strip()) feedback_list_unf = soup.findAll("div", { "class" : "feedback" }) feedback_list = [] for f in feedback_list_unf: feedback_list.append(((f.get_text()).encode('utf-8', "ignore")).strip()) """ Formatting for xml """ xml_mul = "" for answer in answer_list: if answer == correct_answer: xml_mul += "< choice correct=\"true\">" + str(answer) + "</choice>\n" else: xml_mul += "< choice correct=\"false\">" + str(answer) + "</choice>\n" xml_out = "<problem>\n" + \ "<p>" + str(prompt) + "</p>\n" + \ "<multiplechoiceresponse>\n" + \ " <choicegroup type=\"MultipleChoice\">\n" + \ str(xml_mul) + \ " </choicegroup>\n" + \ "</multiplechoiceresponse>\n\n" + \ "<solution>\n" + \ "<div class=\"detailed-solution\">\n" + \ "<p>Explanation</p>\n" + \ "<p>" + str(feedback_list[answer_list.index(correct_answer)]) + "</p>\n" + \ "</div>\n" + \ "</solution>\n" + \ "</problem>\n" output = destination + '/problem/' + filename[:-5] + ".xml" # print(output) with open(output, 'w+') as xml_file: xml_file.write(xml_out)
def TSTVShows(params): try: html = BeautifulSoup(re.sub('\s+', ' ', HTML(params['url']))) try: mode = params['search'] tvshows = html.findAll('li') if len(tvshows) > 0: for tvshow in tvshows: XBMCItemAdd({'title':tvshow.a.string.encode('utf-8')}, { 'func' : 'TSSeasons', 'title': tvshow.a.string.encode('utf-8'), 'url' : tvshow.a['href'] }) XBMCEnd() else: Noty('TS.KG', 'Видео не найдено', ImagePath('noty-tskg.png')) except: tvshows = html.findAll('div', attrs={'class':'categoryblocks'}) if len(tvshows) > 0: for tvshow in tvshows: XBMCItemAdd({'title':tvshow.a.img['title'].encode('utf-8'), 'thumb':tvshow.a.img['src']}, { 'func' : 'TSSeasons', 'title': tvshow.a.img['title'].encode('utf-8'), 'url' : tvshow.a['href'], 'thumb': tvshow.a.img['src'] }) XBMCEnd() else: Noty('TS.KG', 'Видео не найдено', ImagePath('noty-tskg.png')) except: Noty('TS.KG', 'Сервер недоступен', ImagePath('noty-tskg.png'))
def replaceURL(URL,OUTPUT): # Provide user feedback print("[+] Replacing URLs...") print("[+] URLs that will be replaced:") # Open source, read lines, and begin parsing to replace all URLs inside <a> tags with href try: # Print href URLs that will be replaced print("\n".join(re.findall('<a href="?\'?([^"\'>]*)', open(OUTPUT).read()))) with open(OUTPUT, "r") as html: # Read in the source html and parse with BeautifulSoup soup = BeautifulSoup(html) # Find all links and replace URLs with our new text/URL for link in soup.findAll('a', href=True): link['href'] = '{{links.phishgate}}' for link in soup.findAll('link', href=True): link['href'] = urllib.parse.urljoin(URL, link['href']) for link in soup.findAll('script', src=True): link['src'] = urllib.parse.urljoin(URL, link['src']) source = soup.prettify() source = xml.sax.saxutils.unescape(source) # Write the updated URLs to the output file while removing the [' and '] output = open(OUTPUT, "w") output.write(source.replace('[','').replace(']','')) output.close() print("[+] URL parsing successful. URLs replaced.") except: print("[-] URL parsing failed. Make sure the html file exists and is readable.")
def validate_jobposting(url): content = requests.get(url, verify=False).content soup = BeautifulSoup(content) # Look for any of the 3 types of JobPosting markups job_posting_found = [] # Case 1: Microdata job_posting_found.append( soup.findAll('div', {'itemtype' : 'http://schema.org/JobPosting'}) ) # Case 2: RDFa job_posting_found.append( soup.findAll('div', { 'vocab' : 'http://schema.org/', 'typeof': 'JobPosting', }) ) # Case 3: JSON-LD ld_jsons = soup.findAll('script', { 'type' : 'application/ld+json', }) for ld in ld_jsons: ld_json = json.loads(ld.string) job_posting_found.append(ld_json.get("@type", '') == "JobPosting") return any(job_posting_found)
def btdigg_page(query, sort, page): from bs4 import BeautifulSoup from xbmctorrent.utils import url_get html_data = url_get("%s/search" % BASE_URL, headers=HEADERS, params={ "order": sort, "q": query, "p": page, }) soup = BeautifulSoup(html_data, "html5lib") name_nodes = soup.findAll("td", "torrent_name") attr_nodes = soup.findAll("table", "torrent_name_tbl")[1::2] for name_node, attr_node in zip(name_nodes, attr_nodes): attrs = attr_node.findAll("span", "attr_val") title = "%s (%s, DLs:%s)" % (name_node.find("a").text, attrs[0].text, attrs[2].text) yield { "label": title, "path": plugin.url_for("play", uri=attr_node.find("a")["href"]), "is_playable": True, } yield { "label": ">> Next page", "path": plugin.url_for("btdigg_page", query=query, sort=sort, page=int(page) + 1), "is_playable": False, }
def hores(direccio): llista_hores = [] url = "http://www.barcelona-airport.com/cat/"+direccio+".php?tp=0" sock = urllib.urlopen(url) pagina = BeautifulSoup(sock.read(), "lxml") linies_vol = pagina.findAll("div", {"id": "flight_detail"}) for linea_vol in linies_vol: hores = linea_vol.findAll("div", {"id": "fhour"}) for hora in hores: llista_hores.append(hora.text.strip()) url = "http://www.barcelona-airport.com/cat/"+direccio+".php?tp=6" sock = urllib.urlopen(url) pagina = BeautifulSoup(sock.read(), "lxml") linies_vol = pagina.findAll("div", {"id": "flight_detail"}) for linea_vol in linies_vol: hores = linea_vol.findAll("div", {"id": "fhour"}) for hora in hores: llista_hores.append(hora.text.strip()) url = "http://www.barcelona-airport.com/cat/"+direccio+".php?tp=12" sock = urllib.urlopen(url) pagina = BeautifulSoup(sock.read(), "lxml") linies_vol = pagina.findAll("div", {"id": "flight_detail"}) for linea_vol in linies_vol: hores = linea_vol.findAll("div", {"id": "fhour"}) for hora in hores: llista_hores.append(hora.text.strip()) url = "http://www.barcelona-airport.com/cat/"+direccio+".php?tp=18" sock = urllib.urlopen(url) pagina = BeautifulSoup(sock.read(), "lxml") linies_vol = pagina.findAll("div", {"id": "flight_detail"}) for linea_vol in linies_vol: hores = linea_vol.findAll("div", {"id": "fhour"}) for hora in hores: llista_hores.append(hora.text.strip()) return llista_hores
IDX.append(index) A.append(team) B.append(abbr) url = "https://www.reddit.com/r/CFB/wiki/abbreviations" print("Scrape Abbreviations Tool") print("**************************") print("data is from {0}".format(url)) print("Directory Location: {0}".format(settings.data_path)) print("**************************") with contextlib.closing(urlopen(url)) as page: soup = BeautifulSoup(page, "html5lib") tables = soup.findAll("table") IDX = [] A = [] B = [] # Add any Missing Teams Here AddSchool("ALABAMA-BIRMINGHAM", "UAB") AddSchool("ALABAMA A&M", "AAMU") AddSchool("ALBANY-NY", "ALBY") AddSchool("WESTERN KENTUCKY", "WKU") # Add any Missing Teams Here for row in tables[0].findAll("tr"): col = row.findAll('td') if len(col) > 0: tag = str(col[0].find(text=True)).strip()
def get_rent_perregion(city, district): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"zufang/%s/" % district # logging.info("checking url: %s", url) source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + u"zufang/%s/pg%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetRentByRegionlist", district, page + 1, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "house-lst"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "info-panel"}) info_dict.update( {u'title': housetitle.h2.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get("href")}) houseID = name.get("data-housecode") info_dict.update({u'houseID': houseID}) region = name.find("span", {"class": "region"}) info_dict.update({u'region': region.get_text().strip()}) zone = name.find("span", {"class": "zone"}) info_dict.update({u'zone': zone.get_text().strip()}) meters = name.find("span", {"class": "meters"}) info_dict.update({u'meters': meters.get_text().strip()}) other = name.find("div", {"class": "con"}) info_dict.update({u'other': other.get_text().strip()}) subway = name.find("span", {"class": "fang-subway-ex"}) if subway == None: info_dict.update({u'subway': ""}) else: info_dict.update( {u'subway': subway.span.get_text().strip()}) decoration = name.find("span", {"class": "decoration-ex"}) if decoration == None: info_dict.update({u'decoration': ""}) else: info_dict.update({ u'decoration': decoration.span.get_text().strip() }) heating = name.find("span", {"class": "heating-ex"}) if decoration == None: info_dict.update({u'heating': ""}) else: info_dict.update( {u'heating': heating.span.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update( {u'price': int(price.span.get_text().strip())}) pricepre = name.find("div", {"class": "price-pre"}) info_dict.update( {u'pricepre': pricepre.get_text().strip()}) except Exception as e: traceback.print_exc() continue # Rentinfo insert into mysql data_source.append(info_dict) # model.Rentinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): if data_source: logging.info("checking rent info: %s", ''.join(data_source)) logging.info("inserting rent info into db") model.Rentinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_house_perregion(city, district): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"ershoufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + u"ershoufang/%s/pg%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetHouseByRegionlist", district, page + 1, total_pages) data_source = [] hisprice_data_source = [] for ultag in soup.findAll("ul", {"class": "sellListContent"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update( {u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data-housecode') info_dict.update({u'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) if city == 'bj': info = houseinfo.get_text().split('/') else: info = houseinfo.get_text().split('|') info_dict.update({u'community': info[0]}) info_dict.update({u'housetype': info[1]}) info_dict.update({u'square': info[2]}) info_dict.update({u'direction': info[3]}) info_dict.update({u'decoration': info[4]}) housefloor = name.find("div", {"class": "positionInfo"}) info_dict.update({u'years': housefloor.get_text().strip()}) info_dict.update({u'floor': housefloor.get_text().strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {u'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree == None: info_dict.update({u"taxtype": ""}) else: info_dict.update( {u"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get("data-price")}) except Exception as e: traceback.print_exc() continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) # model.Houseinfo.insert(**info_dict).upsert().execute() #model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute() with model.database.atomic(): if data_source: model.Houseinfo.insert_many(data_source).upsert().execute() if hisprice_data_source: model.Hisprice.insert_many( hisprice_data_source).upsert().execute() time.sleep(1)
def get_community_perregion(city, regionname=u'xicheng'): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) # logging.info('checking raw response') # print(source_code) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + u"xiaoqu/" + regionname + "/pg%d/" % page logging.info("fetching from %s", url_page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') # logging.info("querying for page %d content", page) nameList = soup.findAll("li", {"class": "xiaoquListItem"}) # logging.info("checking community list length: %d", len(nameList)) i = 0 log_progress("GetCommunityByRegionlist", regionname, page + 1, total_pages) data_source = [] for name in nameList: # Per house loop i = i + 1 info_dict = {} try: communitytitle = name.find("div", {"class": "title"}) title = communitytitle.get_text().strip('\n') link = communitytitle.a.get('href') info_dict.update({u'title': title}) info_dict.update({u'link': link}) district = name.find("a", {"class": "district"}) info_dict.update({u'district': district.get_text()}) bizcircle = name.find("a", {"class": "bizcircle"}) info_dict.update({u'bizcircle': bizcircle.get_text()}) tagList = name.find("div", {"class": "tagList"}) info_dict.update({u'tagList': tagList.get_text().strip('\n')}) onsale = name.find("a", {"class": "totalSellCount"}) info_dict.update( {u'onsale': onsale.span.get_text().strip('\n')}) onrent = name.find("a", {"title": title + u"租房"}) info_dict.update( {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]}) info_dict.update({u'id': name.get('data-housecode')}) price = name.find("div", {"class": "totalPrice"}) info_dict.update({u'price': price.span.get_text().strip('\n')}) communityinfo = get_communityinfo_by_url(link) for key, value in communityinfo.items(): info_dict.update({key: value}) info_dict.update({u'city': city}) # logging.info('community info: %s', json.dumps(info_dict)) except Exception as e: traceback.print_exc() continue # communityinfo insert into mysql data_source.append(info_dict) # model.Community.insert(**info_dict).upsert().execute() with model.database.atomic(): if data_source: # logging.info("checking data: %s", ''.join(data_source)) # logging.info("inserting community info into db") model.Community.insert_many(data_source).upsert().execute() # logging.info("insertion succeeds") time.sleep(1)
def get_sell_percommunity(city, communityname): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"chengjiao/rs" + \ urllib2.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Sellinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + \ u"chengjiao/pg%drs%s/" % (page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') log_progress("GetSellByCommunitylist", communityname, page + 1, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "listContent"}): for name in ultag.find_all('li'): info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('href').split("/")[-1].split( ".")[0] info_dict.update({u'houseID': houseID.strip()}) house = housetitle.get_text().strip().split(' ') info_dict.update({u'community': communityname}) info_dict.update({ u'housetype': house[1].strip() if 1 < len(house) else '' }) info_dict.update({ u'square': house[2].strip() if 2 < len(house) else '' }) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') info_dict.update({u'direction': info[0].strip()}) info_dict.update( {u'status': info[1].strip() if 1 < len(info) else ''}) housefloor = name.find("div", {"class": "positionInfo"}) floor_all = housefloor.get_text().strip().split(' ') info_dict.update({u'floor': floor_all[0].strip()}) info_dict.update({u'years': floor_all[-1].strip()}) followInfo = name.find("div", {"class": "source"}) info_dict.update( {u'source': followInfo.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) if totalPrice.span is None: info_dict.update( {u'totalPrice': totalPrice.get_text().strip()}) else: info_dict.update({ u'totalPrice': totalPrice.span.get_text().strip() }) unitPrice = name.find("div", {"class": "unitPrice"}) if unitPrice.span is None: info_dict.update( {u'unitPrice': unitPrice.get_text().strip()}) else: info_dict.update( {u'unitPrice': unitPrice.span.get_text().strip()}) dealDate = name.find("div", {"class": "dealDate"}) info_dict.update({ u'dealdate': dealDate.get_text().strip().replace('.', '-') }) except Exception as e: traceback.print_exc() continue # Sellinfo insert into mysql data_source.append(info_dict) # model.Sellinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): if data_source: model.Sellinfo.insert_many(data_source).upsert().execute() time.sleep(1)
for _ in range(10000): afterstring = htmlstring actions.send_keys(Keys.PAGE_DOWN).perform() htmlstring = browser.page_source if afterstring == htmlstring: print ("ended scraping crack test one") actions.send_keys(Keys.PAGE_DOWN).perform() htmlstring = browser.page_source if afterstring == htmlstring: print ("--Scrapping End--") break time.sleep(3) #print(htmlstring) textdoc = io.open("gmapreview.txt", "w", encoding="utf-8") soup = BeautifulSoup(htmlstring,"lxml") mydivs = soup.findAll("div", {"class": "section-review-content"}) counter = 0 for a in mydivs: textdoc.write(str("\nReviewer name: "+a.find("div", class_="section-review-title").text)+" \n||Reviewer Detail: " + str(a.find("div", class_="section-review-subtitle").text) +" \n||Reviewerer Profile URL:"+ str(a.find("a").get('href'))) textdoc.write(" \n||" + a.find("span", class_="section-review-text").text+" \n|| " + a.find("span", class_="section-review-publish-date").text) textdoc.write("=========================================\n") counter = counter + 1 print ("Total reviews scraped:"+str(counter)) textdoc.close() #actions.send_keys(Keys.PAGE_DOWN).perform() #browser.execute_script('')
def change_label_number(): strLabel = tk.Label(window, text='處理中...') strLabel.pack(anchor='center') window.update() global url global zipfileName global comboExample comboExampleget = fileTypeListbox.get(fileTypeListbox.curselection()) url = 'https://www.fda.gov/MedicalDevices/ProductsandMedicalProcedures/DeviceApprovalsandClearances/510kClearances/ucm089428.htm' req = requests.get(url) soup = BeautifulSoup(req.text, 'html5lib') OBDataUrl = { i.a.text: i.a['href'] for i in soup.findAll('p') if i.find(text=re.compile('-')) } strLabel2 = tk.Label(window, text='Downloads 510K Data.') strLabel2.pack(anchor='center') window.update() for K, v in tqdm(OBDataUrl.items(), total=len(OBDataUrl), ascii=True, desc='Downloads 510K Data.'): urllib.request.urlretrieve(v, K) strLabel3 = tk.Label(window, text='Downloads 510K Data Done.') strLabel3.pack(anchor='center') window.update() all510kdatalist = [] for j in tqdm(OBDataUrl, ascii=True, desc='Loading 510K Data'): with zipfile.ZipFile(j, 'r') as zipFile: txtfile = j.lower().replace('.zip', '.txt') fileio = io.StringIO(zipFile.read(txtfile).decode('cp1252')) test01 = pd.read_csv(fileio, sep='|', encoding='utf8') all510kdatalist.extend(test01.to_dict('records')) strLabel4 = tk.Label( window, text='Loading 510K Data to {}'.format(comboExampleget)) strLabel4.pack(anchor='center') window.update() all510kDF = pd.DataFrame(all510kdatalist) all510kDf = all510kDF.rename(dict( zip(all510kDF.columns, [i.title() for i in all510kDF.columns])), axis=1) with open('{}.txt'.format(str(len(all510kDF))), 'w') as txt: pass # print('510K 額外資訊merge') # # 510K 額外資訊merge # urllib.request.urlretrieve('http://www.accessdata.fda.gov/premarket/ftparea/foiclass.zip', 'foiclass.zip') # with zipfile.ZipFile('foiclass.zip', 'r') as zipFile: # fileio = io.StringIO(zipFile.read('foiclass.txt').decode('cp1252')) # test01 = pd.read_csv(fileio, sep='|', encoding='utf8') # test01.rename( # {'REVIEW_PANEL': 'Reviewadvisecomm', 'PRODUCTCODE': 'Productcode', 'DEVICENAME': 'DEVICENAME_ADJ'}, # axis=1, inplace=True) # full510k = pd.merge(all510kDf, test01, how='left', on=['Reviewadvisecomm', 'Productcode']) try: filetypesSelect(all510kDf, '510k', comboExampleget, DateTimeSTR) window.quit() except Exception: window2 = tk.Tk() window2.title('錯誤提示') window2.geometry('400x300') error_Text = '' e_type, e_value, e_traceback = sys.exc_info() error_Text += f'''錯誤訊息如下: Errortype ==> {e_type.__name__} ErrorInfo ==> {e_value} ErrorFileName ==> {e_traceback.tb_frame.f_code.co_filename} ErrorLineOn ==> {e_traceback.tb_lineno} ErrorFunctionName ==> {e_traceback.tb_frame.f_code.co_name}''' with open('errorFileLog.log', 'w+') as errorFileLog: errorFileLog.write(error_Text) strLabel2 = tk.Label(window2, text='{}\n{}\n{}'.format(e_type, e_value, e_traceback)) strLabel2.pack(anchor='center') window2.mainloop() finally: pass
# Create an iterator that will cycle through the first 16 articles and skip a few listIterator = [] listIterator[:] = range(2, 16) # Print out the results to screen for i in listIterator: print '<h3>' + findPatTitle[i] + '</h3><br />' # The title print "<a href ='" + findPatLink[ i] + "'>Original Article</a><br />" # The link to the original article print '\n' articlePage = urlopen( findPatLink[i]).read() # Grab all of the content from original article divBegin = articlePage.find('<div>') # Locate the div provided article = articlePage[divBegin:( divBegin + 1000)] # Copy the first 1000 characters after the div # Pass the article to the Beautiful Soup Module soup = BeautifulSoup(article) # Tell Beautiful Soup to locate all of the p tags and store them in a list paragList = soup.findAll('p') # Print all of the paragraphs to screen for i in paragList: # i = cleanHtml(i) i = cleanHtmlRegex(i) print i + '<br />' print '<br /></body></html>'
#encoding=utf8 import urllib from bs4 import BeautifulSoup from urllib.request import urlopen from urllib.request import Request User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' header = {} header['User-Agent'] = User_Agent url = 'http://www.kxdaili.com/ipList/1.html#ip' req = Request(url,headers=header) res = urlopen(req).read() soup = BeautifulSoup(res) ips = soup.findAll('tr') f = open("../src/proxy.txt","w") for x in range(1,len(ips)): ip = ips[x] tds = ip.findAll("td") ip_temp = tds[0].contents[0]+"\t"+tds[4].contents[0]+"\n" # print tds[2].contents[0]+"\t"+tds[3].contents[0] f.write(ip_temp)
class Page(): def __init__(self): self.url = '' self.html = '' self.Ps = [] self.Spans = [] self.H1s = [] self.H2s = [] self.H3s = [] self.H4s = [] self.H5s = [] self.H6s = [] self.getalldone = False self.links = [] self.textlist = [] #run all the main methods in the required order def process(self, url): self.seturl(url) self.extract_html_page() self.getAllText() self.getLinks() self.cleantext() #allow to set the url to parse def seturl(self, url): #add url analysis to pre check self.url = url #extract HTML content using request and BeautifulSoup def extract_html_page(self): response = requests.get(self.url) self.html = BeautifulSoup(response.content, "html.parser") #extract all the <p> paragraphe def getP(self): AllP = self.html.findAll('p') for P in AllP: self.Ps.append(P.get_text()) #extract all the <span> paragraphe def getSpan(self): Allspan = self.html.findAll('span') for span in Allspan: self.Spans.append(span.get_text()) #extract all the <hn> titles def getH(self): for n in [n + 1 for n in range(6)]: AllH = self.html.findAll('h' + str(n)) for H in AllH: exec(f"self.H{str(n)}s.append(H.get_text())") #extract all P, span and Hn def getAllText(self): if self.getalldone == False: self.getP() self.getSpan() self.getH() self.getLinks() self.getalldone = True #combine all the text in one list def getAllTextCombined(self): self.getAllText() Alltext = [] for n in [n + 1 for n in range(6)]: AllH = [] exec(f"AllH = self.H{str(n)}s") for H in AllH: Alltext.append(H) for span in self.Spans: Alltext.append(span) for P in self.Ps: Alltext.append(P) return Alltext #extract all the links in the page def getLinks(self): for link in self.html.findAll( 'a'): #attrs={'href': re.compile("^http://")}): #print(link.get('href')) self.links.append(link.get('href')) #put the text into a list of sentence def cleantext(self): self.textlist = [ ' '.join(x.split()) for x in self.getAllTextCombined() ]
soup=BeautifulSoup(page.content,'html.parser') find_links=soup.find_all('a') finalarray=[] finallinkdict={} for link in find_links: if len(link.text)>0: if "INTRODUCING AWS" in str(link.text).upper() or "INTRODUCING AMAZON" in str(link.text).upper(): finallinkdict["https:"+str(link.attrs['href'])]={"title":link.text,"summary":""} #we want summarization as well. Take the links, iterate through them, get the text on the page, put it #in to the summarizer, and spit out a dictionary. for key in finallinkdict: temppage=requests.get(key) tempsoup=BeautifulSoup(temppage.content,'html.parser') paralist=tempsoup.findAll('p') tempsummary="" for paragraph in paralist: if paragraph.text.count('.')>0: tempsummary+=paragraph.text+" " fullsummary=summarizer(tempsummary)[0]['summary_text'] finallinkdict[key]['summary']=fullsummary finalarray.append(["Title","Summary","URL"]) for key in finallinkdict: temparray=[finallinkdict[key]['title'].replace(","," "),finallinkdict[key]['summary'].replace(","," "),key] finalarray.append(temparray) with open("AWS_"+year+"_NEW_SERVICES.csv","a") as f: writer=csv.writer(f)
if(len(email) > 1): print(email[1]) return email[1] return "" # driver = webdriver.Firefox() # driver.get('https://hls.harvard.edu/faculty/directory/?l=l') # print(driver.find_element_by_class_name("faculty-detail-link")) # content = driver.page_source f = open("polis.txt", "a") res = requests.get("https://www.polis.cam.ac.uk/Staff_and_Students/academic-staff") content = res.content # print(content) soup = BeautifulSoup(content) divs = soup.findAll('div', attrs={'class': 'emailAddress'}) emails = [] # print(divs) for div in divs: # name = a.find('div', attrs={'class':'sfljd'}) try: text = div.find('a').get('href') email = print_mail(text) emails.append(email) f.write(email + "\n") except: print("cant read") # print(emails) # loop = asyncio.get_event_loop()
##Alex Gagliano ##10/24/2016 ##Script for scraping the major OSS projects on OpenHub and pulling their ActivityFacts Objects from bs4 import BeautifulSoup import urllib wf = open('OpenDuckProjects_ActivityScrape.txt', 'a') projectNames = list() r = urllib.urlopen('https://www.openhub.net/').read() soup = BeautifulSoup(r, "lxml") Soup1 = soup.findAll("div", {"class": "top_ten_link"})[11:20] for ana in Soup1: projectNames.append(str(ana.a.get('href')).replace("/p/", "")) for name in projectNames: #pull data from webpage r = urllib.urlopen( 'https://www.openhub.net/projects/' + name + '/analyses/latest/activity_facts.xml?api_key=d32768dd2ec65efd004d19a9f3c7262d7f30cd8959d9009ce4f9b8e7e19ff0ef&v=1' ).read() soup = BeautifulSoup(r, "lxml") for item in soup.findAll('activity_fact'): tempDate = str(item('month')[0].text) tempCommentsA = str(item('comments_added')[0].text) tempCommentsR = str(item('comments_removed')[0].text) tempCodeA = str(item('code_added')[0].text) tempCodeR = str(item('code_removed')[0].text) tempCommits = str(item('commits')[0].text)
print("Opened queue: %s" % queue_url) while True: print("Attempting to receive messages") response = sqs.receive_message(QueueUrl=queue_url, MaxNumberOfMessages=1, WaitTimeSeconds=1) if not 'Messages' in response: print("No messages") continue message = response['Messages'][0] receipt_handle = message['ReceiptHandle'] url = message['Body'] # parse the page html = requests.get(url) bsobj = BeautifulSoup(html.text, "lxml") # now find the planet name and albedo info planet = bsobj.findAll("h1", {"id": "firstHeading"})[0].text albedo_node = bsobj.findAll("a", {"href": "/wiki/Geometric_albedo"})[0] root_albedo = albedo_node.parent albedo = root_albedo.text.strip() # delete the message from the queue sqs.delete_message(QueueUrl=queue_url, ReceiptHandle=receipt_handle) # print the planets name and albedo info print("%s: %s" % (planet, albedo))
import requests from bs4 import BeautifulSoup sach = {'q': 'Python', 'users': '1000'} url = 'http://b.hatena.ne.jp/search/text' req = requests.get(url, params=sach, timeout=15) print(req) soup = BeautifulSoup(req.text, 'html.parser') bookmarks = [] for b in soup.findAll('h3', {'class': ''}): title = b.find('a').get('title') url = b.find('a').get('href') bookmarks.append([title, url]) print(bookmarks)
kw = 'pokemon' # 在此次可以修改搜索关键词 limit = 3 # 设置图片数量-保护眼睛-后面的壁纸质量简直没法看... root_path = r'.\imgout\\' + str(kw) j = 1 # 全局计数器 # 构造6页循环 主界面查找循环 按照喜欢的排列 for i in range(1, 50): url = 'https://wallhaven.cc/search?q=' + kw + '&categories=111&purity=100&sorting=favorites&order=desc&page=' + str( i) # url参数 按热度排序 降序 html = requests.get(url, headers=header) html.encoding = chardet.detect(html.content)['encoding'] text = html.text soup = BeautifulSoup(html.text, "html.parser") data = soup.findAll(name='a', attrs={"href": re.compile(r'^https://.*(w\/).*')}) # 首次循环 获取所有图片的链接 if data == '[]': continue # 为了省时 没有搜索的话此次循环会被直接跳过 for sn in data: time.sleep(random.randint(1, 5)) # 随机延迟 url1 = str(sn['href']) html1 = requests.get(url1, headers=header) html1.encoding = chardet.detect(html1.content)['encoding'] text1 = html1.text soup1 = BeautifulSoup(html1.text, "html.parser") data1 = soup1.findAll(name='img', attrs={"src": re.compile(r'^https://.*jpg$')}) # 根据之前的url构造进入二级链接的url 获取图片的详细地址 if data1 == '[]': continue
import requests from bs4 import BeautifulSoup import pandas as pd url = 'http://www.runoob.com/html/html-intro.html' response = requests.get(url) html = response.text.encode(response.encoding).decode() # print(html) soup = BeautifulSoup(html, 'lxml') l = [x.text for x in soup.findAll('h2')] # print(l) df = pd.DataFrame(l, columns=[url]) # link = soup.findAll('a')[1] # link.has_attr('href') # print(link) # print(link.attrs['href']) links = [i for i in soup.findAll('a') if i.has_attr('href') and i.attrs['href'][0:5] == '/html'] # print(links) relative_urls = set([i.attrs['href'] for i in links]) # print(relative_urls) # relative_urls.to_excel('url.xlsx') absolute_urls = {'http://www.runoob.com' + i for i in relative_urls} absolute_urls.discard(url) # print(absolute_urls) for i in absolute_urls: ri = requests.get(i) soupi = BeautifulSoup(ri.text.encode(ri.encoding), "lxml") li = [x.text for x in soupi.findAll('h2')] dfi = pd.DataFrame(li, columns=[i]) df = df.join(dfi, how='outer')
def xiaoqu_page_search(db_xq,url_page=u"https://gz.lianjia.com/zufang/tianhe/pg1/"): trytimes = 0 while 1: try: req = urllib2.Request(url_page,headers=hds[random.randint(0,len(hds)-1)]) source_code = urllib2.urlopen(req,timeout=10).read() plain_text=unicode(source_code)#,errors='ignore') soup = BeautifulSoup(plain_text) except socket.timeout as e: if trytimes < 5: time.sleep(3) trytimes += 1 continue else: print e exception_write(e, 'xiaoqu_page_search', url_page) return except (urllib2.HTTPError, urllib2.URLError) as e: print e exception_write(e, 'xiaoqu_page_search', url_page) return except Exception as e: print e exception_write(e, 'xiaoqu_page_search', url_page) return human = soup.find('div',{'class':'human'}) if not human: break else: print "block && wait" time.sleep(600) trytimes = 0 xiaoqu_list = soup.findAll('li',{'data-el':'zufang'}) j = 0 for j in range(len(xiaoqu_list)): xq = xiaoqu_list[j] try: info_dict = {} where = xq.find('div',{'class':'where'}) href = unicode(where.a['href']) longname = where.a.text info_dict[u'href'] = href info_dict[u'name'] = longname.strip() except Exception as e: print e exception_write(e, 'xiaoqu_page_search', str(j)) continue print j try: command = gen_xiaoqu_insert_command(info_dict) db_xq.execute(command) except Exception as e: print e exception_write(e, 'xiaoqu_page_search_db', str(j)) continue
def crawlSearchPage(nextPage, db): foundItems = set() while True: pageNumberMatch = PATTERN_URL_PAGE_NUMBER.search(nextPage) if pageNumberMatch: print("Mining: {}".format(pageNumberMatch.group(1))) else: print("Mining: 1 (initial)") try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:48.0) Gecko/20100101 Firefox/48.0' } html = requests.get(nextPage, headers=headers) if isRobotDetected(html.content.decode('utf-8')): return except Exception as e: print("Failed to urlopen with {}".format(e)) return htmlContent = html.content hexDigestOfNextPage = hashlib.md5(nextPage.encode('utf-8')).hexdigest() print("{}: {}".format(hexDigestOfNextPage, nextPage)) with open( os.environ['DIR-WORKING'] + "/{}.html".format(hexDigestOfNextPage), 'w') as outfile: outfile.write(htmlContent.decode("utf-8")) bsObj = BeautifulSoup(htmlContent, "html5lib") # iterate inside of the single search page, over items on the page for itemDiv in bsObj.findAll( "div", {"class": re.compile(".*x-body--resultitem.*")}): # TODO: check with md5 for changes in the block summary = itemDiv.findAll( "div", {"class": re.compile(".*g-col-9.*")})[0].get_text() aList = itemDiv.findAll("a", {"class": re.compile(".*result-item.*")}) try: a = aList.pop() except Exception as e: print("Failed on URL: {} with {}".format(nextPage, e)) continue id_ = a.attrs["data-ad-id"] if db.find_one({"id": id_}): # we assume, that the items on the search page was ordered by creation time. # as consequence, break out as soon as we see a known item. return foundItems # continue item = { "id": id_, "isMined": False, "summary": summary, "firstSeenOn": datetime.now(), "uri": a.attrs["href"] } db.insert(item) foundItems.add(a) # get the url to the next page of search results nextPageSpan = bsObj.findAll( "span", {"class": re.compile(".*next-resultitems-page.*")}) if len(nextPageSpan) <= 0: return foundItems nextPage = nextPageSpan[0].attrs["data-href"] return foundItems
def get_data(self): return ''.join(self.fed) def strip_tags(html): s = MLStripper() s.feed(html) return s.get_data() r = requests.get( "https://marvelcinematicuniverse.fandom.com/wiki/J.A.R.V.I.S./Quote") soup = BeautifulSoup(r.text, 'html.parser') quotes = soup.findAll("div", {"class": "quote"}) quotes_dict = {"quotes": []} with open("jarvis_quotes.json", "w") as file: for quote in quotes: inner_quote = quote.dl.dd.span.i quotes_list = str(inner_quote).split("<br/>") for q in quotes_list: quotes_dict["quotes"].append({ "quote": strip_tags(q).replace("\"", ""), "type": 0 })
def openFolder(self, folder_id, path='', parent='root'): response = self.br.open('%s%s' % (self.url['folder'], folder_id)).read() soup = BeautifulSoup(response, 'html.parser') data = soup.findAll('a', href=re.compile('^/(Folder|File|note|essay|LearningToolElement)/')) course_num = self.current_course_num course_data = self.courses[course_num] items = {} for item in data: item_link = item.get('href') item_name = item.text.strip() item_id = re.findall(r'\d+', item_link)[0] if item_id == parent: continue item_type = 'unknown' if 'Folder' in item_link: item_type = 'folder' if 'File' in item_link: item_type = 'file' if 'note' in item_link: item_type = 'note' if 'essay' in item_link: item_type = 'essay' if 'LearningToolElement' in item_link: item_type = 'tool' more_info = False if item_type == 'folder': course_data['num_folders'] = course_data['num_folders'] + 1 new_path = path + '/' + item_name more_info = self.openFolder(item_id, new_path, folder_id) elif item_type == 'file': more_info = self.getFile(item_id) elif item_type == 'tool': more_info = self.getTool(item_id) elif item_type == 'note': more_info = self.getNote(item_id) elif item_type == 'essay': more_info = self.getEssay(item_id) act = u'Åpner' if item_type == 'folder' else 'Leser' stat = '%s \'%s\' (%s)' % (act, item_name, item_type) if not more_info: self.printStatus(stat + ' [' + color.yellow + 'uten innhold' + color.end + ']') continue self.printStatus(stat) items[item_id] = { 'data': [item_name, item_type], 'path': path } if len(more_info) > 0: items[item_id] = dict(items[item_id].items() + more_info.items()) course_data['max_item_len'] = max(course_data['max_item_len'], len(u''+item_name)) course_data['tree_depth'] = max(course_data['tree_depth'], path.count('/')) if item_type in ['file', 'note', 'essay', 'tool']: course_data['num_items'] = course_data['num_items'] + 1 time.sleep(self.cfg['loop_delay']) self.courses[course_num] = course_data return items
# 네이버 실시간 검색어 클로링 프로그램 from bs4 import BeautifulSoup import requests from datetime import datetime headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } url = "https://datalab.naver.com/keyword/realtimeList.naver?age=20s" response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') results = soup.findAll('span', 'item_title') print(response.text) search_rank_file = open("rankresult.txt", "a") print(datetime.today().strftime("%Y년 %m월 %d일의 실시간 검색어 순위입니다.\n")) rank = 1 for result in results: search_rank_file.write(str(rank) + "위:" + result.get_text() + "\n") print(rank, "위 : ", result.get_text(), "\n") rank += 1
import urllib from bs4 import BeautifulSoup print ("Collecting data from IMDb charts....\n\n\n") print ("The current top 15 IMDB movies are the following: \n\n") response = urllib.request.urlopen("http://www.imdb.com/chart/top") html = response.read() soup = BeautifulSoup(html, 'html.parser') mytd = soup.findAll("td", {"class":"titleColumn"}) for titles in mytd[:15]: print (titles.find('a').text) print ("\n\nThank you for using IMDB script ...")
#twitter crawling theurl = "https://twitter.com/realDonaldTrump" thepage = urllib.request.urlopen(theurl) soup = BeautifulSoup(thepage, "html.parser") # <title> 태그를 찾아서 프린트 #print(soup.title.text) # 이렇게 해도 됨 print(soup.find("title").text) # <a> 태그를 찾는다. #print(soup.findAll('a')) """" # <a> 태그의 href 속성의 값을 출력한다. for link in soup.find_all('a'): #print(link.get('href')) print(link.text) # text는 text부분을 가져오는 듯.. """ # <div class="ProfileHeaderCard">로 되어있을 때 #print(soup.find('div',{"class":"ProfileHeaderCard"})) #div 태그안에 class속성의 값은 ProfileHeaderCard 입력시 이렇게 씀 #print(soup.find('div',{"class":"ProfileHeaderCard"}).find('p')) # .find('p')를 하면 p태그인 것을 찾아냄 #print(soup.find('div',{"class":"ProfileHeaderCard"}).find('p').text) # .text를 하면 p태그의 text부븐을 찾아냄 i = 1 for tweets in soup.findAll('div', {"class": "content"}): print(i, ": ", tweets.find('p')) #print(i , ": ", tweets.find('p').text) i = i + 1
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None or len( url ) == 0: # if no link returned in movie and tvshow searches, nothing to do here, return out. log_utils.log('inif?') return sources # Grab title and year (cant use imdb code here) url = urlparse.parse_qs(url) title = url['title'][0] year = url['year'][0] # Create search link searchlink = self.search_link = self.search_link + title + ' ' + year url = urlparse.urljoin(self.base_link, searchlink) html = self.scraper.get(url).content # Get the HTML for the page soup = BeautifulSoup(html, "html.parser") # Find all search results and add to array results = soup.findAll("div", {"class": "result-item"}) result_links = [] for result in results: result_links.append(result.find("a", href=True)['href']) # Go over search results and find their sources for result_link in result_links: html = self.scraper.get(result_link).content soup = BeautifulSoup(html, "html.parser") javascripts = soup.findAll("script", {"type": "text/javascript"}) # Lets keep going until we find the one we need ids_b64s = [] for javascript in javascripts: javascript = str(javascript) if "var Player" in javascript and "LoadPlayer" in javascript: # This is the right script # Get the jwplayer-id jw_id = 'jwplayer-' + re.search( 'jwplayer-(\d+)', javascript).groups(0)[0] # Get weird b64 string b64_string = re.search('(?<=jwplayer)(.*)(?="\);)', javascript).groups(0)[0] # Parse into just the b64 b64_string = b64_string.split('","')[1] ids_b64s.append([jw_id, b64_string]) break # Go get the video links for id_b64 in ids_b64s: the_id = id_b64[0] the_b64 = id_b64[1] post = {'id': the_id, 'data': b64_string} html = self.scraper.post(urlparse.urljoin( self.base_link, '/wp-content/plugins/apiplayer/load.php'), data=post).content soup = BeautifulSoup(html, 'html.parser') javascripts = soup.findAll("script", {"type": "text/javascript"}) links_qual = [] for javascript in javascripts: javascript = str(javascript) if ").setup({" in javascript: # This script contains the stuff files = re.search('(?<=sources: \[)(.*)(?=])', javascript).groups()[0] files = "[" + files + "]" files = json.loads(files) for f in files: quality = f['label'] link = f['file'] links_qual.append([link, quality]) for l_q in links_qual: link = l_q[0] quality = l_q[1] host = link.split('//')[1].replace('www.', '').split('/')[0] info = '' sources.append({ 'source': host, 'quality': quality, 'language': 'en', 'url': link, 'info': info, 'direct': True, 'debridonly': False }) return sources except Exception as e: log_utils.log('EXCEPTION MSG: ' + str(e)) return sources
def init_global_variables(website): r = requests.get(website) #r = requests.get("https://www.autocarindia.com") output = r.text soup = BeautifulSoup(output, 'lxml') #The below two lines of code will extract the comments out of the code for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() result = soup.findAll("html") #The result will point to the top node <html> G = nx.DiGraph() #Empty Graph with no nodes and no edges. G.add_node(result[0].name) # result[0].name --> html parent = result[0] #<html><head></head><body><div></div><p></p></body></html> parents = [parent] #[<html><head></head><body><div></div><p></p></body></html>] labels=[parent.name] #['html'] edges = [] i = 0 for parent in parents: if hasattr(parent, 'contents'): for child in parent.contents: #These 2 lines will take out the extra string present as a node if isinstance(child, NavigableString): continue if child.name != None: node_name = child.name+str(i) else: node_name = 'string'+str(i) i = i + 1 G.add_node(node_name) G.add_edge(parent.name,node_name) x = (parent.name,node_name) #print(parent.name,node_name) #print("") #print(str(child.name) + " ---> " + str(child.contents)) toadd = "" for abc in child.contents: #print(str(type(abc)) +" -----> " + str(abc.string)) if isinstance(abc, NavigableString): #print("TOADD ----> " + str(abc.string)) toadd = toadd + str(abc.string) if child.name != None: element1 = str(child.name) + ': ' + node_name child.name = node_name else: element1 = toadd if hasattr(child, 'attrs'): for item in child.attrs: # print(item,child.attrs[item]) # input() element1 = element1 + '<br>' + ' ' + item+':' + ' ' + str(child.attrs[item]) if child.string != None: element1 = element1 + '<br>' + ' ' + 'string'+':' + ' ' + str(child.string) elif toadd != "": element1 = element1 + '<br>' + ' ' + 'string'+':' + ' ' + toadd labels.append(element1) edges.append(x) parents.append(child) #print(parent.name,node_name) pos = nx.spiral_layout(G) # nx.draw(G,pos,with_labels=True, font_weight='bold') # print(parents) # plt.show() g=nx.Graph() g.add_nodes_from(parents) g.add_edges_from(edges) # E is the list of edges pos=nx.fruchterman_reingold_layout(g) # This part of code eliminates the extra nodes that are present in the graph # I didnt know if you want those extra nodes or not.. N = len(parents) # ? counter = 0 Xv = [] Yv = [] for k in pos.keys(): if(counter>=N): Xv.append(pos[k][0]) Yv.append(pos[k][1]) counter+=1 Xed=[] Yed=[] for edge in edges: Xed+=[pos[edge[0]][0],pos[edge[1]][0],None] Yed+=[pos[edge[0]][1],pos[edge[1]][1],None] trace3=Scatter(x=Xed, y=Yed, mode='lines', line=dict(color='rgb(210,210,210)', width=1), hoverinfo='text' ) trace4=Scatter(x=Xv, y=Yv, mode='markers', name='net', marker=dict(symbol='circle-dot', size=5, color='#6959CD', line=dict(color='rgb(50,50,50)', width=0.5) ), text=labels, hoverinfo='text' ) globals()['trace3']=trace3 globals()['trace4']=trace4 globals()['G']=G globals()['g']=g globals()['pos']=pos globals()['parents']=parents globals()['labels']=labels globals()['edges']=edges globals()['Xv']=Xv globals()['Yv']=Yv
class AbstractScraper: class Decorators: """ Define decorators for AbstractScraper methods here. """ @staticmethod def schema_org_priority(decorated): """ Use SchemaOrg parser with priority (if there's data in it) On exception raised - continue by default. If there's no data (no schema implemented on the site) - continue by default """ @functools.wraps(decorated) def schema_org_priority_wrapper(self, *args, **kwargs): function = getattr(self.schema, decorated.__name__) if not function: raise SchemaOrgException( "Function '{}' not found in schema".format( decorated.__name)) if not self.schema.data: return decorated(self, *args, **kwargs) try: value = function(*args, **kwargs) except SchemaOrgException: return decorated(self, *args, **kwargs) return value or decorated(self, *args, **kwargs) return schema_org_priority_wrapper @staticmethod def bcp47_validate(decorated): @functools.wraps(decorated) def bcp47_validate_wrapper(self, *args, **kwargs): tag = tags.tag(decorated(self, *args, **kwargs)) return str(tag) if tag.valid else None return bcp47_validate_wrapper @staticmethod def default_exception_handling(decorated): """ As web scraping is too unpredictable in nature, handle whatever exceptions may arise with defaulting values. If you wish to handle exceptions on your own you can pass the default_exception_handling=False flag. Example: from recipe_scrapers import scrape_me scraper = scrape_me('<recipe_url>', default_exception_handling=False) scraper.total_time() # and etc. """ @functools.wraps(decorated) def default_exception_handling_wrapper(self, *args, **kwargs): if self.default_exception_handling: try: return decorated(self, *args, **kwargs) except: on_exception_return = { 'title': '', 'total_time': 0, 'yields': '', 'image': '', 'ingredients': [], 'instructions': '', 'ratings': -1, 'reviews': None, 'links': [], 'language': 'en', } return on_exception_return.get(decorated.__name__) else: return decorated(self, *args, **kwargs) return default_exception_handling_wrapper def __init__(self, url, test=False, meta_http_equiv=False, default_exception_handling=True): if test: # when testing, we load a file with url: page_data = url.read() else: page_data = requests.get(url, headers=HEADERS).content self.default_exception_handling = default_exception_handling self.meta_http_equiv = meta_http_equiv self.soup = BeautifulSoup(page_data, "html.parser") self.schema = SchemaOrg(page_data) self.url = url # if self.schema.data: # print("Class: %s has schema." % ( # self.__class__.__name__ # )) def url(self): return self.url def host(self): """ get the host of the url, so we can use the correct scraper """ raise NotImplementedError("This should be implemented.") @Decorators.default_exception_handling @Decorators.schema_org_priority def title(self): raise NotImplementedError("This should be implemented.") @Decorators.default_exception_handling @Decorators.schema_org_priority def total_time(self): """ total time it takes to preparate the recipe in minutes """ raise NotImplementedError("This should be implemented.") @Decorators.default_exception_handling @Decorators.schema_org_priority def yields(self): """ The number of servings or items in the recipe """ raise NotImplementedError("This should be implemented.") @Decorators.default_exception_handling @Decorators.schema_org_priority def image(self): """ Image of the recipe Try to fetch it from og:image if not implemented. """ try: image = self.soup.find('meta', { 'property': 'og:image', 'content': True }) return image.get('content') except AttributeError: # if image not found raise NotImplementedError("This should be implemented.") @Decorators.default_exception_handling @Decorators.bcp47_validate @Decorators.schema_org_priority def language(self): """ Human language the recipe is written in. May be overridden by individual scrapers. """ candidate_languages = set() html = self.soup.find('html', {'lang': True}) candidate_languages.add(html.get('lang')) # Deprecated: check for a meta http-equiv header # See: https://www.w3.org/International/questions/qa-http-and-lang meta_language = self.soup.find( 'meta', { 'http-equiv': lambda x: x and x.lower() == 'content-language', 'content': True }) if self.meta_http_equiv else None if meta_language: for language in meta_language.get('content').split(','): candidate_languages.add(language) break # If other langs exist, remove 'en' commonly generated by HTML editors if len(candidate_languages) > 1 and 'en' in candidate_languages: candidate_languages.remove('en') # Return the first candidate language for language in candidate_languages: return language @Decorators.default_exception_handling @Decorators.schema_org_priority def ingredients(self): raise NotImplementedError("This should be implemented.") @Decorators.default_exception_handling @Decorators.schema_org_priority def instructions(self): raise NotImplementedError("This should be implemented.") @Decorators.default_exception_handling @Decorators.schema_org_priority def ratings(self): raise NotImplementedError("This should be implemented.") @Decorators.default_exception_handling def reviews(self): raise NotImplementedError("This should be implemented.") @Decorators.default_exception_handling def links(self): invalid_href = ('#', '') links_html = self.soup.findAll('a', href=True) return [ link.attrs for link in links_html if link['href'] not in invalid_href ]
else: colors.error("Please enter the correct URL ") sys.exit(0) soup1=BeautifulSoup(html,"lxml") # Parsing the html data using BeautifulSoup title=getting_header(soup1) # Getting the title of the page data.header=title # Storing title of the page as Project Title colors.success("Repository Title : "+title) time.sleep(1) star_value=0 watch_value = 0 fork_value =0 a_tags=soup1.findAll("a") # Finding all the 'a' tags in response html data. for a_tag in a_tags: # Finding total stargazers of the repository string=a_tag.get("href") if(string.endswith("/watchers")): watch_value=(a_tag.get_text()).strip() watch_value=formated(watch_value) colors.success("Total watchers : "+watch_value) time.sleep(1) watch_value=int(watch_value) if(string.endswith("/stargazers")): star_value=(a_tag.get_text()).strip() star_value=formated(star_value) colors.success("Total stargazers : "+star_value) time.sleep(1) star_value=int(star_value) if(string.endswith("/members")):
# url = "https://www.youracclaim.com/user/robert-mapstead" print("\nThis program lists your badges from the Acclaim Badges Platform \n") name = input("Enter user name. Example: robert-mapstead >>> ") url = "https://www.youracclaim.com/user/" + name # Getting the webpage, creating a Response object. response = requests.get(url) # Extracting the source code of the page. data = response.text # Passing the source code to Beautiful Soup to create a BeautifulSoup object for it. soup = BeautifulSoup(data, 'lxml') # Extracting all the <a> tags whose class name is 'result-title' into a list. badges = soup.findAll('div', {'class': 'cr-standard-grid-item-content__title'}) # Extracting text from the the <a> tags, i.e. class badges. print() print( "Copy all of this text below OR copy the text from the badges.txt file on the left sidebar for use in your resume, CV, or Social Media Profiles: \n" ) fobj = open('badges.txt', 'a') #fobj.write(url+"\n") fobj.write(url) print("\nmy Badges:") for badge in badges: #print(badge.text) print(badge.text.rstrip()) with open('badges.txt', 'a') as f:
# -*- coding: utf-8 -*- """ Created on Mon Dec 9 22:24:08 2018 @author: TS """ #!/usr/bin/env python # -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup path='/home/tushar/Desktop/BTP/allparagraphs.txt' ####Enter the path to your file here wiki_url='https://en.wikipedia.org/wiki/Narendra_Modi' #####Enter the wiki url here source_code = requests.get(wiki_url).text soup = BeautifulSoup(source_code,'html.parser') a=soup.findAll('p') allparagraphs='' for i in a: allparagraphs=allparagraphs+i.text+'\n' #print(i.text) #print(allparagraphs) allparagraphsFile = open(path,'w') allparagraphsFile.write(wiki_url[30:]+'\n') allparagraphsFile.write(allparagraphs+'\n\n--------------------------------------') allparagraphsFile.close()