def addSection(link, title): if not 'http' in link: page = urllib2.urlopen('http://www.paulgraham.com/'+link).read() soup = BeautifulSoup(page) soup.prettify() else: page = urllib2.urlopen(link).read() section = ez_epub.Section() try: section.title = title print section.title if not 'http' in link: font = str(soup.findAll('table', {'width':'435'})[0].findAll('font')[0]) if not 'Get funded by' in font and not 'Watch how this essay was' in font and not 'Like to build things?' in font and not len(font)<100: content = font else: content = '' for par in soup.findAll('table', {'width':'435'})[0].findAll('p'): content += str(par) for p in content.split("<br /><br />"): section.text.append(genshi.core.Markup(p)) #exception for Subject: Airbnb for pre in soup.findAll('pre'): section.text.append(genshi.core.Markup(pre)) else: for p in str(page).replace("\n","<br />").split("<br /><br />"): section.text.append(genshi.core.Markup(p)) except: pass return section
def convert_to_lca_style(filename): outfile = open(filename + '.html', mode='w', encoding='utf-8') soup = BeautifulSoup(open(filename + '.htm', encoding='utf-8')) soup.prettify() # tags style und head entfernen soup.style.decompose() soup.head.decompose() # b, div, span entfernen (Inhalt bleibt) bs = soup.find_all("b") for tag in bs: soup.b.unwrap() divs = soup.find_all("div") for tag in divs: soup.div.unwrap() spans = soup.find_all("span") for tag in spans: soup.span.unwrap() ps = soup.find_all("p") for tag in ps: soup.p.unwrap() # td: attribute etc. bereinigen tds = soup.find_all("td") for td in tds: del td['style'] # html und body entfernen (Inhalt bleibt) soup.html.unwrap() soup.body.unwrap() outfile.write(soup.prettify()) outfile.close() return
def get_urls(url): data = get_page(url) soup = BeautifulSoup(data) soup.prettify() basic_url = "http://news.donga.com" contents = soup.findAll("p", { "class" : "title" }) for content in contents: if str(content) != '\n': data1 = str(content) soup1 = BeautifulSoup(data1) for link in soup1.findAll('p'): url = basic_url + str(link.contents[1]['href']) final_content = get_articles(url)[0] # 기사내용 final_content_length = get_articles(url)[1] # 기사길이 if len(link.contents[1].contents) != 0: # 기사제목이 없을때 에러방지 final_title = str(link.contents[1].contents[0]) else: final_title = "" for p in list(punctuation): final_title = final_title.replace(p,' ') final_title = final_title.replace("“",' ').replace("”",' ').replace("·",' ').replace("△",' ').replace("■",' ').replace("‘",' ').replace("’",' ').replace("…",' ').replace("▲",' ').replace("⊙",' ').replace("◇",' ').replace("▶",' ').replace("◆",' ') final_title = final_title.strip() final_title = re.sub(' ',' ',final_title) year = str(link.contents[3])[7:11] month = str(link.contents[3])[12:14] day = str(link.contents[3])[15:17] print year + "-" + month + "-" + day '''
def analyze(page): log("Analyzing, extracting ...") soup = BeautifulSoup(page) soup.prettify() word_item = MWordItem() # url url_tag = soup.find("meta", {"property": "og:url"}) word_item.source_url = url_tag["content"] # 单词 word_tag = soup.find("strong", class_="main_entry_word") word_item.word = word_tag.string # 词性 func_tag = soup.find("p", class_="word_function") word_item.func = func_tag.text # 释义 sense_tag_list = soup.find_all("span", class_="ssens") word_item.sense_list = [sense_tag.text[2:] for sense_tag in sense_tag_list] # example & do you know example_dyn_tag_list = soup.find_all("p", class_="word_example_didu") if len(example_dyn_tag_list) != 2: log('Should Contain both example & "do you know" parts, Please Check') else: word_item.example = example_dyn_tag_list[0].text story = example_dyn_tag_list[1].text word_item.story = re.sub(r"((Test Your Memory)|(Name That Synonym)).+$", r"", story) print word_item.story return word_item
def download_json_files(): if not os.path.exists('/tmp/xmltv_convert/json'): os.makedirs('/tmp/xmltv_convert/json') page = urllib2.urlopen('http://json.xmltv.se/') soup = BeautifulSoup(page) soup.prettify() for anchor in soup.findAll('a', href=True): if anchor['href'] != '../': try: anchor_list = anchor['href'].split("_") channel = anchor_list[0] filedate = datetime.datetime.strptime(anchor_list[1][0:10], "%Y-%m-%d").date() except IndexError: filedate = datetime.datetime.today().date() if filedate >= datetime.datetime.today().date(): if len(channels) == 0 or channel in channels or channel == "channels.js.gz": stdout.write("Downloading http://xmltv.tvtab.la/json/%s " % anchor['href']) f = urllib2.urlopen('http://xmltv.tvtab.la/json/%s' % anchor['href']) data = f.read() with open('/tmp/xmltv_convert/json/%s' % anchor['href'].replace('.gz', ''), 'w+ ') as outfile: outfile.write(data) stdout.write("Done!\n") stdout.flush()
def getImg(requestUrl): theHtml = rs.get(requestUrl).text # BeautifulSoup to get the page html soup = BeautifulSoup(theHtml, 'lxml') soup.prettify() theCommentList = soup.find('ol', class_='commentlist') allLiTag = theCommentList.find_all('li') for li in allLiTag: if li.get('id')=='adsense': continue vote = li.find('div', class_='vote').find_all('span')[1].string vote = int(vote) if vote>votelimit: try: id = li.get('id') # print(id) originalLink = li.find('span', class_='righttext').find('a').get('href') dirName = getDirFrom(originalLink) name = getPreNameFrom(originalLink) imageUrl = li.find('a', class_='view_img_link').get('href') vote = li.find('div', class_='vote').find_all('span')[1].string prefixName= rename(name,vote) extraName = getFileExt(imageUrl) downloadImg('http:'+imageUrl,dirName,prefixName,extraName,originalLink) print(vote) except: continue else: continue
def bsoup(): r = requests.get("https://answers.yahoo.com/question/index?qid=20080613085817AAqvcNW") soup = BeautifulSoup (open(r.content)) string = soup.findall("div", {"class":"group"}) print soup.finalall(re.compile("^[A-Z]")) print soup.prettify() print string
def hello_world(): ''' BeautifulSoup ''' html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie/chen/hong" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie/huan/jiang" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie/pang/guai" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, 'html.parser') print soup.prettify() # 下面两种方式一样 print soup.find_all('a') links = soup('a') for link in links: print link.attrs print link.string print link.contents
def get_data_apple(self): """ Grabs official releases from Apple site. The code uses BeautifulSoup to grab the first table. The code looks for a row with the right Software. In case the Apple's format has changed a KeyError is thrown. :return: """ html = urllib2.urlopen(self.data_dic[self.name]["link"]).read() soup = BeautifulSoup(html) soup.prettify() rows = soup.find('table').find_all('tr') for row in rows: text = row.contents[0].get_text() m = re.search(self.data_dic[self.name]["format"], text) if m: self.version = m.group(1) self.date = datetime.datetime.strptime(row.contents[4].get_text(), self.data_dic[self.name]["date_format"]) self.generate_table() return raise KeyError('ERROR: No new releases in the Apple site, please change url or matching pattern')
def walmartSingleItemCrawler(link): soup = BeautifulSoup(urllib2.urlopen(link), 'html.parser') print soup.prettify() # name name = soup(class_ = re.compile("^js-product-heading"))[0].span print "1.Name:" + name.text price = soup(class_ = re.compile("^js-price-display"))[0].text print "2.Price" + price unitPrice = soup(class_ = re.compile("^unit-price-display"))[0].text print "3.unit price" + unitPrice img_src = soup(class_ = "product-image js-product-image js-product-primary-image")[0]['src'] print "4.img_src:" + img_src print "5.About this item ===========" itemInfo = soup(class_ ="product-description-disclaimer-mweb")[0].text print itemInfo liItemInfo = soup(class_= "about-item-preview-text js-about-item-preview-text")[0] # print liItemInfo for li in liItemInfo: print "*" + li.text print "6. Rank ======" itemRank = soup(class_="Grid-col item-ranks")[0] print itemRank starRated = soup(class_ = "Grid mweb-snippet-stars")[0].find_all("i", class_="star star-rated") print "7. star review:" print "star " + str(len(starRated)) + " out of 5"
def download_json_files(): if not os.path.exists("/tmp/xmltv_convert/json"): os.makedirs("/tmp/xmltv_convert/json") page = urllib2.urlopen("http://json.xmltv.se/") soup = BeautifulSoup(page) soup.prettify() for anchor in soup.findAll("a", href=True): if anchor["href"] != "../": aweekago = datetime.datetime.now() - datetime.timedelta(days=7) try: anchor_list = anchor["href"].split("_") channel = anchor_list[0] filedate = datetime.datetime.strptime(anchor_list[1][0:10], "%Y-%m-%d").date() except IndexError: filedate = aweekago.date() if filedate >= aweekago.date(): if len(channels) == 0 or channel in channels or channel == "channels.js.gz": stdout.write("Downloading http://json.xmltv.se/%s " % anchor["href"]) f = urllib2.urlopen("http://json.xmltv.se/%s" % anchor["href"]) data = f.read() with open("/tmp/xmltv_convert/json/%s" % anchor["href"].replace(".gz", ""), "w+ ") as outfile: outfile.write(data) stdout.write("Done!\n") stdout.flush()
def get_parteredmenyek(content): results = list() soup = BeautifulSoup(content, from_encoding='utf-8') soup.prettify(formatter=lambda s: s.replace(u'\xa0', ' ')) jegyzokonyv = soup.find(text='Jegyzőkönyv') voter_table = jegyzokonyv.find_next('table') voter_data = voter_table.find_all('td') total = voter_data[0].text voters = list(voter_data[1]) #non_voters = int(total) - int(voters[0].replace(' ', '')) non_voters = 0 nonvoter = dict() nonvoter['statistics_code'] = 'non-voters' nonvoter['statistics_name'] = 'Non voters' nonvoter['value'] = non_voters results.append(nonvoter) jelolt_table = soup.find('p', text='A szavazatok száma pártlistánként').find_next('table') #print type(jelolt_table) rows = jelolt_table.find_all('tr') #print 'rows: ', rows for row in rows[1:]: party = dict() cells = row.find_all('td') #print 'cells: ', cells party['statistics_code'] = slugify(cells[1].text) party['statistics_name'] = cells[1].text party['value'] = cells[2].text results.append(party) return results
def get_data_ios(self): """ Grabs official releases Data from Wikipedia page regarding iOS releases. The code uses BeautifulSoup to grab the first table. The code looks for the entry which is colored in green (This color is defined in the XML file and can be easily changed without opening the code). In case the Wikipedia format is changed a KeyError is thrown. :return: """ html = urllib2.urlopen(self.link).read() soup = BeautifulSoup(html) soup.prettify() rows = soup.find('table').find_all('tr') for row in rows: tmp = row.find('td', {'style': 'background:' + self.color + ';'}) if tmp: cells = row.find_all('td') self.version = tmp.get_text() self.date = datetime.datetime.strptime(row.find('span', {'class': 'bday dtstart published updated' }).get_text(), '%Y-%m-%d') self.TableObj[self.soft_name][1] = self self.TableStr[self.soft_name][1] = self.return_date() return raise KeyError('ERROR: The Wikipedia format has probably changed.' )
def get_data_selenium(self): """ Grabs official releases from Selenium site. The code uses BeautifulSoup to grab the first table. The code looks for a row with the right Software. In case the Seleniums's format has changed a KeyError is thrown. :return: """ try: self.link = self.data_dic[self.name]["link"] html = urllib2.urlopen(self.link).read() soup = BeautifulSoup(html) soup.prettify() rows = soup.find('table').find_all('tr') for row in rows: text = row.contents[1].get_text() if text == "Python": self.version = row.contents[3].get_text() self.date = datetime.datetime.strptime(row.contents[5].get_text(), self.data_dic[self.name]["date_format"]) self.generate_table(self.link, self.data_dic[self.name]["link2"]) return raise KeyError() except: raise KeyError('ERROR: Error reading version or date: ' + self.name)
def _get_out_links(article, doc): #Needs to focus on only relevent links(is it an actual article) soup = BS(article.html) soup.prettify() for link in soup.findAll('a'): out_link = link.get('href') article.out_links.append(urljoin(article.url, out_link))
def parse_members_info(pages): members = [] for page in pages: soup = BeautifulSoup(page) soup.prettify() tr_members = soup.find_all('tr', {'class': 'member'}) for tr in tr_members: td_infos = tr.find_all('td') username = td_infos[0].find_all('img')[0].get('alt') nickname = td_infos[0].find_all('a', {'class': 'nickname'})[0].get_text() login_id = _get_number_out(td_infos[0].find_all('a', {'class': 'nickname'})[0].get( 'href')) points = td_infos[1].get_text() days = _get_number_out(td_infos[2].get_text()) rate = td_infos[3].get_text() checked = True if 'label-success' in td_infos[4].find_all('span')[0].get('class') \ else False member = { 'username': username, 'nickname': nickname, 'login_id': login_id, 'points': points, 'days': days, 'rate': rate, 'checked_today': checked } members.append(member) return members
def __processFile(self, input_file, output_file, recursing): """Process a single file at path input_file and write processed file to path output_file. May be called recursively depending on template structure. """ f_in = open(input_file, "r", encoding="utf-8") soup = BeautifulSoup(f_in) # find all comments in the document comments = soup.findAll(text=lambda text:isinstance(text, Comment)) # parse out the template filepath from each comment, recursively obtain the content, # and replace the comment with the content to be inserted for comment in comments: # TODO: fix regex to include 'file' keyword match = re.match('^#include.+?virtual=\"(.+?)\"', comment) if match: inc_path = match.group(1) if inc_path: inc_content = BeautifulSoup(self.__processFile(self.input_dir + '/' + inc_path, "", True)) for tag in inc_content.contents: comment.insert_before(tag) comment.extract() f_in.close() # if we are recursing, return our result as a string so it can be included recursively if recursing: return soup.prettify(formatter=None) # if not, this must be the original SHTML file, so write our result to output_file as an HTML document else: f_out = open(output_file, "w", encoding="utf-8") f_out.write(soup.prettify(formatter=None)) f_out.close()
def parse_checkin(checkin_page): soup = BeautifulSoup(checkin_page) soup.prettify() div_checkins = soup.find_all('div', {'class': 'checkin span8'}) notes, dates = [], [] for checkin in div_checkins: note = checkin.find_all('div', {'class': 'note'})[0].get_text().strip() date = checkin.find_all('div', {'class': 'span4'})[0].get_text().strip() notes.append(note) dates.append(date) notes, dates = notes[:7], dates[:7] # only get 7 items words = [_regex_search(r'\d+(?= 个单词)', note) for note in notes] reads = [_regex_search(r'\d+(?= 篇文章)', note) for note in notes] sents = [_regex_search(r'\d+(?= 个句子)', note) for note in notes] lstns = [_regex_search(r'\d+(?= 句听力)', note) for note in notes] dates = [_parse_chinese_date(date) for date in dates] checkin_list = [] for date, word, read, sent, lstn in zip(dates, words, reads, sents, lstns): word = 0 if word == '' else int(word) read = 0 if read == '' else int(read) sent = 0 if sent == '' else int(sent) lstn = 0 if lstn == '' else int(lstn) checkin = {'words': word, 'reads': read, 'sents': sent, 'lstns': lstn} date_checkin = [date, checkin] checkin_list.append(date_checkin) return checkin_list
def parse_total_checkin(page): soup = BeautifulSoup(page) soup.prettify() ul = soup.find_all('ul', {'class': 'nav-stacked'}) checkins = ul[0].find_all('a')[1].get_text() days = _get_number_out(checkins) return days
def getPageItem(self, content): if not content: print '页面加载失败' #最大页码 page_pattern = re.compile('<a href=.*?"pageTo" action-data="page=([0-9].*?)">.*?</a>') pages = re.findall(page_pattern, content) # 更新最大页码 pages_int = [int(x) for x in pages] self.maxIdx = max(pages_int) # 两个链接,时间,微博内容 # 不使用正则表达式 # img_pattern = re.compile('<li>.*?<dl class="m_photoItem m_photoItem_a phtItem_hv">.*?<a href="(.*?)">.*?' + # '<img src="(.*?)".*?/>.*?</a>.*?<span node-type="time">(.*?)</span>.*?' + # '<p title="(.*?)" class.*?</p>.*?</dd>.*?</dl></li>',re.S) # items = re.findall(img_pattern, content) html_tree = BeautifulSoup(content) html_tree.prettify() photo_list = html_tree.find_all("dl", class_="m_photoItem m_photoItem_a phtItem_hv") items = [] for photo_item in photo_list: item = {} item["detail_page"] = photo_item.find('dt').find('a')["href"] item["small_img_link"] = photo_item.find('dt').find('a').find('img')["src"] item["time"] = photo_item.find('dd').find("span", attrs={"node-type":"time"}).string try: item["description"] = photo_item.find('dd').find_all("p")[-1].get("title") except: item["description"] = None items.append(item) return items
def extract_data(code): p = urllib2.urlopen('http://www.nasdaq.com/symbol/'+code+'/historical').read() page = BeautifulSoup(p) page.prettify() div = page.find(id="historicalContainer") tables = div.find_all('table') for table in tables: tr_tags = table.find_all('tr') val=[] date = [] count=0 for tr_tag in tr_tags: if count > 1: count1=0 td_tags = tr_tag.find_all('td') for td_tag in td_tags: if count1 == 4: val.append(str(td_tag.string.strip())) if count1 == 0: date.append(str(td_tag.string.strip())) count1=count1+1 count=count+1 return {"data": val, "date": date}
def test_complete_download_and_mobilization(self): mobilizer = InstapaperMobilizer() u = urllib.urlopen(mobilizer.url("http://m.onet.pl/wiadomosci/4986708,detal.html")) soup = BeautifulSoup(u.read()) self.assertTrue(mobilizer.is_correctly_mobilized(soup), "Correctly mobilized") soup = mobilizer.post_process_html(soup) print soup.prettify()
def from_html(file_path): with open(file_path, "r") as html_file: html_text = html_file.read() html_obj = BeautifulSoup(html_text, 'lxml') html_title = html_obj.title.string.strip() # Remove all comment elements comments = html_obj.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] #print(html_obj) # print and reparse html or we get an error for some reason html_obj = BeautifulSoup(html_obj.prettify(), 'lxml') # remove all script and style elements for unwanted in html_obj(["script", "style"]): unwanted.extract() # print and reparse html or we get an error for some reason html_obj = BeautifulSoup(html_obj.prettify(), 'lxml') # get the text text = html_obj.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) return text, html_obj.title.string
def get_content(self): print 'fetching from {}'.format(self.url) req = urllib2.Request(self.url,headers=headers) page = urllib2.urlopen(req).read() soup = BeautifulSoup(page) soup.prettify() print soup.html
def scrapeTickers(self): # create list of tickers url2scrape = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies" page = urllib2.urlopen(url2scrape).read() soup = BeautifulSoup(page) soup.prettify() table = soup.find("table", { "class" : "wikitable sortable" }) for row in table.findAll("tr")[1:]: self.__tickerList.append(row.a.string) for ticker in self.__tickerList: tableCreate = 'CREATE TABLE IF NOT EXISTS tickers (' \ 'ID int(11) PRIMARY KEY NOT NULL AUTO_INCREMENT, ' \ 'TICKER VARCHAR(5));' addTicker = "INSERT INTO tickers (ticker) VALUE ('{}');".format(ticker) try: #self.__cursor.execute(tableCreate) self.__cursor.execute(addTicker) self.__cnxn.commit() except Exception as e: self.tlog.warning(e) print e pass else: self.__cnxn.close
def getval(): import codecs lol=[] record=[] x=0 while x<len(usnl): page_html=open("results/"+usnl[x]+".html", 'rb') soup=BeautifulSoup(page_html) soup.prettify() fl = codecs.open('output'+sys.argv[1]+sys.argv[2]+'.csv', 'ab',encoding="Utf-8") record=[texts.text for texts in soup.findAll('td',{"align":"center"})] """for y in soup.findAll("td"): if y.parent.name=="tr": lol.append(y.text) if lol[-10] == "A" or lol[-10]=="P" or lol[-10]=="F": lol[-10],lol[-1]=lol[-1],lol[-10]""" del record[0:4] for y in record: if "P" in y: record.remove("P") elif "F" in y: record.remove("F") elif "A" in y: record.remove("A") if len(record)>24: del record[24:] if "Total" in record: del record[:] if record: fl.write("\n"+usnl[x]+",") for y in record: fl.write(y) fl.write(",") #fl.write(lol[-10]) fl.close() x+=1
def openURL(url): print ("yooooo") r = br.open(url) # open our browser object to the comic page page = urllib2.urlopen(url).read() soup = BeautifulSoup(page) soup.prettify() html = r.read() # manga is licensed if "has been licensed, it is not available" in html: print ("Sorry, the series has been licensed") reprompt() elif "searchform_name" in html: # deal with search br.select_form(nr=1) br.submit() # print(br.response().read()) else: # does not work for half chapters at the moment chapter_num = raw_input("Chapter number: ") zero_pad_num = chapter_num.zfill(3) for chapter in soup.find_all("a", {"class": "tips"}): chapterURL = chapter.get("href") print chapterURL query = "c" + zero_pad_num if query in chapterURL: print ("found query at " + chapterURL) getFiles(chapterURL, chapter_num) # rememeber to add name of title break
def find_champ(): url = "http://champion.gg" ourUrl = opener.open(url).read() soup = BeautifulSoup(ourUrl) soup.prettify() for i in soup.findAll('div', {"class" : "champ-index-img"}): champs.append(champions(i))
def extract_text(filename): #data extraction from the html file. file = open(filename) data = file.read() #read the data using BeautifulSoup. soup = BeautifulSoup(data, "html.parser") #represent the html document as a nested data structure. soup.prettify() file.close() #extract the data between <p>, <div>, <b>, <i>, <td>, <h1>...<h5>, and <span> tags. vari = [] tags = ['h1','h2','h3','h4','h5','p','div','b','i','td','span'] for i in range(0, len(tags)): var = map(str, (soup.find_all(tags[i]))) var = map(lambda it: it.strip('<'+tags[i]+'>'), var) var = map(lambda it: it.strip('</'), var) vari.append(var) #open a file for writing the data into 'example.properties' if os.path.exists('example.properties'): f = open('example.properties', 'w+') else: f = open('example.properties', 'a+') #writing the data into 'example.properties' for i in range(0, len(tags)): f.write(""+tags[i]+" = ") for j in range(0, len(vari[i])): f.write("%s " % vari[i][j]) f.write("\n") f.close()
def main(): """Entry point for this script.""" args = parse_command_line_arguments() try: soup = BeautifulSoup(open(args.file)) except IOError as exception: print("ERROR: File '%s' could not be parsed: %s" % (args.file, exception)) return 1 if args.write_changes: try: with open(args.file, 'wb') as output: prettified = soup.prettify(encoding="utf8") output.write(prettified) except IOError as exception: print("ERROR: File '%s' could not be written: %s" % (args.file, exception)) return 1 else: prettified = soup.prettify(encoding="utf8") print(prettified) return 0
import requests from bs4 import BeautifulSoup page = requests.get( "https://dataquestio.github.io/web-scraping-pages/simple.html") print(page) print("--------------") print(page.content) soup1 = BeautifulSoup(page.content, 'html.parser') print(soup1.prettify())
def make_new_html(self): html_frame = dedent('''\ <html> <head> <title> </title> </head> <body bgcolor="#00FFFF"> <div style="margin:40px ; text-align:center;"> <a href="" style="color:red;text-decoration:none;"> <h1> </h1> </a> </div> </body> </html> ''') if self.html_type == 1: soup = BeautifulSoup(self.html_source_code, 'lxml') content = soup.find('td', class_='t_f') [s.extract() for s in content('i')] img_names = [] wrong_tag = 0 for img_name in content.find_all('img'): if img_name.has_attr('file'): img_names.append(img_name['file'].split('/')[-1]) else: wrong_tag += 1 ''' 发现了一个毒瘤(790614): 根据<img>提取出来的图片,还有这种情况,是广告图片,要加个判断 <img src="static/image/smiley/default/handshake.gif" smilieid="23" border="0" alt="" /> ''' pre_img_formats = re.findall(r'<img.+?/>', str(content)) self.new_html_source_code = str(content) self.new_html_source_code = re.sub(r'<td.+?>', '', self.new_html_source_code) self.new_html_source_code = re.sub(r'</td>', '', self.new_html_source_code) for i in range(len(pre_img_formats) - wrong_tag): after_img_formot = r'<img src="' + img_names[i] + r'"/>' self.new_html_source_code = self.new_html_source_code.replace( pre_img_formats[i], after_img_formot) self.new_html_source_code = html_frame[:html_frame.index('</title>')] + self.html_title + \ html_frame[html_frame.index('</title>'):html_frame.index('" style')] + self.url + \ html_frame[html_frame.index('" style'):html_frame.index('</h1>')] + self.html_title + \ html_frame[html_frame.index('</h1>'):html_frame.index('</div>')] + self.new_html_source_code + \ html_frame[html_frame.index('</div>'):] new_html_soup = BeautifulSoup(self.new_html_source_code, 'lxml') self.new_html_source_code = str(new_html_soup.prettify()) return self.new_html_source_code elif self.html_type == 2: ''' 干扰项太多了,直接把<div class="pattl">和第一个<td class="t_f"...> 标签的内容“粘贴”到新的html框架中,再将图片标签修改<img src="...(本地路径)..."> 这样网页源代码会很复杂、不简洁,但干扰项的一个个删除太繁琐了 ''' soup = BeautifulSoup(self.html_source_code, 'lxml') self.new_html_source_code = html_frame[:html_frame.index( '</title>')] + self.html_title + html_frame[html_frame. index('</title>'):] self.new_html_source_code = html_frame[:html_frame.index('</title>')] + self.html_title + \ html_frame[html_frame.index('</title>'):html_frame.index('" style')] + self.url + \ html_frame[html_frame.index('" style'):html_frame.index('</h1>')] + self.html_title + \ html_frame[html_frame.index('</h1>'):] first_part_soup = soup.find('td', class_='t_f') [s.extract() for s in first_part_soup('i')] self.new_html_source_code = self.new_html_source_code[:self.new_html_source_code.index( '</div>')] + str(first_part_soup) + self.new_html_source_code[ self.new_html_source_code.index('</div>'):] other_part_soup = soup.find('div', class_='pattl') self.new_html_source_code = self.new_html_source_code[:self.new_html_source_code.index( '</div>')] + str(other_part_soup) + self.new_html_source_code[ self.new_html_source_code.index('</div>'):] img_list_soup = soup.select('div.mbn.savephotop img') img_names = [] for img_name in img_list_soup: if img_name.has_attr('zoomfile'): img_names.append(img_name['zoomfile'].split('/')[-1]) pre_img_formats = [str(img_name) for img_name in img_list_soup] for i in range(len(pre_img_formats)): after_img_formot = r'<img src="' + img_names[i] + r'"/>' self.new_html_source_code = self.new_html_source_code.replace( pre_img_formats[i], after_img_formot) new_html_soup = BeautifulSoup(self.new_html_source_code, 'lxml') self.new_html_source_code = str(new_html_soup.prettify()) return self.new_html_source_code else: print('网页类型并非1或2,,对于本网页的任务中止\n') return None
file.close() return data # Read the file html_file = read_file() # For parsing html we can use lxml or html.parser soup = BeautifulSoup(html_file,'html.parser') # soup = BeautifulSoup(html_file,'lxml') # soup prettify - prints html file with correct indentation print(soup.prettify()) """ Let see google.com html structure """ ua = UserAgent() header = {'user-agent':ua.chrome} # Get the response google_page = requests.get('https://www.google.com',headers=header)
import requests from bs4 import BeautifulSoup with requests.session() as c: url = 'https://passport.douguo.com/login/?next=/' username = '******' password = '' c.get(url) login_data = dict(username=username, password=password, next='/') c.post(url, data=login_data) page = c.get('http://m.douguo.com/activity/fotilebraize/index/lists/507') print(page.content) soup = BeautifulSoup(page.text, 'html.parser') print(soup.prettify()).encode('gb18030')
def create_html(): # Create Index # Create page per class mapping_pages_directory = "docs/developers/fixm-4.2.0-to-airm-1.0.0" # creates developers/docs/developers/fixm-4.2.0-to-airm-1.0.0 directory path = mapping_pages_directory try: os.mkdir(path) except OSError: print("Creation of the directory %s failed" % path) else: print("Successfully created the directory %s " % path) import fixm fixm = fixm.Fixm() fixm_mapping_dict = fixm.fixm_mapping_dataframe.to_dict('records') #Create index page #creates soup for index using concept-list-template.html html = open("data/html/templates/concept-list-template.html").read() soup = BeautifulSoup(html, "lxml") soup.title.string = "FIXM 4.2.0 to AIRM 1.0.0 | AIRM.aero" #For each entry #create table entry for record in fixm_mapping_dict: tr = soup.new_tag("tr") td_ic_name = soup.new_tag("td") td_ic_name.string = str(record["Information Concept"]) tr.insert(1, td_ic_name) if record["Data Concept"] != "": td_dc_name = soup.new_tag("td") url = "fixm-4.2.0-to-airm-1.0.0/" + record[ "Information Concept"] + ".html" + "#" + record["Data Concept"] text = record["Data Concept"] print(text) new_link = soup.new_tag("a") new_link['href'] = url new_link['target'] = "_blank" new_link.string = text td_dc_name.insert(1, new_link) tr.insert(2, td_dc_name) if record["Definition"] != "": td_def = soup.new_tag("td") td_def.string = str(record["Definition"]) tr.insert(3, td_def) if record["Type"] != "": td_dc_type = soup.new_tag("td") parts = str(record["Type"]).split(":") clean_type = parts[-1] #url = "fixm-4.2.0-to-airm-1.0.0/"+clean_type+".html" #text = clean_type print(text) td_dc_type.string = clean_type #new_link = soup.new_tag("a") #new_link['href'] = url #new_link['target'] = "_blank" #new_link.string = text #td_dc_type.insert(1,new_link) tr.insert(4, td_dc_type) soup.find('tbody').insert(1, tr) f = open("docs/developers/fixm-4.2.0-to-airm-1.0.0.html", "w+") f.write(soup.prettify()) f.close()
def create_html_pages(): import fixm import airm fixm = fixm.Fixm() airm = airm.Airm() fixm_info_concepts_dict = fixm.get_information_concepts() for info_concept in fixm_info_concepts_dict: if info_concept['Information Concept'] != "missing data": print(info_concept['Information Concept']) #creates soup for concept page using concept-template.html html = open("data/html/templates/concept-template.html").read() soup = BeautifulSoup(html, "lxml") #span = soup.new_tag("span") #span.string = str(info_concept['Information Concept']) #soup.find(id="BC_INFO_CONCEPT_NAME").insert(0,span)span = soup.new_tag("span") #span.string = str(info_concept['Information Concept']) soup.title.string = str( info_concept['Information Concept'] ) + " - FIXM 4.2.0 to AIRM 1.0.0 | AIRM.aero" soup.find(text="FIXM_CLASS_NAME_BC").replace_with( str(info_concept['Information Concept'])) h2 = soup.new_tag("h2") h2.string = str(info_concept['Information Concept']) soup.find(id="INFO_CONCEPT_NAME").insert(0, h2) code = soup.new_tag("code") datac_identifier = info_concept['Identifier'] parts = datac_identifier.split(":") identifier = parts[0] + ":" + parts[1] code.string = identifier code["class"] = "text-secondary" soup.find(id="INFO_CONCEPT_NAME").insert(1, code) definition = fixm.get_fixm_class_definition( info_concept['Information Concept']) soup.find(text="FIXM_CLASS_DEFINITION").replace_with( str(definition)) traces = fixm.get_traces_by_info_concept( info_concept['Information Concept']) for trace in traces: print('\t' + trace['Data Concept']) tr = soup.new_tag("tr") if trace["Data Concept"] != "": td_dc_name = soup.new_tag("td") url = "#" + trace["Data Concept"] text = trace["Data Concept"] new_link = soup.new_tag("a") new_link['href'] = url new_link.string = text td_dc_name.insert(1, new_link) tr.insert(1, td_dc_name) if trace["Definition"] != "": td_def = soup.new_tag("td") td_def.string = str(trace["Definition"]) tr.insert(2, td_def) if trace["Type"] != "": td_type = soup.new_tag("td") if trace["Type"] != "enum value": parts = str(trace["Type"]).split(":") clean_type = parts[-1] url = clean_type + ".html" text = clean_type print(text) new_link = soup.new_tag("a") new_link['href'] = url new_link['target'] = "_blank" new_link.string = text td_type.insert(1, new_link) else: td_type.string = str(trace["Type"]) tr.insert(3, td_type) soup.find(id="DATA_CONCEPTS_LIST").insert(1, tr) for trace in traces: property_div = soup.new_tag("div") property_div[ "style"] = "border: 0.5px solid #b2b2b2;border-radius: 4px;box-shadow: 2px 2px #b2b2b2;padding: 15px;padding-bottom: 0px; margin-bottom: 30px" h3 = soup.new_tag("h3") h3.string = str(trace["Data Concept"]) h3["id"] = str(trace["Data Concept"]) h3["style"] = "padding-top: 120px; margin-top: -120px;" property_div.insert(0, h3) code = soup.new_tag("code") identifier = trace['Identifier'] code.string = identifier code["class"] = "text-secondary" property_div.insert(1, code) p = soup.new_tag("p") p.string = str(trace["Definition"]) br = soup.new_tag("br") p.insert(2, br) property_div.insert(2, p) p = soup.new_tag("p") p.string = "Type: " span = soup.new_tag("span") if trace["Type"] != "enum value": parts = str(trace["Type"]).split(":") clean_type = parts[-1] url = clean_type + ".html" text = clean_type print(text) new_link = soup.new_tag("a") new_link['href'] = url new_link['target'] = "_blank" new_link.string = text span.insert(1, new_link) else: span.string = str(trace["Type"]) p.insert(2, span) property_div.insert(3, p) sc_h5 = soup.new_tag("h5") sc_h5.string = "Semantic Correspondence" sc_h5['style'] = "margin-top: 40px;" property_div.insert(4, sc_h5) sc_div = soup.new_tag("div") sc_div["class"] = "table-responsive" sc_table = soup.new_tag("table") sc_table["class"] = "table" sc_thead = soup.new_tag("thead") tr = soup.new_tag("tr") th = soup.new_tag("th") th.string = "AIRM Concept" tr.insert(1, th) th = soup.new_tag("th") th.string = "Definition" tr.insert(2, th) sc_thead.insert(1, tr) sc_table.insert(1, sc_thead) tbody = soup.new_tag("tbody") #for each insert row print('\t\tSemantic Corresponce:') sem_correspondences = str( trace['Semantic Correspondence']).split('\n') for line in sem_correspondences: print('\t\t\t' + line) tr = soup.new_tag("tr") td = soup.new_tag("td") url = create_url(line) text = create_name(line) a = soup.new_tag("a") a['href'] = url a['target'] = "_blank" a.string = text a["data-toggle"] = "tooltip" a["data-placement"] = "right" a["title"] = line td.insert(1, a) tr.insert(1, td) td = soup.new_tag("td") airm_entry = airm.load_and_find_urn(line) td.string = airm_entry["definition"] tr.insert(2, td) tbody.insert(1, tr) sc_table.insert(2, tbody) sc_div.insert(1, sc_table) property_div.insert(5, sc_div) add_correspondences = str( trace['Additional Traces']).split('\n') if len(add_correspondences) > 0: if add_correspondences[0] != "missing data": h5 = soup.new_tag("h5") h5.string = "Additional Traces" property_div.insert(6, h5) add_div = soup.new_tag("div") add_div["class"] = "table-responsive" add_table = soup.new_tag("table") add_table["class"] = "table" add_thead = soup.new_tag("thead") tr = soup.new_tag("tr") th = soup.new_tag("th") th.string = "AIRM Concept" tr.insert(1, th) th = soup.new_tag("th") th.string = "Definition" tr.insert(2, th) add_thead.insert(1, tr) add_table.insert(1, add_thead) tbody = soup.new_tag("tbody") #for each insert row print('\t\tAdditional Traces:') for line in add_correspondences: print('\t\t\t' + line) tr = soup.new_tag("tr") td = soup.new_tag("td") url = create_url(line) text = create_name(line) a = soup.new_tag("a") a['href'] = url a['target'] = "_blank" a.string = text a["data-toggle"] = "tooltip" a["data-placement"] = "right" a["title"] = line td.insert(1, a) tr.insert(1, td) td = soup.new_tag("td") airm_entry = airm.load_and_find_urn(line) td.string = airm_entry["definition"] tr.insert(2, td) tbody.insert(1, tr) add_table.insert(2, tbody) add_div.insert(1, add_table) property_div.insert(7, add_div) if str(trace["Rationale"]) != "missing data": h5 = soup.new_tag("h5") h5.string = "Rationale" property_div.insert(8, h5) p = soup.new_tag("p") p.string = str(trace["Rationale"]) print('Rationale:' + str(trace["Rationale"])) property_div.insert(9, p) if str(trace["Notes"]) != "missing data": notes_h5 = soup.new_tag("h5") notes_h5.string = "Notes" property_div.insert(10, notes_h5) p = soup.new_tag("p") p.string = str(trace["Notes"]) print('NOTES:' + str(trace["Notes"])) property_div.insert(11, p) top_link_p = soup.new_tag("p") new_link = soup.new_tag("a") new_link['href'] = "#top" new_icon = soup.new_tag("i") new_icon['class'] = "fa fa-arrow-circle-up" new_icon["data-toggle"] = "tooltip" new_icon["data-placement"] = "left" new_icon["title"] = "Top of page" new_link.insert(1, new_icon) top_link_p.insert(1, new_link) top_link_p['class'] = "text-right" property_div.insert(12, top_link_p) soup.find(id="DATA_CONCEPTS_DETAIL").insert(1, property_div) f = open( "docs/developers/fixm-4.2.0-to-airm-1.0.0/" + str(info_concept['Information Concept']) + ".html", "w+") f.write(soup.prettify()) f.close()
from selenium import webdriver from bs4 import BeautifulSoup import sys import time url = 'https://aws.amazon.com/tw/ec2/pricing/on-demand/' driver = webdriver.Chrome('chromedriver.exe') driver.get(url) pageSource = driver.page_source # print(pageSource.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding)) soup = BeautifulSoup(pageSource, 'lxml') print(soup.prettify(encoding='utf-8')) for caption in soup.find_all('caption'): print('caption', caption.get_text()) driver.quit()
stop_words[:10] #remove all stop_words from list words_ns = [] for word in words: if word not in stop_words: words_ns.append(word) page.status_code #anything satrting w/ a 2 is good page.content #loads HTML content from site soup = BeautifulSoup(page.content, 'html.parser') print(soup.prettify()) #formats everything in a semi readable format list(soup.children) [type(item) for item in list(soup.children)] #gets all text #h2 = heades, p = body text/equations, ol = bullets headers = soup.find_all('h2') bullets = soup.find_all('ol') text = soup.find_all('p') list(text.children) white_paper = pd.DataFrame(text) header_frame = pd.DataFrame(headers) text.dispersion_plot('bitcoin')
def test_soup(): with open('D:\\work\\MyPYProject\\HTML\\1.html') as f: soup = BeautifulSoup(f, 'lxml') print soup.prettify()
# Formats raw html using soup.prettify() for better analysis from bs4 import BeautifulSoup for i in range(1007): try: if i < 1000: file = open('./Gradesheets/' + str(190001 + i) + '.html').read() else: file = open('./Gradesheets/' + str(190075 + i) + '.html').read() soup = BeautifulSoup(file, 'html.parser') if i < 1000: file1 = open('./Gradesheets/' + str(190001 + i) + '.html', 'w') else: file1 = open('./Gradesheets/' + str(190075 + i) + '.html', 'w') file1.write(soup.prettify()) file1.close() if i < 1000: print(str(190001 + i) + ' successful!') else: print(str(190075 + i) + ' successful!') except: if i < 1000: print(str(190001 + i) + ' failed!') else: print(str(190075 + i) + ' failed!')
from bs4 import BeautifulSoup # Este texto já foi reconhecido como utf-8. Por isso os erros. html = "<html><body><div>Olá Mundo</div></body></html>" soup = BeautifulSoup(html, "html5lib") # Mostrando o encoding print(soup.original_encoding) # Imprimindo com prettify() print(soup.prettify()) print("\n") # Imprimindo com prettify() com codificação passada. print(soup.prettify("utf-8")) print("\n") # Imprimindo com encode() com codificação passada. print(soup.div.encode("utf-8")) print("\n")
#soup = BeautifulSoup(data) #soup.prettify() #html=soup.get_text() urls = open('urls_extracted.txt','r') # db = sqlite3.connect("webScraping.db") # urls = ['https://www.ticketmaster.co.uk/member?tm_link=tm_homeA_header_name','http://www.ticketmaster.co.uk/'] y = '1' page_name='page' + y + '.txt' for Iurl in urls: try: r=requests.get(Iurl) data=r.content soup= BeautifulSoup(data) soup.prettify() #html=soup.get_text() words_list= [] for link in soup.find_all('p'): content = link.text words = content.lower().split() cleaned_words= re.sub("[^A-Za-z]+"," ",str(words)) words_list.append(cleaned_words) words_list.append(soup.find('title').text) # file = open ( "docs2\\\\" + page_name , 'w') file= open(join('docs',page_name),'w') file.write(str(words_list))
def authenticate(self): feedinfo = [] try: with requests.session() as s: if mylar.VERIFY_32P == 1 or mylar.VERIFY_32P == True: verify = True else: verify = False logger.fdebug('[32P] Verify SSL set to : ' + str(verify)) if not verify: #32P throws back an insecure warning because it can't validate against the CA. The below suppresses the message just for 32P instead of being displa$ from lib.requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) # fetch the login page s.headers = self.headers try: s.get(self.url, verify=verify, timeout=30) except (requests.exceptions.SSLError, requests.exceptions.Timeout) as e: logger.error(self.module + ' Unable to establish connection to 32P: ' + str(e)) return # post to the login form r = s.post(self.url, data=self.payload, verify=verify) #need a way to find response code (200=OK), but returns 200 for everything even failed signons (returns a blank page) #logger.info('[32P] response: ' + str(r.content)) soup = BeautifulSoup(r.content) soup.prettify() #check for invalid username/password and if it's invalid - disable provider so we don't autoban (manual intervention is required after). chk_login = soup.find_all("form", {"id":"loginform"}) for ck in chk_login: errorlog = ck.find("span", {"id":"formerror"}) loginerror = " ".join(list(errorlog.stripped_strings)) #login_error.findNext(text=True) errornot = ck.find("span", {"class":"notice"}) noticeerror = " ".join(list(errornot.stripped_strings)) #notice_error.findNext(text=True) logger.error(self.module + ' Error: ' + loginerror) if noticeerror: logger.error(self.module + ' Warning: ' + noticeerror) logger.error(self.module + ' Disabling 32P provider until username/password can be corrected / verified.') return "disable" if not self.searchterm: logger.info('[32P] Successfully authenticated. Verifying authentication & passkeys for usage.') else: logger.info('[32P] Successfully authenticated. Initiating search for : ' + self.searchterm) return self.search32p(s) all_script = soup.find_all("script", {"src": False}) all_script2 = soup.find_all("link", {"rel": "alternate"}) for ind_s in all_script: all_value = str(ind_s) all_items = all_value.split() auth_found = False user_found = False for al in all_items: if al == 'authkey': auth_found = True elif auth_found == True and al != '=': authkey = re.sub('["/;]', '', al).strip() auth_found = False logger.fdebug(self.module + ' Authkey found: ' + str(authkey)) if al == 'userid': user_found = True elif user_found == True and al != '=': userid = re.sub('["/;]', '', al).strip() user_found = False logger.fdebug(self.module + ' Userid found: ' + str(userid)) authfound = False logger.info(self.module + ' Atttempting to integrate with all of your 32P Notification feeds.') for al in all_script2: alurl = al['href'] if 'auth=' in alurl and 'torrents_notify' in alurl and not authfound: f1 = alurl.find('auth=') f2 = alurl.find('&', f1 + 1) auth = alurl[f1 +5:f2] logger.fdebug(self.module + ' Auth:' + str(auth)) authfound = True p1 = alurl.find('passkey=') p2 = alurl.find('&', p1 + 1) passkey = alurl[p1 +8:p2] logger.fdebug(self.module + ' Passkey:' + str(passkey)) if self.reauthenticate: break if 'torrents_notify' in alurl and ('torrents_notify_' + str(passkey)) not in alurl: notifyname_st = alurl.find('name=') notifyname_en = alurl.find('&', notifyname_st +1) if notifyname_en == -1: notifyname_en = len(alurl) notifyname = alurl[notifyname_st +5:notifyname_en] notifynumber_st = alurl.find('torrents_notify_') notifynumber_en = alurl.find('_', notifynumber_st +17) notifynumber = alurl[notifynumber_st:notifynumber_en] logger.fdebug(self.module + ' [NOTIFICATION: ' + str(notifyname) + '] Notification ID: ' + str(notifynumber)) #generate the rss-url here feedinfo.append({'feed': notifynumber + '_' + str(passkey), 'feedname': notifyname, 'user': userid, 'auth': auth, 'passkey': passkey, 'authkey': authkey}) except (requests.exceptions.Timeout, EnvironmentError): logger.warn('Unable to retrieve information from 32Pages - either it is not responding/is down or something else is happening that is stopping me.') return #set the keys here that will be used to download. try: mylar.PASSKEY_32P = passkey mylar.AUTHKEY_32P = authkey # probably not needed here. mylar.KEYS_32P = {} mylar.KEYS_32P = {"user": userid, "auth": auth, "passkey": passkey, "authkey": authkey} except NameError: logger.warn('Unable to retrieve information from 32Pages - either it is not responding/is down or something else is happening that is stopping me.') return if self.reauthenticate: return else: mylar.FEEDINFO_32P = feedinfo return feedinfo
def search(self, search_string, **kwargs): """ Searches ehentai for the provided string or list of hashes, returns a dict with search_string:[list of title & url tuples] of hits found or emtpy dict if no hits are found. """ assert isinstance(search_string, (str, list)) if isinstance(search_string, str): search_string = [search_string] cookies = kwargs.pop('cookies', None) def no_hits_found_check(soup): "return true if hits are found" f_div = soup.body.find_all('div') for d in f_div: if 'No hits found' in d.text: return False return True found_galleries = {} log_i('Initiating hash search on ehentai') for h in search_string: log_d('Hash search: {}'.format(h)) self.begin_lock() if 'color' in kwargs: file_search = self.e_url_o + '?filesearch=1' if cookies: self.check_cookie(cookies) self._browser.session.cookies.update(self.COOKIES) self._browser.open(file_search) file_form = self._browser.get_forms()[1] f_obj = open(h, 'rb') file_form['sfile'].value = f_obj self._browser.submit_form(file_form) f_obj.close() soup = self._browser.parsed else: hash_url = self.e_url_o + '?f_shash=' hash_search = hash_url + h + '&fs_exp=1' # to enable expunged.. maybe make this an option? if cookies: self.check_cookie(cookies) r = requests.get(hash_search, timeout=30, headers=self.HEADERS, cookies=self.COOKIES) else: r = requests.get(hash_search, timeout=30, headers=self.HEADERS) if not self.handle_error(r): return 'error' soup = BeautifulSoup(r.text, "html.parser") self.end_lock() if not no_hits_found_check(soup): log_e('No hits found with hash: {}'.format(h)) continue log_i('Parsing html') try: if soup.body: found_galleries[h] = [] # list view or grid view type = soup.find(attrs={'class': 'itg'}).name if type == 'div': visible_galleries = soup.find_all( 'div', attrs={'class': 'id1'}) elif type == 'table': visible_galleries = soup.find_all( 'div', attrs={'class': 'it5'}) log_i('Found {} visible galleries'.format( len(visible_galleries))) for gallery in visible_galleries: title = gallery.text g_url = gallery.a.attrs['href'] found_galleries[h].append((title, g_url)) except AttributeError: log.exception('Unparseable html') log_d("\n{}\n".format(soup.prettify())) continue if found_galleries: log_i('Found {} out of {} galleries'.format( len(found_galleries), len(search_string))) return found_galleries else: log_w('Could not find any galleries') return {}
# -*- coding:utf-8 -*- import urllib import re import xlwt from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding('utf8') html = open('e:/test/index.html').read() soup = BeautifulSoup(html, "html.parser") text = soup.prettify() txt = open('e:/test/index.htmlll.txt', 'w') txt.write(text) txt.close() print 'ok'
import requests url = 'https://wikipedia.org/' r = requests.get(url) #package, send, and catch in a single function text = r.text ### Scraping the Web ### BeatifulSoup package # parse and extract structured data from HTML from bs4 import BeautifulSoup import requests url = 'https://www.crummy.com/software/BeautifulSoup/' r = requests.get(url) html_doc = r.text # the prettified Soup is indented soup = BeautifulSoup(html_doc) pretty_soup = soup.prettify() print(soup.title) print(soup.get_text()) for link in soup.find_all('a'): print(link.get('href')) # ============================================================================= # API # ============================================================================= # Application Programming Interface: allows two software programs to communicate with each other # OMDb: Open Movie Database API # Tweepy: Twitter API # JSONs: JavaScript Object Notation ### Import from local directory
import time from matplotlib import pyplot as plt from scipy.interpolate import griddata import cv2 # from Adafruit_AMG88xx import Adafruit_AMG88xx import requests from bs4 import BeautifulSoup # sensor = Adafruit_AMG88xx() # num_requests = 20 # while(num_requests > 0): # for i in range(1,20): r = requests.get("http://192.168.1.101/xml") soup = BeautifulSoup(r.content) data_sensor = soup.prettify() print("------------------------------------------") print(data_sensor) print("------------------------------------------") # pass # Access an instance of Configuration # config = channel.config() # Start sensor active = True while (1): if active == True: # Read pixels, convert them to values between 0 and 1, map them to an 8x8 grid # pixels = sensor.readPixels()
headers[ 'User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36" data = {} data['FromDate'] = "2020-09-01" data['ToDate'] = today data['SubSystem'] = "Prices" data['Action'] = "GetTrades" data['Exchange'] = "NMF" data['ext_xslt'] = "/nordicV3/t_table_simple.xsl" data['ext_xslt_lang'] = "en" data['ext_xslt_tableId'] = "danish-bond-trade-history-table" data['t__a'] = "1,2,4,6,7,8,18" data[ 'Instrument'] = "XCSE0:5NDASDRO50" # Need to change this param to change bonds data['showall'] = "1" data['app'] = "/bonds/denmark/microsite" data = urllib.parse.urlencode(data) full_url = url + "?" + data response = requests.get(full_url, headers=headers) soup = BeautifulSoup(response.content, 'xml') raw_df = pd.read_html(soup.prettify())[0] transactions = ["OTC-Primary Transaction", "OTC-Loan Payment"] df = raw_df.loc[raw_df['Trade type'].isin(transactions)] df.set_index('Time', inplace=True) df = df[['Volume', 'Trade type']]
class Model(): def __init__(self, name, root_url, target_url, target_save, target_params, target_contents): self.name = name self.root_url = root_url self.target_url = target_url self.target_save = target_save self.target_params = target_params self.target_contents = target_contents self.soup = object self.content_list = [] def run(self): #Requestsを使って、webから取得 res = requests.get(self.target_url) #要素を抽出 self.soup = BeautifulSoup(res.content, 'lxml') #HTMLファイルとして保存したい場合はファイルオープンして保存 with open('files/' + self.name + '.html', mode='w', encoding='utf-8') as fw: fw.write(self.soup.prettify()) #「target_contents」の項目を全て取得 for target_content in self.target_contents: content = target_content['selector'] # エレメント取得 elems = self.soup.select(content) # 初回のみコンテンツを格納する配列を初期化 if len(self.content_list) < len(elems): # Jsonオブジェクトを取得した要素数分確保 self.content_list = [{} for i in range(len(elems))] # 各要素を配列に格納 self.setContents(elems, target_content['type'], target_content['elem']) #「params」の項目を各オブジェクトに付与する for i in range(len(self.target_params)): param = self.target_params[i] key = [key for key in param][0] for j in range(len(self.content_list)): self.content_list[j].update({key: param[key]}) return self.content_list def setContents(self, elems, _type, prop): # Jsonオブジェクトに変換しリストに格納する for i in range(len(elems)): elem = elems[i].getText() if _type == "text" else self.replace( elems[i][_type]) print(elem) self.content_list[i].update({prop: elem}) def replace(self, item): #相対urlを絶対urlに置換するパターン pattern = '^http(s)' if not re.match(pattern, item): if not re.match('^/', item): item = '/' + item item = self.root_url + item return item
from selenium import webdriver from bs4 import BeautifulSoup driver = webdriver.PhantomJS(executable_path = r'C:\Users\deepti.waddin\Desktop\phython\phantomjs.exe') driver.get('http://python.org') html_doc = driver.page_source soup = BeautifulSoup(html_doc, 'lxml') print soup.prettify() driver.quit()
# Replaces local pdf URL's with an external reference. import os import sys import csv from bs4 import BeautifulSoup count = 0 script_path = os.path.abspath(os.path.dirname(sys.argv[0])) infile_path = os.path.join(script_path, './URLReplaceTest.csv') with open(infile_path) as csvfile: reader = csv.reader(csvfile) for row in reader: html_file = row[0] orig_url = row[1] replace_url = row[2] html_file_path = os.path.abspath( os.path.join(script_path, *html_file.split('/'))) prettyHTML = None with open(html_file_path, mode='r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') # Replacing URL target = soup.find('a', href=orig_url) if target: count += 1 print('replacing', count) replacement = target.replace_with(replace_url) prettyHTML = soup.prettify() if prettyHTML: with open(html_file_path, mode='w', encoding='utf-8') as of: of.write(prettyHTML)
from bs4 import BeautifulSoup import urllib.request import re url = 'http://www.brainfeedersite.com/' req = urllib.request.Request(url) response = urllib.request.urlopen(req) html = response.read() soup = BeautifulSoup(html, "lxml") p = soup.prettify() artist = [] title = [] url = [] pre = [] ''' for sl in soup.find_all(class_="slideshow"): for h2 in sl.find_all("h2"): pre.append(h2.text) for a in h2.find_all("a"): url.append(a['href']) ''' for pc in soup.find_all(class_='list clear', id='loop'): for h2 in pc.find_all('h2'): for a in h2.find_all('a'): title.append(a.text) url.append(a['href'])
from bs4 import BeautifulSoup import urllib req = urllib.request.Request(url="https://www.reddit.com/r/AskReddit/", headers={'User-agent': 'tester 0.2'}) page = urllib.request.urlopen(req).read() soup = BeautifulSoup(page, "lxml") conteudo = soup.prettify() lista = soup.find("div", {"id": "siteTable"}) dados_reddit = {} lista_dados = [] for l in lista: #print(l) score = l.find("div", {"class": "score likes"}) if score is not None: subreddit = l["data-subreddit"] comments_link = l.find( "a", {"class": "bylink comments may-blank"})['data-href-url'] thread_link = l.find("a", {"class": "bylink comments may-blank"})['href'] title = l.a.text if score.text == '•': upvote = 0 else:
from bs4 import BeautifulSoup from urllib import request url = "http://www.baidu.com" rsp = request.urlopen(url) content = rsp.read() soup = BeautifulSoup(content, 'lxml') # bs自动转码 content = soup.prettify() print(content) print("==" * 36) print(soup.head) print("==" * 36) print(soup.meta) print("==" * 36) print(soup.link) print("==" * 36) print(soup.link.name) print("==" * 36) print(soup.link.attrs) print("==" * 36) print(soup.link.attrs['type'])
for file in os.listdir("."): if file.endswith(".html"): html_files.append(file) # Find which is the input file: messagebox.showinfo('File Input Helper', "Please select the file to copy from.") in_file = OpenFile() # Find the menu of input file in_soup = BeautifulSoup(open(in_file), 'html.parser') in_menu = in_soup.find(id="menu") # Remove input file from the html_files thing -- working 3/25/19 for file in html_files: if file in in_file: html_files.remove(file) # Go through all files and update the menu for out_file in html_files: # Open the file as soup out_soup = BeautifulSoup(open(out_file), 'html.parser') # Overwrite menu of the output file out_soup.find(id="menu").replace_with(in_menu) # Save the file with open(out_file, "w") as file: file.write(str(out_soup.prettify()))
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } # performing a search by each letter, and adding each article to a urls_ var. urls_ = {} last_page = [] for page_number in range(1, 500): if 0 < criteria.maxClaims <= len(urls_): break url = "https://africacheck.org/latest-reports/page/" + str( page_number) + "/" try: page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify() links = soup.findAll("div", {"class": "article-content"}) if (len(links) != 0) or (links != last_page): for anchor in links: anchor = anchor.find('a', href=True) ind_ = str(anchor['href']) if ind_ not in list(urls_.keys()): if 0 < criteria.maxClaims <= len(urls_): break if ind_ not in criteria.avoid_url: urls_[ind_] = ind_ print("adding " + str(ind_)) last_page = links else: print("break!") break except: print("error=>" + str(url)) claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.items(): print( str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " + str(url)) index += 1 url_complete = str(url) # print url_complete # try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = Claim() claim_.set_url(url_complete) claim_.set_source("africacheck") # title title = soup.find("meta", {"property": "og:title"}) title_content = title['content'] if "|" in title_content: title_content = title_content.split("|")[-1] claim_.set_title(title_content) # date date_ = soup.find('time') # print date_["content"] if date_: date_str = search_dates( date_['datetime'].split(" ")[0])[0][1].strftime("%Y-%m-%d") # print date_str claim_.set_date(date_str) # print claim_.date # rating truth_rating = "" if soup.find("div", {"class": "verdict-stamp"}): truth_rating = soup.find("div", { "class": "verdict-stamp" }).get_text() if soup.find("div", {"class": "verdict"}): truth_rating = soup.find("div", {"class": "verdict"}).get_text() if soup.find("div", {"class": "indicator"}): truth_rating = soup.find("div", {"class": "indicator"}).get_text() if soup.find("div", {"class": "indicator"}).find('span'): truth_rating = soup.find("div", { "class": "indicator" }).find('span').get_text() claim_.set_rating( str(re.sub('[^A-Za-z0-9 -]+', '', truth_rating)).lower().strip()) # when there is no json date_ = soup.find("time", {"class": "datetime"}) if date_: claim_.set_date(date_.get_text()) # body body = soup.find("div", {"id": "main"}) claim_.set_body(body.get_text()) # author author = soup.find("div", {"class": "sharethefacts-speaker-name"}) if author: claim_.set_author(author.get_text()) # related links divTag = soup.find("div", {"id": "main"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.set_refered_links(related_links) if soup.find("div", {"class": "report-claim"}): claim_.set_claim( soup.find("div", { "class": "report-claim" }).find("strong").get_text()) else: claim_.set_claim(claim_.title) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): tags.append(tag["content"]) claim_.set_tags(", ".join(tags)) claims.append(claim_.generate_dictionary()) # creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
<p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> <b><!--Hey, buddy. Want to buy a used parser?--></b> """ # html.parser lxml ["lxml", "xml"] xml html5lib soup = BeautifulSoup(html_doc, "html.parser") # 按照标准的缩进格式的结构输出 logging.info(soup.prettify()) '''几个浏览结构化数据的方法''' logging.info(soup.title) # INFO:root:<title>The Dormouse's story</title> logging.info(soup.title.name) # INFO:root:title logging.info(soup.title.string) logging.info(soup.title.text) logging.info(soup.title.get_text()) # INFO:root:The Dormouse's story logging.info(soup.title.parent.name) # INFO:root:head
return obj.__dict__ if False: contents = do_request( "https://homezz.ro/anunturi_apartamente_de-vanzare_in-timisoara-tm.html" ) with open('result_homezz.html', 'wb') as file: file.write(contents) with open('result_homezz.html', 'r', encoding='utf-8') as file: contents = file.read() soup = BeautifulSoup(contents, 'html.parser') pretty_html = soup.prettify() offerNodes = soup.body.find_all('a', attrs={'class': 'main_items'}) i = 0 offers = [] for offerNode in offerNodes: price = offerNode.find('span', attrs={ "class": "price" }).text.strip().replace("€", "EUR") titleNode = offerNode.find('span', attrs={'class': 'title'}) title = titleNode.text.strip() url = offerNode['href'] imgNode = offerNode.find('div', attrs={'class': 'overflow_image'}) if imgNode.img:
- To view a specific part: "Inspect Element" - Safari users: Safari menu, Preferences, Advanced, Show Develop menu in menu bar - Let's inspect example.html ''' # read the HTML code for a web page and save as a string with open('example.html', 'rU') as f: html = f.read() # convert HTML into a structured Soup object from bs4 import BeautifulSoup b = BeautifulSoup(html) # print out the object print b print b.prettify() # 'find' method returns the first matching Tag (and everything inside of it) b.find(name='body') b.find(name='h1') # Tags allow you to access the 'inside text' b.find(name='h1').text # Tags also allow you to access their attributes b.find(name='h1')['id'] # 'find_all' method is useful for finding all matching Tags b.find(name='p') # returns a Tag b.find_all(name='p') # returns a ResultSet (like a list of Tags)
import requests from bs4 import BeautifulSoup URL = 'https://www.bedbathandbeyond.com/store/product/breville-reg-the-barista-express-trade-espresso-machine/3244573?opbthead=true&ta=typeahead&keyword=breville-espresso' headers = { "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36' } page = requests.get(URL, headers=headers) soup1 = BeautifulSoup(page.content, "html.parser") soup2 = BeautifulSoup(soup1.prettify(), "html.parser") title = soup2.find(id="productTitle") converted_price = 500 print(title.strip())