def parse(self, response): soup = BeautifulSoup(response.body, 'lxml', from_encoding="utf-8") soup.encoding = 'utf-8' db_value = [] for list in soup.select("#Columns")[0].select('li'): new_time = list.span.text link = list.a['href'] title = list.a.text content = self.get_content(self.domain + link) jason = { 'title': title, 'content': content, 'new_time': new_time, 'orign': response.url } db_value.append(jason) collection.insert_many(db_value)
def request_category_urls(self, url): """Return all the books's urls from a category.""" self.url = url response = requests.get(self.url) if response.ok: response.encoding = "utf-8" self.content = BeautifulSoup(response.text, features="html.parser") self.retrieve_urls_from_content(self.content) try: next_page_html = requests.get( self.load_next_page_url(self.content)) next_page_html.encoding = "utf-8" next_page_content = BeautifulSoup(next_page_html.text, features="html.parser") self.retrieve_urls_from_content(next_page_content) while self.has_next_page(next_page_content): self.retrieve_urls_from_content(next_page_content) next_page_html = requests.get( self.load_next_page_url(next_page_content)) next_page_content.encoding = "utf-8" next_page_content = BeautifulSoup(next_page_html.text, features="html.parser") except MissingSchema: pass else: print("La requête à retourné une erreur : ") print(requests.status_codes)
def get_url(url): kv = {'user-agent': 'Mozilla/5.0'} f = requests.get(url, headers=kv) bf = BeautifulSoup(f.content, 'lxml') bf.encoding = bf.apparent_encoding print(bf.text)
def data_(driver): time.sleep(0.5) html = driver.page_source data = str(pq(html)) data = BeautifulSoup(data, "lxml") data.encoding = 'utf-8' return data
def download_img(self): ls = [] if self.start_page == self.end_page: for i in range(1): html = requests.get(self.html, headers=self.headers) html.encoding = 'utf-8' html_source = BeautifulSoup(html.content, 'lxml') meizi = html_source.select('.commentlist .view_img_link') for each_meizi in meizi: ls.append(each_meizi) else: for i in range(eval(self.end_page) - eval(self.start_page)): if self.start_page <= self.end_page: html_source = requests.get(self.html, headers=self.headers) html_source.encoding = 'utf-8' page_source = BeautifulSoup(html_source.content, 'lxml') meizi = page_source.select('.commentlist .view_img_link') for each_meizi in meizi: ls.append(each_meizi) else: print('起始页面大于末尾页面!') break count = 0 meizi_all_img = len(ls) for meizi_picture in ls: count += 1 meizi_picture = meizi_picture['href'] download_img = requests.get('http:' + meizi_picture, headers=self.headers, stream=True) filename = meizi_picture.split('/')[-1] with open(filename, 'wb') as file: print('正在下载第{0}/{1}张图片'.format(count, meizi_all_img)) file.write(download_img.content)
def update_queue(): global authenticated global queueSoup if not authenticated: if queueSoup: print(color.YELLOW+'Error: Could not update queue. You are not authenticated'+color.END) else: print(color.RED+'Warning: Could not load queue. You are not authenticated'+color.END) return if queueSoup: print_overridable('Updating queue...') resultStr = 'Queue updated' else: print_overridable('Loading queue...') resultStr = 'Queue loaded' data = { 'session_id': cookies['sess_id'], 'fields': 'last_watched_media,last_watched_media_playhead,most_likely_media,most_likely_media_playhead,media.media_id,media.series_id,media.name,media.episode_number,media.available_time,media.duration,media.collection_name,media.url,series,series.name' } queueSoup = BeautifulSoup(requests.get('http://api.crunchyroll.com/queue.0.xml', headers=api_headers, params=data, cookies=cookies).text, 'xml') queueSoup.encoding = 'utf-8' if queueSoup.response.error.text == "true": if queueSoup.response.code.text == "bad_session": msg = "Your session has expired. You are no longer authenticated" unset_cache("session_id") authenticated = False else: msg = "{} ({})".format(queueSoup.response.message.text, queueSoup.response.code.text) print_overridable(color.RED+'Error: Could not fetch queue. '+msg+color.END, True) else: print_overridable(color.GREEN+resultStr+color.END, True)
def get_year_month_data(year, month, url): pageRequest = requests.get("https://stock.wearn.com/" + url + ".asp?Year=" + str(year) + "&month=" + str(month) + "&kind=2330") soup = BeautifulSoup(pageRequest.content, 'html.parser') soup.encoding = 'utf-8' return soup
def get_url(url): kv = {'user-agent': 'Mozilla/5.0'} f = requests.get(url, headers=kv) bf = BeautifulSoup(f.content, 'lxml') bf.encoding = bf.apparent_encoding bcontent = bf.find_all('div', class_='content') for k in bcontent: print(k.text)
def get_paper(url): kv = {'user-agent': 'Mozilla/5.0'} f = requests.get(url, headers=kv, verify=True) bf = BeautifulSoup(f.text, 'lxml') bf.encoding = bf.apparent_encoding a1 = bf.find_all('div', class_='zm-invite-pager') a_bf = BeautifulSoup(str(a1), 'lxml') a2 = a_bf.find_all('span') return int(a2[-2].text)
def commited_id(self): checklist_url = 'http://113.196.57.124/playerEmail/playerEmail_chklist.php' checklist_respone = requests.get(checklist_url, cookies=self.cookie) checklist_respone.encoding = 'utf-8' soup = BeautifulSoup(checklist_respone.text, "html.parser") soup.encoding = 'utf-8' # print(soup.text) names = soup.table.find_all('tr')[1].find_all('td')[2].text.strip() return names
def get_url(url): kv={'user-agent':'Mozilla/5.0'} f = requests.get(url,headers=kv) bf=BeautifulSoup(f.content,'lxml') bf.encoding=bf.apparent_encoding no1=bf.find_all('div', class_="list_box") bf2=BeautifulSoup(str(no1),'lxml') no2=bf2.find_all('span',class_="fh_bt") for k in no2: print(k.text)
def get_price(link): html_doc = urllib.request.urlopen(link) soup = BeautifulSoup(html_doc, 'html.parser') soup.encoding = 'utf-8' parse_result = soup.find('a', class_='offers-description__link offers-description__link_subsidiary offers-description__link_nodecor') if parse_result==None : price = 'Net v nalichii' else: price = soup.find('a', class_='offers-description__link offers-description__link_subsidiary offers-description__link_nodecor').text.strip() return price
def get_title(url): kv = {'user-agent': 'Mozilla/5.0'} f = requests.get(url, headers=kv, verify=True) bf = BeautifulSoup(f.text, 'lxml') bf.encoding = bf.apparent_encoding h2 = bf.find_all('h2', class_='zm-item-title') a_bf = BeautifulSoup(str(h2), 'lxml') a = a_bf.find_all('a') for i in a: print(i.text) with open(cur_path + '\\spider_data.txt', 'a+') as f: f.write(i.text + '\n')
def get6vtext(url): respone = requests.get('http://www.6vhao.tv/') soup = BeautifulSoup(respone.text, 'html.parser') # print(soup) result = soup.select('body > div:nth-of-type(4) > div.tjlist > ul > li:nth-of-type(1) > a') # print(result[0]['href']) infohtml = requests.get('http://www.6vhao.tv/dy6/2018-03-16/33676.html') info_soup = BeautifulSoup(infohtml.text, 'html.parser') info_soup.encoding = "gbk" # print(info_soup) text = info_soup.find(id='text').find_all('a') for t in text: print(t['href'])
def contrib(username): """contributions""" result = [] url = URL.format(author=username) if '.' in username and RE_IP.match(username): url = URL_IP.format(author=username) req = requests.get(url) soup = BeautifulSoup(req.text, 'lxml') soup.encoding = 'utf-8' rows = soup.select('article table tbody tr') item = None hasdetail = False for i, row in enumerate(rows): if not item and not hasdetail: info = row.select('td')[0] document = info.select('a')[0].string try: qs = {k: ''.join(v) for k, v in urllib.parse.parse_qs(info.select('a')[2].attrs['href'].split('?')[-1]).items()} except IndexError: revision = 1 else: try: revision = int(qs['rev']) except KeyError: revision = 1 changes = int(info.select('span')[-1].string) when = row.select('td')[2].string.strip() item = NamuContrib(document=document, revision=revision, changes=changes, when=when) # Find reverts revert = info.select('a + i') if revert: revert = extint(revert[0].string) item.revert = revert elif item and hasdetail: desc = row.select('td')[0].string item.desc = desc hasdetail = False if 'no-line' in row.attrs.get('class', []): hasdetail = True if item and not hasdetail: result.append(item) item = None return result
def login(): while True: response = session.get(config["URL"]["login"], headers=headers, verify=False) if response.status_code == requests.codes.ok: soup = BeautifulSoup(response.content, features='html.parser') soup.encoding = 'utf-8' token = soup.select( 'body > div > div > div > div > div > div > form > input[type=hidden]' )[0].get('value') # print(token) response = session.get(config["URL"]["captcha"], verify=False) open('img.png', 'wb').write(response.content) captcha = verifycode('img.png') # print(captcha) data_login["__RequestVerificationToken"] = token data_login["UserName"] = config["stud_info"]["studentno"] data_login["Password"] = config["stud_info"]["password"] data_login["VerifyCode"] = captcha r_temp = session.post(config["URL"]["login"], data=data_login) if r_temp.status_code == requests.codes.ok: soup = BeautifulSoup(r_temp.content, features='html.parser') soup.encoding = 'utf-8' try: if (token != soup.select( 'body > div > div > div > div > div > div > form > input[type=hidden]' )[0].get('value')): print(datetime.datetime.now().strftime("%H:%M:%S") + " Wrong Captcha") except: print("login complete") # print(r_temp.text) break
def contrib(username): """contributions""" result = [] url = URL.format(author=username) if '.' in username and RE_IP.match(username): url = URL_IP.format(author=username) req = requests.get(url) soup = BeautifulSoup(req.text, 'lxml') soup.encoding = 'utf-8' rows = soup.select('article table tbody tr') item = None hasdetail = False for i, row in enumerate(rows): if not item and not hasdetail: info = row.select('td')[0] document = info.select('a')[0].string try: qs = {k: ''.join(v) for k, v in urllib.parse.parse_qs(info.select('a')[2].attrs['href'].split('?')[-1]).items()} except IndexError: revision = 1 else: revision = int(qs['rev']) changes = int(info.select('span')[-1].string) when = row.select('td')[2].string.strip() item = NamuContrib(document=document, revision=revision, changes=changes, when=when) # Find reverts revert = info.select('a + i') if revert: revert = extint(revert[0].string) item.revert = revert elif item and hasdetail: desc = row.select('td')[0].string item.desc = desc hasdetail = False if 'no-line' in row.attrs.get('class', []): hasdetail = True if item and not hasdetail: result.append(item) item = None return result
def loadHaier(self, url): session = requests.Session() print("loadHaier url=" + url) haierMain = session.get(url) if haierMain.status_code == 200: soap = BeautifulSoup(haierMain.text, features="lxml") soap.encoding = 'utf-8' # print(soap) # 返回3个js polyfills.c38c86ad444630494a92.bundle.js main.4b3d8dea306811e889d6.bundle.js # http://cdkaz.rrs.com/inline.1557c7584b9dbbbbbcec.bundle.js return self.authAndgetMenu(url) # haierUrl = soap.find('a', text=re.compile('服务处理'))['href'] # orderMain = loadHaier(session, baseurl + haierUrl) # print(orderMain) else: return False
def crawl(): today = date.today() # dd/mm/YY d1 = today.strftime("%Y%m%d") URL = 'https://www.myprotein.tw/voucher-codes.list' headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' } page = requests.get(URL, headers=headers) soup = BeautifulSoup(page.content, 'html.parser') soup.encoding = 'utf-8' title = soup.findAll("h2", {"class": "voucher-title"}) day = soup.findAll("div", {"class": "voucher-end-date"}) msg = soup.findAll("div", {"class": "voucher-message"}) numbers = [d.string for d in title] # print(str('crawled objects\' sizes: ') + str((len(title), len(day), len(msg)))) if len(title) != len(day) or len(title) != len(msg) or len(day) != len( msg): return 1 discounts = [] for t in title: tmp = t.text.split(u' ') # print(tmp) for i in range(len(tmp)): if tmp[i][0] == u'\u6298': try: #'折' sometimes is not describing what percent off if int(tmp[i - 1]) < 10: discounts.append(100 - int(tmp[i - 1]) * 10) else: discounts.append(100 - int(tmp[i - 1])) except: continue fields = [today, max(discounts)] with open('log.csv', 'a') as f: writer = csv.writer(f) writer.writerow(fields) # saveEventsToHtml(title, day, msg, numbers, d1) return 0
def get_coordinate(): with open('loupan_url.txt' , 'r' , encoding='utf-8') as fin: for line in fin.readlines(): line = line.strip().split('\t') url = line[0] print(url) name = line[1] price = line[2] la = '' lo = '' try: data = requests.get(url, headers=headers) data.encoding = 'gb18030' data = BeautifulSoup(data.text, "lxml").body try: map = data.find('div' , {'class' , 'mapbox'}).iframe.get('src').strip() map = 'http:' + map data = requests.get(map, headers=headers) data.encoding = 'gb18030' data = BeautifulSoup(data.text, "lxml").body.script.text if 'coord' in data: meta = data.strip().split(',') for item in meta: if 'coordx' in item or 'baidu_coord_x' in item: la = item.strip().split(':')[1].replace('"' , '') if 'coordy' in item or 'baidu_coord_y' in item: lo = item.strip().split(':')[1].replace('"' , '') except: print('error\t' , url) with open('error_lalo.txt' , 'a') as f: f.write(url + '\n') except: print('error\t' , url) with open('error_lalo.txt' , 'a') as f: f.write(url + '\n') if la != '' or lo != '': with open('loupan_url_lalo.txt' , 'a' , encoding='utf-8') as f: f.write(url + '\t' + name + '\t' + price + '\t' + la + ' ' + lo + '\n') time.sleep(3)
def crawl(self): ls = [] super(Webspider, self).__init__() if self.start_page == self.end_page: for i in range(1): html = requests.get(self.html, headers=self.headers) html.encoding = 'utf-8' html_source = BeautifulSoup(html.content, 'lxml') meizi = html_source.select('.commentlist .view_img_link') ls.append(meizi) else: for i in range(eval(self.end_page) - eval(self.start_page)): if self.start_page <= self.end_page: html_source = requests.get(self.html, headers=self.headers) html_source.encoding = 'utf-8' page_source = BeautifulSoup(html_source.content, 'lxml') meizi = page_source.select('.commentlist .view_img_link') ls.append(meizi) else: print('起始页面大于末尾页面!') break return ls
def get_heritageLink(): heritageLinks = [] for eachEntry in get_ancestryLink(): res = requests.get("https://2e.aonprd.com/" + eachEntry) res.raise_for_status() soup = BeautifulSoup(res.text, 'lxml') for linebreak in soup.find_all('br'): linebreak.extract() soup.encoding = "utf-8" subNav = soup.find("span", {'id': 'ctl00_MainContent_SubNavigation'}) try: heritageLink = subNav.find_all("a") #print (heritageLink) #heritageLink = heritageLink.get('href') heritageLinks.append(heritageLink[2]) #heritageLinks.append(heritageLink[2].get('href')) #print (heritageLink[2].text.replace(' Heritages', '')) #print (heritageLink[2].get('href')) #nameAncestry = heritageLinks.text.replace(' Heritages', '') except: pass #print ("Heritage links: ", heritageLinks) return heritageLinks
def get_content(self, url): user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) body = response.read() soup = BeautifulSoup(body, 'lxml', from_encoding="utf-8") soup.encoding = 'utf-8' content = [] for p in soup.select(".topic")[0].select('p'): content.append({'value': p.text}) return content
def get_details(): details = [] # print "tutti i links", get_ancestryLink() for eachEntry in get_ancestryLink(): detail = {} #print "ogni entry", eachEntry res = requests.get("https://2e.aonprd.com/" + eachEntry) #res = requests.get("C:/Users/lillo/Downloads/Dwarf.html") res.raise_for_status() soup = BeautifulSoup(res.text, 'lxml') #print(soup) # soup.br.decompose() for linebreak in soup.find_all('br'): linebreak.extract() soup.encoding = "utf-8" main = soup.find("div", {'id': 'main'}) #print (main) pfsLegal = main.find("img", {'title': 'PFS Standard'}) if (pfsLegal): pfsLegal = True else: pfsLegal = False for finder in main.find_all( "a", {'href': eachEntry}): # temporary for error into html source name = finder.text #name = main.find("a", {'href': eachEntry}).text print('Start with: ', name) traitsarray = main.find_all( "span", {"class": lambda L: L and L.startswith('trai')}) traitHolder = [] for trait in traitsarray: traitHolder.append(trait.text) textHolder = [] textRaw = soup.find( "meta", {'name': 'description' })['content'] #First we grab the content from the meta tag textSplit = re.split( '<(.*?)>', textRaw ) #Now we split it into groups of strings seperated by < >, to pull out any links textClean = (''.join(textSplit[::2])).strip( ) #Finally, we join every other group together (passing over the link groups) into one string, and strip it of whitespace #print (encoder(textClean)) textHolder.append(encoder(textClean)) description = [] descRaw = main.find_all( "i" ) #Finds the i tags, the first of which should have what we want in descRaw[1] descSplit = re.split( '<(.*?)>', ''.join(descRaw[1]) ) #Now we split it into groups of strings seperated by < >, to pull out any links. descClean = ''.join( descSplit[::2] ) #Finally, we join every other group together (passing over the link groups) into one string description.append(encoder(descClean)) detail['name'] = name detail['source'] = encoder( soup.find("a", { 'class': 'external-link' }).text ) #The source should be the text of the first 'a' tag with the "class='external-link" attribute detail['traits'] = traitHolder detail['description'] = description detail['text'] = " ".join(textHolder) physPoint = soup.find("h2", string='Physical Description').next_sibling detail['physical'] = cleanSplit( physPoint) #cleanSplit cleans up the list socPoint = soup.find("h2", string='Society').next_sibling detail['society'] = cleanSplit(socPoint) alignPoint = soup.find("h2", string='Alignment and Religion').next_sibling detail['alignment'] = cleanSplit(alignPoint) namesTypePoint = soup.find("h2", string='Names').next_sibling detail['namesType'] = cleanSplit(namesTypePoint) detail['abilityBoosts'] = titleContent( soup.find("h2", string='Ability Boosts')) if (soup.find( "h2", string='Ability Flaw(s)')): #not all ancestries have flaws detail['abilityFlaws'] = titleContent( soup.find("h2", string='Ability Flaw(s)')) detail['hp'] = str(soup.find("h2", string='Hit Points').next_sibling) detail['size'] = str(soup.find("h2", string='Size').next_sibling) detail['speed'] = str(soup.find("h2", string='Speed').next_sibling) langPoint = soup.find("h2", string='Languages').next_sibling detail['languages'] = cleanSplit(langPoint) nameList = [] nameList = ( encoder(str(soup.find("h3", string='Sample Names').next_sibling)) ).split( ', ' ) #We split it in case anyone wants to draw out a single sample name nameList = [nameItem.strip() for nameItem in nameList ] #strips whitespace from each entry detail['nameList'] = nameList mightList = [] mightRaw = soup.find("h2", string='You Might...').next_sibling for mightItem in mightRaw.find_all("li"): mightList.append(encoder(encoder(mightItem.text))) detail['might'] = mightList probablyList = [] probablyRaw = soup.find("h2", string='Others Probably...').next_sibling for probablyItem in probablyRaw.find_all("li"): probablyList.append(encoder(encoder(probablyItem.text))) detail['probably'] = probablyList detail['pfsLegal'] = pfsLegal details.append(detail) return details
def get_details(): #herit = {} #print (get_herytageLink()) details = [] # print "tutti i links", get_ancestryLink() for eachEntry in get_heritageLink(): #print (eachEntry.text.replace(' Heritages', '')) #print (eachEntry.get('href')) nameAncestry = eachEntry.text.replace(' Heritages', '') linkAncestry = eachEntry.get('href') res = requests.get("https://2e.aonprd.com/" + linkAncestry) res.raise_for_status() soup = BeautifulSoup(res.text, 'lxml') for linebreak in soup.find_all('br'): linebreak.extract() for linebreak2 in soup.find_all('hr'): linebreak2.extract() soup.encoding = "utf-8" main = soup.find("div", {'id': 'main'}) print('start with: ', nameAncestry) #H1 Problem on HTML page IF and ELSE take element for finder in main.find_all( "a", {'href': lambda L: L and L.startswith('Heritages.aspx')}): detail = {} if not finder.text.endswith('eritages'): #print(finder.previous_sibling) prevElement = finder.previous_sibling #print (prevElement.contents) pfsLegal = prevElement.find("img", {'title': 'PFS Standard'}) if (pfsLegal): pfsLegal = True else: pfsLegal = False """ if finder.previous_sibling: print(finder.previous_sibling) """ #print (finder) name = finder.text detail['heritageName'] = name th1 = finder.next_sibling if th1 is not None: #print('th1', th1) th2 = th1.next_sibling sourceHref = th2.next_sibling #print (sourceHref.text) detail['source'] = sourceHref.text detail['sourceLink'] = sourceHref.get('href') else: th1 = finder.parent.next_sibling th2 = th1.next_sibling sourceHref = th2.next_sibling #print (sourceHref) detail['source'] = sourceHref.text detail['sourceLink'] = sourceHref.get('href') #print (encoder(sourceHref.next_sibling)) description = sourceHref.next_sibling detail['desc'] = encoder(description) ah1 = description.next_sibling #print (ah1) if ah1 is not None: if ah1.name == 'h3': detailAbility = {} #print(ah1.text) detailAbility['abilityName'] = ah1.text #print(ah1.child) #print(ah1.find("img").get('alt')) detailAbility['abilityActions'] = ah1.find("img").get( 'alt') traitsFinder = ah1.next_sibling if traitsFinder.get('class'): traitsHolder = [] while traitsFinder.get('class'): #print (traitsFinder.text) traitsFinder = traitsFinder.next_sibling traitsHolder.append(traitsFinder.text) #print('traits: ', traitsHolder) detailAbility['abilityTraits'] = traitsHolder else: pass #sourceAbiHref = th2.next_sibling ak = traitsFinder.next_sibling abiSource = ak.next_sibling detailAbility['abilitySource'] = abiSource.text detailAbility['abilitySourceLink'] = abiSource.get( 'href') extraEleAbi = abiSource.next_sibling if extraEleAbi.name == 'b': while extraEleAbi.name == 'b': #print(extraEleAbi.text) detailAbility[extraEleAbi.text] = encoder( extraEleAbi.next_sibling) ar = extraEleAbi.next_sibling extraEleAbi = ar.next_sibling #print (extraEleAbi) else: pass #abiDesc = ay.next_sibling #print (detailAbility) detail['ability'] = detailAbility detail['pfsLegal'] = pfsLegal detail['ancestryName'] = nameAncestry details.append(detail) else: pass #nameAncestry = {} #herit[nameAncestry] = details #print (prova) return details
def run_media(pageurl): global queueSoup #seriesid = None while True: mediaid = re.search(r'[^\d](\d{6})(?:[^\d]|$)', pageurl).group(1) data = { 'req': 'RpcApiVideoPlayer_GetStandardConfig', 'media_id': mediaid, 'video_format': '108', 'video_quality': '80', 'current_page': pageurl } print_overridable('Fetching media information...') config = requests.get('http://www.crunchyroll.com/xml/', headers=rpc_headers, params=data, cookies=cookies) config.encoding = 'utf-8' print_overridable() if config.status_code != 200: print(color.RED+'Error: '+config.text+color.END) return #What is this even? Does it catch some specific media or 404 pages? if len(config.text) < 100: print(config.url) print(config.text) return config = BeautifulSoup(config.text, 'lxml-xml') #Check for errors error = config.find('error') if error: print(color.RED+'Error: '+error.msg.text+color.END) return #Check if media is unavailable error = config.find('upsell') if error: print(color.RED+'Error: Media is only available for premium members'+color.END) return nextEpisode = config.find('nextUrl').text series = config.series_title.text epnum = config.episode_number.text episode = config.episode_title.text duration = config.duration.text print('{} - E{}'.format(series, epnum)) print(episode) print('Duration: {}'.format(mmss(duration))) sub = config.find('subtitle', attrs={'link': None}) if sub: print_overridable('Preparing subtitles...') _id = int(sub['id']) _iv = sub.iv.text _subdata = sub.data.text # print(_id, _iv, _subdata) open(SUBTITLE_TEMP_PATH, 'w').write(convert(decode_subtitles(_id, _iv, _subdata).decode('utf-8'))) print_overridable('Fetching stream information...') data['req'] = 'RpcApiVideoEncode_GetStreamInfo' streamconfig = BeautifulSoup(requests.post('http://www.crunchyroll.com/xml', headers=rpc_headers, data=data, cookies=cookies).text, 'lxml-xml') streamconfig.encoding = 'utf-8' print_overridable('Starting stream...') playhead = 0 if not streamconfig.host.text: url = streamconfig.file.text subprocess.call(['mpv', url]) else: host = streamconfig.host.text file = streamconfig.file.text if re.search('fplive\.net', host): url1, = re.findall('.+/c[0-9]+', host) url2, = re.findall('c[0-9]+\?.+', host) else: url1, = re.findall('.+/ondemand/', host) url2, = re.findall('ondemand/.+', host) subarg = "" if sub: subarg = " --sub-file "+SUBTITLE_TEMP_PATH proc = subprocess.Popen( ["rtmpdump -a '"+url2+"' --flashVer 'WIN 11,8,800,50' -m 15 --pageUrl '"+pageurl+"' --rtmp '"+url1+"' --swfVfy http://www.crunchyroll.com/vendor/ChromelessPlayerApp-c0d121b.swf -y '"+file+"' | mpv --force-seekable=yes"+subarg+" -"], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, bufsize=1, shell=True ) # Pick up stderr for playhead information while True: line = proc.stderr.readline().decode("utf-8") if line == '' and proc.poll() is not None: break timestamp = re.search('AV: ([0-9]{2}:[0-9]{2}:[0-9]{2}) / ([0-9]{2}:[0-9]{2}:[0-9]{2})', line) if timestamp: current = [int(i) for i in timestamp.group(1).split(":")] playhead = (current[0]*60+current[1])*60+current[2] print_overridable('Playhead: {}'.format(mmss(playhead))) print_under() if sub: os.remove(SUBTITLE_TEMP_PATH) if authenticated and input_yes('Do you want to update seen duration to {}/{}'.format(mmss(playhead), mmss(duration))): print_overridable('Updating seen duration...') data = { 'req': 'RpcApiVideo_VideoView', 'media_id': mediaid, 'cbcallcount': 0, 'cbelapsed': 30, 'playhead': config.duration } resp = requests.get('http://www.crunchyroll.com/xml/', headers=rpc_headers, params=data, cookies=cookies) if resp.status_code != 200: print_overridable(color.RED+'Error: '+resp.text+color.END, True) else: print_overridable(color.GREEN+'Seen duration was saved'+color.END, True) update_queue() #We update the queue after marking episode as seen! if nextEpisode != "": if input_yes('Another episode is available, do you want to watch it'): pageurl = nextEpisode else: break else: print(color.RED+'No more episodes available'+color.END) break
)[0].font.font.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.text num_pages = int(re.sub('\W', '', num_pages)) ### Get the data data_list = defaultdict(list) # for i in range(1, num_pages + 1): start_time = time.time() ### Get web page page = requests.get('http://www.baibai.com.tw/temple.asp?Page=' + str(i) + '&name=&keyword=&morder=') page.encoding = 'big5' page = BeautifulSoup(page.text) num_items_per_page = int( len( page.tr.next_sibling.next_sibling.find_all( 'a', {"href": re.compile('^view-temple\.asp\?com_ser=')})) / 3) step = int( len( page.tr.next_sibling.next_sibling.find_all( 'a', {"href": re.compile('^view-temple\.asp\?com_ser=')})) / num_items_per_page) ### Get data ## Get name of the temple
from bs4 import BeautifulSoup from urllib.request import urlopen import sys f = open(r'content_page_list_tichong_tichu.txt', 'r') a = [] for i in f.readlines(): lst = i.strip('\n') a.append(lst) f.close() for x1 in a: try: html = urlopen("http://www.langji520.com" + x1) ctObj = BeautifulSoup(html, "html.parser") ctObj.encoding = 'utf-8' content = ctObj.find(name='div', attrs={ 'class': "article_con" }).find("p").get_text() print(content) fo = open("langji520.txt", "a") fo.write(content + '\r\n') print("write Successful!" + str(x1) + "Still remaining:" + str(len(a) - a.index(x1))) except AttributeError as reason: print("AttributeError:" + str(reason) + str(x1)) pass except UnicodeEncodeError as reason: print(str(reason) + str(x1)) pass finally:
def get_details(): details = [] # print "tutti i links", get_ancestryLink() for eachEntry in get_ancestryLink(): detail = {} #print "ogni entry", eachEntry res = requests.get("https://2e.aonprd.com/" + eachEntry) #res = requests.get("C:/Users/lillo/Downloads/Dwarf.html") res.raise_for_status() soup = BeautifulSoup(res.text, 'lxml') #print(soup) # soup.br.decompose() for linebreak in soup.find_all('br'): linebreak.extract() soup.encoding = "utf-8" main = soup.find("div", {'id': 'main'}) #print (main) pfsLegal = main.find("img", {'title': 'PFS Standard'}) if(pfsLegal): pfsLegal = True else: pfsLegal = False for finder in main.find_all("a", {'href': eachEntry}): # temporary for error into html source name = finder.text #name = main.find("a", {'href': eachEntry}).text print('Start with: ', name) traitsarray = main.find_all( "span", {"class": lambda L: L and L.startswith('trai')}) traitHolder = [] for trait in traitsarray: traitHolder.append(trait.text) description = [] #source = main.find("a", {'class':'title'}).text children = main.contents detailHolder = [] for child in children: stringContents = child.encode('utf-8') #print('stringato: ', stringContents) if stringContents.startswith("<"): if child.name == "a": #print("href", child) try: if child['class'][0] == "external-link": #print("source", child.text) detail['source'] = child.text except: pass if child.name == "b": if child.text != "Source": tagType = child.text.lower() if child.name == "a": #print("href", child) try: if child['class'][0] == "external-link": nextchild = child.next_sibling if nextchild.name == "i": child = nextchild description.append(encoder(child.text)) description.extend(titleContent(child)) except: pass if child.name == "h2": if child.text == "You Might...": ul = child.next_sibling liList = [] for li in ul.findAll('li'): liList.append(encoder(encoder(li.text))) detail['might'] = liList if child.name == "h2": if child.text == "Others Probably...": ul = child.next_sibling liList = [] for li in ul.findAll('li'): liList.append(encoder(encoder(li.text))) detail['probably'] = liList if child.name == "h2": if child.text == "Physical Description": detail['physical'] = titleContent(child) if child.name == "h2": if child.text == "Society": detail['society'] = titleContent(child) if child.name == "h2": if child.text == "Alignment and Religion": detail['alignment'] = titleContent(child) if child.name == "h2": if child.text == "Names": detail['namesType'] = titleContent(child) if child.name == "h3": if child.text == "Sample Names": nameList = [] words = child.next_sibling.split(', ') # for each word in the line: for word in words: nameList.append(encoder(word)) # print the word detail['nameList'] = nameList if child.name == "h2": if child.text == "Hit Points": detail['hp'] = child.next_sibling if child.name == "h2": if child.text == "Size": detail['size'] = child.next_sibling if child.name == "h2": if child.text == "Speed": detail['speed'] = child.next_sibling if child.name == "h2": if child.text == "Ability Boosts": detail['abilityBoosts'] = titleContent(child) if child.name == "h2": if child.text == "Ability Flaw(s)": detail['abilityFlaws'] = titleContent(child) if child.name == "h2": if child.text == "Languages": #detail['languages'] = titleContent(child) languagesHolder = [] #languagesOptHolder = [] nextChild = child.next_sibling while nextChild.name != "h2" and nextChild.name != "h3" and nextChild != None: if nextChild.name == "a" : languagesHolder.append(encoder(nextChild.text)) nextChild = nextChild.next_sibling else: languagesHolder.append(encoder(nextChild)) #languagesOptHolder.append(encoder(nextChild).split(', ')) break detail['languages'] = languagesHolder #detail['languagesOpt'] = languagesOptHolder if child.name == "h2": if child.text == "Languages": t=0 while child.find_next("h2"): otherHolder = [] titolo = child.find_next("h2") t+= 1 if titolo.name == "h2": #print(t, "eccolo", titolo.text) otherHolder.append(titolo.text) otherHolder.append(titolo.next_sibling) detail['other'+str(t)] = otherHolder child = child.find_next("h2") else: if not stringContents.isspace(): detailHolder.append(encoder(child.text)) detail['pfsLegal'] = pfsLegal detail['name'] = name detail['traits'] = traitHolder detail['description'] = description string = " " detail['text'] = string.join(detailHolder) details.append(detail) return details
def get_text(self, URL): if 'news.na' in URL: source_code_from_URL = urllib.request.urlopen(URL) soup = BeautifulSoup(source_code_from_URL, 'html.parser') text = '' for item in soup.find_all(class_="_article_body_contents"): text = text + str(item.find_all(text=True)) for n in text: text = re.sub(' | |\t|\r|\n', ' ', text) text = re.sub('<script.*?>.*?</script>', '', text) text = text.replace(r"\xa0", "") text = re.sub( '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "", text) text = re.sub('[a-zA-Z]', '', text) text = text.replace(" ", "") text = text.strip() elif 'enter' in URL: source_code_from_URL = urllib.request.urlopen(URL) soup = BeautifulSoup(source_code_from_URL, 'html.parser') text = '' for item in soup.find_all(class_="end_body_wrp"): text = text + str(item.find_all(text=True)) for n in text: text = re.sub(' | |\t|\r|\n', ' ', text) text = re.sub('<script.*?>.*?</script>', '', text) text = text.replace(r"\xa0", "") text = re.sub( '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "", text) text = re.sub('[a-zA-Z]', '', text) text = text.replace(" ", "") text = text.strip() elif 'chosun' in URL: source_code_from_URL = urllib.request.urlopen(URL) soup = BeautifulSoup(source_code_from_URL, 'html.parser') text = '' for item in soup.find_all(class_="par"): text = text + str(item.find_all(text=True)) for n in text: text = re.sub(' | |\t|\r|\n', ' ', text) text = re.sub('<script.*?>.*?</script>', '', text) text = text.replace(r"\xa0", "") text = re.sub( '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "", text) text = re.sub('[a-zA-Z]', '', text) text = text.replace(" ", "") text = text.strip() elif 'daum' in URL: source_code_from_URL = urllib.request.urlopen(URL) soup = BeautifulSoup(source_code_from_URL, 'html.parser') text = '' for item in soup.find_all(class_="news_view"): text = text + str(item.find_all(text=True)) for n in text: text = re.sub(' | |\t|\r|\n', ' ', text) text = re.sub('<script.*?>.*?</script>', '', text) text = text.replace(r"\xa0", "") text = re.sub( '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "", text) text = re.sub('[a-zA-Z]', '', text) text = text.replace(" ", "") text = text.strip() elif 'joins' in URL: source_code_from_URL = urllib.request.urlopen(URL) soup = BeautifulSoup(source_code_from_URL, 'html.parser') soup.encoding = 'utf-8' text = '' for item in soup.find_all(class_="article_body fs1 mg"): text = text + str(item.find_all(text=True)) for n in text: text = re.sub(' | |\t|\r|\n', ' ', text) text = re.sub('<script.*?>.*?</script>', '', text) text = text.replace(r"\xa0", "") text = re.sub( '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "", text) text = re.sub('[a-zA-Z]', '', text) text = text.replace(" ", "") text = text.strip() elif 'hani' in URL: source_code_from_URL = urllib.request.urlopen(URL) soup = BeautifulSoup(source_code_from_URL, 'html.parser') text = '' for item in soup.find_all(class_="text"): text = text + str(item.find_all(text=True)) for n in text: text = re.sub(' | |\t|\r|\n', ' ', text) text = re.sub('<script.*?>.*?</script>', '', text) text = text.replace(r"\xa0", "") text = re.sub( '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "", text) text = re.sub('[a-zA-Z]', '', text) text = text.replace(" ", "") text = text.strip() elif 'kmib' in URL: source_code_from_URL = urllib.request.urlopen(URL) soup = BeautifulSoup(source_code_from_URL, 'html.parser') text = '' for item in soup.find_all(class_="tx"): text = text + str(item.find_all(text=True)) for n in text: text = re.sub(' | |\t|\r|\n', ' ', text) text = re.sub('<script.*?>.*?</script>', '', text) text = text.replace(r"\xa0", "") text = re.sub( '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "", text) text = re.sub('[a-zA-Z]', '', text) text = text.replace(" ", "") text = text.strip() elif 'hankookilbo' in URL: source_code_from_URL = urllib.request.urlopen(URL) soup = BeautifulSoup(source_code_from_URL, 'html.parser') text = '' for item in soup.find_all(class_="article-story"): text = text + str(item.find_all(text=True)) for n in text: text = re.sub(' | |\t|\r|\n', ' ', text) text = re.sub('<script.*?>.*?</script>', '', text) text = text.replace(r"\xa0", "") text = re.sub( '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "", text) text = re.sub('[a-zA-Z]', '', text) text = text.replace(" ", "") text = text.strip() elif 'seoul' in URL: source_code_from_URL = urllib.request.urlopen(URL) soup = BeautifulSoup(source_code_from_URL, 'html.parser') text = '' for item in soup.find_all(class_="user-snb-wrapper"): text = text + str(item.find_all(text=True)) for n in text: text = re.sub(' | |\t|\r|\n', ' ', text) text = re.sub('<script.*?>.*?</script>', '', text) text = text.replace(r"\xa0", "") text = re.sub( '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "", text) text = re.sub('[a-zA-Z]', '', text) text = text.replace(" ", "") text = text.strip() elif 'asiatoday' in URL: source_code_from_URL = urllib.request.urlopen(URL) soup = BeautifulSoup(source_code_from_URL, 'html.parser') text = '' for item in soup.find_all(class_="news_bm"): text = text + str(item.find_all(text=True)) for n in text: text = re.sub(' | |\t|\r|\n', ' ', text) text = re.sub('<script.*?>.*?</script>', '', text) text = text.replace(r"\xa0", "") text = re.sub( '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "", text) text = re.sub('[a-zA-Z]', '', text) text = text.replace(" ", "") text = text.strip() return text
import requests import time from bs4 import BeautifulSoup from flask import Flask, request, abort from selenium import webdriver import sys import datetime import json import os while (2 > 1): pageRequest = requests.get('https://www.naif.org.tw/infoPigSellDaily.aspx') soup = BeautifulSoup(pageRequest.content, 'html.parser') soup.encoding = 'utf-8' pigpig = soup.find(attrs={"class": "ScrollForm"}).text print(pigpig) time.sleep(60 * 5)