def spide(self): self.subject='' self.content='' try: request = urllib2.Request(self.spiderUrl) response = urllib2.urlopen(request) soup = BeautifulSoup(response.read().decode('utf-8')) self.subject = soup.title.string.output_ready() self.subject = self.subject.encode('utf-8') self.content = '[b]' + soup.find("div",class_="article-experpt explain").string.output_ready() + '[/b]' + '\n' main_body = soup.find("div",class_ ="js-article-body") for child in main_body.children: if child.string: self.content = self.content + '\t' + child.string.output_ready() +'\n' elif child.contents: # if child.contents[0].name=='strong': # self.content = self.content + '[color=Sienna]' + child.string.output_ready() +'[/color]' +'\n' if child.contents[0].name=='img': self.content = self.content + '[align=center][img=660,440]' + child.contents[0]['src'] + '[/img][/align]'+'\n' self.content = self.content.encode('utf-8') self.content = '[font=微软雅黑]' + self.content +'[/font]'+ '\n\n\n\n ' +'本文转自' + self.spiderUrl + '\n' + '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t-----自动转贴' print (self.content) return True except urllib2.URLError, e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason return False
def post_von_wagner(): ''' check if pickle file is already there, else get it ''' today = datetime.date.today().isoformat() if os.path.exists(today+'.p'): temp_file = open(today+'.p', 'rb') trash = pickle.load(temp_file) else: temp_file = open(today+'.p', 'wb+') ''' get the link to the newest post''' URL = "http://www.bild.de/themen/personen/franz-josef-wagner/kolumne-17304844.bild.html" r = requests.get(URL) soup = BeautifulSoup(r.text) URL = 'http://www.bild.de'+soup.find('div', 'tr').find('a').get('href') ''' get the text out the article ''' r = requests.get(URL) soup = BeautifulSoup(r.text) trash = soup.find('div', 'txt clearfix').text ''' clean everything up and create a list''' trash = re.sub(r'[\;\,\(\).\"\@\:\?]', ' ', trash) trash = trash.split() ''' save trash to pickle file ''' pickle.dump(trash, temp_file) temp_file.close() return trash[:-24]
def recognise_eHentai(link, path): url = str(link) page = urllib2.urlopen(url).read() soup = BeautifulSoup(page) name = soup.findAll('title') name = name[0].get_text().encode('utf-8') name = str(name) path = path + '\\' + name download_eHentai(link, path) pages = soup.find_all('span') pages = pages[1].get_text() pages = int(pages) z = 0 while (pages > z): z = z + 1 sopa = soup.find('div', 'sn') sopa = sopa.find_all('a') sopa = sopa[2].get('href') url = str(sopa) download_eHentai(url, path) page = urllib2.urlopen(url).read() soup = BeautifulSoup(page) sopa = soup.find('div', 'sn') sopa = sopa.find_all('a') sopa = sopa[2].get('href') download_eHentai(sopa, path)
def getSCLeg(partyDict): houseSoup = BeautifulSoup(urllib2.urlopen('http://www.scstatehouse.gov/member.php?chamber=H&order=D').read()) senateSoup = BeautifulSoup(urllib2.urlopen('http://www.scstatehouse.gov/member.php?chamber=S&order=D').read()) houseTable = houseSoup.find('div', {'class': 'mainwidepanel'}).find_all('div', {'style': 'width: 325px; height: 135px; margin: 0 0 0 20px; text-align: left; float: left;'}) senateTable = senateSoup.find('div', {'class': 'mainwidepanel'}).find_all('div', {'style': 'width: 325px; height: 135px; margin: 0 0 0 20px; text-align: left; float: left;'}) dictList = [] for item in houseTable: repInfo = {} link = item.find('a') if link is not None: repInfo['Website'] = 'http://www.scstatehouse.gov' + link.get('href') repInfo['Name'] = re.sub(r'\[.*$', '', link.string.strip()).strip().replace(' ', ' ').replace(' ', ' ') repInfo['Party'] = partyDict[str(re.sub(r'^.*\[(.*)\].*$', r'\1', link.string.strip()))] else: repInfo['Name'] = 'VACANT' repInfo['District'] = 'SC State House ' + re.sub(r'^.*(District [0-9]*).*$', r'\1', item.get_text()) dictList.append(repInfo) for item in senateTable: repInfo = {} link = item.find('a') if link is not None: repInfo['Website'] = 'http://www.scstatehouse.gov' + link.get('href') repInfo['Name'] = re.sub(r'\[.*$', '', link.string.strip()).strip().replace(' ', ' ').replace(' ', ' ') repInfo['Party'] = partyDict[str(re.sub(r'^.*\[(.*)\].*$', r'\1', link.string.strip()))] else: repInfo['Name'] = 'VACANT' repInfo['District'] = 'SC State Senate ' + re.sub(r'^.*(District [0-9]*).*$', r'\1', item.get_text()) dictList.append(repInfo) return dictList
def write(self, caption_set): caption_set = deepcopy(caption_set) sami = BeautifulSoup(SAMI_BASE_MARKUP, u"lxml-xml") caption_set.layout_info = self._relativize_and_fit_to_screen(caption_set.layout_info) primary = None for lang in caption_set.get_languages(): self.last_time = None if primary is None: primary = lang caption_set.set_layout_info(lang, self._relativize_and_fit_to_screen(caption_set.get_layout_info(lang))) for caption in caption_set.get_captions(lang): # Loop through all captions/nodes and apply transformations to # layout in function of the provided or default settings caption.layout_info = self._relativize_and_fit_to_screen(caption.layout_info) for node in caption.nodes: node.layout_info = self._relativize_and_fit_to_screen(node.layout_info) sami = self._recreate_p_tag(caption, sami, lang, primary, caption_set) stylesheet = self._recreate_stylesheet(caption_set) sami.find(u"style").append(stylesheet) a = sami.prettify(formatter=None).split(u"\n") caption_content = u"\n".join(a[1:]) return caption_content
def get_li(doc): soup = BeautifulSoup(doc, 'html.parser') ol = soup.find('ol', class_='grid_view') name = [] #名字 star_con = [] #评价人数 score = [] #评分 info_list = [] #短评 for i in ol.find_all('li'): detail = i.find('div', attrs={'class': 'hd'}) movie_name = detail.find('span', attrs={'class': 'title'}).get_text() #电影名字 level_star = i.find('span',attrs={'class':'rating_num'}).get_text() #评分 star = i.find('div',attrs={'class':'star'}) star_num = star.find(text=re.compile('评价')) #评价 info = i.find('span',attrs={'class':'inq'}) #短评 if info: #判断是否有短评 info_list.append(info.get_text()) else: info_list.append('无') score.append(level_star) name.append(movie_name) star_con.append(star_num) page = soup.find('span', attrs={'class': 'next'}).find('a') #获取下一页 if page: return name,star_con,score,info_list,DOWNLOAD_URL + page['href'] return name,star_con,score,info_list,None
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources year = url['year'] h = {'User-Agent': client.randomagent()} title = cleantitle.geturl(url['title']).replace('-', '+') url = urlparse.urljoin(self.base_link, self.search_link % title) r = requests.get(url, headers=h) r = BeautifulSoup(r.text, 'html.parser').find('div', {'class': 'item'}) r = r.find('a')['href'] r = requests.get(r, headers=h) r = BeautifulSoup(r.content, 'html.parser') quality = r.find('span', {'class': 'calidad2'}).text url = r.find('div', {'class':'movieplay'}).find('iframe')['src'] if not quality in ['1080p', '720p']: quality = 'SD' valid, host = source_utils.is_host_valid(url, hostDict) sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'direct': False, 'debridonly': False}) return sources except: print("Unexpected error in Furk Script: check_api", sys.exc_info()[0]) exc_type, exc_obj, exc_tb = sys.exc_info() print(exc_type, exc_tb.tb_lineno) return sources
def get_links_from(channel,pages): #http://bj.ganji.com/jiaju/a3o11/ #ttp://bj.ganji.com/wupinjiaohuan/o3/#两种不同url if channel in ['http://bj.ganji.com/xuniwupin/','http://bj.ganji.com/qitawupin/','http://bj.ganji.com/ershoufree/','http://bj.ganji.com/wupinjiaohuan/']: list_view = '{}o{}/'.format(channel,str(pages)) wb_data = requests.get(list_view,headers=headers) #time.sleep(1) soup = BeautifulSoup(wb_data.text,'lxml') if soup.find('ul','pageLink clearfix'): for link in soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dt > div > a'): item_link = link.get('href') url_list.insert_one({'url':item_link}) print(item_link) else: #pass print('重复页面') else: list_view = '{}a3o{}/'.format(channel,str(pages)) wb_data = requests.get(list_view,headers=headers) #time.sleep(1) soup = BeautifulSoup(wb_data.text,'lxml') if soup.find('ul','pageLink clearfix'): for link in soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a'): item_link = link.get('href') url_list.insert_one({'url':item_link}) print(item_link) else: #pass print('重复页面')
def parse_round(bsoup, rnd, gid, airdate): """Parses and inserts the list of clues from a whole round""" round_id = "jeopardy_round" if rnd == 1 else "double_jeopardy_round" r = bsoup.find(id = round_id) # the game may not have all the rounds if not r: return False # the list of categories for this round categories = [c.get_text() for c in r.find_all("td", class_ = "category_name")] # the x_coord determines which category a clue is in # because the categories come before the clues, we will # have to match them up with the clues later on x = 0 for a in r.find_all("td", class_ = "clue"): if not a.get_text().strip(): continue value = a.find("td", class_ = re.compile("clue_value")).get_text().lstrip("D: $") value = re.sub('[:$,]','',value) text = a.find("td", class_ = "clue_text").get_text() answerDiv = BeautifulSoup(a.find("div", onmouseover = True).get("onmouseover"), "lxml") answer = answerDiv.find("em", class_ = "correct_response").get_text() right = answerDiv.find("td", class_ = "right") if right == None: right = "Triple Stumper" else: right = right.get_text() insert([gid, airdate, rnd, categories[x], value, text, answer, right]) x = 0 if x == 5 else x + 1 return True
def scrape_song_metadata(soup, verbose = False): result = {} first_soup = soup.find("div", {"class":"song_header-primary_info"}) first_soup = BeautifulSoup(soup.prettify(), "html.parser") artist = first_soup.find("a", {"class":"song_header-primary_info-primary_artist"}) artist = clean_text(artist.string) if verbose: print "Artist : " + artist.encode('utf-8') result["artist"] = artist song = first_soup.find("h1", {"class":"song_header-primary_info-title"}) song = clean_text(song.string) if verbose: print "Song : " + song.encode('utf-8') result["song"] = song labels = first_soup.findAll("span", {"class":"song_info-label"}) labels = [clean_text(l.string) for l in labels] contents = first_soup.findAll("span", {"class":"song_info-info"}) contents = [BeautifulSoup(c.prettify(), "html.parser") for c in contents] contents = [c.a for c in contents] for i in range(len(labels)): if contents[i]: if verbose: print labels[i] + " :" print " " + clean_text(contents[i].string).encode('utf-8') print " " + contents[i]['href'].encode('utf-8') result[labels[i]] = {"name" : clean_text(contents[i].string), "link" : geniusify(contents[i]['href'])} return result
def process_html(self, url, html): soup = BeautifulSoup(html) details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.find(itemprop='name').get_text() author = soup.find(itemprop='author').get_text() completion_year_el = soup.find(itemprop='dateCreated') byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '') image_url = soup.find(id='paintingImage')['href'] if not title or not author or not image_url: self.response.out.write('Could not parse HTML') self.response.set_status(500) return publish_date = (datetime.datetime .utcfromtimestamp(int(self.request.get('publishDate')) / 1000) .date()) image_url, thumb_url = maybe_process_image(image_url, NO_CROP_TUPLE, publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline) # create the artwork entry new_artwork = FeaturedArtwork( title=title, byline=byline, image_url=image_url, thumb_url=thumb_url, details_url=details_url, publish_date=publish_date) new_artwork.save() self.response.set_status(200)
def getLinksFromWooyun(html): soup = BeautifulSoup(html) soup = soup.find('div', class_="content") soup = soup.find('table',class_="listTable") html = soup.find('tbody') if not html: now = time.strftime('%H:%M:%S',time.localtime(time.time())) print "["+str(now)+"] [WARNING] failed to crawl" else: html_doc=html.find_all('tr') if not html_doc: now = time.strftime('%H:%M:%S',time.localtime(time.time())) print "["+str(now)+"] [WARNING] failed to crawl" else: for doc in html_doc: try: td=doc.find_all('td')[2] atag=td.find('a') link=atag.get('href').strip() if not isExisted(link,'wooyun.txt'): logfile(link,'wooyun.txt') now = time.strftime('%H:%M:%S',time.localtime(time.time())) print "["+str(now)+"] [INFO] "+link else: now = time.strftime('%H:%M:%S',time.localtime(time.time())) print "["+str(now)+"] [WARNING] url is duplicate ["+link+"]" except Exception: pass
def get_parteredmenyek(content): results = list() soup = BeautifulSoup(content) registry_table = soup.find('p', text='a) A választók nyilvántartása').find_next('table') row = registry_table.find_all('tr')[3] total = row.find_all('td')[4].text.replace(' ', '') voter_table = registry_table.find_next('table') row = voter_table.find_all('tr')[3] #voters = row.find_all('td')[4].text.replace(' ', '') #non_voters = int(total) - int(voters[0].replace(' ', '')) non_voters = 0 nonvoter = dict() nonvoter['statistics_code'] = 'non-voters' nonvoter['statistics_name'] = 'Non voters' nonvoter['value'] = non_voters results.append(nonvoter) lista_table = soup.find('p', text='Érvényes szavazatok száma:').find_next('table') rows = lista_table.find_all('tr') for row in rows[1:]: result = {} cells = row.find_all('td') result['statistics_code'] = slugify(cells[1].text) result['statistics_name'] = cells[1].text result['value'] = cells[2].text.replace(' ', '') #result['percent'] = cells[3].text.replace('.', '') results.append(result) return results
def attachTranslateAndAppreciation(dataSession, postId, shangxiUrls, fanyiUrls): # 首先添加译文信息 if len(fanyiUrls) is not 0: for i, translateUrl in enumerate(fanyiUrls): html = requests.get(translateUrl).content soup = BeautifulSoup(html, "lxml") contentList = soup.find('div', attrs={'class': 'shangxicont'}).find_all('p')[1:-1] translateText = '' for contentElement in contentList: translateText += contentElement.get_text() if len(translateText) is not 0: translatePost = translatePosts(postId = postId, translateUrl = translateUrl, translateText = translateText, rate = i + 1) dataSession.add(translatePost) dataSession.commit() # 再添加赏析内容 if len(shangxiUrls) is not 0: for i, appreciationUrl in enumerate(shangxiUrls): html = requests.get(appreciationUrl).content soup = BeautifulSoup(html, "lxml") contentList = soup.find('div', attrs={'class': 'shangxicont'}).find_all('p')[1:-1] appreciationText = '' for contentElement in contentList: appreciationText += contentElement.get_text() if len(appreciationText) is not 0: appreciationPost = appreciationPosts(postId = postId, appreciationUrl = appreciationUrl, appreciationText = appreciationText, rate = i + 1) dataSession.add(appreciationPost) dataSession.commit()
def get_article(article_id, abs_file_path): '''获得文章,获取成功返回True,文章不在最近一周内返回False''' article_url = 'http://www.tuicool.com/articles/{}'.format(article_id) try: print article_url common.rand_sleep(5, 10) res = l.session.get(article_url) logging.info('return url {} success'.format(res.url)) soup = BeautifulSoup(res.text, 'html.parser') title = str(soup.find('div', class_='article_detail_bg').find('h1')\ .get_text()) print title pub_time = re.sub(re.compile('时间[\s\S]{2}'), '', \ str(soup.find('span', class_='timestamp').get_text()).strip()) keywords = [str(item.get_text())\ for item in soup.find_all('span', class_='new-label')] content = str(soup.find('div', class_='article_body')) # 只抓最近一周内的文章 timedelta = datetime.date.today()-datetime.datetime\ .strptime(pub_time, '%Y-%m-%d %H:%M:%S').date() if timedelta.days > 7: return False with open(abs_file_path, 'w') as f: f.write('标题:' + title + '\n') f.write('发布时间:' + pub_time + '\n') f.write('关键字:' + ', '.join(keywords) + '\n') f.write('内容:' + content + '\n') return True except Exception, e: print Exception, e logging.error('run error', exc_info=True) return False
def sort_articles(path="../elife-articles/"): # Get all xml files from path path_files = [f for f in listdir(path) if isfile(join(path,f))] or [] xml_files = filter(lambda x: '.xml' in x, path_files) if len(xml_files) > 0: print "Total number of articles: %d" % len(xml_files) counter, errors = 0, 0 for article in xml_files: # For each xml file add info to the ARTICLES dictionary try: soup = BeautifulSoup(open(path+article), ["lxml", "xml"]) journal_id = soup.find("journal-id").string pub_id = soup.find("article-id").string subjects = soup.find_all("subj-group") auth_sn = soup.find("surname").string auth_fn = soup.find("given-names").string if len(subjects) == 2: counter += 1 unique_subj = subjects[1].string if not unique_subj in ARTICLES.keys(): ARTICLES[unique_subj] = [{"file": article, "journal_id": journal_id, "pub_id": pub_id, "first_auth": "%s, %s" %(auth_sn, auth_fn)}] else: ARTICLES[unique_subj].append({"file": article, "journal_id": journal_id, "pub_id": pub_id, "first_auth": "%s, %s" %(auth_sn, auth_fn)}) except: errors += 1 pass print "There are %d articles out of %s with a single subject." % (counter, len(xml_files)) print "There is/are %d exception(s). \n" % errors
def saveNovelInfo(novelInfo): html = urlopen(novelInfo.website + novelInfo.novelId) bsObj = BeautifulSoup(html, "lxml") novelInfo.name = bsObj.h2.get_text() cover = bsObj.find("div", {"class": "cover"}) novelInfo.avatar = cover.find("img").attrs['src'] small = bsObj.find("div", {"class": "small"}) for child in small.children: label = child.get_text().split(":")[0] value = child.get_text().split(":")[1] if label == "作者": novelInfo.author = value elif label == "分类": novelInfo.category = value elif label == "状态": novelInfo.state = value elif label == "字数": novelInfo.words = value elif label == "更新时间": novelInfo.latest_updatetime = value elif label == "最新章节": novelInfo.latest_chapter = value intro = bsObj.find("div", {"class": "intro"}) introArray = intro.get_text().split(":") novelInfo.intro = introArray[1].replace("\u3000", "")
def manga_parse(html, source): if not html: Logger.info('html is empty!') return None soup = BeautifulSoup(html, from_encoding='gbk') # Logger.debug(soup.prettify()) intro = soup.find(id='intro_l') title = intro.find('h1').string.decode('utf-8').encode('utf-8') search = soup.find_all('p', attrs={'class':'w260'}) up_time = search[0].find('span').string tmp = search[1].contents if(len(tmp) > 1): author = tmp[1] else: author = '' added_time = search[2].contents[1] cover = soup.find('div', attrs={'class':'info_cover'}).p.img['src'] intro = soup.find('div', id='intro1').p.string if(intro): intro = intro.strip() up_time = datetime.strptime(up_time, '%Y-%m-%d').date() added_time = datetime.strptime(added_time, '%Y-%m-%d').date() plot = plot_parse(soup, source) return Manga( added_at=added_time, update_at=up_time, name=title, author=author, introduction=intro, poster=cover, source=source, plot=plot)
def write(self, captions, force=''): dfxp = BeautifulSoup(dfxp_base, 'xml') dfxp.find('tt')['xml:lang'] = "en" for style, content in captions['styles'].items(): if content != {}: dfxp = self._recreate_styling_tag(style, content, dfxp) body = dfxp.find('body') if force: captions['captions'] = self._force_language(force, captions['captions']) for lang in captions['captions']: div = dfxp.new_tag('div') div['xml:lang'] = '%s' % lang for sub in captions['captions'][lang]: p = self._recreate_p_tag(sub, dfxp) div.append(p) body.append(div) return unicode(dfxp.prettify(formatter=None))
def create_post_data(self, username, password, flag): if flag == "no_captcha": try: r = session.get(BASE_URL) except requests.exceptions.ConnectionError as err: print("无网络连接,程序退出") sys.exit() base = r.content.decode('utf-8') base_soup = BeautifulSoup(base,'lxml') lt = base_soup.find('input',attrs={'type':'hidden', 'name':'lt'})['value']#时间戳 self.__POST_DATA['lt'] = lt self.__POST_DATA['username'] = username self.__POST_DATA['password'] = password elif flag == "captcha": r = session.get(BASE_URL) base = r.content.decode('utf-8') base_soup = BeautifulSoup(base,'lxml') lt = base_soup.find('input',attrs={'type':'hidden', 'name':'lt'})['value'] payload = collections.OrderedDict() payload['username'] = username payload['_'] = lt self.get_captcha(payload) captcha = get_input("请输入验证码: ", "captcha") self.__POST_DATA_WITH_CAPTCHA['captcha'] = captcha self.__POST_DATA_WITH_CAPTCHA['lt'] = lt self.__POST_DATA_WITH_CAPTCHA['username'] = username self.__POST_DATA_WITH_CAPTCHA['password'] = password
def get_bible_hub_verse(self, verse): """ Retrieves the text for a user-supplied verse selection that can be found on Bible Hub. """ url = ("http://biblehub.com/%s/%s/%d.htm" % (verse.translation.lower(), verse.book.lower().replace(" ", "_"), verse.chapter)) page = urlopen(url) soup = BeautifulSoup(page.read()) verses = soup.find("div", {"class":"chap"}) if len(verses) < 1: return None, None, None for cur_verse in verses.findAll("b"): cur_verse.decompose() text = verses.get_text() trans_title = soup.find("div", {"class":"vheading"}).get_text() verse_list = text.splitlines() contents = "" for i, val in enumerate(verse_list): verse_num = i + 1 if verse.start_verse == 0: contents += ("[**%d**] %s " % (verse_num, val)) else: if (verse_num >= verse.start_verse and (verse.end_verse == 0 or verse_num <= verse.end_verse)): contents += ("[**%d**] %s " % (verse_num, val)) return contents, trans_title, url
def add_post(self, pid): """ Просмотр поста (выборка даты и тегов) """ if (self.get_post(pid)): return print('-'*10,'http://habrahabr.ru/post/'+str(pid),'-'*10) cur=self.con.execute("select pid from %s where %s=%d" % ('post_tags','pid',pid)) res=cur.fetchone( ) if res==None: try: soup=BeautifulSoup(urllib.request.urlopen('http://habrahabr.ru/post/'+str(pid)).read( )) except (urllib.request.HTTPError): self.add_tag(pid,"parse_error_404","") print("error 404") else: published = soup.find("div", { "class" : "published" }) tags = soup.find("ul", { "class" : "tags" }) if tags: for tag in tags.findAll("a"): self.add_tag(pid, tag.string, get_date(published.string)) else: self.add_tag(pid,"parse_access_denied","") print("access denied") else: print("post has already")
def review_info_from_url_review(url): # output review as a dict: reivew_id, review_title, review_body, review_rate, review_restaurantID # input is url of one review: g*+d*+r* html = urllib2.urlopen(url) soup = BeautifulSoup(html) review_info = {'review_body':'','review_title':'', 'review_id':'', 'review_restaurantID':'', 'review_rate':''} url_review_split = url.split('-') review_attID_part = url_review_split[2] review_attrID = review_attID_part.replace('d','') review_info['review_restaurantID'] = int(review_attrID.encode('utf8')) review_id_part = url_review_split[3] review_id = review_id_part.replace('r','') review_info['review_id'] = int(review_id.encode('utf8')) title_node = soup.find('title') review_info['review_title'] = ((((title_node.text).encode('utf8')).split('-'))[0]).strip() #rate_node = soup.find_all('img',{'class':'sprite-rating_no_fill rating_no_fill no50'}) #review_info['review_rate'] = float((rate_node[0]['alt']).encode('utf8')) try: rate_node = soup.find('img',{'property': 'v:rating'}) review_rate = float(rate_node['content']) review_info['review_rate'] = review_rate except: review_info['review_rate'] = 0 try: review_node = soup.find('p',id = True, property = True) review_info['review_body'] = (review_node.text).encode('utf8') except AttributeError: review_info['review_body'] = 'NA' return review_info
def get_image_info(client, my_massage): """ Parse HTML page and extract featured image name and link """ # Get Javascript updated HTML page response = client.commands.getPageText() #print 'type(response): ', type(response) #print 'response: ', response ## fff = open('te.txt', 'w+') ## fff.write( str(response) ) ## fff.close() ## assert response['status'] ## assert response['result'] # Create soup from HTML page and get desired information #soup = BeautifulSoup(response['result'], markupMassage=my_massage) #soup = BeautifulSoup(response['result'], markup="") soup = BeautifulSoup(response['result']) name = soup.find(id='caption_region').h3.string link = urlparse.urljoin('http://www.nasa.gov',\ soup.find(attrs='Full_Size')['href']) print name print link image_info = {'name': name, 'link': link} return image_info
def parse_module_detail_page(self, url): """ Parse a module detail page and return its extracted properties. """ try: r = requests.get(url) soup = BeautifulSoup(r.content) def get_course(row): return re.match('(.*)\(', row.find('strong').text).group(1).strip() table = soup.find('tbody', id=re.compile('^modul')) course_table = soup.find('tbody', id=re.compile('^kategorieZuordnungen')) course_rows = course_table.find_all('div', {'class': 'katZuordnung'}) courses = {get_course(row) for row in course_rows} ects_points_row = table.find('tr', id=re.compile('^Kreditpunkte')) objectives_row = table.find('tr', id=re.compile('^Lernziele')) lecturer_row = table.find('tr', id=re.compile('^dozent')) return { 'ects_points': int(ects_points_row.find_all('td')[1].text), 'objectives': objectives_row.find_all('td')[1].string, 'lecturer': lecturer_row.find_all('td')[1].text, 'courses': {c for c in courses if c not in course_specialisations} # Skip all courses that are only specialications and not real courses } except KeyboardInterrupt: self.stderr.write('Abort.') sys.exit(1) except: self.stderr.write("Could not parse {0}: {1}".format(url, sys.exc_info()[0]))
def main(): venues_file_name = 'venues%s.csv' % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") venue_team_file_name = 'venue_team%s.csv' % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") with open(file_name, 'wb') as f, open(venue_team_file_name) as f2: writer = csv.writer(f) writer.writerow(['venue_id', 'name', 'country', 'address', 'zipcode', 'city', 'fax', 'email', 'website', 'phone', 'openend', 'architect', 'capacity', 'surface', 'facts']) writer2 = csv.writer(f2) writer2.writerow(['venue_id','team_id']) for x in range(0,20000): print x if requests.head(BASE_URL % x).status_code == requests.codes.ok: r = requests.get(BASE_URL % x) soup = BS(r.text, 'html.parser') venue_data = [] venue_data.append(x) #venue_id name_node = soup.select('#subheading > h1') venue_data.append(name_node[0].text.encode('utf-8') if name_node else '') venue_data.append(get_detail('Founded', soup)) venue_data.append(get_detail('Address', soup)) venue_data.append(get_detail('Country', soup)) venue_data.append(get_detail('Phone', soup)) venue_data.append(get_detail('Fax', soup)) venue_data.append(get_detail('E-mail', soup)) website_node = soup.find('a', text='Official website') venue_data.append(website_node['href'].encode('utf-8') if website_node else '') venue_data.append(soup.find('div', 'logo').img['src'].encode('utf-8')) venue_data.append(get_venue_id(x)) print name_node[0].text writer.writerow(venue_data)
async def _timein_country(self, country_code): """Get time using country code country_code is a 2 letter country code from this list https://timezonedb.com/country-codes or a custom shortcut code Preset shortcuts: UK - United Kingdom (converts to GB) USE - United States East (New York) USW - United States West (Los Angeles) """ apiKey = self.settings['api_key'] if ".com" in apiKey: await self.bot.say("You have to set your API key, see data/timein/settings.json for details") return url = 'http://api.timezonedb.com/v2/list-time-zone?key=' + apiKey + '&format=xml' flag = ':flag_' if country_code.lower() == 'use': url += '&country=US&zone=*New_York*' flag += 'us: EAST ' elif country_code.lower() == 'usw': url += '&country=US&zone=*Los_Angeles*' flag += 'us: WEST ' elif country_code.lower() == 'test': url += '&zone=*auckland*' flag += 'nz: ' elif len(country_code) != 2 or ' ' in country_code == False: await self.bot.say("Country code must be 2 letters and from this list https://timezonedb.com/country-codes") return else: if country_code == 'UK' or country_code == 'uk': country_code = 'GB' url += '&country=' + country_code flag += country_code.lower() + ': ' async with aiohttp.get(url) as response: soupObject = BeautifulSoup(await response.text(), "html.parser") message = '' status = soupObject.find('status').get_text() if status != 'OK': message += 'Request failed. Details:\n```' message += status + '\n' message += soupObject.find('message').get_text() message += '```\nMake sure country code is from the list at https://timezonedb.com/country-codes' else: zones = soupObject.find_all('zone') for zone in zones: newmessage = '' newmessage += flag newmessage += zone.find('countryname').get_text() + '\n' newmessage += zone.find('zonename').get_text() + '\n' unixtime = zone.find('timestamp').get_text() prettyTime = datetime.datetime.fromtimestamp(int(unixtime)).strftime('%Y-%m-%d %H:%M:%S') newmessage += prettyTime + '\n' message += newmessage + '\n' await self.bot.say(message)
def http(url): html = requests.get(url).text soup_main = BeautifulSoup(html) # "一个"的文字 div = soup_main.find_all("div", {"class": "fp-one-cita"}) text = div[0].a.text # print(text) # “一个”的图片地址 img_list = soup_main.find_all("img", {"class": "fp-one-imagen"}) imgUrl = img_list[0].get('src') # print(imgUrl) # "一个"的标题 title_list = soup_main.find_all("p", {"class": "titulo"}) title = str(title_list[0].text) print(title) # “一个”的文章vol.1132#articulo' url_stroy = 'http://wufazhuce.com/one/' + title + '#articulo' soup_stroy = BeautifulSoup(requests.get(url_stroy).text) stroy_content = str(soup_stroy.find("div", {"class": "articulo-contenido"})) stroy_title = str(soup_stroy.find("h2", {"class": "articulo-titulo"})) stroy = stroy_title + stroy_content for addr in to_addr: sendEmail(text, imgUrl, title, stroy, addr)
def submission(csv_filename, compress): global competition_name filename = csv_filename if compress == True : with ZipFile(csv_filename, 'w') as myzip: filename = csv_filename + ".zip" myzip.write(filename) r_pre = session.get('https://www.kaggle.com/c/%s/submissions/attach' % competition_name) soup = BeautifulSoup(r_pre.content, 'html.parser') token = soup.find('input', {'name': '__RequestVerificationToken'})['value'] competition_id = soup.find('input', {'name': 'CompetitionId'})['value'] payload = { 'CompetitionId': competition_id, '__RequestVerificationToken': token, 'IsScriptVersionSubmission': 'False', 'SubmissionDescription': 'This-is-description!' } files = { 'SubmissionUpload': open(filename, 'rb') } r = session.post('https://www.kaggle.com/competitions/submissions/accept', data=payload, files=files) if r.status_code == 200: print("Submission Succeed") return True print("Submission Failed") return False
def getallview(): nums = 27 # 这个是我关注的人数 followees_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36', 'Referer': 'https://www.zhihu.com/people/GitSmile/followees', 'Origin': 'https://www.zhihu.com', 'Accept-Encoding': 'gzip, deflate, br', 'CG - Sid': '57226ad5 - 793b - 4a9d - 9791 - 2a9a17e682ef', 'Accept': '* / *' } count = 0 for index in range(0, nums): fo_url = 'https://www.zhihu.com/node/ProfileFolloweesListV2' m_data = { 'method': 'next', 'params': '{"offset":' + str( index) + ',"order_by":"created","hash_id":"de2cb64bc1afe59cf8a6e456ee5eaebc"}', '_xsrf': str(getxsrf()) } result = session.post(fo_url, data=m_data, headers=followees_headers) dic = json.loads(result.content.decode('utf-8')) li = dic['msg'][0] mysoup = BeautifulSoup(li, 'html.parser') for result in mysoup.findAll('a', attrs={'class': 'zm-item-link-avatar'}): print(index + 1) print(result.get('title')) href = str(result.get('href')) print(mysoup.find('a', attrs={'href': href + '/followers'}).text) print(mysoup.find('a', attrs={'href': href + '/asks'}).text) print(mysoup.find('a', attrs={'href': href + '/answers'}).text) print(mysoup.find('a', attrs={'href': href, 'class': 'zg-link-gray-normal'}).text + '\n') count += 1 print('一共关注了 %d人' % count)
roic = x.get_ROIC() mcap = x.get_marketCap_B() cashflow = (x.get_totalCashFromOperatingActivities()) priceSales = x.get_pricetoSales() s = {ticker:{'market cap':mcap, 'value':price[0], 'price':stickerPrice, 'current EPS': cEPS, 'book value anual': bv, 'book value growth': growth, 'book value quater': bvq, 'book value quater growth': qgrowth[1], 'roic': roic, 'cashflow':cashflow, 'priceSales':priceSales}} #https://finance.yahoo.com/quote/DDD/profile?p=DDD url = 'https://finance.yahoo.com/quote/'+ticker+'/profile?p='+ticker data = urllib2.urlopen(url) soup = BeautifulSoup(data, features = 'lxml') summarys = soup.find('p',class_={'Mt(15px) Lh(1.6)'}) summary = summarys.get_text() s[ticker]['business summary']=summary url = 'https://finance.yahoo.com/quote/'+ticker+'?p='+ticker+'.tsrc=fin-srch' data = urllib2.urlopen(url) time.sleep(0.5) soup = BeautifulSoup(data, features = 'lxml') divs = soup.find('div',attrs={'id':'quoteNewsStream-0-Stream-Proxy'}) div = divs.find('div',attrs={'id':'quoteNewsStream-0-Stream'}) ul = div.find('ul') lis = ul.findAll('li') hls = [] count = 0 news = {'news':{}} for li in lis:
# # # for i in range(1,10): # url = f'http://www.oaontc.ru/services/registers/lnk/?&page={i}' # print(url) url = "http://www.oaontc.ru/services/registers/lnk/" req = requests.get(url, headers=headers) src = req.text print(src) with open('index.html', 'w')as file: # запись страницы в файл file.write(src) with open('index.html')as file: # запись страницы в файл src = file.read() soup = BeautifulSoup(src, 'lxml') all_hrefs = soup.find(class_="textpage docs").find_all('a') print(all_hrefs) all_dict = {} for item in all_hrefs: item_text = item.text item_href = 'http://www.oaontc.ru' + item.get('href') all_dict[item_text] = item_href with open('all_dict.json', 'w') as file: json.dump(all_dict, file, indent=4,ensure_ascii=False) with open('all_dict.json') as file: all_categories = json.load(file) print(all_categories)
def get_links(soup: BeautifulSoup) -> List[str]: list_href = [] articles = soup.find('div', class_='view-content').find_all('a') for article in articles: list_href.append('https://www.ffa-assurance.fr' + article.get('href')) return list_href
from youtube_dl import YoutubeDL from urllib.request import urlopen from bs4 import BeautifulSoup import pyexcel url = "https://www.apple.com/itunes/charts/songs/" conn = urlopen(url) raw_data = conn.read() html_page = raw_data.decode('utf-8') f_conn = open('itunecharts.html','wb') f_conn.write = raw_data f_conn.close() soup = BeautifulSoup(html_page,"html.parser") section = soup.find('section','section chart-grid') li_list = section.find_all('li') rank_list =[] for li in li_list: rank = li.strong.string song = li.h3.a.string artist = li.h4.a.string ranking = { "Rank": rank, "Song": song, "Artist": artist, } rank_list.append(ranking) print(rank_list) pyexcel.save_as(records = rank_list, dest_file_name = 'itune_rank.xlsx' )
soup2 = BeautifulSoup(page2, "html.parser") for table in soup2.findAll("table") [0:10]:#2014-2005 tbody = table.find("tbody") # object that tells me which team, what year, # how many players were picked that year i = 0 #which column in the row is selected playerList=[] for row in tbody.findAll("td"): i+= 1 if (i%5 == 1): player = {} title = table.find("tr", {'class':'thd1'}) player['year'] = title.find("td").contents[0][:4] player['team'] = soup2.find('option', selected=True).getText(); player['teamAbbr'] = teamExtraInfos[player['team']]["abbr"] player['division'] = teamExtraInfos[player['team']]["division"] player['status'] = 'N/A' draftsJson.append(player) swithcer = { 1: roundNum, 2: selNum, 3: name, 4: position, 0: school } swithcer[i%5] (player, row) for urlR in urlListRoster:
from urllib.request import urlopen from bs4 import BeautifulSoup import re html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon") bsObj=BeautifulSoup(html,"html.parser") #for link in bsObj.findAll("a"): for link in bsObj.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$")): if 'href' in link.attrs: print(link.attrs['href'])
from bs4 import BeautifulSoup import urllib2 import os import urllib import os import getxml #create connection with main page url = 'http://www.heart.org/HEARTORG/General/State-by-State-NIH-Allocations_UCM_440585_Article.jsp' page = urllib2.urlopen(url) soup = BeautifulSoup(page.read(), 'lxml') #get all the links to countries div = soup.find('div', class_ = 'content') table = div.find('table', width = 400) for row in table.findChildren('tr'): for cell in row.findChildren('td'): link = cell.find('a').get('href') url = 'http://www.heart.org/' + link state = cell.text urllib.urlretrieve (url, 'pdf/' + state + '.pdf') getxml.getxml('pdf/' + state) print state + ' done'
scraping = True # We use a while loop because switching pages might be necessary to collect all the # data on one craigslist search. while scraping: # Requesting the server to let us retrieve data from the site src = requests.get(url) # Getting html of the webpage results = src.content # Converting html to more readable format with BeautifulSoup soup = BeautifulSoup(results, 'lxml') # Finding all classes that matches 'result-info', this contains all the basic item info front_page_info = soup.find_all(class_='result-info') # Gets the number of total results. 120 results max per page. num_of_total_results = soup.find(class_='totalcount').get_text() # Gets the number of results in the page. Less than 120 results indicates the last page. range_to = soup.find(class_='rangeTo').get_text() range_from = soup.find(class_='rangeFrom').get_text() num_of_items = int(range_to) - int(range_from) + 1 print('Total number of listings: ' + str(num_of_total_results)) print('number of items on page: ' + str(num_of_items)) url = 'https://seattle.craigslist.org/search/sss?query={}&sort=rel'.format(input_1 + '&s=' + str(range_to)) # Using the find method to get title, price, location, product page, and date for all items for item in range(num_of_items): title = front_page_info[item].find(class_='result-title hdrlnk')[0].get_text() title_list.append(title) price = front_page_info[item].find(class_='result-price')[0].get_text() price_list.append(price) date = front_page_info[item].find(class_='result-date')[0].get_text()
class BRefMatch: """ Generates a match information from basketball reference """ def __init__(self, country, league, season, code, match_type): self.country = country self.league = league self.season = season self.code = code self.type = match_type def is_crawled(self): """ returns wether match is already crawled """ return '{0}.json'.format(self.code) in os.listdir( './matches/{0}/{1}/{2}'.format(self.country, self.league, self.season)) @timeout def crawl(self): """ generate all stats for a nba match """ match_url = self.uri_base.format(self.code) headers = {'User-agent': random.choice(USER_AGENTS)} rv = requests.get(match_url, headers=headers) self.soup_ = BeautifulSoup(rv.text) self.match_ = defaultdict(dict) self._gen_teams_stats() self._gen_match_basic_info() self._gen_teams_basic_info() self._gen_scoring() self._gen_extra_info() self._write_match() def _gen_teams_stats(self): """ generate and add statistics related to teams and players to match dict """ for team in ['home', 'away']: self.match_[team]['players'] = defaultdict(dict) self.match_[team]['totals'] = defaultdict(dict) stats_tables = self.soup_.find_all('table', {'class': 'stats_table'}) bas_stats_tables = stats_tables[0], stats_tables[2] adv_stats_tables = stats_tables[1], stats_tables[3] self._read_table(bas_stats_tables, last_col=False) self._read_table(adv_stats_tables, last_col=True) self._gen_derived_stats() self.match_['home']['totals']['+/-'] = self.match_['home']['totals'][ 'PTS'] - self.match_['away']['totals']['PTS'] self.match_['away']['totals']['+/-'] = self.match_['away']['totals'][ 'PTS'] - self.match_['home']['totals']['PTS'] def _gen_match_basic_info(self): """ generate and add basic information related to match to match dict """ self.match_['code'] = self.code self.match_['type'] = self.type self.match_['league'] = self.league self.match_['season'] = self.season self.match_['country'] = " ".join( map(lambda x: x.capitalize(), self.country.split('_'))) loc_time = [ el.text for el in self.soup_.find('div', { 'class': 'scorebox_meta' }).find_all('div') ] if len(loc_time) >= 1: date = loc_time[0] if 'AM' in date or 'PM' in date: date, time = gen_date_with_mins(date) self.match_['date'] = str(date) self.match_['time'] = str(time) else: self.match_['date'] = str(gen_date(date)) if len(loc_time) == 2: self.match_['stadium'] = " ".join( map(lambda x: x.capitalize(), loc_time[1].split(',')[0].split(' '))) def _gen_teams_basic_info(self): """ generates teams (and their players) basic information """ teams = [ team.find_all('a')[-1] for team in self.soup_.find( 'div', {'scorebox'}).find_all('div', {'itemprop': 'performer'}) ] away, home = [team.text for team in teams] away_page, home_page = [team['href'] for team in teams] for team, team_name, team_page in zip(['away', 'home'], [away, home], [away_page, home_page]): self.match_[team]['name'] = team_name self._team_pls_basic_info(team, team_name, team_page) def _team_pls_basic_info(self, team_cond, team_name, team_page): """ generate and add basic information related to players to match dict """ team_info = BRefTeam(team_name, team_page) team_info.gen_players_info() pls = self.match_[team_cond]['players'] for pl, info in pls.items(): pl_basic_info = PlayerBasicInfo(pl, team_info) info.update(pl_basic_info.get()) def _gen_scoring(self): """ generate and add scoring information to match dict """ raise NotImplementedError def _gen_extra_info(self): """ generate and add attendance, duration and officials info to match dict """ raise NotImplementedError def _read_table(self, table, last_col): """ reads given table and updates relevant stats in match dict """ raise NotImplementedError def _gen_derived_stats(self): for team in ['home', 'away']: team_stats = self.match_[team]['totals'] def add_derivated_stats_to_dict(d, type_): d['FG%'] = gen_derived_var(d['FG'], d['FGA']) d['FT%'] = gen_derived_var(d['FT'], d['FTA']) d['3P%'] = gen_derived_var(d['3P'], d['3PA']) d['eFG%'] = gen_derived_var((d['FG'] + 0.5 * d['3P']), d['FGA']) d['TSA'] = d['FGA'] + 0.44 * d['FTA'] d['TS%'] = gen_derived_var(d['PTS'], 2 * d['TSA']) d['3PAr'] = gen_derived_var(d['3PA'], d['FGA']) d['FTAr'] = gen_derived_var(d['FTA'], d['FGA']) d['2P'] = d['FG'] - d['3P'] d['2PA'] = d['FGA'] - d['3PA'] d['2P%'] = gen_derived_var(d['2P'], d['2PA']) d['2PAr'] = gen_derived_var(d['2PA'], d['FGA']) d['DRB'] = d['TRB'] - d['ORB'] d['ORBr'] = gen_derived_var(d['ORB'], d['TRB']) d['DRBr'] = gen_derived_var(d['DRB'], d['TRB']) d['AST/TOV'] = gen_derived_var(d['AST'], d['TOV']) d['STL/TOV'] = gen_derived_var(d['STL'], d['TOV']) d['FIC'] = (d['PTS'] + d['ORB'] + 0.75 * d['DRB'] + d['AST'] + d['STL'] + d['BLK'] - 0.75 * d['FGA'] - 0.375 * d['FTA'] - d['TOV'] - 0.5 * d['PF']) d['FT/FGA'] = gen_derived_var(d['FT'], d['FGA']) d['HOB'] = gen_derived_var(d['FG'] + d['AST'], team_stats['FG']) # derive players and teams stats for player_stats in self.match_[team]['players'].values(): if player_stats['MP']: add_derivated_stats_to_dict(player_stats, 'player') add_derivated_stats_to_dict(team_stats, 'team') def _write_match(self): filename = './matches/{0}/{1}/{2}/{3}.json'.format( self.country, self.league, self.season, self.code) with open(filename, 'w') as f: f.write(json.dumps(self.match_))
class foxExtractor(object): """docstring for foxExtractor""" def __init__(self, url, testMode): print("Detected FOX NOW\nProcessing....\n") self.loginRequired = False self.urlName = url self.debug = True self.testMode = testMode self.requestsFileName = "iDoNotExistDefinitelyOnThisComputerFolder.html" self.showId = "" self.showName = "" self.videoGuid = "" self.subtitleServer = "http://static-media.fox.com/cc/" self.fileExtension = [".srt", ".dfxp"] pass def getSubtitles(self): """ The main function which uses helper functions to get the subtitles """ self.createSoupObject() self.getTitle() if self.debug: print(self.title) self.contentID = self.getContentID1(self.urlName) # Method-1 try: self.contentID = int(self.contentID) except: print("Trying an alternative method to fetch Content ID") self.contentID = self.getContentID2() # Method-2 try: self.contentID = int(self.contentID) except: print("Unable to fetch the contentID.") self.deleteUnnecessaryfiles() return 0 if self.debug: print(self.contentID) jsonString = self.getShowJson() if self.debug: print(jsonString) if not self.standardCheck(jsonString): return 0 self.getShowDetails(jsonString) if self.debug: print(self.showId) print(self.showName) print(self.videoGuid) if not self.standardCheck(self.showId, self.showName, self.videoGuid): return 0 CaptionList = self.getSubtitleUrl() if self.debug: print(CaptionList) for link in CaptionList: returnValue = self.downloadTranscript(link) if returnValue: break self.deleteUnnecessaryfiles() return returnValue def createSoupObject(self): requestObject = requests.get(self.urlName) # fileHandler = open("requests.txt", "w") # fileHandler.write(requestObject.text) # fileHandler.close() self.soupObject = BeautifulSoup(requestObject.text, "lxml", from_encoding="utf8") # soupObject1 = BeautifulSoup(requestObject.text,"lxml") # print(self.soupObject.original_encoding) fh = open(self.requestsFileName, "w") fh.write(str(self.soupObject)) fh.close() pass def getContentID1(self, url): """This is one of the methodologies to get the content ID. If this fails the alternative method will be called The URL follows a specific standard throughout. http://www.fox.com/watch/684171331973/7684520448 We need to split and return "684171331973" """ contentId = '' try: searchStringList = ["watch/"] juknkData, episodeName, IDContainer = url.partition( searchStringList[0]) contentId, Slash, Junk = IDContainer.partition("/") except: pass return contentId def getContentID2(self): """ This is the alternative method to obtain the contentID. <meta content="http://www.fox.com/watch/681382467805/7683748608" property="og:url"/> Obtained from the SOUP. """ contentId = '' try: UrlObj = self.soupObject.find("meta", attrs={"property": "og:url"}) Url = UrlObj['content'] contentId = self.getContentID1(Url) except: pass return contentId pass def getShowJson(self): """ The required script content looks like this- jQuery.extend(Drupal.settings, {"":...............}); 1) We add everything to a new string after encountering the first "{". 2) Remove the last parantheses and the semi-colon to create a valid JSON. ---- ');' """ scripts = self.soupObject.findAll("script") rawScript = "" for strs in scripts: if strs.string is not None: if "showid" in strs.string: rawScript = strs.string addState = False jsonString = '' for i in rawScript: if i == "{" and addState is False: addState = True if addState is True: jsonString += i jsonString = jsonString.replace(");", "") return jsonString pass def getShowDetails(self, jsonString): """ The json content looks like this - {"foxProfileContinueWatching":{"showid":"empire","showname":"Empire"},.............. "foxAdobePassProvider": {......,"videoGUID":"2AYB18"}} """ try: IndexingParameters = [ ["foxProfileContinueWatching", "showid", "showname"], ["foxAdobePassProvider", "videoGUID"], ] parsedJsonObject = json.loads(jsonString) self.showId = parsedJsonObject[IndexingParameters[0][0]][ IndexingParameters[0][1]] self.showName = parsedJsonObject[IndexingParameters[0][0]][ IndexingParameters[0][2]] self.videoGuid = parsedJsonObject[IndexingParameters[1][0]][ IndexingParameters[1][1]] except: print("Unable to parse Json. Please report.") pass pass def getSubtitleUrl(self): """ Sample Subtitle Link - http://static-media.fox.com/cc/sleepy-hollow/SleepyHollow_3AWL18_660599363942.srt http://static-media.fox.com/cc/sleepy-hollow/SleepyHollow_3AWL18_660599363942.dfxp The standard followed is - http://static-media.fox.com/cc/[showid]/showname_videoGUID_contentID.srt OR http://static-media.fox.com/cc/[showid]/showname_videoGUID_contentID.dfxp Some Subtitle URL's follow this standard - http://static-media.fox.com/cc/[showid]/showname_videoGUID.dfxp http://static-media.fox.com/cc/[showid]/showname_videoGUID.srt So we store both URL's and check both """ SubsUrl = self.subtitleServer SecondarySubsUrl = '' self.showName = self.processShowName(self.showName) SubsUrl += str(self.showId) SubsUrl += "/" SubsUrl += str(self.showName) SubsUrl += "_" SubsUrl += str(self.videoGuid) SecondarySubsUrl = SubsUrl SubsUrl += "_" SubsUrl += str(self.contentID) SubsUrl += self.fileExtension[0] SecondarySubsUrl += self.fileExtension[0] return [SubsUrl, SecondarySubsUrl] def processShowName(self, name): """ Removes white spaces """ name = name.replace(" ", "") return name def downloadTranscript(self, SubsLink): """ This function fetches the captions and writes them into a file in VTT format """ try: subRequestObject = requests.get(SubsLink) subRequestObject.encoding = 'utf-8' # print(subRequestObject.text) if subRequestObject.status_code >= 400: # Deliberate error to exit. s = int("deliberateError") subsFileHandler = open(self.title + self.fileExtension[0], "w") print("Creating ~ '%s%s' ..." % (self.title, self.fileExtension[0])) subsFileHandler.write(subRequestObject.text) subsFileHandler.close() return 1 except: return 0 pass def getTitle(self): """ This function returns the title of the video. This is also used for naming the file. <title>Watch New Girl Online: Episode 21, Season 5 on FOX</title> --> Extracting the value from here """ # print(self.soupObject.title.string) try: self.title = self.soupObject.title.string.strip() if not self.title: s = int("deliberateError") except: self.title = "DownloadedFOXNowSubtitles" pass def deleteUnnecessaryfiles(self): if not self.debug: try: os.remove(self.requestsFileName) except: pass def standardCheck(self, *variablesToCheck): for variables in variablesToCheck: if not variables: print("Unable to fetch the subtitles.") self.deleteUnnecessaryfiles() return 0 return 1
rows = [] for page in range(1,12): page_url = f'https://www.udemy.com/courses/free/?lang=en&p={page}&sort=highest-rated' driver.get(page_url) time.sleep(5) try: WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'course-list--container--3zXPS'))) except TimeoutException: print('Loading exceeds delay time') # break else: soup = BeautifulSoup(driver.page_source, 'html.parser') course_list = soup.find('div', {'class': 'course-list--container--3zXPS'}) courses = course_list.find_all('a', {'class' : "udlite-custom-focus-visible browse-course-card--link--3KIkQ"}) # total_res = driver.find_element(By.CLASS_NAME,"udlite-heading-md filter-panel--item-count--2JGx3") # # print("----------------- FOUND :",total_res, " RESULTS !-----------------") for course in courses: course_url = '{}{}'.format("https://www.udemy.com",course['href']) course_title = course.select('div[class*="course-card--course-title"]')[0].text course_details = course.find_all('span', {'class':'course-card--row--1OMjg'}) course_len = course_details[0].text number_of_lectures = course_details[1].text difficulty = course_details[2].text course_rating = extract_text(course, "span", 'data-purpose', 'rating-number')
H1 = '<h1>{0}</h1>' # Load and parses RSS feed d = feedparser.parse(RSS_FEED) print d.feed.title print d.feed.description for idx, item in enumerate(d.entries): print 'Processing...' + item.title + ' : ' + item.link try : page = urllib2.urlopen(item.link) soup = BeautifulSoup(page) final = '' c = soup.find('td', width="455") if c is None: c = soup.find('td', width="375") cc = soup.findAll(cellspacing="0",width="100%") [comment.extract() for comment in cc] if c is not None: s = c.prettify() s = re.sub('<br>\s*\s<br>','</p><p>',s) s = re.sub('<font(\s*|.*|\s)>|</font>|<br>|</br>|<br/>|<td(\s*|.*|\s)>|</td>|</img>','',s) s = re.sub('<img(\s*|.*|\s)>\s*</p>',str.format(H1,item.title),s) s = re.sub('<p>\s*\s<b>','<h3>',s) s = re.sub('</b>\s*\s</p>','</h3>',s) s = TEMPLATE.format(item.title,s,item.link)
import requests from bs4 import BeautifulSoup URL = 'https://www.amazon.in/BATA-Jorah-Formal-Shoes-8-8216017/dp/B079R8JWH8/ref=sr_1_1_sspa?crid=1SMS0M4ZJAB8K&keywords=formal+shoes+for+men&qid=1573122191&sprefix=formal+sho%2Caps%2C371&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzVVNOWjg2S0xRQTNGJmVuY3J5cHRlZElkPUEwNTE0NjkwUVo0SThUV0MxMVZNJmVuY3J5cHRlZEFkSWQ9QTAwMjMyODMzVVBGVEw3Qko2Qk1KJndpZGdldE5hbWU9c3BfYXRmJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ==' headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36' } page = requests.get(URL, headers=headers) soup = BeautifulSoup(page.content, "html.parser") title = soup.find(id="productTitle") print(title)
soup = BeautifulSoup(open(filename), 'html.parser') n = 0 c = 0 for b in soup.table(): if (str(b.get('id')) != "None"): n = n + 1 x = str(b.get('id')) for b in soup.table(): if (str(b.get('id')) != "None"): c = c + 1 if (c == n - 1): x = str(b.get('id')) id_selector = x[3:5] print(id_selector) rollnumber = str(soup.find(id='lblRollNo').text) name = str(soup.find(id='lblFullName').text) fathername = str(soup.find(id='lblFatherName').text) marks = str( soup.find(id='ctl' + id_selector + '_ctl01_lblSemesterTotalMarksObtained').text) cp = str(soup.find(id='ctl' + id_selector + '_ctl01_lblResultStatus').text) cop = str(soup.find(id='ctl' + id_selector + '_lblCOP').text) i = soup.find(id='ctl' + id_selector + '_ctl01_ctl00_grdViewSubjectMarksheet') print(rollnumber + " \n" + name + " \n" + fathername + "\n" + marks + "\n" + cp + "\n" + cop) subjects = [
def parseTime(time): return time from toi import toi headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0", } listSite = "https://timesofindia.indiatimes.com" baseURL = "https://timesofindia.indiatimes.com" page = requests.get(listSite, headers=headers) soup = BeautifulSoup(page.content, 'html.parser') listN = soup.find(class_="list9").find_all('li') topNews = [] for site in listN: try: heading = site.find('a').text heading = heading.replace('\xa0', ' ').encode('utf-8') link = baseURL + site.a["href"] # news = { # "href": link, # "headline": heading, # # "time": time # } topNews.append(toi(link)) except: print("Non list item ")
def parse_list(self, response): soup = BeautifulSoup(response.body, "lxml") tr_list = soup.find('table', attrs={'class':'table table-bordered table-striped'}).find_all('tr') for i in tr_list: yield scrapy.Request(url=response.url.split('?')[0]+i.td.next_sibling.a['href'], callback=self.parse_notice)
def scrape_info(): browser = init_browser() # Visit https://mars.nasa.gov/news/ url1 = 'https://mars.nasa.gov/news/' browser.visit(url1) time.sleep(3) # Scrape page into Soup html = browser.html soup = BeautifulSoup(html, "html.parser") news_titles = soup.find('div', class_="content_title") news_title = news_titles.text print(news_title) time.sleep(3) news_ps = soup.find('div', class_="article_teaser_body") news_p = news_ps.text print(news_p) #Find the src for the featured image url2 = 'http://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url2) time.sleep(2) html2 = browser.html soup = BeautifulSoup(html2, 'html.parser') img = soup.find_all('a', class_="button fancybox") for a in img: print(a["data-fancybox-href"]) url9 = "http://www.jpl.nasa.gov/" featured_image_url = url9 + a["data-fancybox-href"] url3 = 'https://twitter.com/marswxreport?lang=en' browser.visit(url3) time.sleep(3) soup = BeautifulSoup(browser.html, 'html.parser') mars_weather = soup.find(class_='tweet-text').text url4 = 'https://space-facts.com/mars/' browser.visit(url4) time.sleep(10) html4 = browser.html soup = BeautifulSoup(html4, 'html.parser') marsfacts = soup.find_all('table', class_="tablepress tablepress-id-p-mars") marsfacts url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url5) time.sleep(5) html5 = browser.html soup = BeautifulSoup(html5, 'html.parser') hemis_search = soup.find_all('a', class_="itemLink product-item") url10 = "https://astrogeology.usgs.gov" img_url = [] for a in hemis_search: print(a['href']) img_url.append(a['href']) url11 = url10 + img_url[0] url12 = url10 + img_url[2] url13 = url10 + img_url[4] url14 = url10 + img_url[6] browser.visit(url11) html11 = browser.html time.sleep(5) soup = BeautifulSoup(html11, 'html.parser') hemis_search2 = soup.find_all('img', class_="wide-image") for a in hemis_search2: print(a['src']) url15 = url10 + (a['src']) print(url15) browser.visit(url12) html12 = browser.html time.sleep(5) soup = BeautifulSoup(html12, 'html.parser') hemis_search3 = soup.find_all('img', class_="wide-image") for a in hemis_search3: print(a['src']) url16 = url10 + (a['src']) print(url16) browser.visit(url13) html13 = browser.html time.sleep(5) soup = BeautifulSoup(html13, 'html.parser') hemis_search4 = soup.find_all('img', class_="wide-image") for a in hemis_search4: print(a['src']) url17 = url10 + (a['src']) print(url17) browser.visit(url14) html14 = browser.html time.sleep(5) soup = BeautifulSoup(html14, 'html.parser') hemis_search4 = soup.find_all('img', class_="wide-image") for a in hemis_search4: print(a['src']) url18 = url10 + (a['src']) print(url18) hemisphere_image_url = [ {"title": "Cerberus Hemisphere", "img_url": url15}, {"title": "Schiaparelli Hemisphere", "img_url": url16}, {"title": "Syrtis Major Hemisphere", "img_url": url17}, {"title": "Valles Marineris Hemisphere", "img_url": url18} ] # Store data in a dictionary mars_data = { "news_title": news_title, "news_p": news_p, "featured_image_url": featured_image_url, "mars_weather": mars_weather, "url15": url15, "url16": url16, "url17": url17, "url18": url18 } # Close the browser after scraping browser.quit() # Return results return mars_data
import urllib.request from bs4 import BeautifulSoup import json url = "https://old.reddit.com/r/ProgrammerHumor/" request = urllib.request.Request( url, headers={ 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0' }) html = urllib.request.urlopen(request).read() soup = BeautifulSoup(html, 'html.parser') # First lets get the HTML of the table called site Table where all the links are displayed main_table = soup.find("div", attrs={'id': 'siteTable'}) # Now we go into main_table and get every a element in it which has a class "title" links = main_table.find_all("a", class_="title") # List to store a dict of the data we extracted extracted_records = [] for link in links: title = link.text url = link['href'] # There are better ways to check if a URL is absolute in Python. For sake simplicity we'll just stick to .startwith method of a string # https://stackoverflow.com/questions/8357098/how-can-i-check-if-a-url-is-absolute-using-python if not url.startswith('http'): url = "https://old.reddit.com/" + url # You can join urls better using urlparse library of python. # https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin # Lets just print it print("%s - %s" % (title, url)) record = {'title': title, 'url': url}
def fillUnivList(ulist, html): soup = BeautifulSoup(html, "html.parser") for tr in soup.find('tbody').children: if isinstance(tr, bs4.element.Tag): tds = tr('td') ulist.append([tds[0].string, tds[1].string, tds[3].string])
""" import bs4 from bs4 import BeautifulSoup import requests import csv url = requests.get('https://ziare.com/economie/analiza-economica/cum-ne-pregatim-pentru-economia-post-covid-1606027').text soup = BeautifulSoup(url, 'html.parser') #csv_file = open('articole.csv', 'w', encoding ='utf-16') #csv_writer = csv.writer(csv_file) #csv_writer.writerow(['Titlu', 'Articol']) print(soup.prettify()) titlu = soup.find('h1').text print(titlu) articolul = soup.find('div', class_='descriere_main').text print(articolul) #csv_writer.writerow([titlu, articolul]) #csv_file.close() #csv.field_size_limit() with open('articole1.csv', 'a',newline='', encoding="utf-16") as csvfile: fieldnames = ['Titlu', 'Articol'] writer = csv.DictWriter(csvfile, fieldnames = fieldnames, delimiter = '\t') writer.writerow({"Titlu": titlu.replace("\n", " ").strip(), "Articol": articolul.replace("\n", " ").strip()})
import urllib import pymysql import db from bs4 import BeautifulSoup params = urllib.parse.urlencode({'page' :1}) url='https://movie.naver.com/movie/point/af/list.nhn?&%s' % params print(url) response = urllib.request.urlopen(url) navigator = BeautifulSoup(response, 'html.parser') table = navigator.find('table', class_ = 'list_netizen') print(table) list_records=[] for i,r in enumerate(table.find_all('tr')): for j,c in enumerate(r.find_all('td')): if j==0: record=int(c.text.strip()) elif j==2: record1=int(c.text.strip()) elif j==3: record2= str(c.find('a', class_ = 'movie').text.strip()) record3= str(c.text).split('\n')[2] elif j==4: record4 = str(c.find('a', class_ = 'author').text.strip()) record5= str(c.text).split('****')[1] try: record_t=tuple([record,record1,record2,record3,record4,record5]) list_records.append(record_t)
from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen('http://www.pythonscraping.com/pages/page3.html') soup = BeautifulSoup(html, 'html.parser') gifts = soup.find('table', {'id': 'giftList'}).children # print(len(gifts)) for gift in gifts: print(gift) siblings = soup.find('table', {'id': 'giftList'}).tr.next_siblings for sibling in siblings: print(sibling)
def _parse(self, html): soup = BeautifulSoup(html, from_encoding='utf-8') self.meta = soup.find_all('meta') try: seo_title = soup.find('meta', property='og:title').get('content') except AttributeError: # find returned None, None has no get method self.real_article = False return try: self.title = soup.find('div', 'art_header_title').get_text() except AttributeError: self.title = seo_title try: byline_and_date = (soup.find('span', 'art_header_footer_author') .get_text()) except AttributeError: self.real_article = False return byline, _, date = re.split(u'(פורסם|עדכון אחרון):', byline_and_date) self.byline = byline.strip() self.date = date.strip() sub_title_elt = soup.find('div', 'art_header_sub_title') sub_title = u'' if sub_title_elt is None else sub_title_elt.get_text() body_elt = soup.find('div', 'art_body') if body_elt is None: self.real_article = False return def replace_with_para(tag, para_text, dividers=True): para = soup.new_tag('p') para.string = para_text tag.replace_with(para) if dividers: pre_divider = soup.new_tag('p') pre_divider.string = NBSP para.insert_before(pre_divider) post_divider = soup.new_tag('p') post_divider.string = NBSP para.insert_after(post_divider) # TODO: handle these better, maybe mention the caption for video in body_elt.find_all('div', 'art_video'): replace_with_para(video, u'(סרטון)') for img in body_elt('div', 'citv_image'): replace_with_para(img, u'(תמונה)') for sidething in body_elt('div', 'arttvgenlink'): sidething.decompose() for ad in body_elt('div', 'CAATVcompAdvertiseTv'): # these are floated left replace_with_para(ad, u' (פרסומת) ', dividers=False) def body_part_to_text(part): if part.name == 'p': t = part.get_text() if t == NBSP: # p contains just nbsp => this is the paragraph division return u'\n\n' else: return re.sub(r'\s+', ' ', t.strip()) if part.name.startswith('h') and part.name[1:].isdigit(): return part.get_text().strip() + u'\n\n' if part.name == 'ul': return u'\n' + u'\n'.join( li.get_text().strip() for li in part('li')) # join with ' ' so that adjacent p tags get a space between them. # we'll later remove extra spaces. self.body = u' '.join(itertools.chain( [sub_title, u'\n\n'], itertools.ifilter(None, itertools.imap(body_part_to_text, body_elt(['p', 'ul', 'h3']))))) # remove double spaces created by joining paragraphs with ' ' self.body = re.sub(r' +', ' ', self.body) # remove spaces adjacent to dividers, created by joining paragraphs # with ' ' self.body = re.sub(r'\n\n ', '\n\n', self.body) self.body = re.sub(r' \n\n', '\n\n', self.body) # also remove double dividers (note that this is done # after the adjacent spaces are removed) self.body = re.sub(r'\n{3,}', r'\n\n', self.body)
#!/usr/bin/env python2.7 from bs4 import BeautifulSoup import requests import smtplib server = smtplib.SMTP('smtp.gmail.com', 587) server.starttls() server.login("*****@*****.**", "D0llareur0") finsurl='http://fins.az/bank' response = requests.get(finsurl) respData = BeautifulSoup(response.content, "html.parser") usd = respData.find('div', {'class': 'value'}).get_text() eur = respData.find('div', {'class': 'value'}).find_next('div', {'class': 'value'}).get_text() def printer_func(site, dollar, euro): return "%s saytindan istinad edilmishdir: " % str(site), "1 USD = %s AZN" % str(dollar), "1 EURO: %s AZN" % str(euro) cbarurl='http://www.cbar.az/' respcbar = requests.get(cbarurl) cbrespData = BeautifulSoup(respcbar.content, "html.parser") cbusd = cbrespData.find('span', {'class': 'item item_4'}).get_text() cbeur = cbrespData.find('span', {'class': 'item item_4'}).find_next('span', {'class': 'item item_4'}).get_text() faz = printer_func(str(finsurl), str(usd), str(eur)) caz = printer_func(str(cbarurl), str(cbusd), str(cbeur)) message = """From: Euro Dollar <*****@*****.**> To: Email Author <*****@*****.**>
import requests from bs4 import BeautifulSoup from urllib.error import HTTPError from urllib.error import URLError try: site_url = requests.get( 'https://www.ibge.gov.br/estatisticas/economicas/industria/9294-pesquisa-industrial-mensal-producao-fisica-brasil.html?=&t=resultados' ).text except HTTPError as e: print(e) except URLError as e: print("o Servidor não foi encontrado!") soup = BeautifulSoup(site_url, 'lxml') print(soup.prettify()) minha_tabela = soup.find('table', {'class': 'pvtTable'}) print(minha_tabela) #indices_mensais = [] #links = minha minha_tabela.findall('div')
def getFormMethod(text, eid): soup = BeautifulSoup(text, "html.parser") return soup.find("form", id=eid).get("method")
def main(): merchantFilePath = os.path.dirname( os.path.abspath(__file__)) + "/merchants.json" if os.path.exists(merchantFilePath): json_open = open(merchantFilePath, "r", encoding="utf8") merchants = json.load(json_open) else: merchants = {"data": [], "names": []} findMerchants = [] page = 0 while True: page += 1 print("----- Page {page} -----".format(page=page)) html = requests.get( "https://www.gotoeat-tochigi.jp/merchant/index.php?word=&sort=2&page={page}" .format(page=page)) html.encoding = html.apparent_encoding soup = BeautifulSoup(html.content, "html.parser") lists = soup.find("ul", { "class": "serch_result" }).findChildren("li", recursive=False) if (len(lists) == 0): break for merchant in lists: merchant_name = merchant.find("p", {"class": "name"}).text merchant_type = merchant.find("p", { "class": "name" }).find("span").text merchant_name = re.sub( r"{merchant_type}$".format(merchant_type=merchant_type), "", merchant_name) _merchant_address = merchant.find("div", { "class": "add" }).findAll("p")[0].text merchant_postal_code = re.sub(r"所在地〒([0-9\-]+) (.+)", r"\1", _merchant_address) merchant_address = re.sub(r"所在地〒([0-9\-]+) (.+)", r"\2", _merchant_address) if len(merchant.find("div", {"class": "add"}).findAll("p")) >= 2: merchant_tel = merchant.find("div", { "class": "add" }).findAll("p")[1].text merchant_tel = re.sub(r"TEL(.+)", r"\1", merchant_tel) print(merchant_name + " - " + merchant_address) findMerchants.append(merchant_name) if merchant_name in merchants["names"]: continue lat, lng = getLatLng(merchant_address) print(str(lat) + " " + str(lng)) merchants["data"].append({ "name": merchant_name, "type": merchant_type, "address": merchant_address, "postal_code": merchant_postal_code, "tel": merchant_tel, "lat": lat, "lng": lng }) merchants["names"].append(merchant_name) with open(merchantFilePath, mode="w", encoding="utf8") as f: f.write(json.dumps(merchants, indent=4, ensure_ascii=False)) if (soup.find("li", {"class": "next"}) == None): break else: time.sleep(1) merchants = checkRemovedMerchant(merchants, findMerchants) with open(merchantFilePath, mode="w", encoding="utf8") as f: f.write(json.dumps(merchants, indent=4, ensure_ascii=False))
from bs4 import BeautifulSoup import requests, pprint, random, time, string url = requests.get( "https://www.imdb.com/india/top-rated-indian-movies/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=8a7876cd-2844-4017-846a-2c0876945b7b&pf_rd_r=C6ZKX5N78115F6BM14Y3&pf_rd_s=right-5&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_india_tr_rhs_1" ) soup = BeautifulSoup(url.text, 'lxml') table = soup.find('tbody', class_='lister-list') body = table.find_all('tr') random_var = random.randint(1, 5) time.sleep(random_var) _list = [] for i in body: _dict = {} data = i.find('td', class_="titleColumn") no = '' for j in data.text: no += j if j == '.': break _dict['No'] = no.strip() _dict['Movie'] = data.find('a').text _dict['Year'] = int(data.find('span').text.strip('(').strip(')')) _dict['Rating'] = i.find('strong').text _dict['Link'] = "https://www.imdb.com" + i.find('a')['href'] _list.append(_dict) pprint.pprint(_list) def scrapped_movie(mov_link): new_url = requests.get(mov_link).text soup = BeautifulSoup(new_url, 'lxml')
def ret_header(site): response = requests.get(site) soup = BS(response.content, 'html.parser') titles = soup.find('title') return titles.text
def getElementById(text, eid): soup = BeautifulSoup(text, "html.parser") result = soup.find(id=eid) return result