def parse_one_page(html): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', 'Upgrade-Insecure-Requests': '1', 'Referer': 'http://search.people.com.cn/cnpeople/news/getNewsResult.jsp', 'Host': 'search.people.com.cn', 'Origin': 'http://search.people.com.cn', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0' } requests.adapters.DEFAULT_RETRIES = 5 s.keep_alive = False page = s.get(html).content soup = bfs(page, 'lxml') data = soup.select('p') head = soup.select('h1') #print(head.get_text()) #write_to_file(head.get_text()) for i in data: if i.select('a') == []: print(i.get_text())
def get_maoyan_ten_movie(url_name): """获取十部电影的链接""" response = requests.get(url_name, headers=header) # headers参数是为了尽量模拟浏览器的功能 # 验证是否可以爬取到内容 # print(response.text) print(f"返回码:{response.status_code}") # 使用beautifulsoup解析内容 soup = bfs(response.text, 'html.parser') # 使用生成器无法迭代??? # for divtag in soup.find_all('div', attrs={'class': 'movie-item-hover'}): # for atag in divtag.find_all('a'): # yield f"https://maoyan.com{atag.get('href')}" # ten_movie_url = [] # for divtag in soup.find_all('div', attrs={'class': 'movie-item-hover'}): # for atag in divtag.find_all('a'): # ten_movie_url.append(f"https://maoyan.com{atag.get('href')}") # 使用列表生成器 ten_movie_url = [ f"https://maoyan.com{atag.get('href')}" for divtag in soup.find_all( 'div', attrs={'class': 'movie-item-hover'})[0:10] for atag in divtag.find_all('a') ] # print(ten_movie_url) return ten_movie_url
def getLinks(articleUrl): html = urlopen("http://en.wikipedia.org" + articleUrl) data = bfs(html, "html.parser") title = data.find("h1").get_text() content = data.find("div", {"id": "mw-content-text"}).find("p").get_text() store(title, content) return data.find("div", {"id": "bodyContent"}).findAll("a", href=reMatch)
def getContent(html): global pics soup = bfs(html) title = soup.h1 if title is None: title = soup.h2 if title is None: title = soup.h3 if title is None: title = soup.h4 if title is None: title = 'None' else: title = title.text else: title = title.text for script in soup.findAll('script'): script.extract() for style in soup.findAll('style'): style.extract() soup.prettify() content = soup.get_text().replace('\n', '').replace('\u3000', '').replace('\xa0', '') pic_urls = pics.findall(html) return content, title, pic_urls
def get_information(url_name): for url_value in get_maoyan_ten_movie(url_name): # print(f"url地址:{url_value}") response_info = requests.get(url_value, headers=header) soup_info = bfs(response_info.text, 'html.parser') pd_list = [] for tag in soup_info.find_all('div', attrs={'class': 'movie-brief-container'}): # 电影名称 movie_name = tag.find('h1', attrs={'class': 'name'}).text print(f"电影名称: {movie_name}") pd_list.append(f"电影名称:{movie_name}\n") movie_tag = tag.find_all('li', attrs={'class': 'ellipsis'}) # 电影类型 # print(type(movie_tag[0].text)) movie_type = " ".join(movie_tag[0].text.split("\n")).strip(" ") print(f'电影类型:{movie_type}') pd_list.append(f'电影类型:{movie_type}\n') # 上映时间 movie_time = movie_tag[2].text print(f'上映时间:{movie_time}') pd_list.append(f'上映时间:{movie_time}\r\n') with open('./homework1.csv', 'a+', encoding='utf-8') as movieinfo: movieinfo.write("".join(pd_list))
def quotify(word): word_list = list(word.split(' ')) if len(word_list) <= 2: pass else: word = word_list[-1] page = requests.get('https://www.brainyquote.com/search_results?q=' + word) soup = bfs(page.text, 'html.parser') try: word = word.lower() quotelist = soup.find(id='quotesList') quotes = quotelist.find_all(title='view quote') authors = quotelist.find_all(title='view author') choice = int(len(quotes) * (random.random())) - 1 quote = quotes[choice].contents[0] author = authors[choice].contents[0] if len(word_list) <= 2 and (word not in quote): return quotify(word_list[-1]) first = quote[:quote.lower().find(word)] keyword = quote[quote.lower().find(word):quote.lower().find(word) + len(word)] second = quote[quote.lower().find(word) + len(word):] result = [first, keyword, second, author] #print('"%s"\n-"%s"'%(result['quote'],result['author'])) except Exception as e: print(str(e)) result = 0 return result
def get_activity_datetime(self, activity_id): """ :param activity_id: String :return: datetime object. """ url = "{site}/user/{profile}/activity/{activity_id}".format( site=self._runkeeper.site, profile=self._runkeeper.profile_username, activity_id=activity_id) try: activity_datetime_session = self.session.get(url) except: raise EndpointConnectionError soup = bfs(activity_datetime_session.text, "html.parser") form = soup.find('div', {'class': 'micro-text activitySubTitle'}) activity_datetime = [ date_params.split('-')[0].rstrip() for date_params in form ] activity_datetime = (''.join(activity_datetime)) activity_datetime = datetime.strptime(activity_datetime, '%a %b %d %H:%M:%S %Z %Y') return activity_datetime
def getHtml(uri): updateTime = [] result = [] url = HOST + uri res = requests.get(url, timeout=TIMEOUT) soup = bfs(res.text, "lxml") titleRes = soup.find_all(class_="detail_title") btnRes = soup.find_all(class_="dict_dl_btn") updateRes = soup.find_all(class_="show_content") pageNums = soup.find_all(["span", "default"]) maxPage = 0 for nums in pageNums: if not nums.text.isdigit(): continue if int(nums.text) > maxPage: maxPage = int(nums.text) for upt in updateRes: if upt.contents[0].find('-') < 0: continue updateTime.append(upt.contents) for tit, btn, upt in zip(titleRes, btnRes, updateTime): for tc, bc, uc in zip(tit.children, btn.children, upt): result.append({"title": tc.contents[0].strip(), "link": bc.attrs["href"], "updateTime": uc}) return result
def __init__(self, url, page, signInUrl, email, password): self.url = url self.targetPage = urljoin(url, page) #Start Session self.session = rq.Session() #Get page self.getRequest = self.session.get(self.targetPage) #Check ok response self.getRequest.raise_for_status() self.bfs = bfs(self.getRequest.text, features='html5lib') #Sign in loginUrl = urljoin(url, signInUrl) self.session.get(loginUrl) csrfToken = self.session.cookies.get_dict()['csrftoken'] form_data = { 'email': email, 'password': password, 'csrfmiddlewaretoken': csrfToken } self.session.post(loginUrl, data=form_data, headers={'Referer': loginUrl})
def get_citation_needed_count(url): """ This gets the count of citations on a wiki page Args: url (str): Needs to be a valid url Returns: str: the number of cites needed """ res = requests.get(url) content = res.content soup = bfs(content, 'html.parser') first_el = soup.find(id='mw-content-text') find_cites = first_el.find_all( class_='noprint Inline-Template Template-Fact') citations = len(find_cites) print(f'Number of citations needed are {citations}\n') return f'Number of citations needed are {citations}'
def getLinks(pageUrl): global page global reTest1 try: html = urlopen(r"http://en.wikipedia.org" + pageUrl) soup = bfs(html, "html.parser") except Exception as e: print(e) return None try: print(soup.h1.get_text()) # 打印标签元素 except Exception as e: print(e) try: result = soup.find("div", { "id": "bodyContent" }).findAll("a", href=reTest1) except Exception as e: print(e) return None for link in result: if 'href' in link.attrs: if link.attrs['href'] not in page: newPage = link.attrs['href'] print(newPage) page.add(newPage) getLinks(newPage) return None
def getPhotoComments(self): ''' :return:返回用户评论信息; ''' comment_str = '' comments_rest = self.flickr.photos.comments.getList( photo_id=self.photoId, format='rest') comments_lxml = bfs(comments_rest, 'lxml', from_encoding='utf8') comments = comments_lxml.find_all('comment') for comment in comments: try: # commentSearch = re.search(r'>.+?<',str(comment)).group(0) commentS = '{' + re.sub( r'\[.+?\]|<[^>]+>|\s|\n|&.+?;|www\..+?;', ' ', str(comment)) + '}' comment_str = comment_str + commentS except: continue if comment_str == '': return None else: return comment_str
def categoryChoice(response, url): soup = bfs(response.text, features="html.parser") catList = makeCategoryList(response, url) print("Vous pouvez choisir de scrapper une catégorie particulière.") print( "Si vous répondez 'non' à la question ci-dessous, la totalité du site sera scrappé." ) choice = input( "Souhaitez-vous selectionner une catégorie (oui/non) : ").lower() if choice == "oui": print(50 * "-") print( "Choisissez la catégorie à scrapper parmis les catégories suivantes :" ) print(50 * "-") for catNum, catName in enumerate(catList.keys()): print(catNum + 1, ":", catName) print(50 * "-") catChoice = input( "Nom de la catégorie que vous souhaitez scrapper : ").capitalize() if catChoice in catList.keys(): print(f"La catégorie {catChoice} va être scrappée !") return catList[catChoice] else: print("Catégorie inconnue !") return False elif choice == "non": print("L'ensemble du site va être scrappé !") return catList["Books"] else: print("Choix invalide") return False
def get_citations_needed_report(url: str) -> str: """ Gives you the elements in on the page that needed cites Args: url (str): Valid URL on wikipedia Returns: str: All the paragraphs that need cites. """ res = requests.get(url) content = res.content soup = bfs(content, 'html.parser') first_el = soup.find(id='mw-content-text') p_tag = first_el.find_all('p') show_which = '' for p in p_tag: if 'citation needed' in p.text: show_which += p.text + '\n' print(show_which.strip()) return show_which
def ipvoid(): url = "https://www.ipvoid.com/ip-blacklist-check/" session = requests.Session() for ip in range(len(df.index)): current_ip = df.loc[ip, 'IP'].strip() try: pay_load = {"ip": current_ip} request = session.post(url, data=pay_load) soup = bfs(request.content, "html5lib") # print('configuring out {}'.format(df.loc[ip,'IP'])) if len(soup.select('span.label.label-danger')) != 0: result = soup.select('span.label.label-danger')[0].get_text() elif len(soup.select('span.label.label-warning')) != 0: result = soup.select('span.label.label-warning')[0].get_text() else: result = soup.select('span.label.label-success')[-1].get_text() df.loc[ip, 'IPVOID'] = result except: df.loc[ip, 'IPVOID'] = 'NA' time.sleep(10)
def parse_one_page(html): page = s.get(html).content soup = bfs(page, 'lxml') data = soup.select('p') for i in data: if i.select('a') == []: print(i.get_text()) write_to_file(i.get_text())
def walkto(URL,find): r = requests.get(URL) data = r.text soup = bfs(data,"lxml") texts = [] # texts for txts in soup.find_all(find): texts.append(txts.getText()) return texts
def get_links(): html = requests.get('http://myjorney.tistory.com/category/%EC%BD%94%EB%94%A9/PYTHON%20%EA%B8%B0%EB%B3%B8%EB%AC%B8%EB%B2%95').text soup =bfs(html,'html.parser') data=[] for url in soup.select('#body > ul > li:nth-child(1) > a'): data.append(url['href']) return data
def retrieve_project_urls(url, css_sel = 'a.list-proj-name'): """ Retrieve the list of url for projects or proposals, based on the url fof the page that lists them """ r = requests.get(url) soup = bfs(r.text, 'lxml') link_tags = soup.select(css_sel) return [x.attrs['href'][3:] for x in link_tags]
def retrieve_project_urls(url, css_sel='a.list-proj-name'): """ Retrieve the list of url for projects or proposals, based on the url fof the page that lists them """ r = requests.get(url) soup = bfs(r.text, 'lxml') link_tags = soup.select(css_sel) return [x.attrs['href'][3:] for x in link_tags]
def getUrls(originUrl, html): data = bfs(html, features="html5lib") urls = [] for elem in data.find_all('a', href=re.compile('.+')): href = elem['href'] url = validateHref(href, originUrl) if(url and url.geturl() not in urls): urls.append(url) return urls
def bfs_process(path, file_name): split_file = split_file_part(path, file_name) content = [] for part in split_file: soup = bfs(part) cnt = ''.join(soup.content.string) content.append(cnt) return content
def crawl_document(link): response = urllib2.urlopen(link) out = response.read() soup = bfs(out) buff = [] paragraphs = soup.find_all("p", class_=False) for paragraph in paragraphs: content = str(paragraph).strip("<p>").strip("</p>") buff.append(content) text = "".join(buff) print text
def scraper_securite_routiere(urlPage): hdr = {'User-Agent': 'Mozilla/5.0'} req = urllib2.Request(urlPage, headers=hdr) page1 = urllib2.urlopen(req) #utilisation de beautifulSoup pour parser la page soup = bfs(page1, 'html.parser') indicateurs = soup.find('div', attrs={'class': 'cadre_avec_fleches'}) for i, link in enumerate(indicateurs.findAll('a')): downloadUrl = homepageUrl + link.get('href') if downloadUrl.endswith('.pdf'): #print downloadUrl lesUrls.append(downloadUrl) filenames.append(indicateurs.select('a')[i].attrs['href']) else: # scraping de la 2eme page req2 = urllib2.Request(downloadUrl, headers=hdr) page2 = urllib2.urlopen(req2) soup2 = bfs(page2, 'html.parser') #print soup2 for cpt, lien in enumerate(soup2.find_all('a')): if lien.get('href').endswith('pdf'): downloadlink = homepageUrl + lien.get('href') lesUrls.append(downloadlink) filenames.append(downloadlink) names_urls = zip(filenames, lesUrls) for name, url in names_urls: url = urllib2.quote(url.encode('utf8'), ':/') rq = urllib2.Request(url, headers=hdr) res = urllib2.urlopen(rq) # rfind recupère la dernière occurence d'un char dans une string pdf = open(name[name.rfind('/') + 1:], 'wb') pdf.write(res.read()) pdf.close()
def parse_youtube(): url = 'https://www.youtube.com/results?search_query=개발자' html = req.get(url).text soup = bfs(html, 'html.parser') data = {} for tag in soup.select('li > div > div > div.yt-lockup-content > h3 > a'): data[tag.text] = 'https://www.youtube.com/' + tag['href'] return data
def map_image(detail_url): #다이닝 코드 상세 맛집 url에서 위도, 경도 정보 파싱해서 가져오기 html = req.get(detail_url).text soup = bfs(html, 'html.parser') soup_lat = soup.select('#hdn_lat') #위도 soup_lng = soup.select('#hdn_lng') #경도 if soup_lat is not None and len( soup_lat) > 0 and soup_lng is not None and len(soup_lng) > 0: latitude = soup_lat[0]['value'] longitude = soup_lng[0]['value'] real_latitude = float(latitude) real_longitude = float(longitude) #folium 라이브러리 활용, 맛집에 마커된 지도 html파일 생성 food_location = [real_latitude, real_longitude] map = folium.Map(location=food_location, zoom_start=25) folium.Marker(food_location, popup='destination').add_to(map) map.save('./location.html') map #selenium 라이브러리 활용, 지도 html파일을 스크린샷캡쳐해서 정적이미지 파일로 생성 browser = webdriver.Chrome( 'C:/Users/yurim/Desktop/chromedriver.exe') #크롬드라이버경로넣어줘야함 browser.get( 'C:/Users/yurim/Documents/GitHub/capstone-capyou/code/complete_code/location.html' ) #지도 html경로 browser.save_screenshot('restaurant_location.png') #time.sleep(2) #browser.quit() #동적 지도창 닫지 않기 위해서 주석 처리 #다른 맛집 검색하거나 끝 누르면 html 창 자동으로 사라짐, 그 전에 마음대로 닫으면 오류 발생 # slackbot 답변으로 위에서 저장된 이미지 파일 답변하기 map_image_file = { 'file': ('restaurant_location.png', open('restaurant_location.png', 'rb'), 'png') } map_image_file_detail = { "filename": "restaurant_location.png", "token": token, "channels": ['#general'] } r = req.post("https://slack.com/api/files.upload", params=map_image_file_detail, files=map_image_file) else: return
def create_new_activity(self, activity_type, activity_file=None): activity_type = activity_type.upper() url = '{site}/new/activity'.format(site=self.site) with open(activity_file, 'r') as myfile: data_str = myfile.read().replace('\n', '') files = { 'trackFile': (activity_file, open(activity_file, 'rb'), 'multipart/form-data') } try: new_activity_form = self.session.get(url) except: raise EndpointConnectionError soup = bfs(new_activity_form.text, "html.parser") activities_form = soup.find_all('li', {'class': 'activityTypeItem'}) activity_types = [ act_type.attrs['data-value'] for act_type in activities_form ] hidden_elements = self.__get_hidden_elements('new/activity') if not activity_types: raise NoActivityTypesFound if activity_type not in activity_types: raise ActivityTypeUnknown hidden_elements['activityType'] = activity_type hidden_elements.update(self.__populate_activity_gpx(activity_file)) file_hidden_elements = {k: v for k, v in hidden_elements.iteritems()} file_hidden_elements['trackFile'] = data_str file_hidden_elements['heartRateGraphJson'] = '' file_hidden_elements['route'] = '' file_hidden_elements['averageHeartRate'] = '' file_hidden_elements['hrmFile'] = '' file_hidden_elements['activityViewableBy'] = '' file_hidden_elements['calories'] = '' file_hidden_elements['notes'] = '' if activity_file.endswith('.gpx'): file_hidden_elements['uploadType'] = '.gpx' else: raise UnknownFileType try: if self.upload_activity(activity_file): new_activity_post = self.session.post( url, data=file_hidden_elements, files=files) return new_activity_post except Exception as e: raise ErrorUploadingTrack(e)
def getEndPage(uri): url = HOST + uri res = requests.get(url, timeout=TIMEOUT) soup = bfs(res.text, "lxml") pageNums = soup.find_all(["span", "default"]) maxPage = 0 for nums in pageNums: if not nums.text.isdigit(): continue if int(nums.text) > maxPage: maxPage = int(nums.text) return maxPage+1
def walkto2(URL): r = requests.get(URL) data = r.text soup = bfs(data,"lxml") doors = [] texts = [] # links for link in soup.find_all('a'): doors.append(link.get('href')) # texts for txts in soup.find_all('p'): texts.append(txts.getText()) return doors, texts
def get_url(): arr = {} for n in range(59, 1000): soup = bfs(requests.get(url + format(n) + "/").content, "html.parser").find("div", { "id": "wallwindow" }).findChildren() for img in soup: if (not "WP_" in img.get('src')): pass else: arr[n] = img.get('src') for value in arr: print value, arr[value]
def get_docs(address): tr = tar.open(address, "r:gz", encoding="latin-1") for member in tr.getmembers(): tr_file = tr.extractfile(member) if tr_file is not None: content = tr_file.read() text = content.decode('utf-8', 'ignore') docs = text.split("</REUTERS>") for doc in docs: filtered = bfs(doc, features="html.parser").get_text() yield filtered return 3
def getting_html(site, headers): try: session = requests.Session() request = session.get(site, headers=headers) status = request.status_code page_html = bfs(request.text, "lxml") if status == 200: return getting_url(page_html) except Exception as name_error: print("Error!", name_error) print(sys.exc_info()[1])
def get_content(link): request_headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'), 'Referer': 'http://myjorney.tistory.com', # 뉴스홈 } abs_link = 'http://myjorney.tistory.com' + link html = req.get(abs_link, headers=request_headers).text soup = bfs(html, 'html.parser') # 가져온 데이터로 뭔가 할 수 있겠죠? # 하지만 일단 여기서는 시간만 확인해봅시다. print(soup.select('#head > h2 > a')[0].text) # 첫 h1 태그를 봅시다.
def crawl_index(filename="sampleB.html"): soup = bfs(open(filename)) mainContent = soup.find("div", class_="main-inner") dates = mainContent.find_all("p") links = mainContent.find_all("ul") for idx, date in enumerate(dates): #bfs4 will also extract "\n from text" string = date.find("strong").string.split(" / ")[0] day, year = string.split(", ") print "start crawling for date %s year %s...."%(day, year) currentLinks = links[idx] a = currentLinks.find_all('a') for link in a: link = "http://www.cdc.gov/mmwr/preview/" + link.get("href") print "link %s for the %s year %s"%(link, day, year)
def getLinks(historicalIndexLink): try: response = urllib2.urlopen(historicalIndexLink) content = response.read() soup = bfs(content) mainContent = soup.find("div", class_="main-inner") links = mainContent.find_all("a") #traverse the tree find all <a> hrefs = [link.get("href") for link in links if link.get("href") is not None] hrefs = map(lambda x: "http://www.cdc.gov" + x, hrefs) return hrefs except urllib2.HTTPError as e1: print "please check your url for %s"%(historicalIndexLink,), e1.reason return [] except urllib2.URLError as e2: print "please check your link for %s"%(historicalIndexLink), e2.reason return []
def create_new_activity(self, activity_type, activity_file=None): activity_type = activity_type.upper() url = '{site}/new/activity'.format(site=self.site) with open(activity_file, 'r') as myfile: data_str = myfile.read().replace('\n', '') files = {'trackFile': (activity_file, open(activity_file, 'rb'), 'multipart/form-data')} try: new_activity_form = self.session.get(url) except: raise EndpointConnectionError soup = bfs(new_activity_form.text, "html.parser") activities_form = soup.find_all('li', {'class': 'activityTypeItem'}) activity_types = [act_type.attrs['data-value'] for act_type in activities_form] hidden_elements = self.__get_hidden_elements('new/activity') if not activity_types: raise NoActivityTypesFound if activity_type not in activity_types: raise ActivityTypeUnknown hidden_elements['activityType'] = activity_type hidden_elements.update(self.__populate_activity_gpx(activity_file)) file_hidden_elements = {k: v for k, v in hidden_elements.iteritems()} file_hidden_elements['trackFile'] = data_str file_hidden_elements['heartRateGraphJson'] = '' file_hidden_elements['route'] = '' file_hidden_elements['averageHeartRate'] = '' file_hidden_elements['hrmFile'] = '' file_hidden_elements['activityViewableBy'] = '' file_hidden_elements['calories'] = '' file_hidden_elements['notes'] = '' if activity_file.endswith('.gpx'): file_hidden_elements['uploadType'] = '.gpx' else: raise UnknownFileType try: if self.upload_activity(activity_file): new_activity_post = self.session.post(url, data=file_hidden_elements, files=files) return new_activity_post except Exception as e: raise ErrorUploadingTrack(e)
def profile_username(self): """ Get profile username or ID once logged in by using Session object :return: str """ if not self.__profile_username: url = "{site}/home".format(site=self.site) try: home = self.session.get(url) except: raise EndpointConnectionError soup = bfs(home.text, "html.parser") profile_url = soup.find('a', {'href': re.compile('/user/[a-zA-Z]|[0-9]/profile')}) try: self.__profile_username = profile_url.attrs['href'].split('/')[2] except IndexError: raise ProfileNotFound return self.__profile_username
def get_activity_datetime(self, activity_id): """ :param activity_id: String :return: datetime object. """ url = "{site}/user/{profile}/activity/{activity_id}".format(site=self._runkeeper.site, profile=self._runkeeper.profile_username, activity_id=activity_id) try: activity_datetime_session = self.session.get(url) except: raise EndpointConnectionError soup = bfs(activity_datetime_session.text, "html.parser") form = soup.find('div', {'class': 'micro-text activitySubTitle'}) activity_datetime = [date_params.split('-')[0].rstrip() for date_params in form] activity_datetime = (''.join(activity_datetime)) activity_datetime = datetime.strptime(activity_datetime, '%a %b %d %H:%M:%S %Z %Y') return activity_datetime
def __get_hidden_elements(self, endpoint): """ Retrieve all <hidden> parameters from requested form :return: dict """ url = "{site}/{endpoint}".format(site=self.site, endpoint=endpoint) try: endpoint_form = self.session.get(url) except: raise EndpointConnectionError soup = bfs(endpoint_form.text, "html.parser") try: form = soup.find_all('input', {'type': 'hidden'}) except: raise HiddenElementsNotFound hidden_elements = {element.attrs['name']: element.attrs['value'] for element in form} return hidden_elements
__Date__ = '11/10/2015' """ From "http://www.practicepython.org/"***************** 16)Use the BeautifulSoup and requests Python packages to print out a list of all the article titles on the New York Times homepage. http://www.nytimes.com/ """ from bs4 import BeautifulSoup as bfs import requests nyurl='http://www.nytimes.com/' html = requests.get(nyurl) soup=bfs(html.text,"html.parser") for story_heading in soup.find_all(class_="story-heading"): if story_heading.a: print(story_heading.a.text.replace("\n", " ").strip()) else: print(story_heading.contents[0].strip())
def get_page_links(volumePage): soup = bfs(volumePage) mainContent = soup.find("div", class_="mSyndicate") links = mainContent.find("a")