async def _traverse_gallery( worker_id: int, session: aiohttp.ClientSession, traverse_queue: URLQueue, gallery_queue: GalleryQueue, ) -> None: with logger.contextualize( worker_id=worker_id, task='traverse_gallery', ): while True: root_url = await traverse_queue.get() domain_url = yarl.URL( f'{root_url.scheme}://{root_url.parent.host}') logger.info( 'Looking for downloadable video files at {root_url}', root_url=root_url, ) soup = BS( await (await session.get(root_url)).text(), 'html.parser', parse_only=SS('a'), ) video_anchors = list( map( lambda h: VideoPage( title=h.img.get('alt').strip(), url=domain_url.with_path(h.get('href')), ), soup.find_all( 'a', class_='thumbnail-link', href=re.compile('video'), ), ), ) logger.info( 'Located {c} videos to download via {u}', c=len(video_anchors), u=root_url, ) for va in video_anchors: await gallery_queue.put(va) next_anchor = soup.find('a', rel='next') if next_anchor: _, page_number = next_anchor['href'].replace('?', '').split('=') await traverse_queue.put( root_url.with_query(page=int(page_number))) traverse_queue.task_done()
def getArticles(url_addr: str): from bs4 import SoupStrainer as SS, BeautifulSoup as BS import operator import urllib.request html = urllib.request.urlopen(url_addr) articles = BS(html, 'html.parser', parse_only=SS('article')) parsed_articles = [] for i in articles: parsing_res = parseArticle(i) if parsing_res: parsed_articles.append(parsing_res) return sortListOfDicts(parsed_articles, ID, True)
def main(): s = requests.Session() #get cookies r = s.post(url + login, data=logindata) r = s.get(url+ws_api) main_strain = SS(id='region-main') main_soup = BS(r.text, 'html.parser', parse_only=main_strain) parse_api(main_soup) return main_soup
from bs4 import BeautifulSoup as BS, SoupStrainer as SS import requests url = "http://www.imdb.com/search/title" # genres=action&languages=en&release_date=2015,2016&user_rating=7.5, params = dict() params['genres'] = 'action' params['user_rating'] = '8.0,' params['release_date'] = '2016-11-01,' resp = requests.get(url, params=params) print resp.url #, resp.content print ss = SS('a') soup = BS(resp.content, 'html.parser', parse_only=ss) resp.close() #print soup #print soup.find_all(title="Inside Out (2015)")#Mission: Impossible - Rogue Nation (2015)") #fl = open('movie_titles','w') for movie_title in soup.find_all('a'): print movie_title.string #fl.close()
def getData(company, journal, entry, response=None): """Get the data. Starts from the data contained in the RSS page and, if necessary, parses the website for additional information""" url = refineUrl(company, journal, entry) # If the journal is edited by the RSC if company == 'RSC': """Graphical abstract present in RSS. Abstract incomplete and w/out html. Title w/out html""" title = entry.title date = arrow.get(entry.updated).format('YYYY-MM-DD') abstract = None graphical_abstract = None author = None soup = BS(entry.summary, "html.parser") r = soup("img", align="center") if r: graphical_abstract = r[0]['src'] if response.status_code is requests.codes.ok: # # Get the title (w/ html) # Strainer: get a soup with only the interesting part. # Don't load the complete tree in memory. Saves RAM strainer = SS("h2", attrs={"class": "capsule__title fixpadv--m"}) soup = BS(response.text, "html.parser", parse_only=strainer) title = soup.h2 if title is not None: title = title.renderContents().decode().strip() # Get the abstrat (w/ html) strainer = SS("p", xmlns="http://www.rsc.org/schema/rscart38") soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.p if r is not None: abstract = r.renderContents().decode() if abstract == "": abstract = None strainer = SS("meta", attrs={"name": "citation_author"}) soup = BS(response.text, "html.parser", parse_only=strainer) # Here, multiple tags (results) are expected, so perform # the search, even if the tree contains only the result r = soup("meta", attrs={"name": "citation_author"}) if r: author = [tag['content'] for tag in r] author = ", ".join(author) elif company == 'Wiley': title, date, author, abstract, graphical_abstract = parseWiley( entry, response) elif company == 'ACS': """Feed only contains graphical abstract""" title = entry.title.rstrip() date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') abstract = None author = entry.author author = entry.author.split(" and ") if len(author) > 1: author = ", ".join(author) else: author = author[0] graphical_abstract = None soup = BS(entry.summary, "html.parser") r = soup("img", alt="TOC Graphic") if r: graphical_abstract = r[0]['src'] # If the dl went wrong, print an error if response.status_code is requests.codes.ok: strainer = SS("p", attrs={"class": "articleBody_abstractText"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.p if r is not None: abstract = r.renderContents().decode() strainer = SS("h1", attrs={"class": "articleTitle"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() elif company == 'Nature': title = entry.title date = entry.date abstract = None graphical_abstract = None author = None try: if entry.authors: author = [] for element in entry.authors: author.append(element['name']) author = ", ".join(author) except AttributeError: pass if entry.summary: abstract = BS(entry.summary, "html.parser") while abstract.find_all('p'): _ = abstract.p.extract() try: _ = abstract.img.extract() except AttributeError: pass abstract = abstract.renderContents().decode() if (response.status_code is requests.codes.ok or response.status_code == 401): strainer = SS("div", attrs={"class": "article__body serif cleared"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.div try: abstract = r.text except AttributeError: pass strainer = SS("figure") soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("img", attrs={"class": "figure__image"}) if r: # Additional verification to correctly forge the URL graphical_abstract = "http:" + r[0]["src"] elif company == 'Science': title = entry.title date = entry.date graphical_abstract = None if entry.author: author = entry.author else: author = None abstract = entry.summary if not abstract: abstract = None elif company == 'PNAS': title = entry.title date = entry.prism_publicationdate graphical_abstract = None author = None abstract = None if response.status_code is requests.codes.ok: # Get the correct title, not the one in the RSS strainer = SS("h1", id="article-title-1") soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("h1", id="article-title-1") if r: title = r[0].renderContents().decode() # Get the authors strainer = SS("a", attrs={"class": "name-search"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("a", attrs={"class": "name-search"}) if r: author = [tag.text for tag in r] author = ", ".join(author) # Try to get the complete abstract. Sometimes it's available, # sometimes the article only contains an extract strainer = SS("div", attrs={"class": "section abstract"}) soup = BS(response.text, "html.parser", parse_only=strainer) if soup.p is not None: abstract = soup.p.renderContents().decode() else: abstract = entry.summary elif company == 'Elsevier': title = entry.title date = arrow.get(mktime(entry.updated_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = entry.summary if abstract: try: author = abstract.split("Author(s): ")[1].split( "<br")[0].split("<")[0] author = author.replace(" , ", ", ") author = author.replace(" ", " ") except IndexError: author = None soup = BS(abstract, "html.parser") try: # First type of abstract formatting abstract = soup("simple-para")[0].renderContents().decode() except IndexError: try: # Second type of abstract formatting abstract = abstract.split("<br />")[3].lstrip() except IndexError: abstract = None r = soup.find_all("img") if r: graphical_abstract = r[0]['src'] # NOTE: javascript embedded, impossible # if response.status_code is requests.codes.ok: # url = response.url # print(response.url) # # Get the abstract # soup = BS(response.text) # Get the correct title, no the one in the RSS # r = soup.find_all("li", attrs={"class": "originalArticleName"}) # print(r) # if r: # title = r[0].renderContents().decode() elif company == 'Thieme': title = entry.title date = arrow.get(entry.updated).format('YYYY-MM-DD') abstract = None graphical_abstract = None author = None try: if entry.authors: author = [] for element in entry.authors: # Reverse Family name/first name field = reversed(element['name'].split(', ')) name = " ".join(field) author.append(name) author = ", ".join(author) except AttributeError: pass try: if entry.summary: abstract = entry.summary except AttributeError: pass elif company == 'Beilstein': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') abstract = None graphical_abstract = None author = entry.author author = entry.author.split(" and ") if len(author) > 1: author = ", ".join(author) else: author = author[0] if entry.summary != "": soup = BS(entry.summary, "html.parser") r = soup.find_all("p") if r: abstract = r[1].renderContents().decode() r = soup.find_all("img") if r: # This company can change the background of the GA through # the url. If nothing is done, the bg is black, so turn it # to white. Doesn't affect images with unchangeable bg graphical_abstract = r[0]['src'] + '&background=FFFFFF' elif company == 'Nature2': title = entry.title date = entry.date abstract = entry.summary graphical_abstract = None try: author = [dic['name'] for dic in entry.authors] if author: if len(author) > 1: author = ", ".join(author) else: author = author[0] else: author = None except AttributeError: author = None if response.status_code is requests.codes.ok or response.status_code == 401: strainer = SS( "h1", attrs={"class": "tighten-line-height small-space-below"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() strainer = SS("div", attrs={"id": "abstract-content"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.p if r is not None: abstract = r.renderContents().decode() strainer = SS("img") soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("img", attrs={"alt": "Figure 1"}) if r: if "f1.jpg" in r[0]["src"]: graphical_abstract = "http://www.nature.com" + r[0]["src"] elif company == 'PLOS': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') author = None try: if entry.authors: author = [] for element in entry.authors: author.append(element['name']) author = ", ".join(author) except AttributeError: pass abstract = BS(entry.summary, "html.parser") # Clean the authors' names from the abstract r = abstract.find_all("p") if r and str(r[0]).startswith("<p>by "): abstract("p")[0].extract() try: abstract("img")[0].extract() except IndexError: pass abstract = abstract.renderContents().decode().strip() base = "http://journals.plos.org/plosone/article/figure/image?size=medium&id=info:doi/{}.g001" graphical_abstract = base.format(getDoi(company, journal, entry)) elif company == 'Springer': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = BS(entry.summary, "html.parser") try: _ = abstract("h3")[0].extract() # Remove the graphical abstract part from the abstract _ = abstract( "span", attrs={ "class": "a-plus-plus figure category-standard float-no id-figa" })[0].extract() except IndexError: pass abstract = abstract.renderContents().decode().strip() if response.status_code is requests.codes.ok: strainer = SS("div", attrs={"class": "MediaObject"}) soup = BS(response.text, "html.parser", parse_only=strainer) # For now, it's one shot: if the dl fails for the GA, there # won't be a retry. That's bc too little articles have GA r = soup.find_all("img") if r: graphical_abstract = r[0]['src'] strainer = SS("ul", attrs={"class": "AuthorNames"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("span", attrs={"class": "AuthorName"}) if r: author = [tag.text for tag in r] author = ", ".join(author) strainer = SS("h1", attrs={"class": "ArticleTitle"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() elif company == 'Springer_open': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = BS(entry.summary, "html.parser") try: _ = abstract("h3")[0].extract() # Remove the graphical abstract part from the abstract _ = abstract( "span", attrs={ "class": "a-plus-plus figure category-standard float-no id-figa" })[0].extract() except IndexError: pass abstract = abstract.renderContents().decode().strip() if response.status_code is requests.codes.ok: strainer = SS("div", attrs={"class": "MediaObject"}) soup = BS(response.text, "html.parser", parse_only=strainer) # For now, it's one shot: if the dl fails for the GA, there # won't be a retry. That's bc too little articles have GA r = soup.find_all("img") if r: graphical_abstract = r[0]['src'] strainer = SS("ul", attrs={"class": "u-listReset"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("span", attrs={"class": "AuthorName"}) if r: author = [tag.text for tag in r] author = ", ".join(author) strainer = SS("h1", attrs={"class": "ArticleTitle"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() elif company == 'Taylor': title = entry.title date = arrow.get(mktime(entry.updated_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = None try: author = [] for element in entry.authors: author.append(element['name']) author = ", ".join(author) except AttributeError: author = None if response.status_code is requests.codes.ok: strainer = SS("div", attrs={"class": "col-md-2-3 "}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.span if r is not None: # Remove all tags attributes for tag in r.findAll(True): tag.attrs = None title = r.renderContents().decode() strainer = SS("div", attrs={"class": "abstractSection abstractInFull"}) soup = BS(response.text, "html.parser", parse_only=strainer) # Erase the title 'Abstract', useless if soup("p") and soup("p")[0].text == "Abstract": soup("p")[0].extract() r = soup.p if r is not None: abstract = r.renderContents().decode() r = soup.find_all("img") if r: base = "http://www.tandfonline.com{}" graphical_abstract = base.format(r[0]['src']) elif company == 'ChemArxiv': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = None try: if entry.authors: author = [] for element in entry.authors: author.append(element['name']) author = ", ".join(author) except AttributeError: pass try: abstract = entry.summary except AttributeError: # I saw once a poster conference, w/ no abstract. # Filter these entries if it becomes common pass elif company == 'ChemRxiv': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = None try: abstract = entry.summary except AttributeError: # I saw once a poster conference, w/ no abstract. # Filter these entries if it becomes common pass if response.status_code is requests.codes.ok: pass strainer = SS("span", attrs={"class": "authors-holder"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("a", attrs={"class": "normal-link author"}) if r: author = [tag.text.strip() for tag in r] author = ", ".join(author) else: return None if title is None: return None topic_simple = forgeTopicSimple(title, abstract) if abstract is None or abstract == '': abstract = "Empty" if graphical_abstract is None: graphical_abstract = "Empty" if author is None or author == '': author = "Empty" author_simple = None else: # Clean author field author = author.replace(' ', ' ') author = author.replace(' ,', ',') author_simple = " " + fct.simpleChar(author) + " " return title, date, author, abstract, graphical_abstract, url, topic_simple, author_simple
async def _find_video_link( worker_id: int, session: aiohttp.ClientSession, gallery_queue: GalleryQueue, video_dl_queue: VideoDownloadQueue, ) -> None: from selenium import webdriver options = webdriver.FirefoxOptions() options.headless = True with logger.contextualize( worker_id=worker_id, task='find_video_dl_link', ): while True: # in here we will deal with selenium and firefox driver video_page_url = await gallery_queue.get() logger.info('Examining video url={u}', u=video_page_url) soup = BS( await (await session.get(video_page_url.url)).text(), 'html.parser', parse_only=SS('a'), ) logger.info( 'Successfully downloaded video page {u}', u=video_page_url, ) highest_quality = sorted( map( operator.attrgetter('text'), soup.find_all('a', class_='quality-btn'), ), key=lambda q: int(q.replace('p', '')), )[-1] logger.info( 'Highest quality is {hq}', hq=highest_quality, ) driver = webdriver.Firefox(options=options) driver.get( str(video_page_url.url.with_query(wersja=highest_quality))) await asyncio.sleep(5) video_src = driver.find_element_by_tag_name('video').get_attribute( 'src') driver.quit() video_dl_url = yarl.URL(video_src) logger.info( 'Obtained downloadable video URL={u}', u=video_dl_url, ) await video_dl_queue.put( VideoURL( title=video_page_url.title, quality=highest_quality, url=video_dl_url, ext=video_dl_url.path.split('/')[-1].rsplit('.', 1)[-1], ), ) gallery_queue.task_done()
import youtube_dl from bs4 import BeautifulSoup as BS, SoupStrainer as SS import requests course_code = 'RDQMrrM3s_cosug' #'PL385A53B00B8B158E' url = 'https://www.youtube.com/watch?v=Kd57YHWqrsI&list=' + course_code resp = requests.get(url) print resp.url html = resp.content resp.close() ss = SS('tr') soup = BS(html, 'html.parser', parse_only=ss) base_url = 'https://www.youtube.com/watch?v={0}&index={1}&list=' + course_code fn1 = lambda tag: tag.has_attr('data-video-id') and tag.has_attr( 'class') and "yt-uix-tile" in tag.attrs['class'] cnt = 1 for tag in soup.find_all(fn1): #print tag.attrs#.tr.attrs url = base_url.format(str(tag.attrs['data-video-id']), str(cnt)) print url cnt += 1 youtube_dl.YoutubeDL().download([url])
# loop through pages - 25 results per page - allow for 10 pages starting at 1 for i in range(1, 11): # search URL URL = "https://www.yell.com/ucs/UcsSearchAction.do?find=Y&keywords={0}&location={1}&pageNum={2}".format(busType, busLoc, i) # Define browser driver driver = webdriver.Chrome() # Collect webpage contents driver.get(URL) page_source = driver.page_source # Define listings part of page divs = SS(class_="col-sm-15 col-md-14 col-lg-15 businessCapsule--mainContent") # parse only the parts of the page listed above soup = BS(page_source, 'html.parser', parse_only=divs) print('Collecting data...') # Loop through retrieved data extracting information - try/except those that dont always have data for each in soup: busName = each.find('span', class_="businessCapsule--name").text busNo = each.find('span', class_="business--telephoneNumber").text try: busDesc = each.find(attrs={'itemprop': 'description'}).text except: busDesc = '' try: