Example #1
0
async def _traverse_gallery(
    worker_id: int,
    session: aiohttp.ClientSession,
    traverse_queue: URLQueue,
    gallery_queue: GalleryQueue,
) -> None:
    with logger.contextualize(
            worker_id=worker_id,
            task='traverse_gallery',
    ):
        while True:
            root_url = await traverse_queue.get()
            domain_url = yarl.URL(
                f'{root_url.scheme}://{root_url.parent.host}')

            logger.info(
                'Looking for downloadable video files at {root_url}',
                root_url=root_url,
            )

            soup = BS(
                await (await session.get(root_url)).text(),
                'html.parser',
                parse_only=SS('a'),
            )

            video_anchors = list(
                map(
                    lambda h: VideoPage(
                        title=h.img.get('alt').strip(),
                        url=domain_url.with_path(h.get('href')),
                    ),
                    soup.find_all(
                        'a',
                        class_='thumbnail-link',
                        href=re.compile('video'),
                    ),
                ), )
            logger.info(
                'Located {c} videos to download via {u}',
                c=len(video_anchors),
                u=root_url,
            )

            for va in video_anchors:
                await gallery_queue.put(va)

            next_anchor = soup.find('a', rel='next')
            if next_anchor:
                _, page_number = next_anchor['href'].replace('?',
                                                             '').split('=')
                await traverse_queue.put(
                    root_url.with_query(page=int(page_number)))
            traverse_queue.task_done()
Example #2
0
def getArticles(url_addr: str):
    from bs4 import SoupStrainer as SS, BeautifulSoup as BS
    import operator
    import urllib.request
    html = urllib.request.urlopen(url_addr)
    articles = BS(html, 'html.parser', parse_only=SS('article'))
    parsed_articles = []
    for i in articles:
        parsing_res = parseArticle(i)
        if parsing_res:
            parsed_articles.append(parsing_res)
    return sortListOfDicts(parsed_articles, ID, True)
Example #3
0
def main():

    s = requests.Session()

    #get cookies
    r = s.post(url + login, data=logindata)
    r = s.get(url+ws_api)

    main_strain = SS(id='region-main')
    main_soup = BS(r.text, 'html.parser', parse_only=main_strain)

    parse_api(main_soup)
    return main_soup
Example #4
0
from bs4 import BeautifulSoup as BS, SoupStrainer as SS
import requests

url = "http://www.imdb.com/search/title"

# genres=action&languages=en&release_date=2015,2016&user_rating=7.5,

params = dict()
params['genres'] = 'action'
params['user_rating'] = '8.0,'
params['release_date'] = '2016-11-01,'

resp = requests.get(url, params=params)

print resp.url  #, resp.content
print
ss = SS('a')
soup = BS(resp.content, 'html.parser', parse_only=ss)
resp.close()

#print soup
#print soup.find_all(title="Inside Out (2015)")#Mission: Impossible - Rogue Nation (2015)")

#fl = open('movie_titles','w')
for movie_title in soup.find_all('a'):
    print movie_title.string

#fl.close()
Example #5
0
def getData(company, journal, entry, response=None):
    """Get the data. Starts from the data contained in the RSS page and, if
    necessary, parses the website for additional information"""

    url = refineUrl(company, journal, entry)

    # If the journal is edited by the RSC
    if company == 'RSC':
        """Graphical abstract present in RSS. Abstract incomplete
        and w/out html. Title w/out html"""

        title = entry.title
        date = arrow.get(entry.updated).format('YYYY-MM-DD')

        abstract = None
        graphical_abstract = None
        author = None

        soup = BS(entry.summary, "html.parser")

        r = soup("img", align="center")
        if r:
            graphical_abstract = r[0]['src']

        if response.status_code is requests.codes.ok:

            # # Get the title (w/ html)
            # Strainer: get a soup with only the interesting part.
            # Don't load the complete tree in memory. Saves RAM
            strainer = SS("h2", attrs={"class": "capsule__title fixpadv--m"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            title = soup.h2

            if title is not None:
                title = title.renderContents().decode().strip()

            # Get the abstrat (w/ html)
            strainer = SS("p", xmlns="http://www.rsc.org/schema/rscart38")
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.p

            if r is not None:
                abstract = r.renderContents().decode()
                if abstract == "":
                    abstract = None

            strainer = SS("meta", attrs={"name": "citation_author"})
            soup = BS(response.text, "html.parser", parse_only=strainer)

            # Here, multiple tags (results) are expected, so perform
            # the search, even if the tree contains only the result
            r = soup("meta", attrs={"name": "citation_author"})
            if r:
                author = [tag['content'] for tag in r]
                author = ", ".join(author)

    elif company == 'Wiley':

        title, date, author, abstract, graphical_abstract = parseWiley(
            entry, response)

    elif company == 'ACS':
        """Feed only contains graphical abstract"""

        title = entry.title.rstrip()
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')
        abstract = None

        author = entry.author
        author = entry.author.split(" and ")
        if len(author) > 1:
            author = ", ".join(author)
        else:
            author = author[0]

        graphical_abstract = None

        soup = BS(entry.summary, "html.parser")
        r = soup("img", alt="TOC Graphic")
        if r:
            graphical_abstract = r[0]['src']

        # If the dl went wrong, print an error
        if response.status_code is requests.codes.ok:

            strainer = SS("p", attrs={"class": "articleBody_abstractText"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.p
            if r is not None:
                abstract = r.renderContents().decode()

            strainer = SS("h1", attrs={"class": "articleTitle"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

    elif company == 'Nature':

        title = entry.title
        date = entry.date
        abstract = None
        graphical_abstract = None
        author = None

        try:
            if entry.authors:
                author = []
                for element in entry.authors:
                    author.append(element['name'])
                author = ", ".join(author)
        except AttributeError:
            pass

        if entry.summary:
            abstract = BS(entry.summary, "html.parser")

            while abstract.find_all('p'):
                _ = abstract.p.extract()

            try:
                _ = abstract.img.extract()
            except AttributeError:
                pass

            abstract = abstract.renderContents().decode()

        if (response.status_code is requests.codes.ok
                or response.status_code == 401):

            strainer = SS("div",
                          attrs={"class": "article__body serif cleared"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.div
            try:
                abstract = r.text
            except AttributeError:
                pass

            strainer = SS("figure")
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("img", attrs={"class": "figure__image"})

            if r:
                # Additional verification to correctly forge the URL
                graphical_abstract = "http:" + r[0]["src"]

    elif company == 'Science':

        title = entry.title
        date = entry.date

        graphical_abstract = None

        if entry.author:
            author = entry.author
        else:
            author = None

        abstract = entry.summary
        if not abstract:
            abstract = None

    elif company == 'PNAS':

        title = entry.title
        date = entry.prism_publicationdate

        graphical_abstract = None
        author = None

        abstract = None

        if response.status_code is requests.codes.ok:

            # Get the correct title, not the one in the RSS
            strainer = SS("h1", id="article-title-1")
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("h1", id="article-title-1")
            if r:
                title = r[0].renderContents().decode()

            # Get the authors
            strainer = SS("a", attrs={"class": "name-search"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("a", attrs={"class": "name-search"})
            if r:
                author = [tag.text for tag in r]
                author = ", ".join(author)

            # Try to get the complete abstract. Sometimes it's available,
            # sometimes the article only contains an extract
            strainer = SS("div", attrs={"class": "section abstract"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            if soup.p is not None:
                abstract = soup.p.renderContents().decode()
            else:
                abstract = entry.summary

    elif company == 'Elsevier':

        title = entry.title
        date = arrow.get(mktime(entry.updated_parsed)).format('YYYY-MM-DD')

        graphical_abstract = None
        author = None

        abstract = entry.summary

        if abstract:
            try:
                author = abstract.split("Author(s): ")[1].split(
                    "<br")[0].split("<")[0]
                author = author.replace(" , ", ", ")
                author = author.replace("  ", " ")
            except IndexError:
                author = None

            soup = BS(abstract, "html.parser")

            try:
                # First type of abstract formatting
                abstract = soup("simple-para")[0].renderContents().decode()
            except IndexError:
                try:
                    # Second type of abstract formatting
                    abstract = abstract.split("<br />")[3].lstrip()
                except IndexError:
                    abstract = None

            r = soup.find_all("img")
            if r:
                graphical_abstract = r[0]['src']

        # NOTE: javascript embedded, impossible
        # if response.status_code is requests.codes.ok:
        # url = response.url
        # print(response.url)
        # # Get the abstract
        # soup = BS(response.text)

        # Get the correct title, no the one in the RSS
        # r = soup.find_all("li", attrs={"class": "originalArticleName"})
        # print(r)
        # if r:
        # title = r[0].renderContents().decode()

    elif company == 'Thieme':

        title = entry.title
        date = arrow.get(entry.updated).format('YYYY-MM-DD')

        abstract = None
        graphical_abstract = None
        author = None

        try:
            if entry.authors:
                author = []
                for element in entry.authors:
                    # Reverse Family name/first name
                    field = reversed(element['name'].split(', '))
                    name = " ".join(field)
                    author.append(name)
                author = ", ".join(author)
        except AttributeError:
            pass

        try:
            if entry.summary:
                abstract = entry.summary
        except AttributeError:
            pass

    elif company == 'Beilstein':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')

        abstract = None
        graphical_abstract = None

        author = entry.author
        author = entry.author.split(" and ")
        if len(author) > 1:
            author = ", ".join(author)
        else:
            author = author[0]

        if entry.summary != "":
            soup = BS(entry.summary, "html.parser")
            r = soup.find_all("p")

            if r:
                abstract = r[1].renderContents().decode()

            r = soup.find_all("img")
            if r:
                # This company can change the background of the GA through
                # the url. If nothing is done, the bg is black, so turn it
                # to white. Doesn't affect images with unchangeable bg
                graphical_abstract = r[0]['src'] + '&background=FFFFFF'

    elif company == 'Nature2':

        title = entry.title
        date = entry.date
        abstract = entry.summary
        graphical_abstract = None

        try:
            author = [dic['name'] for dic in entry.authors]
            if author:
                if len(author) > 1:
                    author = ", ".join(author)
                else:
                    author = author[0]
            else:
                author = None
        except AttributeError:
            author = None

        if response.status_code is requests.codes.ok or response.status_code == 401:

            strainer = SS(
                "h1", attrs={"class": "tighten-line-height small-space-below"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

            strainer = SS("div", attrs={"id": "abstract-content"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.p
            if r is not None:
                abstract = r.renderContents().decode()

            strainer = SS("img")
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("img", attrs={"alt": "Figure 1"})
            if r:
                if "f1.jpg" in r[0]["src"]:
                    graphical_abstract = "http://www.nature.com" + r[0]["src"]

    elif company == 'PLOS':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')

        author = None

        try:
            if entry.authors:
                author = []
                for element in entry.authors:
                    author.append(element['name'])
                author = ", ".join(author)
        except AttributeError:
            pass

        abstract = BS(entry.summary, "html.parser")

        # Clean the authors' names from the abstract
        r = abstract.find_all("p")
        if r and str(r[0]).startswith("<p>by "):
            abstract("p")[0].extract()

        try:
            abstract("img")[0].extract()
        except IndexError:
            pass

        abstract = abstract.renderContents().decode().strip()

        base = "http://journals.plos.org/plosone/article/figure/image?size=medium&id=info:doi/{}.g001"
        graphical_abstract = base.format(getDoi(company, journal, entry))

    elif company == 'Springer':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')
        graphical_abstract = None
        author = None

        abstract = BS(entry.summary, "html.parser")

        try:
            _ = abstract("h3")[0].extract()
            # Remove the graphical abstract part from the abstract
            _ = abstract(
                "span",
                attrs={
                    "class":
                    "a-plus-plus figure category-standard float-no id-figa"
                })[0].extract()
        except IndexError:
            pass

        abstract = abstract.renderContents().decode().strip()

        if response.status_code is requests.codes.ok:

            strainer = SS("div", attrs={"class": "MediaObject"})
            soup = BS(response.text, "html.parser", parse_only=strainer)

            # For now, it's one shot: if the dl fails for the GA, there
            # won't be a retry. That's bc too little articles have GA
            r = soup.find_all("img")
            if r:
                graphical_abstract = r[0]['src']

            strainer = SS("ul", attrs={"class": "AuthorNames"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("span", attrs={"class": "AuthorName"})
            if r:
                author = [tag.text for tag in r]
                author = ", ".join(author)

            strainer = SS("h1", attrs={"class": "ArticleTitle"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

    elif company == 'Springer_open':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')
        graphical_abstract = None
        author = None

        abstract = BS(entry.summary, "html.parser")

        try:
            _ = abstract("h3")[0].extract()
            # Remove the graphical abstract part from the abstract
            _ = abstract(
                "span",
                attrs={
                    "class":
                    "a-plus-plus figure category-standard float-no id-figa"
                })[0].extract()
        except IndexError:
            pass

        abstract = abstract.renderContents().decode().strip()

        if response.status_code is requests.codes.ok:

            strainer = SS("div", attrs={"class": "MediaObject"})
            soup = BS(response.text, "html.parser", parse_only=strainer)

            # For now, it's one shot: if the dl fails for the GA, there
            # won't be a retry. That's bc too little articles have GA
            r = soup.find_all("img")
            if r:
                graphical_abstract = r[0]['src']

            strainer = SS("ul", attrs={"class": "u-listReset"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("span", attrs={"class": "AuthorName"})
            if r:
                author = [tag.text for tag in r]
                author = ", ".join(author)

            strainer = SS("h1", attrs={"class": "ArticleTitle"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

    elif company == 'Taylor':

        title = entry.title
        date = arrow.get(mktime(entry.updated_parsed)).format('YYYY-MM-DD')
        graphical_abstract = None
        author = None
        abstract = None

        try:
            author = []
            for element in entry.authors:
                author.append(element['name'])
            author = ", ".join(author)
        except AttributeError:
            author = None

        if response.status_code is requests.codes.ok:

            strainer = SS("div", attrs={"class": "col-md-2-3 "})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.span
            if r is not None:
                # Remove all tags attributes
                for tag in r.findAll(True):
                    tag.attrs = None
                title = r.renderContents().decode()

            strainer = SS("div",
                          attrs={"class": "abstractSection abstractInFull"})
            soup = BS(response.text, "html.parser", parse_only=strainer)

            # Erase the title 'Abstract', useless
            if soup("p") and soup("p")[0].text == "Abstract":
                soup("p")[0].extract()

            r = soup.p
            if r is not None:
                abstract = r.renderContents().decode()

            r = soup.find_all("img")
            if r:
                base = "http://www.tandfonline.com{}"
                graphical_abstract = base.format(r[0]['src'])

    elif company == 'ChemArxiv':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')
        graphical_abstract = None
        author = None
        abstract = None

        try:
            if entry.authors:
                author = []
                for element in entry.authors:
                    author.append(element['name'])
                author = ", ".join(author)
        except AttributeError:
            pass

        try:
            abstract = entry.summary
        except AttributeError:
            # I saw once a poster conference, w/ no abstract.
            # Filter these entries if it becomes common
            pass

    elif company == 'ChemRxiv':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')
        graphical_abstract = None
        author = None
        abstract = None

        try:
            abstract = entry.summary
        except AttributeError:
            # I saw once a poster conference, w/ no abstract.
            # Filter these entries if it becomes common
            pass

        if response.status_code is requests.codes.ok:
            pass

            strainer = SS("span", attrs={"class": "authors-holder"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("a", attrs={"class": "normal-link author"})
            if r:
                author = [tag.text.strip() for tag in r]
                author = ", ".join(author)

    else:
        return None

    if title is None:
        return None

    topic_simple = forgeTopicSimple(title, abstract)

    if abstract is None or abstract == '':
        abstract = "Empty"
    if graphical_abstract is None:
        graphical_abstract = "Empty"

    if author is None or author == '':
        author = "Empty"
        author_simple = None
    else:
        # Clean author field
        author = author.replace('  ', ' ')
        author = author.replace(' ,', ',')
        author_simple = " " + fct.simpleChar(author) + " "

    return title, date, author, abstract, graphical_abstract, url, topic_simple, author_simple
Example #6
0
async def _find_video_link(
    worker_id: int,
    session: aiohttp.ClientSession,
    gallery_queue: GalleryQueue,
    video_dl_queue: VideoDownloadQueue,
) -> None:
    from selenium import webdriver

    options = webdriver.FirefoxOptions()
    options.headless = True

    with logger.contextualize(
            worker_id=worker_id,
            task='find_video_dl_link',
    ):
        while True:
            # in here we will deal with selenium and firefox driver
            video_page_url = await gallery_queue.get()
            logger.info('Examining video url={u}', u=video_page_url)

            soup = BS(
                await (await session.get(video_page_url.url)).text(),
                'html.parser',
                parse_only=SS('a'),
            )
            logger.info(
                'Successfully downloaded video page {u}',
                u=video_page_url,
            )

            highest_quality = sorted(
                map(
                    operator.attrgetter('text'),
                    soup.find_all('a', class_='quality-btn'),
                ),
                key=lambda q: int(q.replace('p', '')),
            )[-1]
            logger.info(
                'Highest quality is {hq}',
                hq=highest_quality,
            )

            driver = webdriver.Firefox(options=options)
            driver.get(
                str(video_page_url.url.with_query(wersja=highest_quality)))
            await asyncio.sleep(5)
            video_src = driver.find_element_by_tag_name('video').get_attribute(
                'src')
            driver.quit()

            video_dl_url = yarl.URL(video_src)
            logger.info(
                'Obtained downloadable video URL={u}',
                u=video_dl_url,
            )

            await video_dl_queue.put(
                VideoURL(
                    title=video_page_url.title,
                    quality=highest_quality,
                    url=video_dl_url,
                    ext=video_dl_url.path.split('/')[-1].rsplit('.', 1)[-1],
                ), )
            gallery_queue.task_done()
Example #7
0
import youtube_dl
from bs4 import BeautifulSoup as BS, SoupStrainer as SS
import requests

course_code = 'RDQMrrM3s_cosug'  #'PL385A53B00B8B158E'
url = 'https://www.youtube.com/watch?v=Kd57YHWqrsI&list=' + course_code
resp = requests.get(url)
print resp.url
html = resp.content
resp.close()

ss = SS('tr')
soup = BS(html, 'html.parser', parse_only=ss)

base_url = 'https://www.youtube.com/watch?v={0}&index={1}&list=' + course_code
fn1 = lambda tag: tag.has_attr('data-video-id') and tag.has_attr(
    'class') and "yt-uix-tile" in tag.attrs['class']
cnt = 1
for tag in soup.find_all(fn1):
    #print tag.attrs#.tr.attrs
    url = base_url.format(str(tag.attrs['data-video-id']), str(cnt))
    print url
    cnt += 1
    youtube_dl.YoutubeDL().download([url])
Example #8
0
#  loop through pages - 25 results per page - allow for 10 pages starting at 1
for i in range(1, 11):

	#  search URL
	URL = "https://www.yell.com/ucs/UcsSearchAction.do?find=Y&keywords={0}&location={1}&pageNum={2}".format(busType,
																											busLoc, i)
	#  Define browser driver
	driver = webdriver.Chrome()

	#  Collect webpage contents
	driver.get(URL)
	page_source = driver.page_source

	#  Define listings part of page
	divs = SS(class_="col-sm-15 col-md-14 col-lg-15 businessCapsule--mainContent")

	#  parse only the parts of the page listed above
	soup = BS(page_source, 'html.parser', parse_only=divs)
	print('Collecting data...')
	#  Loop through retrieved data extracting information - try/except those that dont always have data
	for each in soup:
		busName = each.find('span', class_="businessCapsule--name").text
		busNo = each.find('span', class_="business--telephoneNumber").text

		try:
			busDesc = each.find(attrs={'itemprop': 'description'}).text
		except:
			busDesc = ''

		try: