コード例 #1
0
ファイル: worker.py プロジェクト: RKBK/ChemBrows
    def pictureDownloaded(self, doi, entry_url, future):

        """Callback to handle the response of the futures
        downloading a picture"""

        if not self.parent.parsing:
            return

        query = QtSql.QSqlQuery(self.bdd)

        try:
            response = future.result()
        except concurrent.futures._base.CancelledError:
            self.l.error("future cancelled for {}".format(entry_url))
            return
        except Exception as e:
            self.l.error("Exception raised in pictureDownloaded:\n{}".
                         format(e))
            self.l.error(traceback.format_exc())
            params = ("Empty", doi)
        else:
            # If the picture was dled correctly
            if response.status_code is requests.codes.ok:
                try:
                    # Save the page
                    io = BytesIO(response.content)
                    Image.open(io).convert('RGB').save(
                        self.DATA_PATH + functions.simpleChar(response.url),
                        format='JPEG')
                    self.l.debug("Image ok")
                except Exception as e:
                    self.l.error("An error occured in pictureDownloaded:\n{}".
                                 format(e))
                    self.l.error(traceback.format_exc())
                    params = ("Empty", doi)
                else:
                    params = (functions.simpleChar(response.url), doi)
            else:
                self.l.debug("Bad return code: {} DOI: {}".
                             format(response.status_code, doi))
                params = ("Empty", doi)

        finally:
            query.prepare("UPDATE papers SET graphical_abstract=? WHERE doi=?")

            for value in params:
                query.addBindValue(value)

            self.new_entries_worker += 1
            query.exec_()

        self.count_futures_images += 1
コード例 #2
0
    def pictureDownloaded(self, doi, entry_url, future):
        """Callback to handle the response of the futures
        downloading a picture"""

        if not self.parent.parsing:
            return

        query = QtSql.QSqlQuery(self.bdd)

        try:
            response = future.result()
        except concurrent.futures._base.CancelledError:
            self.l.error("future cancelled for {}".format(entry_url))
            self.parent.counter_images_failed += 1
            params = ("Empty", doi)
        except Exception as e:
            self.parent.counter_images_failed += 1
            self.l.error("pictureDownloaded: {}".format(e), exc_info=True)
            params = ("Empty", doi)
        else:
            # If the picture was dled correctly
            if response.status_code is requests.codes.ok:
                try:
                    # Save the page
                    io = BytesIO(response.content)
                    Image.open(io).convert('RGB').save(
                        self.PATH + functions.simpleChar(response.url),
                        format='JPEG')
                    self.l.debug("Image ok")
                except Exception as e:
                    self.l.error(
                        "An error occured in pictureDownloaded:\n{}".format(e),
                        exc_info=True)
                    params = ("Empty", doi)
                else:
                    params = (functions.simpleChar(response.url), doi)
            else:
                self.l.debug("Bad return code: {} DOI: {}".format(
                    response.status_code, doi))
                params = ("Empty", doi)

        finally:
            query.prepare("UPDATE papers SET graphical_abstract=? WHERE doi=?")

            for value in params:
                query.addBindValue(value)

            self.new_entries_worker += 1
            query.exec_()

        self.counter_futures_images += 1
コード例 #3
0
def forgeTopicSimple(title: str, abstract: str) -> str:
    """
    Forge topic_simple, a simplified version of the abstract, used for
    sqlite queries
    """

    simple_title = fct.simpleChar(title)

    if abstract is not None:
        simple_abstract = fct.simpleChar(BS(abstract, "html.parser").text)
        topic_simple = " " + simple_abstract + " " + simple_title + " "
    else:
        topic_simple = " " + simple_title + " "

    return topic_simple
コード例 #4
0
def getData(company, journal, entry, response=None):
    """Get the data. Starts from the data contained in the RSS page and, if
    necessary, parses the website for additional information"""

    url = refineUrl(company, journal, entry)

    # If the journal is edited by the RSC
    if company == 'RSC':
        """Graphical abstract present in RSS. Abstract incomplete
        and w/out html. Title w/out html"""

        title = entry.title
        date = arrow.get(entry.updated).format('YYYY-MM-DD')

        abstract = None
        graphical_abstract = None
        author = None

        soup = BS(entry.summary, "html.parser")

        r = soup("img", align="center")
        if r:
            graphical_abstract = r[0]['src']

        if response.status_code is requests.codes.ok:

            # # Get the title (w/ html)
            # Strainer: get a soup with only the interesting part.
            # Don't load the complete tree in memory. Saves RAM
            strainer = SS("h2", attrs={"class": "capsule__title fixpadv--m"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            title = soup.h2

            if title is not None:
                title = title.renderContents().decode().strip()

            # Get the abstrat (w/ html)
            strainer = SS("p", xmlns="http://www.rsc.org/schema/rscart38")
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.p

            if r is not None:
                abstract = r.renderContents().decode()
                if abstract == "":
                    abstract = None

            strainer = SS("meta", attrs={"name": "citation_author"})
            soup = BS(response.text, "html.parser", parse_only=strainer)

            # Here, multiple tags (results) are expected, so perform
            # the search, even if the tree contains only the result
            r = soup("meta", attrs={"name": "citation_author"})
            if r:
                author = [tag['content'] for tag in r]
                author = ", ".join(author)

    elif company == 'Wiley':

        title, date, author, abstract, graphical_abstract = parseWiley(
            entry, response)

    elif company == 'ACS':
        """Feed only contains graphical abstract"""

        title = entry.title.rstrip()
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')
        abstract = None

        author = entry.author
        author = entry.author.split(" and ")
        if len(author) > 1:
            author = ", ".join(author)
        else:
            author = author[0]

        graphical_abstract = None

        soup = BS(entry.summary, "html.parser")
        r = soup("img", alt="TOC Graphic")
        if r:
            graphical_abstract = r[0]['src']

        # If the dl went wrong, print an error
        if response.status_code is requests.codes.ok:

            strainer = SS("p", attrs={"class": "articleBody_abstractText"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.p
            if r is not None:
                abstract = r.renderContents().decode()

            strainer = SS("h1", attrs={"class": "articleTitle"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

    elif company == 'Nature':

        title = entry.title
        date = entry.date
        abstract = None
        graphical_abstract = None
        author = None

        try:
            if entry.authors:
                author = []
                for element in entry.authors:
                    author.append(element['name'])
                author = ", ".join(author)
        except AttributeError:
            pass

        if entry.summary:
            abstract = BS(entry.summary, "html.parser")

            while abstract.find_all('p'):
                _ = abstract.p.extract()

            try:
                _ = abstract.img.extract()
            except AttributeError:
                pass

            abstract = abstract.renderContents().decode()

        if (response.status_code is requests.codes.ok
                or response.status_code == 401):

            strainer = SS("div",
                          attrs={"class": "article__body serif cleared"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.div
            try:
                abstract = r.text
            except AttributeError:
                pass

            strainer = SS("figure")
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("img", attrs={"class": "figure__image"})

            if r:
                # Additional verification to correctly forge the URL
                graphical_abstract = "http:" + r[0]["src"]

    elif company == 'Science':

        title = entry.title
        date = entry.date

        graphical_abstract = None

        if entry.author:
            author = entry.author
        else:
            author = None

        abstract = entry.summary
        if not abstract:
            abstract = None

    elif company == 'PNAS':

        title = entry.title
        date = entry.prism_publicationdate

        graphical_abstract = None
        author = None

        abstract = None

        if response.status_code is requests.codes.ok:

            # Get the correct title, not the one in the RSS
            strainer = SS("h1", id="article-title-1")
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("h1", id="article-title-1")
            if r:
                title = r[0].renderContents().decode()

            # Get the authors
            strainer = SS("a", attrs={"class": "name-search"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("a", attrs={"class": "name-search"})
            if r:
                author = [tag.text for tag in r]
                author = ", ".join(author)

            # Try to get the complete abstract. Sometimes it's available,
            # sometimes the article only contains an extract
            strainer = SS("div", attrs={"class": "section abstract"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            if soup.p is not None:
                abstract = soup.p.renderContents().decode()
            else:
                abstract = entry.summary

    elif company == 'Elsevier':

        title = entry.title
        date = arrow.get(mktime(entry.updated_parsed)).format('YYYY-MM-DD')

        graphical_abstract = None
        author = None

        abstract = entry.summary

        if abstract:
            try:
                author = abstract.split("Author(s): ")[1].split(
                    "<br")[0].split("<")[0]
                author = author.replace(" , ", ", ")
                author = author.replace("  ", " ")
            except IndexError:
                author = None

            soup = BS(abstract, "html.parser")

            try:
                # First type of abstract formatting
                abstract = soup("simple-para")[0].renderContents().decode()
            except IndexError:
                try:
                    # Second type of abstract formatting
                    abstract = abstract.split("<br />")[3].lstrip()
                except IndexError:
                    abstract = None

            r = soup.find_all("img")
            if r:
                graphical_abstract = r[0]['src']

        # NOTE: javascript embedded, impossible
        # if response.status_code is requests.codes.ok:
        # url = response.url
        # print(response.url)
        # # Get the abstract
        # soup = BS(response.text)

        # Get the correct title, no the one in the RSS
        # r = soup.find_all("li", attrs={"class": "originalArticleName"})
        # print(r)
        # if r:
        # title = r[0].renderContents().decode()

    elif company == 'Thieme':

        title = entry.title
        date = arrow.get(entry.updated).format('YYYY-MM-DD')

        abstract = None
        graphical_abstract = None
        author = None

        try:
            if entry.authors:
                author = []
                for element in entry.authors:
                    # Reverse Family name/first name
                    field = reversed(element['name'].split(', '))
                    name = " ".join(field)
                    author.append(name)
                author = ", ".join(author)
        except AttributeError:
            pass

        try:
            if entry.summary:
                abstract = entry.summary
        except AttributeError:
            pass

    elif company == 'Beilstein':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')

        abstract = None
        graphical_abstract = None

        author = entry.author
        author = entry.author.split(" and ")
        if len(author) > 1:
            author = ", ".join(author)
        else:
            author = author[0]

        if entry.summary != "":
            soup = BS(entry.summary, "html.parser")
            r = soup.find_all("p")

            if r:
                abstract = r[1].renderContents().decode()

            r = soup.find_all("img")
            if r:
                # This company can change the background of the GA through
                # the url. If nothing is done, the bg is black, so turn it
                # to white. Doesn't affect images with unchangeable bg
                graphical_abstract = r[0]['src'] + '&background=FFFFFF'

    elif company == 'Nature2':

        title = entry.title
        date = entry.date
        abstract = entry.summary
        graphical_abstract = None

        try:
            author = [dic['name'] for dic in entry.authors]
            if author:
                if len(author) > 1:
                    author = ", ".join(author)
                else:
                    author = author[0]
            else:
                author = None
        except AttributeError:
            author = None

        if response.status_code is requests.codes.ok or response.status_code == 401:

            strainer = SS(
                "h1", attrs={"class": "tighten-line-height small-space-below"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

            strainer = SS("div", attrs={"id": "abstract-content"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.p
            if r is not None:
                abstract = r.renderContents().decode()

            strainer = SS("img")
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("img", attrs={"alt": "Figure 1"})
            if r:
                if "f1.jpg" in r[0]["src"]:
                    graphical_abstract = "http://www.nature.com" + r[0]["src"]

    elif company == 'PLOS':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')

        author = None

        try:
            if entry.authors:
                author = []
                for element in entry.authors:
                    author.append(element['name'])
                author = ", ".join(author)
        except AttributeError:
            pass

        abstract = BS(entry.summary, "html.parser")

        # Clean the authors' names from the abstract
        r = abstract.find_all("p")
        if r and str(r[0]).startswith("<p>by "):
            abstract("p")[0].extract()

        try:
            abstract("img")[0].extract()
        except IndexError:
            pass

        abstract = abstract.renderContents().decode().strip()

        base = "http://journals.plos.org/plosone/article/figure/image?size=medium&id=info:doi/{}.g001"
        graphical_abstract = base.format(getDoi(company, journal, entry))

    elif company == 'Springer':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')
        graphical_abstract = None
        author = None

        abstract = BS(entry.summary, "html.parser")

        try:
            _ = abstract("h3")[0].extract()
            # Remove the graphical abstract part from the abstract
            _ = abstract(
                "span",
                attrs={
                    "class":
                    "a-plus-plus figure category-standard float-no id-figa"
                })[0].extract()
        except IndexError:
            pass

        abstract = abstract.renderContents().decode().strip()

        if response.status_code is requests.codes.ok:

            strainer = SS("div", attrs={"class": "MediaObject"})
            soup = BS(response.text, "html.parser", parse_only=strainer)

            # For now, it's one shot: if the dl fails for the GA, there
            # won't be a retry. That's bc too little articles have GA
            r = soup.find_all("img")
            if r:
                graphical_abstract = r[0]['src']

            strainer = SS("ul", attrs={"class": "AuthorNames"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("span", attrs={"class": "AuthorName"})
            if r:
                author = [tag.text for tag in r]
                author = ", ".join(author)

            strainer = SS("h1", attrs={"class": "ArticleTitle"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

    elif company == 'Springer_open':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')
        graphical_abstract = None
        author = None

        abstract = BS(entry.summary, "html.parser")

        try:
            _ = abstract("h3")[0].extract()
            # Remove the graphical abstract part from the abstract
            _ = abstract(
                "span",
                attrs={
                    "class":
                    "a-plus-plus figure category-standard float-no id-figa"
                })[0].extract()
        except IndexError:
            pass

        abstract = abstract.renderContents().decode().strip()

        if response.status_code is requests.codes.ok:

            strainer = SS("div", attrs={"class": "MediaObject"})
            soup = BS(response.text, "html.parser", parse_only=strainer)

            # For now, it's one shot: if the dl fails for the GA, there
            # won't be a retry. That's bc too little articles have GA
            r = soup.find_all("img")
            if r:
                graphical_abstract = r[0]['src']

            strainer = SS("ul", attrs={"class": "u-listReset"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("span", attrs={"class": "AuthorName"})
            if r:
                author = [tag.text for tag in r]
                author = ", ".join(author)

            strainer = SS("h1", attrs={"class": "ArticleTitle"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

    elif company == 'Taylor':

        title = entry.title
        date = arrow.get(mktime(entry.updated_parsed)).format('YYYY-MM-DD')
        graphical_abstract = None
        author = None
        abstract = None

        try:
            author = []
            for element in entry.authors:
                author.append(element['name'])
            author = ", ".join(author)
        except AttributeError:
            author = None

        if response.status_code is requests.codes.ok:

            strainer = SS("div", attrs={"class": "col-md-2-3 "})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.span
            if r is not None:
                # Remove all tags attributes
                for tag in r.findAll(True):
                    tag.attrs = None
                title = r.renderContents().decode()

            strainer = SS("div",
                          attrs={"class": "abstractSection abstractInFull"})
            soup = BS(response.text, "html.parser", parse_only=strainer)

            # Erase the title 'Abstract', useless
            if soup("p") and soup("p")[0].text == "Abstract":
                soup("p")[0].extract()

            r = soup.p
            if r is not None:
                abstract = r.renderContents().decode()

            r = soup.find_all("img")
            if r:
                base = "http://www.tandfonline.com{}"
                graphical_abstract = base.format(r[0]['src'])

    elif company == 'ChemArxiv':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')
        graphical_abstract = None
        author = None
        abstract = None

        try:
            if entry.authors:
                author = []
                for element in entry.authors:
                    author.append(element['name'])
                author = ", ".join(author)
        except AttributeError:
            pass

        try:
            abstract = entry.summary
        except AttributeError:
            # I saw once a poster conference, w/ no abstract.
            # Filter these entries if it becomes common
            pass

    elif company == 'ChemRxiv':

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD')
        graphical_abstract = None
        author = None
        abstract = None

        try:
            abstract = entry.summary
        except AttributeError:
            # I saw once a poster conference, w/ no abstract.
            # Filter these entries if it becomes common
            pass

        if response.status_code is requests.codes.ok:
            pass

            strainer = SS("span", attrs={"class": "authors-holder"})
            soup = BS(response.text, "html.parser", parse_only=strainer)
            r = soup.find_all("a", attrs={"class": "normal-link author"})
            if r:
                author = [tag.text.strip() for tag in r]
                author = ", ".join(author)

    else:
        return None

    if title is None:
        return None

    topic_simple = forgeTopicSimple(title, abstract)

    if abstract is None or abstract == '':
        abstract = "Empty"
    if graphical_abstract is None:
        graphical_abstract = "Empty"

    if author is None or author == '':
        author = "Empty"
        author_simple = None
    else:
        # Clean author field
        author = author.replace('  ', ' ')
        author = author.replace(' ,', ',')
        author_simple = " " + fct.simpleChar(author) + " "

    return title, date, author, abstract, graphical_abstract, url, topic_simple, author_simple
コード例 #5
0
ファイル: worker.py プロジェクト: RKBK/ChemBrows
    def run(self):

        """Main function. Starts the real business"""

        self.l.debug("Entering worker")
        self.l.debug(self.url_feed)

        # Get the RSS page of the url provided
        try:
            self.feed = feedparser.parse(self.url_feed)
            self.l.debug("RSS page successfully dled")
        except OSError:
            self.l.error("Too many files open, could not start the thread !")
            return

        # Get the journal name
        try:
            journal = self.feed['feed']['title']
        except KeyError:
            self.l.critical("No title for the journal ! Aborting")
            self.l.critical(self.url_feed)
            return

        self.l.info("{0}: {1}".format(journal, len(self.feed.entries)))

        # Lists to check if the post is in the db, and if
        # it has all the infos
        self.session_images = FuturesSession(max_workers=20)

        # Get the company and the journal_abb by scrolling the dictionnary
        # containing all the data regarding the journals implemented in the
        # program. This dictionnary is built in gui.py, to avoid multiple calls
        # to hosts.getJournals
        # care_image determines if the Worker will try to dl the graphical
        # abstracts
        for key, tuple_data in self.dict_journals.items():
            if journal in tuple_data[0]:
                company = key
                index = tuple_data[0].index(journal)
                journal_abb = tuple_data[1][index]
                care_image = tuple_data[3][index]
                break

        try:
            self.dico_doi = self.listDoi(journal_abb)
        except UnboundLocalError:
            self.l.error("Journal not recognized ! Aborting")
            return

        # Create a list for the journals which a dl of the article
        # page is not required. All the data are in the rss page
        company_no_dl = ['science', 'elsevier', 'beilstein', 'plos']

        query = QtSql.QSqlQuery(self.bdd)

        self.bdd.transaction()

        # The feeds of these journals are complete
        # if journal in wiley + science + elsevier:
        if company in company_no_dl:

            self.count_futures_urls += len(self.feed.entries)

            for entry in self.feed.entries:

                # Get the DOI, a unique number for a publication
                doi = hosts.getDoi(company, journal, entry)
                url = getattr(entry, 'feedburner_origlink', entry.link)

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.count_futures_images += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    # Insert the crappy articles in a rescue database
                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)
                        self.l.debug("Inserting {0} in table debug".
                                     format(doi))
                        for value in params:
                            query.addBindValue(value)
                        query.exec_()
                    else:
                        continue

                # Artice complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.count_futures_images += 1
                    self.l.debug("Skipping {}".format(doi))
                    continue

                # Artice not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:

                    # How to update the entry
                    dl_page, dl_image, data = hosts.updateData(company,
                                                               journal,
                                                               entry,
                                                               care_image)

                    # For these journals, all the infos are in the RSS.
                    # Only care about the image
                    if dl_image:
                        self.parent.counter_updates += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):
                            self.count_futures_images += 1
                        else:
                            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                       'Connection': 'close',
                                       'Referer': url}

                            future_image = self.session_images.get(
                                graphical_abstract, headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(functools.partial(
                                self.pictureDownloaded, doi, url))
                            self.list_futures.append(future_image)

                    else:
                        self.count_futures_images += 1
                        continue

                else:
                    try:
                        title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry)
                    except TypeError:
                        self.l.error("getData returned None for {}".
                                     format(journal))
                        self.count_futures_images += 1
                        return

                    # Rejecting article if no author
                    if authors == "Empty":
                        self.count_futures_images += 1
                        self.parent.counter_rejected += 1
                        self.l.debug("Rejecting article {}, no author".
                                     format(title))
                        continue

                    query.prepare("INSERT INTO papers (doi, title, date, \
                                  journal, authors, abstract, \
                                  graphical_abstract, url, new, topic_simple, \
                                  author_simple) \
                                   VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

                    # Set new to 1 and not to true
                    params = (doi, title, date, journal_abb, authors, abstract,
                              graphical_abstract, url, 1, topic_simple, author_simple)

                    self.l.debug("Adding {0} to the database".format(doi))
                    self.parent.counter += 1
                    self.new_entries_worker += 1

                    for value in params:
                        query.addBindValue(value)
                    query.exec_()

                    if graphical_abstract == "Empty" or os.path.exists(
                            self.DATA_PATH +
                            functions.simpleChar(graphical_abstract)):

                        self.count_futures_images += 1

                        # This block is executed when you delete the db, but
                        # not the images. Allows to update the
                        # graphical_abstract in db accordingly
                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):

                            query.prepare("UPDATE papers SET \
                                          graphical_abstract=? WHERE doi=?")

                            params = (functions.simpleChar(graphical_abstract),
                                      doi)

                            for value in params:
                                query.addBindValue(value)
                            query.exec_()
                    else:
                        headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                   'Connection': 'close',
                                   'Referer': url}

                        future_image = self.session_images.get(
                            graphical_abstract, headers=headers,
                            timeout=self.TIMEOUT)
                        future_image.add_done_callback(
                            functools.partial(self.pictureDownloaded,
                                              doi, url))
                        self.list_futures.append(future_image)

        else:

            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                       'Connection': 'close'}

            self.session_pages = FuturesSession(max_workers=20)

            for entry in self.feed.entries:

                doi = hosts.getDoi(company, journal, entry)

                if company == 'acs':
                    url = getattr(entry, 'feedburner_origlink',
                                  entry.link).split('/')[-1]
                    url = "http://pubs.acs.org/doi/abs/10.1021/" + url

                elif company == 'npg':
                    url = getattr(entry, 'feedburner_origlink',
                                  entry.link).split('/')[-1]
                    url = "http://www.nature.com/nature/journal/vaop/ncurrent/abs/" + url + ".html"
                else:
                    url = getattr(entry, 'feedburner_origlink', entry.link)

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.count_futures_images += 1
                    self.count_futures_urls += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)

                        for value in params:
                            query.addBindValue(value)
                        query.exec_()

                        self.l.debug("Inserting {0} in table debug".
                                     format(doi))
                    continue


                # Article complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.count_futures_images += 1
                    self.count_futures_urls += 1
                    self.l.debug("Skipping {}".format(doi))
                    continue


                # Article not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:


                    dl_page, dl_image, data = hosts.updateData(company,
                                                               journal,
                                                               entry,
                                                               care_image)

                    if dl_page:
                        self.parent.counter_updates += 1

                        future = self.session_pages.get(url,
                                                        timeout=self.TIMEOUT,
                                                        headers=headers)
                        future.add_done_callback(functools.partial(
                            self.completeData, doi, company, journal,
                            journal_abb, entry))
                        self.list_futures.append(future)

                        # Continue just to be sure. If dl_page is True,
                        # dl_image is likely True too
                        continue

                    elif dl_image:
                        self.parent.counter_updates += 1
                        self.count_futures_urls += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):
                            self.count_futures_images += 1
                        else:
                            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                       'Connection': 'close',
                                       'Referer': url}

                            future_image = self.session_images.get(
                                graphical_abstract, headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(functools.partial(
                                self.pictureDownloaded, doi, url))
                            self.list_futures.append(future_image)

                    else:
                        self.count_futures_urls += 1
                        self.count_futures_images += 1
                        continue

                else:

                    self.l.debug("Starting adding new entry")

                    future = self.session_pages.get(url, timeout=self.TIMEOUT,
                                                    headers=headers)
                    future.add_done_callback(functools.partial(
                        self.completeData, doi, company, journal, journal_abb,
                        entry))
                    self.list_futures.append(future)


        # Check if the counters are full
        while ((self.count_futures_images + self.count_futures_urls) !=
                len(self.feed.entries) * 2 and self.parent.parsing):
            self.sleep(1)

        if self.parent.parsing:
            if not self.bdd.commit():
                self.l.error(self.bdd.lastError().text())
                self.l.debug("db insertions/modifications: {}".
                             format(self.new_entries_worker))
                self.l.error("Problem when comitting data for {}".
                             format(journal))

        # Free the memory, and clean the remaining futures
        try:
            self.session_pages.executor.shutdown()
        except AttributeError:
            self.l.error("No session_pages to shut down")

        self.session_images.executor.shutdown()
        self.l.debug("Exiting thread for {}".format(journal))
コード例 #6
0
ファイル: worker.py プロジェクト: RKBK/ChemBrows
    def completeData(self, doi, company, journal, journal_abb, entry, future):

        """Callback to handle the response of the futures trying to
        download the page of the articles"""

        self.l.debug("Page dled")
        self.count_futures_urls += 1

        if not self.parent.parsing:
            return

        try:
            response = future.result()
        except requests.exceptions.ReadTimeout:
            self.l.error("ReadTimeout for {}".format(journal))
            self.count_futures_images += 1
            return
        except requests.exceptions.ConnectionError:
            self.l.error("ConnectionError for {}".format(journal))
            self.count_futures_images += 1
            return
        except ConnectionResetError:
            self.l.error("ConnectionResetError for {}".format(journal))
            self.count_futures_images += 1
            return
        except socket.timeout:
            self.l.error("socket.timeout for {}".format(journal))
            self.count_futures_images += 1
            return
        except concurrent.futures._base.CancelledError:
            self.l.error("future cancelled for {}".format(journal))
            self.count_futures_images += 1
            return
        except Exception as e:
            self.l.error("Unknown exception {} for {}".format(e, journal))
            self.l.error(traceback.format_exc())
            self.count_futures_images += 1
            return

        query = QtSql.QSqlQuery(self.bdd)

        try:
            title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry, response)
        except TypeError:
            self.l.error("getData returned None for {}".format(journal))
            self.count_futures_images += 1
            return
        except Exception as e:
            self.l.error("Unknown exception completeData {}".format(e))
            self.l.error(traceback.format_exc())
            self.count_futures_images += 1
            return

        # Rejecting the article if no authors
        if authors == "Empty":
            self.count_futures_images += 1
            self.parent.counter_rejected += 1
            self.l.debug("Rejecting article {}, no author".format(title))
            return

        # Check if the DOI is already in the db. Mandatory, bc sometimes
        # updateData will tell the worker to dl the page before downloading
        # the picture
        if doi not in self.dico_doi:
            query.prepare("INSERT INTO papers (doi, title, date, journal, \
                          authors, abstract, graphical_abstract, url, new, \
                          topic_simple, author_simple) VALUES(?, ?, ?, ?, ?, \
                          ?, ?, ?, ?, ?, ?)")

            params = (doi, title, date, journal_abb, authors, abstract,
                      graphical_abstract, url, 1, topic_simple, author_simple)

            self.l.debug("Adding {0} to the database".format(doi))
            self.parent.counter += 1

            for value in params:
                query.addBindValue(value)

            query.exec_()

        self.new_entries_worker += 1

        # Don't try to dl the image if its url is 'Empty', or if the image
        # already exists
        if (graphical_abstract == "Empty" or
                os.path.exists(self.DATA_PATH +
                               functions.simpleChar(graphical_abstract))):
            self.count_futures_images += 1
            self.l.debug("Image already dled or Empty")

            # This block is executed when you delete the db, but not the
            # images. Allows to update the graphical_abstract in db accordingly
            if os.path.exists(self.DATA_PATH +
                              functions.simpleChar(graphical_abstract)):
                query.prepare("UPDATE papers SET graphical_abstract=? WHERE \
                              doi=?")
                params = (functions.simpleChar(graphical_abstract), doi)
                for value in params:
                    query.addBindValue(value)
                query.exec_()
        else:
            self.l.debug("Page dled, adding future image")
            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                       'Connection': 'close',
                       'Referer': url}

            future_image = self.session_images.get(graphical_abstract,
                                                   headers=headers,
                                                   timeout=self.TIMEOUT)
            future_image.add_done_callback(functools.partial(
                self.pictureDownloaded, doi, url))
            self.list_futures.append(future_image)
コード例 #7
0
    def run(self):
        """Main function. Starts the real business"""

        self.l.debug("Entering worker")

        feed = self._getFeed(timeout=self.TIMEOUT)

        if feed is None:
            self.l.error("Exiting worker, problem w/ the feed")
            self.parent.list_failed_rss.append(self.url_feed)
            return

        # Get the journal name
        journal = feed['feed']['title']

        self.l.info("{}: {}".format(journal, len(feed.entries)))

        # Lists to check if the post is in the db, and if
        # it has all the info
        self.session_images = FuturesSession(
            max_workers=self.MAX_WORKERS, session=self.parent.browsing_session)

        # Get the company and the journal_abb by scrolling the dictionary
        # containing all the data regarding the journals implemented in the
        # program. This dictionary is built in gui.py, to avoid multiple calls
        # to hosts.getJournals
        # care_image determines if the Worker will try to dl the graphical
        # abstracts
        for key, tuple_data in self.dict_journals.items():
            if journal in tuple_data[0]:
                company = key
                index = tuple_data[0].index(journal)
                journal_abb = tuple_data[1][index]
                care_image = tuple_data[3][index]
                break

        try:
            self.dico_doi = self.listDoi(journal_abb)
        except UnboundLocalError:
            self.l.error("Journal not recognized ! Aborting")
            self.parent.list_failed_rss.append(self.url_feed)
            return

        # Create a list for the journals which a dl of the article
        # page is not required. All the data are in the rss page
        company_no_dl = [
            'Science', 'Elsevier', 'Beilstein', 'PLOS', 'ChemArxiv', 'Wiley'
        ]

        query = QtSql.QSqlQuery(self.bdd)

        self.bdd.transaction()

        # The feeds of these journals are complete
        if company in company_no_dl:

            self.counter_futures_urls += len(feed.entries)

            for entry in feed.entries:

                # Get the DOI, a unique number for a publication
                try:
                    doi = hosts.getDoi(company, journal, entry)
                except Exception as e:
                    self.l.error("getDoi failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    continue

                try:
                    url = hosts.refineUrl(company, journal, entry)
                except Exception as e:
                    self.l.error("refineUrl failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    continue

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.counter_futures_images += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    # Insert the crappy articles in a rescue database
                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)
                        self.l.debug(
                            "Inserting {0} in table debug".format(doi))
                        for value in params:
                            query.addBindValue(value)
                        query.exec_()
                    else:
                        continue

                # Artice complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.counter_futures_images += 1
                    self.l.debug("Article complete, skipping {}".format(doi))
                    continue

                # Artice not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:
                    self.l.debug("Trying to update {}".format(doi))

                    # How to update the entry
                    dl_page, dl_image, data = hosts.updateData(
                        company, journal, entry, care_image)

                    # For these journals, all the infos are in the RSS.
                    # Only care about the image
                    if dl_image:
                        self.parent.counter_updates += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):
                            self.counter_futures_images += 1
                        else:
                            headers = {
                                'User-agent':
                                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                'Connection': 'close',
                                'Referer': url
                            }

                            future_image = self.session_images.get(
                                graphical_abstract,
                                headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(
                                functools.partial(self.pictureDownloaded, doi,
                                                  url))
                            self.list_futures.append(future_image)

                    else:
                        self.counter_futures_images += 1
                        continue

                # New article, treat it
                else:
                    try:
                        title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                            company, journal, entry)
                    except Exception as e:
                        self.l.error(
                            "Problem with getData: {}".format(journal),
                            exc_info=True)
                        self.counter_futures_images += 1
                        self.parent.counter_articles_failed += 1
                        return

                    # Rejecting article if no author
                    if authors == "Empty":
                        self.counter_futures_images += 1
                        self.parent.counter_rejected += 1
                        self.l.debug(
                            "Rejecting article {}, no author".format(title))
                        continue

                    query.prepare("INSERT INTO papers (doi, title, date, \
                                  journal, authors, abstract, \
                                  graphical_abstract, url, new, topic_simple, \
                                  author_simple) \
                                  VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

                    # Set new to 1 and not to true
                    params = (doi, title, date, journal_abb, authors, abstract,
                              graphical_abstract, url, 1, topic_simple,
                              author_simple)

                    for value in params:
                        query.addBindValue(value)

                    # Test that query worked
                    if not query.exec_():
                        self.l.error(
                            "SQL ERROR in run(): {}, company_no_dl".format(
                                query.lastError().text()))
                        self.parent.counter_articles_failed += 1
                        continue
                    else:
                        self.l.debug("{} added to the database".format(doi))
                        self.new_entries_worker += 1
                        self.parent.counter_added += 1

                    # If article has no graphical abstract of if it has been
                    # dled
                    if graphical_abstract == "Empty" or os.path.exists(
                            self.PATH +
                            functions.simpleChar(graphical_abstract)):

                        self.counter_futures_images += 1

                        # This block is executed when you delete the db, but
                        # not the images. Allows to update the
                        # graphical_abstract in db accordingly
                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):

                            query.prepare("UPDATE papers SET \
                                          graphical_abstract=? WHERE doi=?")

                            params = (functions.simpleChar(graphical_abstract),
                                      doi)

                            for value in params:
                                query.addBindValue(value)
                            query.exec_()
                    else:
                        headers = {
                            'User-agent':
                            'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                            'Connection': 'close',
                            'Referer': url
                        }

                        future_image = self.session_images.get(
                            graphical_abstract,
                            headers=headers,
                            timeout=self.TIMEOUT)

                        future_image.add_done_callback(
                            functools.partial(self.pictureDownloaded, doi,
                                              url))

                        self.list_futures.append(future_image)

        # The company requires to download the article's web page
        else:

            headers = {
                'User-agent':
                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                'Connection': 'close'
            }

            self.session_pages = FuturesSession(
                max_workers=self.MAX_WORKERS,
                session=self.parent.browsing_session)

            for entry in feed.entries:

                # Get the DOI, a unique number for a publication
                try:
                    doi = hosts.getDoi(company, journal, entry)
                except Exception as e:
                    self.l.error("getDoi failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Try to refine the url
                try:
                    url = hosts.refineUrl(company, journal, entry)
                except Exception as e:
                    self.l.error("refineUrl failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Make sure the entry has a title
                try:
                    title = entry.title
                except AttributeError:
                    self.l.error("No title for {}".format(doi), exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(title):
                    self.counter_futures_images += 1
                    self.counter_futures_urls += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)

                        for value in params:
                            query.addBindValue(value)
                        query.exec_()

                        self.l.debug(
                            "Inserting {0} in table debug".format(doi))
                    continue

                # Article complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.counter_futures_images += 1
                    self.counter_futures_urls += 1
                    self.l.debug("Article complete, skipping {}".format(doi))
                    continue

                # Article not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:

                    url = hosts.refineUrl(company, journal, entry)

                    dl_page, dl_image, data = hosts.updateData(
                        company, journal, entry, care_image)

                    if dl_page:
                        self.parent.counter_updates += 1

                        future = self.session_pages.get(url,
                                                        timeout=self.TIMEOUT,
                                                        headers=headers)
                        future.add_done_callback(
                            functools.partial(self.completeData, doi, company,
                                              journal, journal_abb, entry))
                        self.list_futures.append(future)

                        # Continue just to be sure. If dl_page is True,
                        # dl_image is likely True too
                        continue

                    elif dl_image:
                        self.parent.counter_updates += 1
                        self.counter_futures_urls += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):
                            self.counter_futures_images += 1
                        else:
                            headers = {
                                'User-agent':
                                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                'Connection': 'close',
                                'Referer': url
                            }

                            future_image = self.session_images.get(
                                graphical_abstract,
                                headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(
                                functools.partial(self.pictureDownloaded, doi,
                                                  url))
                            self.list_futures.append(future_image)

                    else:
                        self.counter_futures_urls += 1
                        self.counter_futures_images += 1
                        continue

                # New article, treat it
                else:

                    url = hosts.refineUrl(company, journal, entry)
                    self.l.debug("Starting adding new entry")

                    future = self.session_pages.get(url,
                                                    timeout=self.TIMEOUT,
                                                    headers=headers)
                    future.add_done_callback(
                        functools.partial(self.completeData, doi, company,
                                          journal, journal_abb, entry))
                    self.list_futures.append(future)

        # Check if the counters are full
        while ((self.counter_futures_images + self.counter_futures_urls) !=
               len(feed.entries) * 2 and self.parent.parsing):
            self.sleep(0.5)

        if self.parent.parsing:
            if not self.bdd.commit():
                self.l.error(self.bdd.lastError().text())
                self.l.debug("db insertions/modifications: {}".format(
                    self.new_entries_worker))
                self.l.error(
                    "Problem when comitting data for {}".format(journal))

        # Free the memory, and clean the remaining futures
        try:
            self.session_pages.executor.shutdown()
        except AttributeError:
            self.l.error("No session_pages to shut down")

        self.session_images.executor.shutdown()
        self.l.debug("Exiting thread for {}".format(journal))
コード例 #8
0
    def completeData(self, doi, company, journal, journal_abb, entry, future):
        """Callback to handle the response of the futures trying to
        download the page of the articles"""

        self.l.debug("Page dled")
        self.counter_futures_urls += 1

        if not self.parent.parsing:
            return

        try:
            response = future.result()
        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, ConnectionResetError,
                socket.timeout, concurrent.futures._base.CancelledError) as e:

            self.l.error("{} raised for {}. Handled".format(journal, e))
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return
        except Exception as e:
            self.l.error("Unknown exception {} for {}".format(e, journal),
                         exc_info=True)
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return

        query = QtSql.QSqlQuery(self.bdd)

        try:
            title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                company, journal, entry, response)
        except TypeError:
            self.l.error("getData returned None for {}".format(journal),
                         exc_info=True)
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return
        except Exception as e:
            self.l.error("Unknown exception completeData {}".format(e),
                         exc_info=True)
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return

        # Rejecting the article if no authors
        if authors == "Empty":
            self.counter_futures_images += 1
            self.parent.counter_rejected += 1
            self.l.debug("Rejecting article {}, no author".format(title))
            return

        # Check if the DOI is already in the db. Mandatory, bc sometimes
        # updateData will tell the worker to dl the page before downloading
        # the picture
        if doi not in self.dico_doi:
            query.prepare("INSERT INTO papers (doi, title, date, journal, \
                          authors, abstract, graphical_abstract, url, new, \
                          topic_simple, author_simple) VALUES(?, \
                          ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

            params = (doi, title, date, journal_abb, authors, abstract,
                      graphical_abstract, url, 1, topic_simple, author_simple)

            self.l.debug("Adding {} to the database".format(doi))
            self.parent.counter_added += 1

            for value in params:
                query.addBindValue(value)

            # Test that query worked
            if not query.exec_():
                self.l.error("SQL ERROR in completeData(): {}".format(
                    query.lastError().text()))
                self.parent.counter_articles_failed += 1
                return
            else:
                self.new_entries_worker += 1

        # Don't try to dl the image if its url is 'Empty', or if the image
        # already exists
        if (graphical_abstract == "Empty"
                or os.path.exists(self.PATH +
                                  functions.simpleChar(graphical_abstract))):
            self.counter_futures_images += 1
            self.l.debug("Image already dled or Empty")

            # This block is executed when you delete the db, but not the
            # images. Allows to update the graphical_abstract in db accordingly
            if os.path.exists(self.PATH +
                              functions.simpleChar(graphical_abstract)):
                query.prepare("UPDATE papers SET graphical_abstract=? WHERE \
                              doi=?")
                params = (functions.simpleChar(graphical_abstract), doi)
                for value in params:
                    query.addBindValue(value)
                query.exec_()
        else:
            self.l.debug("Page dled, adding future image")
            headers = {
                'User-agent':
                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                'Connection': 'close',
                'Referer': url
            }

            future_image = self.session_images.get(graphical_abstract,
                                                   headers=headers,
                                                   timeout=self.TIMEOUT)
            future_image.add_done_callback(
                functools.partial(self.pictureDownloaded, doi, url))
            self.list_futures.append(future_image)
コード例 #9
0
ファイル: hosts.py プロジェクト: RKBK/ChemBrows
def getData(company, journal, entry, response=None):

    """Get the data. Starts from the data contained in the RSS flux, and if
    necessary, parse the website for supplementary infos. Download the
    graphical abstract"""

    # If the journal is edited by the RSC
    if company == "rsc":

        """Graphical abstract present in RSS. Abstract incomplete
        and w/out html. Title w/out html"""

        title = entry.title
        date = arrow.get(entry.updated).format("YYYY-MM-DD")

        url = getattr(entry, "feedburner_origlink", entry.link)

        abstract = None
        graphical_abstract = None
        author = None

        soup = BeautifulSoup(entry.summary)

        r = soup("img", align="center")
        if r:
            graphical_abstract = r[0]["src"]

        if response.status_code is requests.codes.ok:

            # # Get the title (w/ html)
            # Strainer: get a soup with only the interesting part.
            # Don't load the complete tree in memory. Saves RAM
            strainer = SoupStrainer("h2", attrs={"class": "alpH1"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            title = soup.h2

            if title is not None:
                title = title.renderContents().decode().strip()

            # # Get the abstrat (w/ html)
            strainer = SoupStrainer("p", xmlns="http://www.rsc.org/schema/rscart38")
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.p

            if r is not None:
                abstract = r.renderContents().decode()
                if abstract == "":
                    abstract = None

            strainer = SoupStrainer("meta", attrs={"name": "citation_author"})
            soup = BeautifulSoup(response.text, parse_only=strainer)

            # Here, multiple tags (results) are expected, so perform
            # the search, even if the tree contains only the result
            r = soup("meta", attrs={"name": "citation_author"})
            if r:
                author = [tag["content"] for tag in r]
                author = ", ".join(author)

    elif company == "wiley":

        """Feed compltete. Abstract w/ html. Title w/out html"""

        title = entry.title
        date = arrow.get(entry.updated).format("YYYY-MM-DD")

        author = entry.author

        url = entry.prism_url

        graphical_abstract = None

        abstract = None

        soup = BeautifulSoup(entry.summary)
        try:
            # Remove the title "Abstract" from the abstract
            soup("h3")[0].extract()
        except IndexError:
            pass
        r = soup("a", attrs={"class": "figZoom"})
        if r:
            # Define the graphical abstract by extracting it
            # (and deleting it) from the abstract
            graphical_abstract = r[0].extract()
            graphical_abstract = graphical_abstract["href"]

        abstract = soup.renderContents().decode()

        if abstract == "":
            abstract = None

        if response.status_code is requests.codes.ok:

            # # Get the title (w/ html)
            strainer = SoupStrainer("span", attrs={"class": "mainTitle"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.span
            if r is not None:
                try:
                    # Remove the sign for the supplementary infos
                    r("a", href="#nss")[0].extract()
                except IndexError:
                    pass

                # Remove the image representing a bond
                try:
                    r("img", alt="[BOND]")[0].replaceWith("-")
                    title = r.renderContents().decode().strip()
                except IndexError:
                    title = r.renderContents().decode().strip()

    elif company == "acs":

        """Feed only contains graphical abstract"""

        title = entry.title.rstrip()
        date = arrow.get(mktime(entry.published_parsed)).format("YYYY-MM-DD")
        abstract = None

        author = entry.author
        author = entry.author.split(" and ")
        if len(author) > 1:
            author = ", ".join(author)
        else:
            author = author[0]

        url = getattr(entry, "feedburner_origlink", entry.link).split("/")[-1]
        url = "http://pubs.acs.org/doi/abs/10.1021/" + url

        graphical_abstract = None

        soup = BeautifulSoup(entry.summary)
        r = soup("img", alt="TOC Graphic")
        if r:
            graphical_abstract = r[0]["src"]

        # If the dl went wrong, print an error
        if response.status_code is requests.codes.ok:

            strainer = SoupStrainer("p", attrs={"class": "articleBody_abstractText"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.p
            if r is not None:
                abstract = r.renderContents().decode()

            strainer = SoupStrainer("h1", attrs={"class": "articleTitle"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

    elif company == "npg":

        title = entry.title
        date = entry.date
        abstract = entry.summary
        graphical_abstract = None

        url = getattr(entry, "feedburner_origlink", entry.link).split("/")[-1]
        url = "http://www.nature.com/nature/journal/vaop/ncurrent/abs/" + url + ".html"

        try:
            author = [dic["name"] for dic in entry.authors]
            if author:
                if len(author) > 1:
                    author = ", ".join(author)
                else:
                    author = author[0]
            else:
                author = None
        except AttributeError:
            author = None

        if response.status_code is requests.codes.ok or response.status_code == 401:

            strainer = SoupStrainer("h1", attrs={"class": "article-heading"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

            strainer = SoupStrainer("div", attrs={"id": "first-paragraph"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.div
            if r is not None:
                abstract = r.renderContents().decode()

            strainer = SoupStrainer("figure")
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.find_all("img")
            if r:
                graphical_abstract = "http://www.nature.com" + r[0]["src"]

                if "carousel" in graphical_abstract:
                    graphical_abstract = graphical_abstract.replace("carousel", "images_article")

    elif company == "science":

        title = entry.title
        date = entry.date
        url = entry.id

        graphical_abstract = None
        author = None

        abstract = entry.summary

        if not abstract:
            abstract = None
        else:
            if "Author:" in entry.summary:
                abstract = entry.summary.split("Author: ")[0]

                try:
                    author = entry.summary.split("Author: ")[1]
                except IndexError:
                    pass
            elif "Authors:" in entry.summary:
                abstract = entry.summary.split("Authors: ")[0]
                author = entry.summary.split("Authors: ")[1].split(", ")
                author = ", ".join(author)  # To comment if formatName

    elif company == "nas":

        title = entry.title
        date = entry.prism_publicationdate
        url = entry.id

        graphical_abstract = None
        author = None

        abstract = None

        if response.status_code is requests.codes.ok:

            # Get the correct title, not the one in the RSS
            strainer = SoupStrainer("h1", id="article-title-1")
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.find_all("h1", id="article-title-1")
            if r:
                title = r[0].renderContents().decode()

            # Get the authors
            strainer = SoupStrainer("a", attrs={"class": "name-search"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.find_all("a", attrs={"class": "name-search"})
            if r:
                author = [tag.text for tag in r]
                author = ", ".join(author)

            # Try to get the complete abstract. Sometimes it's available, sometimes
            # the article only contains an extract
            strainer = SoupStrainer("div", attrs={"class": "section abstract"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            if soup.p is not None:
                abstract = soup.p.renderContents().decode()
            else:
                abstract = entry.summary

    elif company == "elsevier":

        title = entry.title
        date = arrow.get(mktime(entry.updated_parsed)).format("YYYY-MM-DD")

        url = entry.id

        graphical_abstract = None
        author = None

        abstract = entry.summary

        if abstract:
            try:
                author = abstract.split("Author(s): ")[1].split("<br")[0].split("<")[0]
                author = author.replace(" , ", ", ")
                author = author.replace("  ", " ")
            except IndexError:
                author = None

            soup = BeautifulSoup(abstract)

            r = soup.find_all("img")
            if r:
                graphical_abstract = r[0]["src"]

            try:
                abstract = abstract.split("<br />")[3].lstrip()
            except IndexError:
                abstract = ""

            if abstract == "":
                abstract = None

        # NOTE: javascript embedded, impossible
        # if response.status_code is requests.codes.ok:
        # url = response.url
        # print(response.url)
        # # Get the abstract
        # soup = BeautifulSoup(response.text)

        # Get the correct title, no the one in the RSS
        # r = soup.find_all("li", attrs={"class": "originalArticleName"})
        # print(r)
        # if r:
        # title = r[0].renderContents().decode()

    elif company == "thieme":

        title = entry.title
        date = arrow.get(entry.updated).format("YYYY-MM-DD")
        url = entry.id

        abstract = None
        graphical_abstract = None
        author = None

        if response.status_code is requests.codes.ok:

            if entry.summary != "":

                # Get the abstract, and clean it
                strainer = SoupStrainer("section", id="abstract")
                soup = BeautifulSoup(response.text, parse_only=strainer)
                abstract = soup.section

                abstract("div", attrs={"class": "articleFunctions"})[0].extract()
                [tag.extract() for tag in abstract("a", attrs={"name": True})]
                [tag.extract() for tag in abstract("h3")]
                [tag.extract() for tag in abstract("ul", attrs={"class": "linkList"})]
                [tag.extract() for tag in abstract("a", attrs={"class": "gotolink"})]

                try:
                    abstract("div", attrs={"class": "articleKeywords"})[0].extract()
                except IndexError:
                    pass

                abstract = abstract.renderContents().decode()

            strainer = SoupStrainer("span", id="authorlist")
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.find_all("span", id="authorlist")
            if r:
                author = r[0].text
                author = author.replace("*a, b", "")
                author = author.replace("*a", "")
                author = author.replace("*", "")

    elif company == "beilstein":

        title = entry.title
        date = arrow.get(mktime(entry.published_parsed)).format("YYYY-MM-DD")
        url = entry.link

        abstract = None
        graphical_abstract = None

        author = entry.author
        author = entry.author.split(" and ")
        if len(author) > 1:
            author = ", ".join(author)
        else:
            author = author[0]

        if entry.summary != "":
            soup = BeautifulSoup(entry.summary)
            r = soup.find_all("p")

            if r:
                abstract = r[1].renderContents().decode()

            r = soup.find_all("img")
            if r:
                # This company can change the background of the GA through
                # the url. If nothing is done, the bg is black, so turn it
                # to white. Doesn't affect images with unchangeable bg
                graphical_abstract = r[0]["src"] + "&background=FFFFFF"

    elif company == "npg2":

        title = entry.title
        date = entry.date
        abstract = entry.summary
        graphical_abstract = None

        url = entry.links[0]["href"]

        try:
            author = [dic["name"] for dic in entry.authors]
            if author:
                if len(author) > 1:
                    author = ", ".join(author)
                else:
                    author = author[0]
            else:
                author = None
        except AttributeError:
            author = None

        if response.status_code is requests.codes.ok or response.status_code == 401:

            strainer = SoupStrainer("h1", attrs={"class": "tighten-line-height small-space-below"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

            strainer = SoupStrainer("div", attrs={"id": "abstract-content"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.p
            if r is not None:
                abstract = r.renderContents().decode()

            strainer = SoupStrainer("img")
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.find_all("img", attrs={"alt": "Figure 1"})
            if r:
                if "f1.jpg" in r[0]["src"]:
                    graphical_abstract = "http://www.nature.com" + r[0]["src"]

    elif company == "plos":

        title = entry.title
        url = entry.link
        date = arrow.get(mktime(entry.published_parsed)).format("YYYY-MM-DD")

        if entry.authors:
            author = []
            for element in entry.authors:
                author.append(element["name"])
            author = ", ".join(author)
        else:
            author = None

        abstract = BeautifulSoup(entry.summary)

        # Clean the authors' names from the abstract
        r = abstract.find_all("p")
        if r and str(r[0]).startswith("<p>by "):
            abstract("p")[0].extract()

        try:
            abstract("img")[0].extract()
        except IndexError:
            pass

        abstract = abstract.renderContents().decode().strip()

        base = "http://journals.plos.org/plosone/article/figure/image?size=medium&id=info:doi/{}.g001"
        graphical_abstract = base.format(getDoi(company, journal, entry))

    elif company == "springer":

        title = entry.title
        url = entry.link
        date = arrow.get(mktime(entry.published_parsed)).format("YYYY-MM-DD")
        graphical_abstract = None
        author = None

        abstract = BeautifulSoup(entry.summary)

        try:
            _ = abstract("h3")[0].extract()
            # Remove the graphical abstract part from the abstract
            _ = abstract("span", attrs={"class": "a-plus-plus figure category-standard float-no id-figa"})[0].extract()
        except IndexError:
            pass

        abstract = abstract.renderContents().decode().strip()

        if response.status_code is requests.codes.ok:

            strainer = SoupStrainer("div", attrs={"class": "MediaObject"})
            soup = BeautifulSoup(response.text, parse_only=strainer)

            # For now, it's one shot: if the dl fails for the GA, there
            # won't be a retry. That's bc too little articles have GA
            r = soup.find_all("img")
            if r:
                graphical_abstract = r[0]["src"]

            strainer = SoupStrainer("ul", attrs={"class": "AuthorNames"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.find_all("span", attrs={"class": "AuthorName"})
            if r:
                author = [tag.text for tag in r]
                author = ", ".join(author)

            strainer = SoupStrainer("h1", attrs={"class": "ArticleTitle"})
            soup = BeautifulSoup(response.text, parse_only=strainer)
            r = soup.h1
            if r is not None:
                title = r.renderContents().decode()

    else:
        return None

    if abstract is not None:

        topic_simple = (
            " " + functions.simpleChar(BeautifulSoup(abstract).text) + " " + functions.simpleChar(title) + " "
        )
    else:
        topic_simple = " " + functions.simpleChar(title) + " "

    if abstract is None or abstract == "":
        abstract = "Empty"
    if graphical_abstract is None:
        graphical_abstract = "Empty"

    if author is None or author == "":
        author = "Empty"
        author_simple = None
    else:
        author_simple = " " + functions.simpleChar(author) + " "

    return title, date, author, abstract, graphical_abstract, url, topic_simple, author_simple