Esempio n. 1
0
def test_getDoi(journalsUrls):
    """Tests if the function getDoi gets the DOI correctly"""

    l.info("Function getDoi")

    start_time = datetime.datetime.now()

    list_sites = journalsUrls

    # Build a dic with key: company
    # value: journal name
    dict_journals = {}
    for company in hosts.getCompanies():
        dict_journals[company] = hosts.getJournals(company)[0]

    for site in list_sites:

        try:
            feed = feedparser.parse(site, timeout=20)
            journal = feed['feed']['title']
            l.debug("RSS page successfully dled")
        except Exception as e:
            l.error("RSS page could not be downloaded: {}".format(e),
                    exc_info=True)
            continue

        try:
            journal = feed['feed']['title']
        except KeyError:
            l.error("Failed to get title for: {}".format(site))
            pytest.fail("Failed to get title for: {}".format(site))

        # Get the company name
        for publisher, data in dict_journals.items():
            if journal in data:
                company = publisher

        l.info("{}: {}".format(site, len(feed.entries)))

        if len(feed.entries) < LENGTH_SAMPLE:
            samples = feed.entries
        else:
            samples = random.sample(feed.entries, LENGTH_SAMPLE)

        # Tests LENGTH_SAMPLE entries for a journal, not all of them
        for entry in samples:

            doi = hosts.getDoi(company, journal, entry)
            l.info(doi)

            logAssert(
                type(doi) == str or not doi.startswith('10.1'),
                "DOI is not a string or is not a DOI {}".format(doi))

    l.debug("Time spent in test_getDoi: {}".format(datetime.datetime.now() -
                                                   start_time))
Esempio n. 2
0
def test_getDoi(journalsUrls):

    """Tests if the function getDoi gets the DOI correctly"""

    print("\n")
    print("Starting test getDoi")

    list_sites = journalsUrls

    # Build a dic with key: company
    # value: journal name
    dict_journals = {}
    for company in os.listdir("journals"):
        company = company.split(".")[0]
        dict_journals[company] = hosts.getJournals(company)[0]

    for site in list_sites:
        feed = feedparser.parse(site)
        journal = feed["feed"]["title"]

        # Get the company name
        for publisher, data in dict_journals.items():
            if journal in data:
                company = publisher

        print("{}: {}".format(site, len(feed.entries)))

        if len(feed.entries) < LENGTH_SAMPLE:
            samples = feed.entries
        else:
            samples = random.sample(feed.entries, LENGTH_SAMPLE)

        # Tests LENGTH_SAMPLE entries for a journal, not all of them
        for entry in samples:

            doi = hosts.getDoi(company, journal, entry)
            print(doi)

            assert type(doi) == str
Esempio n. 3
0
    def _checkIsFeed(self, url: str, company: str,
                     feed: feedparser.util.FeedParserDict) -> bool:

        self.l.debug("Entering _checkIsFeed")

        # Check if the feed has a title
        try:
            journal = feed['feed']['title']
        except Exception as e:
            self.l.critical("verifyInput, can't access title {}".format(url),
                            exc_info=True)
            return False

        nbr_ok = 0
        for entry in feed.entries:

            try:
                doi = hosts.getDoi(company, journal, entry)
                url = hosts.refineUrl(company, journal, entry)
                self.l.debug("{}, {}".format(doi, url))
            except Exception as e:
                self.l.error(
                    "verifyInput, entry has no doi or no url".format(url),
                    exc_info=True)
                continue

            # Check if DOI and URL can be obtained
            if (doi.startswith('10.') and validators.url(url)
                    or validators.url(doi) and validators.url(url)):
                nbr_ok += 1

            # If 3 entries are OK, the feed is considered valid
            if nbr_ok == 3:
                self.l.debug("3 entries ok, valid feed")
                return True

        # If still here, the feed is NOT considered valid
        return False
Esempio n. 4
0
    def run(self):

        """Main function. Starts the real business"""

        self.l.debug("Entering worker")
        self.l.debug(self.url_feed)

        # Get the RSS page of the url provided
        try:
            self.feed = feedparser.parse(self.url_feed)
            self.l.debug("RSS page successfully dled")
        except OSError:
            self.l.error("Too many files open, could not start the thread !")
            return

        # Get the journal name
        try:
            journal = self.feed['feed']['title']
        except KeyError:
            self.l.critical("No title for the journal ! Aborting")
            self.l.critical(self.url_feed)
            return

        self.l.info("{0}: {1}".format(journal, len(self.feed.entries)))

        # Lists to check if the post is in the db, and if
        # it has all the infos
        self.session_images = FuturesSession(max_workers=20)

        # Get the company and the journal_abb by scrolling the dictionnary
        # containing all the data regarding the journals implemented in the
        # program. This dictionnary is built in gui.py, to avoid multiple calls
        # to hosts.getJournals
        # care_image determines if the Worker will try to dl the graphical
        # abstracts
        for key, tuple_data in self.dict_journals.items():
            if journal in tuple_data[0]:
                company = key
                index = tuple_data[0].index(journal)
                journal_abb = tuple_data[1][index]
                care_image = tuple_data[3][index]
                break

        try:
            self.dico_doi = self.listDoi(journal_abb)
        except UnboundLocalError:
            self.l.error("Journal not recognized ! Aborting")
            return

        # Create a list for the journals which a dl of the article
        # page is not required. All the data are in the rss page
        company_no_dl = ['science', 'elsevier', 'beilstein', 'plos']

        query = QtSql.QSqlQuery(self.bdd)

        self.bdd.transaction()

        # The feeds of these journals are complete
        # if journal in wiley + science + elsevier:
        if company in company_no_dl:

            self.count_futures_urls += len(self.feed.entries)

            for entry in self.feed.entries:

                # Get the DOI, a unique number for a publication
                doi = hosts.getDoi(company, journal, entry)
                url = getattr(entry, 'feedburner_origlink', entry.link)

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.count_futures_images += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    # Insert the crappy articles in a rescue database
                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)
                        self.l.debug("Inserting {0} in table debug".
                                     format(doi))
                        for value in params:
                            query.addBindValue(value)
                        query.exec_()
                    else:
                        continue

                # Artice complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.count_futures_images += 1
                    self.l.debug("Skipping {}".format(doi))
                    continue

                # Artice not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:

                    # How to update the entry
                    dl_page, dl_image, data = hosts.updateData(company,
                                                               journal,
                                                               entry,
                                                               care_image)

                    # For these journals, all the infos are in the RSS.
                    # Only care about the image
                    if dl_image:
                        self.parent.counter_updates += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):
                            self.count_futures_images += 1
                        else:
                            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                       'Connection': 'close',
                                       'Referer': url}

                            future_image = self.session_images.get(
                                graphical_abstract, headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(functools.partial(
                                self.pictureDownloaded, doi, url))
                            self.list_futures.append(future_image)

                    else:
                        self.count_futures_images += 1
                        continue

                else:
                    try:
                        title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry)
                    except TypeError:
                        self.l.error("getData returned None for {}".
                                     format(journal))
                        self.count_futures_images += 1
                        return

                    # Rejecting article if no author
                    if authors == "Empty":
                        self.count_futures_images += 1
                        self.parent.counter_rejected += 1
                        self.l.debug("Rejecting article {}, no author".
                                     format(title))
                        continue

                    query.prepare("INSERT INTO papers (doi, title, date, \
                                  journal, authors, abstract, \
                                  graphical_abstract, url, new, topic_simple, \
                                  author_simple) \
                                   VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

                    # Set new to 1 and not to true
                    params = (doi, title, date, journal_abb, authors, abstract,
                              graphical_abstract, url, 1, topic_simple, author_simple)

                    self.l.debug("Adding {0} to the database".format(doi))
                    self.parent.counter += 1
                    self.new_entries_worker += 1

                    for value in params:
                        query.addBindValue(value)
                    query.exec_()

                    if graphical_abstract == "Empty" or os.path.exists(
                            self.DATA_PATH +
                            functions.simpleChar(graphical_abstract)):

                        self.count_futures_images += 1

                        # This block is executed when you delete the db, but
                        # not the images. Allows to update the
                        # graphical_abstract in db accordingly
                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):

                            query.prepare("UPDATE papers SET \
                                          graphical_abstract=? WHERE doi=?")

                            params = (functions.simpleChar(graphical_abstract),
                                      doi)

                            for value in params:
                                query.addBindValue(value)
                            query.exec_()
                    else:
                        headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                   'Connection': 'close',
                                   'Referer': url}

                        future_image = self.session_images.get(
                            graphical_abstract, headers=headers,
                            timeout=self.TIMEOUT)
                        future_image.add_done_callback(
                            functools.partial(self.pictureDownloaded,
                                              doi, url))
                        self.list_futures.append(future_image)

        else:

            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                       'Connection': 'close'}

            self.session_pages = FuturesSession(max_workers=20)

            for entry in self.feed.entries:

                doi = hosts.getDoi(company, journal, entry)

                if company == 'acs':
                    url = getattr(entry, 'feedburner_origlink',
                                  entry.link).split('/')[-1]
                    url = "http://pubs.acs.org/doi/abs/10.1021/" + url

                elif company == 'npg':
                    url = getattr(entry, 'feedburner_origlink',
                                  entry.link).split('/')[-1]
                    url = "http://www.nature.com/nature/journal/vaop/ncurrent/abs/" + url + ".html"
                else:
                    url = getattr(entry, 'feedburner_origlink', entry.link)

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.count_futures_images += 1
                    self.count_futures_urls += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)

                        for value in params:
                            query.addBindValue(value)
                        query.exec_()

                        self.l.debug("Inserting {0} in table debug".
                                     format(doi))
                    continue


                # Article complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.count_futures_images += 1
                    self.count_futures_urls += 1
                    self.l.debug("Skipping {}".format(doi))
                    continue


                # Article not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:


                    dl_page, dl_image, data = hosts.updateData(company,
                                                               journal,
                                                               entry,
                                                               care_image)

                    if dl_page:
                        self.parent.counter_updates += 1

                        future = self.session_pages.get(url,
                                                        timeout=self.TIMEOUT,
                                                        headers=headers)
                        future.add_done_callback(functools.partial(
                            self.completeData, doi, company, journal,
                            journal_abb, entry))
                        self.list_futures.append(future)

                        # Continue just to be sure. If dl_page is True,
                        # dl_image is likely True too
                        continue

                    elif dl_image:
                        self.parent.counter_updates += 1
                        self.count_futures_urls += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):
                            self.count_futures_images += 1
                        else:
                            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                       'Connection': 'close',
                                       'Referer': url}

                            future_image = self.session_images.get(
                                graphical_abstract, headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(functools.partial(
                                self.pictureDownloaded, doi, url))
                            self.list_futures.append(future_image)

                    else:
                        self.count_futures_urls += 1
                        self.count_futures_images += 1
                        continue

                else:

                    self.l.debug("Starting adding new entry")

                    future = self.session_pages.get(url, timeout=self.TIMEOUT,
                                                    headers=headers)
                    future.add_done_callback(functools.partial(
                        self.completeData, doi, company, journal, journal_abb,
                        entry))
                    self.list_futures.append(future)


        # Check if the counters are full
        while ((self.count_futures_images + self.count_futures_urls) !=
                len(self.feed.entries) * 2 and self.parent.parsing):
            self.sleep(1)

        if self.parent.parsing:
            if not self.bdd.commit():
                self.l.error(self.bdd.lastError().text())
                self.l.debug("db insertions/modifications: {}".
                             format(self.new_entries_worker))
                self.l.error("Problem when comitting data for {}".
                             format(journal))

        # Free the memory, and clean the remaining futures
        try:
            self.session_pages.executor.shutdown()
        except AttributeError:
            self.l.error("No session_pages to shut down")

        self.session_images.executor.shutdown()
        self.l.debug("Exiting thread for {}".format(journal))
Esempio n. 5
0
    def run(self):
        """Main function. Starts the real business"""

        self.l.debug("Entering worker")

        feed = self._getFeed(timeout=self.TIMEOUT)

        if feed is None:
            self.l.error("Exiting worker, problem w/ the feed")
            self.parent.list_failed_rss.append(self.url_feed)
            return

        # Get the journal name
        journal = feed['feed']['title']

        self.l.info("{}: {}".format(journal, len(feed.entries)))

        # Lists to check if the post is in the db, and if
        # it has all the info
        self.session_images = FuturesSession(
            max_workers=self.MAX_WORKERS, session=self.parent.browsing_session)

        # Get the company and the journal_abb by scrolling the dictionary
        # containing all the data regarding the journals implemented in the
        # program. This dictionary is built in gui.py, to avoid multiple calls
        # to hosts.getJournals
        # care_image determines if the Worker will try to dl the graphical
        # abstracts
        for key, tuple_data in self.dict_journals.items():
            if journal in tuple_data[0]:
                company = key
                index = tuple_data[0].index(journal)
                journal_abb = tuple_data[1][index]
                care_image = tuple_data[3][index]
                break

        try:
            self.dico_doi = self.listDoi(journal_abb)
        except UnboundLocalError:
            self.l.error("Journal not recognized ! Aborting")
            self.parent.list_failed_rss.append(self.url_feed)
            return

        # Create a list for the journals which a dl of the article
        # page is not required. All the data are in the rss page
        company_no_dl = [
            'Science', 'Elsevier', 'Beilstein', 'PLOS', 'ChemArxiv', 'Wiley'
        ]

        query = QtSql.QSqlQuery(self.bdd)

        self.bdd.transaction()

        # The feeds of these journals are complete
        if company in company_no_dl:

            self.counter_futures_urls += len(feed.entries)

            for entry in feed.entries:

                # Get the DOI, a unique number for a publication
                try:
                    doi = hosts.getDoi(company, journal, entry)
                except Exception as e:
                    self.l.error("getDoi failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    continue

                try:
                    url = hosts.refineUrl(company, journal, entry)
                except Exception as e:
                    self.l.error("refineUrl failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    continue

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.counter_futures_images += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    # Insert the crappy articles in a rescue database
                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)
                        self.l.debug(
                            "Inserting {0} in table debug".format(doi))
                        for value in params:
                            query.addBindValue(value)
                        query.exec_()
                    else:
                        continue

                # Artice complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.counter_futures_images += 1
                    self.l.debug("Article complete, skipping {}".format(doi))
                    continue

                # Artice not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:
                    self.l.debug("Trying to update {}".format(doi))

                    # How to update the entry
                    dl_page, dl_image, data = hosts.updateData(
                        company, journal, entry, care_image)

                    # For these journals, all the infos are in the RSS.
                    # Only care about the image
                    if dl_image:
                        self.parent.counter_updates += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):
                            self.counter_futures_images += 1
                        else:
                            headers = {
                                'User-agent':
                                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                'Connection': 'close',
                                'Referer': url
                            }

                            future_image = self.session_images.get(
                                graphical_abstract,
                                headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(
                                functools.partial(self.pictureDownloaded, doi,
                                                  url))
                            self.list_futures.append(future_image)

                    else:
                        self.counter_futures_images += 1
                        continue

                # New article, treat it
                else:
                    try:
                        title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                            company, journal, entry)
                    except Exception as e:
                        self.l.error(
                            "Problem with getData: {}".format(journal),
                            exc_info=True)
                        self.counter_futures_images += 1
                        self.parent.counter_articles_failed += 1
                        return

                    # Rejecting article if no author
                    if authors == "Empty":
                        self.counter_futures_images += 1
                        self.parent.counter_rejected += 1
                        self.l.debug(
                            "Rejecting article {}, no author".format(title))
                        continue

                    query.prepare("INSERT INTO papers (doi, title, date, \
                                  journal, authors, abstract, \
                                  graphical_abstract, url, new, topic_simple, \
                                  author_simple) \
                                  VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

                    # Set new to 1 and not to true
                    params = (doi, title, date, journal_abb, authors, abstract,
                              graphical_abstract, url, 1, topic_simple,
                              author_simple)

                    for value in params:
                        query.addBindValue(value)

                    # Test that query worked
                    if not query.exec_():
                        self.l.error(
                            "SQL ERROR in run(): {}, company_no_dl".format(
                                query.lastError().text()))
                        self.parent.counter_articles_failed += 1
                        continue
                    else:
                        self.l.debug("{} added to the database".format(doi))
                        self.new_entries_worker += 1
                        self.parent.counter_added += 1

                    # If article has no graphical abstract of if it has been
                    # dled
                    if graphical_abstract == "Empty" or os.path.exists(
                            self.PATH +
                            functions.simpleChar(graphical_abstract)):

                        self.counter_futures_images += 1

                        # This block is executed when you delete the db, but
                        # not the images. Allows to update the
                        # graphical_abstract in db accordingly
                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):

                            query.prepare("UPDATE papers SET \
                                          graphical_abstract=? WHERE doi=?")

                            params = (functions.simpleChar(graphical_abstract),
                                      doi)

                            for value in params:
                                query.addBindValue(value)
                            query.exec_()
                    else:
                        headers = {
                            'User-agent':
                            'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                            'Connection': 'close',
                            'Referer': url
                        }

                        future_image = self.session_images.get(
                            graphical_abstract,
                            headers=headers,
                            timeout=self.TIMEOUT)

                        future_image.add_done_callback(
                            functools.partial(self.pictureDownloaded, doi,
                                              url))

                        self.list_futures.append(future_image)

        # The company requires to download the article's web page
        else:

            headers = {
                'User-agent':
                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                'Connection': 'close'
            }

            self.session_pages = FuturesSession(
                max_workers=self.MAX_WORKERS,
                session=self.parent.browsing_session)

            for entry in feed.entries:

                # Get the DOI, a unique number for a publication
                try:
                    doi = hosts.getDoi(company, journal, entry)
                except Exception as e:
                    self.l.error("getDoi failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Try to refine the url
                try:
                    url = hosts.refineUrl(company, journal, entry)
                except Exception as e:
                    self.l.error("refineUrl failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Make sure the entry has a title
                try:
                    title = entry.title
                except AttributeError:
                    self.l.error("No title for {}".format(doi), exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(title):
                    self.counter_futures_images += 1
                    self.counter_futures_urls += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)

                        for value in params:
                            query.addBindValue(value)
                        query.exec_()

                        self.l.debug(
                            "Inserting {0} in table debug".format(doi))
                    continue

                # Article complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.counter_futures_images += 1
                    self.counter_futures_urls += 1
                    self.l.debug("Article complete, skipping {}".format(doi))
                    continue

                # Article not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:

                    url = hosts.refineUrl(company, journal, entry)

                    dl_page, dl_image, data = hosts.updateData(
                        company, journal, entry, care_image)

                    if dl_page:
                        self.parent.counter_updates += 1

                        future = self.session_pages.get(url,
                                                        timeout=self.TIMEOUT,
                                                        headers=headers)
                        future.add_done_callback(
                            functools.partial(self.completeData, doi, company,
                                              journal, journal_abb, entry))
                        self.list_futures.append(future)

                        # Continue just to be sure. If dl_page is True,
                        # dl_image is likely True too
                        continue

                    elif dl_image:
                        self.parent.counter_updates += 1
                        self.counter_futures_urls += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):
                            self.counter_futures_images += 1
                        else:
                            headers = {
                                'User-agent':
                                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                'Connection': 'close',
                                'Referer': url
                            }

                            future_image = self.session_images.get(
                                graphical_abstract,
                                headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(
                                functools.partial(self.pictureDownloaded, doi,
                                                  url))
                            self.list_futures.append(future_image)

                    else:
                        self.counter_futures_urls += 1
                        self.counter_futures_images += 1
                        continue

                # New article, treat it
                else:

                    url = hosts.refineUrl(company, journal, entry)
                    self.l.debug("Starting adding new entry")

                    future = self.session_pages.get(url,
                                                    timeout=self.TIMEOUT,
                                                    headers=headers)
                    future.add_done_callback(
                        functools.partial(self.completeData, doi, company,
                                          journal, journal_abb, entry))
                    self.list_futures.append(future)

        # Check if the counters are full
        while ((self.counter_futures_images + self.counter_futures_urls) !=
               len(feed.entries) * 2 and self.parent.parsing):
            self.sleep(0.5)

        if self.parent.parsing:
            if not self.bdd.commit():
                self.l.error(self.bdd.lastError().text())
                self.l.debug("db insertions/modifications: {}".format(
                    self.new_entries_worker))
                self.l.error(
                    "Problem when comitting data for {}".format(journal))

        # Free the memory, and clean the remaining futures
        try:
            self.session_pages.executor.shutdown()
        except AttributeError:
            self.l.error("No session_pages to shut down")

        self.session_images.executor.shutdown()
        self.l.debug("Exiting thread for {}".format(journal))