Example #1
0
def test_getData(journalsUrls):

    """Tests the function getData. For each journal of each company,
    tests LENGTH_SAMPLE entries"""

    print("\n")
    print("Starting test getData")

    # Returns a list of the urls of the feed pages
    list_urls_feed = journalsUrls

    # Bypass all companies but one
    list_urls_feed = hosts.getJournals("acs")[2]

    # Build a dic with key: company
    # value: journal name
    dict_journals = {}
    for company in os.listdir("journals"):
        company = company.split(".")[0]
        dict_journals[company] = hosts.getJournals(company)[0]

    # All the journals are tested
    for site in list_urls_feed:

        print("Site {} of {}".format(list_urls_feed.index(site) + 1, len(list_urls_feed)))

        feed = feedparser.parse(site)
        journal = feed["feed"]["title"]

        # Get the company name
        for publisher, data in dict_journals.items():
            if journal in data:
                company = publisher

        print("\n")
        print(journal)

        if len(feed.entries) < LENGTH_SAMPLE:
            samples = feed.entries
        else:
            samples = random.sample(feed.entries, LENGTH_SAMPLE)

        # Tests LENGTH_SAMPLE entries for a journal, not all of them
        for entry in samples:

            if company in ["science", "elsevier", "beilstein"]:
                title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                    company, journal, entry
                )
            else:
                if company == "acs":
                    url = getattr(entry, "feedburner_origlink", entry.link).split("/")[-1]
                    url = "http://pubs.acs.org/doi/abs/10.1021/" + url
                else:
                    url = getattr(entry, "feedburner_origlink", entry.link)

                try:
                    response = requests.get(url, timeout=10)
                    title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                        company, journal, entry, response
                    )
                except requests.exceptions.ReadTimeout:
                    print("A ReadTimeout occured, continue to next entry")

            print(title)
            print(url)
            print(graphical_abstract)
            print(date)
            print("\n")

            assert type(abstract) == str and abstract

            assert type(url) == str and url
            if url != "Empty":
                # Test if url is valid
                assert validators.url(url) is True

            assert type(graphical_abstract) == str and graphical_abstract
            if graphical_abstract != "Empty":
                assert validators.url(graphical_abstract) is True

            assert type(arrow.get(date)) == arrow.arrow.Arrow

            assert topic_simple.startswith(" ") is True
            assert topic_simple.endswith(" ") is True

            if author_simple is not None:
                assert author_simple.startswith(" ") is True
                assert author_simple.endswith(" ") is True
Example #2
0
def test_getData(journalsUrls):
    """Tests the function getData. For each journal of each company,
    tests LENGTH_SAMPLE entries"""

    l.info("Starting test getData")

    start_time = datetime.datetime.now()

    # Returns a list of the urls of the feed pages
    list_urls_feed = journalsUrls

    # TODO: comment or uncomment
    # Bypass all companies but one
    # list_urls_feed = hosts.getJournals("ChemRxiv")[2]

    # Build a dic with key: company
    # value: journal name
    dict_journals = {}

    # Build a dictionnary to store the results of the tests, by company
    dict_res_by_company = {}

    for company in hosts.getCompanies():
        dict_journals[company] = hosts.getJournals(company)[0]

        res = {
            'count_abs_empty': 0,
            'count_image_empty': 0,
            'count_articles_tested': 0,
            'count_articles_untested': 0,
            'count_journals_untested': 0,
            'count_redirections': 0,
        }

        dict_res_by_company[company] = res

    s = requests.session()

    # All the journals are tested
    for site in list_urls_feed:

        l.info("Site {} of {} \n".format(
            list_urls_feed.index(site) + 1, len(list_urls_feed)))

        # Get the RSS page of the url provided
        try:
            feed = feedparser.parse(site, timeout=20)
            journal = feed['feed']['title']
            l.debug("RSS page successfully dled")
        except Exception as e:
            dict_res_by_company[company]['count_journals_untested'] += 1
            l.error("RSS page could not be downloaded: {}".format(e),
                    exc_info=True)
            continue

        # Get the company name
        for publisher, data in dict_journals.items():
            if journal in data:
                company = publisher

        l.info(journal)

        if len(feed.entries) < LENGTH_SAMPLE:
            samples = feed.entries
        else:
            samples = random.sample(feed.entries, LENGTH_SAMPLE)

        # Tests LENGTH_SAMPLE entries for a journal, not all of them
        for entry in samples:

            if company in ['Science', 'Elsevier', 'Beilstein', 'PLOS']:
                title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                    company, journal, entry)
            else:
                url = hosts.refineUrl(company, journal, entry)

                try:
                    response = s.get(url, timeout=10, headers=HEADERS)
                    title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                        company, journal, entry, response)
                except Exception as e:
                    dict_res_by_company[company][
                        'count_articles_untested'] += 1
                    l.error(
                        "A problem occured: {}, continue to next entry".format(
                            e),
                        exc_info=True)
                    continue

            dict_res_by_company[company]['count_articles_tested'] += 1

            l.info("Title: {}".format(title))
            l.info("URL: {}".format(url))
            l.info("Image: {}".format(graphical_abstract))
            l.info("Date: {}".format(date))

            # Count and try do detect suspiciously high numbers of
            # empty results
            if abstract == "Empty":
                dict_res_by_company[company]['count_abs_empty'] += 1
            if graphical_abstract == "Empty":
                dict_res_by_company[company]['count_image_empty'] += 1

            try:
                if response.history:
                    dict_res_by_company[company]['count_redirections'] += 1
                    l.debug("Request was redirected")
                    for resp in response.history:
                        l.debug("Status code, URL: {}, {}".format(
                            resp.status_code, resp.url))
                    l.debug("Final destination:")
                    l.debug("Status code, URL: {}, {} \n".format(
                        resp.status_code, response.url))
                else:
                    l.debug("Request was not redirected \n")
            except UnboundLocalError:
                pass

            # ------------------------ ASSERT SECTION -------------------------

            logAssert(
                type(abstract) == str and abstract,
                "Abstract missing or not a string {}".format(abstract))

            logAssert(
                type(url) == str and url,
                "URL is missing or is not a string {}".format(url))

            # Test if url is valid
            if url != 'Empty':
                logAssert(
                    validators.url(url) is True,
                    "URL is a string but is not a URL {}".format(url))

            # For ACS and Nature, check if the URL is the abstract page's URL
            if company in ['ACS', 'Nature']:
                logAssert(
                    'abs' in url,
                    "company is {}, but URL doesn't contain 'abs' {}".format(
                        company, url))

            logAssert(
                type(graphical_abstract) == str and graphical_abstract,
                "graphical_abstract is missing or not a string {}".format(
                    graphical_abstract))

            if graphical_abstract != 'Empty':
                logAssert(
                    validators.url(graphical_abstract) is True,
                    "graphical_abstract is a string but is not a URL {}".
                    format(graphical_abstract))

            logAssert(
                type(arrow.get(date)) == arrow.arrow.Arrow,
                "The date is not really a date {}".format(date))

            logAssert(
                topic_simple.startswith(' ') is True,
                "Topic doesn't start with space {}".format(topic_simple))

            logAssert(
                topic_simple.endswith(' ') is True,
                "Topic doesn't end with space {}".format(topic_simple))

            if author_simple is not None:
                logAssert(
                    author_simple.startswith(' ') is True,
                    "author_simple doesn't start with space {}".format(
                        author_simple))
                logAssert(
                    author_simple.endswith(' ') is True,
                    "author_simple doesn't end with space {}".format(
                        author_simple))

    pprint(dict_res_by_company)

    # Count results
    count_abs_empty = 0
    count_image_empty = 0
    count_articles_tested = 0
    count_articles_untested = 0
    count_journals_untested = 0
    count_redirections = 0

    for company in dict_res_by_company:
        count_abs_empty += dict_res_by_company[company]['count_abs_empty']
        count_image_empty += dict_res_by_company[company]['count_image_empty']
        count_articles_tested += dict_res_by_company[company][
            'count_articles_tested']
        count_articles_untested += dict_res_by_company[company][
            'count_articles_untested']
        count_journals_untested += dict_res_by_company[company][
            'count_journals_untested']
        count_redirections += dict_res_by_company[company][
            'count_redirections']

    l.debug("Number of untested jounals: {} / {}".format(
        count_journals_untested, len(list_urls_feed)))

    l.debug("Number of test/untested articles: {} / {}".format(
        count_articles_tested, count_articles_untested))

    l.debug("Number of Empty abstracts: {}".format(count_abs_empty))

    l.debug(
        "Number of Empty graphical_abstracts: {}".format(count_image_empty))

    l.debug("Number of redirections: {}".format(count_redirections))

    l.debug("Time spent in test_getData: {}".format(datetime.datetime.now() -
                                                    start_time))
Example #3
0
    def run(self):

        """Main function. Starts the real business"""

        self.l.debug("Entering worker")
        self.l.debug(self.url_feed)

        # Get the RSS page of the url provided
        try:
            self.feed = feedparser.parse(self.url_feed)
            self.l.debug("RSS page successfully dled")
        except OSError:
            self.l.error("Too many files open, could not start the thread !")
            return

        # Get the journal name
        try:
            journal = self.feed['feed']['title']
        except KeyError:
            self.l.critical("No title for the journal ! Aborting")
            self.l.critical(self.url_feed)
            return

        self.l.info("{0}: {1}".format(journal, len(self.feed.entries)))

        # Lists to check if the post is in the db, and if
        # it has all the infos
        self.session_images = FuturesSession(max_workers=20)

        # Get the company and the journal_abb by scrolling the dictionnary
        # containing all the data regarding the journals implemented in the
        # program. This dictionnary is built in gui.py, to avoid multiple calls
        # to hosts.getJournals
        # care_image determines if the Worker will try to dl the graphical
        # abstracts
        for key, tuple_data in self.dict_journals.items():
            if journal in tuple_data[0]:
                company = key
                index = tuple_data[0].index(journal)
                journal_abb = tuple_data[1][index]
                care_image = tuple_data[3][index]
                break

        try:
            self.dico_doi = self.listDoi(journal_abb)
        except UnboundLocalError:
            self.l.error("Journal not recognized ! Aborting")
            return

        # Create a list for the journals which a dl of the article
        # page is not required. All the data are in the rss page
        company_no_dl = ['science', 'elsevier', 'beilstein', 'plos']

        query = QtSql.QSqlQuery(self.bdd)

        self.bdd.transaction()

        # The feeds of these journals are complete
        # if journal in wiley + science + elsevier:
        if company in company_no_dl:

            self.count_futures_urls += len(self.feed.entries)

            for entry in self.feed.entries:

                # Get the DOI, a unique number for a publication
                doi = hosts.getDoi(company, journal, entry)
                url = getattr(entry, 'feedburner_origlink', entry.link)

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.count_futures_images += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    # Insert the crappy articles in a rescue database
                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)
                        self.l.debug("Inserting {0} in table debug".
                                     format(doi))
                        for value in params:
                            query.addBindValue(value)
                        query.exec_()
                    else:
                        continue

                # Artice complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.count_futures_images += 1
                    self.l.debug("Skipping {}".format(doi))
                    continue

                # Artice not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:

                    # How to update the entry
                    dl_page, dl_image, data = hosts.updateData(company,
                                                               journal,
                                                               entry,
                                                               care_image)

                    # For these journals, all the infos are in the RSS.
                    # Only care about the image
                    if dl_image:
                        self.parent.counter_updates += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):
                            self.count_futures_images += 1
                        else:
                            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                       'Connection': 'close',
                                       'Referer': url}

                            future_image = self.session_images.get(
                                graphical_abstract, headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(functools.partial(
                                self.pictureDownloaded, doi, url))
                            self.list_futures.append(future_image)

                    else:
                        self.count_futures_images += 1
                        continue

                else:
                    try:
                        title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry)
                    except TypeError:
                        self.l.error("getData returned None for {}".
                                     format(journal))
                        self.count_futures_images += 1
                        return

                    # Rejecting article if no author
                    if authors == "Empty":
                        self.count_futures_images += 1
                        self.parent.counter_rejected += 1
                        self.l.debug("Rejecting article {}, no author".
                                     format(title))
                        continue

                    query.prepare("INSERT INTO papers (doi, title, date, \
                                  journal, authors, abstract, \
                                  graphical_abstract, url, new, topic_simple, \
                                  author_simple) \
                                   VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

                    # Set new to 1 and not to true
                    params = (doi, title, date, journal_abb, authors, abstract,
                              graphical_abstract, url, 1, topic_simple, author_simple)

                    self.l.debug("Adding {0} to the database".format(doi))
                    self.parent.counter += 1
                    self.new_entries_worker += 1

                    for value in params:
                        query.addBindValue(value)
                    query.exec_()

                    if graphical_abstract == "Empty" or os.path.exists(
                            self.DATA_PATH +
                            functions.simpleChar(graphical_abstract)):

                        self.count_futures_images += 1

                        # This block is executed when you delete the db, but
                        # not the images. Allows to update the
                        # graphical_abstract in db accordingly
                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):

                            query.prepare("UPDATE papers SET \
                                          graphical_abstract=? WHERE doi=?")

                            params = (functions.simpleChar(graphical_abstract),
                                      doi)

                            for value in params:
                                query.addBindValue(value)
                            query.exec_()
                    else:
                        headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                   'Connection': 'close',
                                   'Referer': url}

                        future_image = self.session_images.get(
                            graphical_abstract, headers=headers,
                            timeout=self.TIMEOUT)
                        future_image.add_done_callback(
                            functools.partial(self.pictureDownloaded,
                                              doi, url))
                        self.list_futures.append(future_image)

        else:

            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                       'Connection': 'close'}

            self.session_pages = FuturesSession(max_workers=20)

            for entry in self.feed.entries:

                doi = hosts.getDoi(company, journal, entry)

                if company == 'acs':
                    url = getattr(entry, 'feedburner_origlink',
                                  entry.link).split('/')[-1]
                    url = "http://pubs.acs.org/doi/abs/10.1021/" + url

                elif company == 'npg':
                    url = getattr(entry, 'feedburner_origlink',
                                  entry.link).split('/')[-1]
                    url = "http://www.nature.com/nature/journal/vaop/ncurrent/abs/" + url + ".html"
                else:
                    url = getattr(entry, 'feedburner_origlink', entry.link)

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.count_futures_images += 1
                    self.count_futures_urls += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)

                        for value in params:
                            query.addBindValue(value)
                        query.exec_()

                        self.l.debug("Inserting {0} in table debug".
                                     format(doi))
                    continue


                # Article complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.count_futures_images += 1
                    self.count_futures_urls += 1
                    self.l.debug("Skipping {}".format(doi))
                    continue


                # Article not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:


                    dl_page, dl_image, data = hosts.updateData(company,
                                                               journal,
                                                               entry,
                                                               care_image)

                    if dl_page:
                        self.parent.counter_updates += 1

                        future = self.session_pages.get(url,
                                                        timeout=self.TIMEOUT,
                                                        headers=headers)
                        future.add_done_callback(functools.partial(
                            self.completeData, doi, company, journal,
                            journal_abb, entry))
                        self.list_futures.append(future)

                        # Continue just to be sure. If dl_page is True,
                        # dl_image is likely True too
                        continue

                    elif dl_image:
                        self.parent.counter_updates += 1
                        self.count_futures_urls += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):
                            self.count_futures_images += 1
                        else:
                            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                       'Connection': 'close',
                                       'Referer': url}

                            future_image = self.session_images.get(
                                graphical_abstract, headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(functools.partial(
                                self.pictureDownloaded, doi, url))
                            self.list_futures.append(future_image)

                    else:
                        self.count_futures_urls += 1
                        self.count_futures_images += 1
                        continue

                else:

                    self.l.debug("Starting adding new entry")

                    future = self.session_pages.get(url, timeout=self.TIMEOUT,
                                                    headers=headers)
                    future.add_done_callback(functools.partial(
                        self.completeData, doi, company, journal, journal_abb,
                        entry))
                    self.list_futures.append(future)


        # Check if the counters are full
        while ((self.count_futures_images + self.count_futures_urls) !=
                len(self.feed.entries) * 2 and self.parent.parsing):
            self.sleep(1)

        if self.parent.parsing:
            if not self.bdd.commit():
                self.l.error(self.bdd.lastError().text())
                self.l.debug("db insertions/modifications: {}".
                             format(self.new_entries_worker))
                self.l.error("Problem when comitting data for {}".
                             format(journal))

        # Free the memory, and clean the remaining futures
        try:
            self.session_pages.executor.shutdown()
        except AttributeError:
            self.l.error("No session_pages to shut down")

        self.session_images.executor.shutdown()
        self.l.debug("Exiting thread for {}".format(journal))
Example #4
0
    def completeData(self, doi, company, journal, journal_abb, entry, future):

        """Callback to handle the response of the futures trying to
        download the page of the articles"""

        self.l.debug("Page dled")
        self.count_futures_urls += 1

        if not self.parent.parsing:
            return

        try:
            response = future.result()
        except requests.exceptions.ReadTimeout:
            self.l.error("ReadTimeout for {}".format(journal))
            self.count_futures_images += 1
            return
        except requests.exceptions.ConnectionError:
            self.l.error("ConnectionError for {}".format(journal))
            self.count_futures_images += 1
            return
        except ConnectionResetError:
            self.l.error("ConnectionResetError for {}".format(journal))
            self.count_futures_images += 1
            return
        except socket.timeout:
            self.l.error("socket.timeout for {}".format(journal))
            self.count_futures_images += 1
            return
        except concurrent.futures._base.CancelledError:
            self.l.error("future cancelled for {}".format(journal))
            self.count_futures_images += 1
            return
        except Exception as e:
            self.l.error("Unknown exception {} for {}".format(e, journal))
            self.l.error(traceback.format_exc())
            self.count_futures_images += 1
            return

        query = QtSql.QSqlQuery(self.bdd)

        try:
            title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry, response)
        except TypeError:
            self.l.error("getData returned None for {}".format(journal))
            self.count_futures_images += 1
            return
        except Exception as e:
            self.l.error("Unknown exception completeData {}".format(e))
            self.l.error(traceback.format_exc())
            self.count_futures_images += 1
            return

        # Rejecting the article if no authors
        if authors == "Empty":
            self.count_futures_images += 1
            self.parent.counter_rejected += 1
            self.l.debug("Rejecting article {}, no author".format(title))
            return

        # Check if the DOI is already in the db. Mandatory, bc sometimes
        # updateData will tell the worker to dl the page before downloading
        # the picture
        if doi not in self.dico_doi:
            query.prepare("INSERT INTO papers (doi, title, date, journal, \
                          authors, abstract, graphical_abstract, url, new, \
                          topic_simple, author_simple) VALUES(?, ?, ?, ?, ?, \
                          ?, ?, ?, ?, ?, ?)")

            params = (doi, title, date, journal_abb, authors, abstract,
                      graphical_abstract, url, 1, topic_simple, author_simple)

            self.l.debug("Adding {0} to the database".format(doi))
            self.parent.counter += 1

            for value in params:
                query.addBindValue(value)

            query.exec_()

        self.new_entries_worker += 1

        # Don't try to dl the image if its url is 'Empty', or if the image
        # already exists
        if (graphical_abstract == "Empty" or
                os.path.exists(self.DATA_PATH +
                               functions.simpleChar(graphical_abstract))):
            self.count_futures_images += 1
            self.l.debug("Image already dled or Empty")

            # This block is executed when you delete the db, but not the
            # images. Allows to update the graphical_abstract in db accordingly
            if os.path.exists(self.DATA_PATH +
                              functions.simpleChar(graphical_abstract)):
                query.prepare("UPDATE papers SET graphical_abstract=? WHERE \
                              doi=?")
                params = (functions.simpleChar(graphical_abstract), doi)
                for value in params:
                    query.addBindValue(value)
                query.exec_()
        else:
            self.l.debug("Page dled, adding future image")
            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                       'Connection': 'close',
                       'Referer': url}

            future_image = self.session_images.get(graphical_abstract,
                                                   headers=headers,
                                                   timeout=self.TIMEOUT)
            future_image.add_done_callback(functools.partial(
                self.pictureDownloaded, doi, url))
            self.list_futures.append(future_image)
Example #5
0
    def run(self):
        """Main function. Starts the real business"""

        self.l.debug("Entering worker")

        feed = self._getFeed(timeout=self.TIMEOUT)

        if feed is None:
            self.l.error("Exiting worker, problem w/ the feed")
            self.parent.list_failed_rss.append(self.url_feed)
            return

        # Get the journal name
        journal = feed['feed']['title']

        self.l.info("{}: {}".format(journal, len(feed.entries)))

        # Lists to check if the post is in the db, and if
        # it has all the info
        self.session_images = FuturesSession(
            max_workers=self.MAX_WORKERS, session=self.parent.browsing_session)

        # Get the company and the journal_abb by scrolling the dictionary
        # containing all the data regarding the journals implemented in the
        # program. This dictionary is built in gui.py, to avoid multiple calls
        # to hosts.getJournals
        # care_image determines if the Worker will try to dl the graphical
        # abstracts
        for key, tuple_data in self.dict_journals.items():
            if journal in tuple_data[0]:
                company = key
                index = tuple_data[0].index(journal)
                journal_abb = tuple_data[1][index]
                care_image = tuple_data[3][index]
                break

        try:
            self.dico_doi = self.listDoi(journal_abb)
        except UnboundLocalError:
            self.l.error("Journal not recognized ! Aborting")
            self.parent.list_failed_rss.append(self.url_feed)
            return

        # Create a list for the journals which a dl of the article
        # page is not required. All the data are in the rss page
        company_no_dl = [
            'Science', 'Elsevier', 'Beilstein', 'PLOS', 'ChemArxiv', 'Wiley'
        ]

        query = QtSql.QSqlQuery(self.bdd)

        self.bdd.transaction()

        # The feeds of these journals are complete
        if company in company_no_dl:

            self.counter_futures_urls += len(feed.entries)

            for entry in feed.entries:

                # Get the DOI, a unique number for a publication
                try:
                    doi = hosts.getDoi(company, journal, entry)
                except Exception as e:
                    self.l.error("getDoi failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    continue

                try:
                    url = hosts.refineUrl(company, journal, entry)
                except Exception as e:
                    self.l.error("refineUrl failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    continue

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.counter_futures_images += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    # Insert the crappy articles in a rescue database
                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)
                        self.l.debug(
                            "Inserting {0} in table debug".format(doi))
                        for value in params:
                            query.addBindValue(value)
                        query.exec_()
                    else:
                        continue

                # Artice complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.counter_futures_images += 1
                    self.l.debug("Article complete, skipping {}".format(doi))
                    continue

                # Artice not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:
                    self.l.debug("Trying to update {}".format(doi))

                    # How to update the entry
                    dl_page, dl_image, data = hosts.updateData(
                        company, journal, entry, care_image)

                    # For these journals, all the infos are in the RSS.
                    # Only care about the image
                    if dl_image:
                        self.parent.counter_updates += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):
                            self.counter_futures_images += 1
                        else:
                            headers = {
                                'User-agent':
                                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                'Connection': 'close',
                                'Referer': url
                            }

                            future_image = self.session_images.get(
                                graphical_abstract,
                                headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(
                                functools.partial(self.pictureDownloaded, doi,
                                                  url))
                            self.list_futures.append(future_image)

                    else:
                        self.counter_futures_images += 1
                        continue

                # New article, treat it
                else:
                    try:
                        title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                            company, journal, entry)
                    except Exception as e:
                        self.l.error(
                            "Problem with getData: {}".format(journal),
                            exc_info=True)
                        self.counter_futures_images += 1
                        self.parent.counter_articles_failed += 1
                        return

                    # Rejecting article if no author
                    if authors == "Empty":
                        self.counter_futures_images += 1
                        self.parent.counter_rejected += 1
                        self.l.debug(
                            "Rejecting article {}, no author".format(title))
                        continue

                    query.prepare("INSERT INTO papers (doi, title, date, \
                                  journal, authors, abstract, \
                                  graphical_abstract, url, new, topic_simple, \
                                  author_simple) \
                                  VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

                    # Set new to 1 and not to true
                    params = (doi, title, date, journal_abb, authors, abstract,
                              graphical_abstract, url, 1, topic_simple,
                              author_simple)

                    for value in params:
                        query.addBindValue(value)

                    # Test that query worked
                    if not query.exec_():
                        self.l.error(
                            "SQL ERROR in run(): {}, company_no_dl".format(
                                query.lastError().text()))
                        self.parent.counter_articles_failed += 1
                        continue
                    else:
                        self.l.debug("{} added to the database".format(doi))
                        self.new_entries_worker += 1
                        self.parent.counter_added += 1

                    # If article has no graphical abstract of if it has been
                    # dled
                    if graphical_abstract == "Empty" or os.path.exists(
                            self.PATH +
                            functions.simpleChar(graphical_abstract)):

                        self.counter_futures_images += 1

                        # This block is executed when you delete the db, but
                        # not the images. Allows to update the
                        # graphical_abstract in db accordingly
                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):

                            query.prepare("UPDATE papers SET \
                                          graphical_abstract=? WHERE doi=?")

                            params = (functions.simpleChar(graphical_abstract),
                                      doi)

                            for value in params:
                                query.addBindValue(value)
                            query.exec_()
                    else:
                        headers = {
                            'User-agent':
                            'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                            'Connection': 'close',
                            'Referer': url
                        }

                        future_image = self.session_images.get(
                            graphical_abstract,
                            headers=headers,
                            timeout=self.TIMEOUT)

                        future_image.add_done_callback(
                            functools.partial(self.pictureDownloaded, doi,
                                              url))

                        self.list_futures.append(future_image)

        # The company requires to download the article's web page
        else:

            headers = {
                'User-agent':
                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                'Connection': 'close'
            }

            self.session_pages = FuturesSession(
                max_workers=self.MAX_WORKERS,
                session=self.parent.browsing_session)

            for entry in feed.entries:

                # Get the DOI, a unique number for a publication
                try:
                    doi = hosts.getDoi(company, journal, entry)
                except Exception as e:
                    self.l.error("getDoi failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Try to refine the url
                try:
                    url = hosts.refineUrl(company, journal, entry)
                except Exception as e:
                    self.l.error("refineUrl failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Make sure the entry has a title
                try:
                    title = entry.title
                except AttributeError:
                    self.l.error("No title for {}".format(doi), exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(title):
                    self.counter_futures_images += 1
                    self.counter_futures_urls += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)

                        for value in params:
                            query.addBindValue(value)
                        query.exec_()

                        self.l.debug(
                            "Inserting {0} in table debug".format(doi))
                    continue

                # Article complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.counter_futures_images += 1
                    self.counter_futures_urls += 1
                    self.l.debug("Article complete, skipping {}".format(doi))
                    continue

                # Article not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:

                    url = hosts.refineUrl(company, journal, entry)

                    dl_page, dl_image, data = hosts.updateData(
                        company, journal, entry, care_image)

                    if dl_page:
                        self.parent.counter_updates += 1

                        future = self.session_pages.get(url,
                                                        timeout=self.TIMEOUT,
                                                        headers=headers)
                        future.add_done_callback(
                            functools.partial(self.completeData, doi, company,
                                              journal, journal_abb, entry))
                        self.list_futures.append(future)

                        # Continue just to be sure. If dl_page is True,
                        # dl_image is likely True too
                        continue

                    elif dl_image:
                        self.parent.counter_updates += 1
                        self.counter_futures_urls += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):
                            self.counter_futures_images += 1
                        else:
                            headers = {
                                'User-agent':
                                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                'Connection': 'close',
                                'Referer': url
                            }

                            future_image = self.session_images.get(
                                graphical_abstract,
                                headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(
                                functools.partial(self.pictureDownloaded, doi,
                                                  url))
                            self.list_futures.append(future_image)

                    else:
                        self.counter_futures_urls += 1
                        self.counter_futures_images += 1
                        continue

                # New article, treat it
                else:

                    url = hosts.refineUrl(company, journal, entry)
                    self.l.debug("Starting adding new entry")

                    future = self.session_pages.get(url,
                                                    timeout=self.TIMEOUT,
                                                    headers=headers)
                    future.add_done_callback(
                        functools.partial(self.completeData, doi, company,
                                          journal, journal_abb, entry))
                    self.list_futures.append(future)

        # Check if the counters are full
        while ((self.counter_futures_images + self.counter_futures_urls) !=
               len(feed.entries) * 2 and self.parent.parsing):
            self.sleep(0.5)

        if self.parent.parsing:
            if not self.bdd.commit():
                self.l.error(self.bdd.lastError().text())
                self.l.debug("db insertions/modifications: {}".format(
                    self.new_entries_worker))
                self.l.error(
                    "Problem when comitting data for {}".format(journal))

        # Free the memory, and clean the remaining futures
        try:
            self.session_pages.executor.shutdown()
        except AttributeError:
            self.l.error("No session_pages to shut down")

        self.session_images.executor.shutdown()
        self.l.debug("Exiting thread for {}".format(journal))
Example #6
0
    def completeData(self, doi, company, journal, journal_abb, entry, future):
        """Callback to handle the response of the futures trying to
        download the page of the articles"""

        self.l.debug("Page dled")
        self.counter_futures_urls += 1

        if not self.parent.parsing:
            return

        try:
            response = future.result()
        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, ConnectionResetError,
                socket.timeout, concurrent.futures._base.CancelledError) as e:

            self.l.error("{} raised for {}. Handled".format(journal, e))
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return
        except Exception as e:
            self.l.error("Unknown exception {} for {}".format(e, journal),
                         exc_info=True)
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return

        query = QtSql.QSqlQuery(self.bdd)

        try:
            title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                company, journal, entry, response)
        except TypeError:
            self.l.error("getData returned None for {}".format(journal),
                         exc_info=True)
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return
        except Exception as e:
            self.l.error("Unknown exception completeData {}".format(e),
                         exc_info=True)
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return

        # Rejecting the article if no authors
        if authors == "Empty":
            self.counter_futures_images += 1
            self.parent.counter_rejected += 1
            self.l.debug("Rejecting article {}, no author".format(title))
            return

        # Check if the DOI is already in the db. Mandatory, bc sometimes
        # updateData will tell the worker to dl the page before downloading
        # the picture
        if doi not in self.dico_doi:
            query.prepare("INSERT INTO papers (doi, title, date, journal, \
                          authors, abstract, graphical_abstract, url, new, \
                          topic_simple, author_simple) VALUES(?, \
                          ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

            params = (doi, title, date, journal_abb, authors, abstract,
                      graphical_abstract, url, 1, topic_simple, author_simple)

            self.l.debug("Adding {} to the database".format(doi))
            self.parent.counter_added += 1

            for value in params:
                query.addBindValue(value)

            # Test that query worked
            if not query.exec_():
                self.l.error("SQL ERROR in completeData(): {}".format(
                    query.lastError().text()))
                self.parent.counter_articles_failed += 1
                return
            else:
                self.new_entries_worker += 1

        # Don't try to dl the image if its url is 'Empty', or if the image
        # already exists
        if (graphical_abstract == "Empty"
                or os.path.exists(self.PATH +
                                  functions.simpleChar(graphical_abstract))):
            self.counter_futures_images += 1
            self.l.debug("Image already dled or Empty")

            # This block is executed when you delete the db, but not the
            # images. Allows to update the graphical_abstract in db accordingly
            if os.path.exists(self.PATH +
                              functions.simpleChar(graphical_abstract)):
                query.prepare("UPDATE papers SET graphical_abstract=? WHERE \
                              doi=?")
                params = (functions.simpleChar(graphical_abstract), doi)
                for value in params:
                    query.addBindValue(value)
                query.exec_()
        else:
            self.l.debug("Page dled, adding future image")
            headers = {
                'User-agent':
                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                'Connection': 'close',
                'Referer': url
            }

            future_image = self.session_images.get(graphical_abstract,
                                                   headers=headers,
                                                   timeout=self.TIMEOUT)
            future_image.add_done_callback(
                functools.partial(self.pictureDownloaded, doi, url))
            self.list_futures.append(future_image)