Python getData Examples

Programming Language: Python

Namespace/Package Name: hosts

Method/Function: getData

Examples at hotexamples.com: 6

Python getData - 6 examples found. These are the top rated real world Python examples of hosts.getData extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: test_hosts.py Project: RKBK/ChemBrows

def test_getData(journalsUrls):

    """Tests the function getData. For each journal of each company,
    tests LENGTH_SAMPLE entries"""

    print("\n")
    print("Starting test getData")

    # Returns a list of the urls of the feed pages
    list_urls_feed = journalsUrls

    # Bypass all companies but one
    list_urls_feed = hosts.getJournals("acs")[2]

    # Build a dic with key: company
    # value: journal name
    dict_journals = {}
    for company in os.listdir("journals"):
        company = company.split(".")[0]
        dict_journals[company] = hosts.getJournals(company)[0]

    # All the journals are tested
    for site in list_urls_feed:

        print("Site {} of {}".format(list_urls_feed.index(site) + 1, len(list_urls_feed)))

        feed = feedparser.parse(site)
        journal = feed["feed"]["title"]

        # Get the company name
        for publisher, data in dict_journals.items():
            if journal in data:
                company = publisher

        print("\n")
        print(journal)

        if len(feed.entries) < LENGTH_SAMPLE:
            samples = feed.entries
        else:
            samples = random.sample(feed.entries, LENGTH_SAMPLE)

        # Tests LENGTH_SAMPLE entries for a journal, not all of them
        for entry in samples:

            if company in ["science", "elsevier", "beilstein"]:
                title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                    company, journal, entry
                )
            else:
                if company == "acs":
                    url = getattr(entry, "feedburner_origlink", entry.link).split("/")[-1]
                    url = "http://pubs.acs.org/doi/abs/10.1021/" + url
                else:
                    url = getattr(entry, "feedburner_origlink", entry.link)

                try:
                    response = requests.get(url, timeout=10)
                    title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                        company, journal, entry, response
                    )
                except requests.exceptions.ReadTimeout:
                    print("A ReadTimeout occured, continue to next entry")

            print(title)
            print(url)
            print(graphical_abstract)
            print(date)
            print("\n")

            assert type(abstract) == str and abstract

            assert type(url) == str and url
            if url != "Empty":
                # Test if url is valid
                assert validators.url(url) is True

            assert type(graphical_abstract) == str and graphical_abstract
            if graphical_abstract != "Empty":
                assert validators.url(graphical_abstract) is True

            assert type(arrow.get(date)) == arrow.arrow.Arrow

            assert topic_simple.startswith(" ") is True
            assert topic_simple.endswith(" ") is True

            if author_simple is not None:
                assert author_simple.startswith(" ") is True
                assert author_simple.endswith(" ") is True

Example #2

Show file

def test_getData(journalsUrls):
    """Tests the function getData. For each journal of each company,
    tests LENGTH_SAMPLE entries"""

    l.info("Starting test getData")

    start_time = datetime.datetime.now()

    # Returns a list of the urls of the feed pages
    list_urls_feed = journalsUrls

    # TODO: comment or uncomment
    # Bypass all companies but one
    # list_urls_feed = hosts.getJournals("ChemRxiv")[2]

    # Build a dic with key: company
    # value: journal name
    dict_journals = {}

    # Build a dictionnary to store the results of the tests, by company
    dict_res_by_company = {}

    for company in hosts.getCompanies():
        dict_journals[company] = hosts.getJournals(company)[0]

        res = {
            'count_abs_empty': 0,
            'count_image_empty': 0,
            'count_articles_tested': 0,
            'count_articles_untested': 0,
            'count_journals_untested': 0,
            'count_redirections': 0,
        }

        dict_res_by_company[company] = res

    s = requests.session()

    # All the journals are tested
    for site in list_urls_feed:

        l.info("Site {} of {} \n".format(
            list_urls_feed.index(site) + 1, len(list_urls_feed)))

        # Get the RSS page of the url provided
        try:
            feed = feedparser.parse(site, timeout=20)
            journal = feed['feed']['title']
            l.debug("RSS page successfully dled")
        except Exception as e:
            dict_res_by_company[company]['count_journals_untested'] += 1
            l.error("RSS page could not be downloaded: {}".format(e),
                    exc_info=True)
            continue

        # Get the company name
        for publisher, data in dict_journals.items():
            if journal in data:
                company = publisher

        l.info(journal)

        if len(feed.entries) < LENGTH_SAMPLE:
            samples = feed.entries
        else:
            samples = random.sample(feed.entries, LENGTH_SAMPLE)

        # Tests LENGTH_SAMPLE entries for a journal, not all of them
        for entry in samples:

            if company in ['Science', 'Elsevier', 'Beilstein', 'PLOS']:
                title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                    company, journal, entry)
            else:
                url = hosts.refineUrl(company, journal, entry)

                try:
                    response = s.get(url, timeout=10, headers=HEADERS)
                    title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                        company, journal, entry, response)
                except Exception as e:
                    dict_res_by_company[company][
                        'count_articles_untested'] += 1
                    l.error(
                        "A problem occured: {}, continue to next entry".format(
                            e),
                        exc_info=True)
                    continue

            dict_res_by_company[company]['count_articles_tested'] += 1

            l.info("Title: {}".format(title))
            l.info("URL: {}".format(url))
            l.info("Image: {}".format(graphical_abstract))
            l.info("Date: {}".format(date))

            # Count and try do detect suspiciously high numbers of
            # empty results
            if abstract == "Empty":
                dict_res_by_company[company]['count_abs_empty'] += 1
            if graphical_abstract == "Empty":
                dict_res_by_company[company]['count_image_empty'] += 1

            try:
                if response.history:
                    dict_res_by_company[company]['count_redirections'] += 1
                    l.debug("Request was redirected")
                    for resp in response.history:
                        l.debug("Status code, URL: {}, {}".format(
                            resp.status_code, resp.url))
                    l.debug("Final destination:")
                    l.debug("Status code, URL: {}, {} \n".format(
                        resp.status_code, response.url))
                else:
                    l.debug("Request was not redirected \n")
            except UnboundLocalError:
                pass

            # ------------------------ ASSERT SECTION -------------------------

            logAssert(
                type(abstract) == str and abstract,
                "Abstract missing or not a string {}".format(abstract))

            logAssert(
                type(url) == str and url,
                "URL is missing or is not a string {}".format(url))

            # Test if url is valid
            if url != 'Empty':
                logAssert(
                    validators.url(url) is True,
                    "URL is a string but is not a URL {}".format(url))

            # For ACS and Nature, check if the URL is the abstract page's URL
            if company in ['ACS', 'Nature']:
                logAssert(
                    'abs' in url,
                    "company is {}, but URL doesn't contain 'abs' {}".format(
                        company, url))

            logAssert(
                type(graphical_abstract) == str and graphical_abstract,
                "graphical_abstract is missing or not a string {}".format(
                    graphical_abstract))

            if graphical_abstract != 'Empty':
                logAssert(
                    validators.url(graphical_abstract) is True,
                    "graphical_abstract is a string but is not a URL {}".
                    format(graphical_abstract))

            logAssert(
                type(arrow.get(date)) == arrow.arrow.Arrow,
                "The date is not really a date {}".format(date))

            logAssert(
                topic_simple.startswith(' ') is True,
                "Topic doesn't start with space {}".format(topic_simple))

            logAssert(
                topic_simple.endswith(' ') is True,
                "Topic doesn't end with space {}".format(topic_simple))

            if author_simple is not None:
                logAssert(
                    author_simple.startswith(' ') is True,
                    "author_simple doesn't start with space {}".format(
                        author_simple))
                logAssert(
                    author_simple.endswith(' ') is True,
                    "author_simple doesn't end with space {}".format(
                        author_simple))

    pprint(dict_res_by_company)

    # Count results
    count_abs_empty = 0
    count_image_empty = 0
    count_articles_tested = 0
    count_articles_untested = 0
    count_journals_untested = 0
    count_redirections = 0

    for company in dict_res_by_company:
        count_abs_empty += dict_res_by_company[company]['count_abs_empty']
        count_image_empty += dict_res_by_company[company]['count_image_empty']
        count_articles_tested += dict_res_by_company[company][
            'count_articles_tested']
        count_articles_untested += dict_res_by_company[company][
            'count_articles_untested']
        count_journals_untested += dict_res_by_company[company][
            'count_journals_untested']
        count_redirections += dict_res_by_company[company][
            'count_redirections']

    l.debug("Number of untested jounals: {} / {}".format(
        count_journals_untested, len(list_urls_feed)))

    l.debug("Number of test/untested articles: {} / {}".format(
        count_articles_tested, count_articles_untested))

    l.debug("Number of Empty abstracts: {}".format(count_abs_empty))

    l.debug(
        "Number of Empty graphical_abstracts: {}".format(count_image_empty))

    l.debug("Number of redirections: {}".format(count_redirections))

    l.debug("Time spent in test_getData: {}".format(datetime.datetime.now() -
                                                    start_time))

Example #3

Show file

File: worker.py Project: RKBK/ChemBrows

    def run(self):

        """Main function. Starts the real business"""

        self.l.debug("Entering worker")
        self.l.debug(self.url_feed)

        # Get the RSS page of the url provided
        try:
            self.feed = feedparser.parse(self.url_feed)
            self.l.debug("RSS page successfully dled")
        except OSError:
            self.l.error("Too many files open, could not start the thread !")
            return

        # Get the journal name
        try:
            journal = self.feed['feed']['title']
        except KeyError:
            self.l.critical("No title for the journal ! Aborting")
            self.l.critical(self.url_feed)
            return

        self.l.info("{0}: {1}".format(journal, len(self.feed.entries)))

        # Lists to check if the post is in the db, and if
        # it has all the infos
        self.session_images = FuturesSession(max_workers=20)

        # Get the company and the journal_abb by scrolling the dictionnary
        # containing all the data regarding the journals implemented in the
        # program. This dictionnary is built in gui.py, to avoid multiple calls
        # to hosts.getJournals
        # care_image determines if the Worker will try to dl the graphical
        # abstracts
        for key, tuple_data in self.dict_journals.items():
            if journal in tuple_data[0]:
                company = key
                index = tuple_data[0].index(journal)
                journal_abb = tuple_data[1][index]
                care_image = tuple_data[3][index]
                break

        try:
            self.dico_doi = self.listDoi(journal_abb)
        except UnboundLocalError:
            self.l.error("Journal not recognized ! Aborting")
            return

        # Create a list for the journals which a dl of the article
        # page is not required. All the data are in the rss page
        company_no_dl = ['science', 'elsevier', 'beilstein', 'plos']

        query = QtSql.QSqlQuery(self.bdd)

        self.bdd.transaction()

        # The feeds of these journals are complete
        # if journal in wiley + science + elsevier:
        if company in company_no_dl:

            self.count_futures_urls += len(self.feed.entries)

            for entry in self.feed.entries:

                # Get the DOI, a unique number for a publication
                doi = hosts.getDoi(company, journal, entry)
                url = getattr(entry, 'feedburner_origlink', entry.link)

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.count_futures_images += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    # Insert the crappy articles in a rescue database
                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)
                        self.l.debug("Inserting {0} in table debug".
                                     format(doi))
                        for value in params:
                            query.addBindValue(value)
                        query.exec_()
                    else:
                        continue

                # Artice complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.count_futures_images += 1
                    self.l.debug("Skipping {}".format(doi))
                    continue

                # Artice not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:

                    # How to update the entry
                    dl_page, dl_image, data = hosts.updateData(company,
                                                               journal,
                                                               entry,
                                                               care_image)

                    # For these journals, all the infos are in the RSS.
                    # Only care about the image
                    if dl_image:
                        self.parent.counter_updates += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):
                            self.count_futures_images += 1
                        else:
                            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                       'Connection': 'close',
                                       'Referer': url}

                            future_image = self.session_images.get(
                                graphical_abstract, headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(functools.partial(
                                self.pictureDownloaded, doi, url))
                            self.list_futures.append(future_image)

                    else:
                        self.count_futures_images += 1
                        continue

                else:
                    try:
                        title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry)
                    except TypeError:
                        self.l.error("getData returned None for {}".
                                     format(journal))
                        self.count_futures_images += 1
                        return

                    # Rejecting article if no author
                    if authors == "Empty":
                        self.count_futures_images += 1
                        self.parent.counter_rejected += 1
                        self.l.debug("Rejecting article {}, no author".
                                     format(title))
                        continue

                    query.prepare("INSERT INTO papers (doi, title, date, \
                                  journal, authors, abstract, \
                                  graphical_abstract, url, new, topic_simple, \
                                  author_simple) \
                                   VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

                    # Set new to 1 and not to true
                    params = (doi, title, date, journal_abb, authors, abstract,
                              graphical_abstract, url, 1, topic_simple, author_simple)

                    self.l.debug("Adding {0} to the database".format(doi))
                    self.parent.counter += 1
                    self.new_entries_worker += 1

                    for value in params:
                        query.addBindValue(value)
                    query.exec_()

                    if graphical_abstract == "Empty" or os.path.exists(
                            self.DATA_PATH +
                            functions.simpleChar(graphical_abstract)):

                        self.count_futures_images += 1

                        # This block is executed when you delete the db, but
                        # not the images. Allows to update the
                        # graphical_abstract in db accordingly
                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):

                            query.prepare("UPDATE papers SET \
                                          graphical_abstract=? WHERE doi=?")

                            params = (functions.simpleChar(graphical_abstract),
                                      doi)

                            for value in params:
                                query.addBindValue(value)
                            query.exec_()
                    else:
                        headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                   'Connection': 'close',
                                   'Referer': url}

                        future_image = self.session_images.get(
                            graphical_abstract, headers=headers,
                            timeout=self.TIMEOUT)
                        future_image.add_done_callback(
                            functools.partial(self.pictureDownloaded,
                                              doi, url))
                        self.list_futures.append(future_image)

        else:

            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                       'Connection': 'close'}

            self.session_pages = FuturesSession(max_workers=20)

            for entry in self.feed.entries:

                doi = hosts.getDoi(company, journal, entry)

                if company == 'acs':
                    url = getattr(entry, 'feedburner_origlink',
                                  entry.link).split('/')[-1]
                    url = "http://pubs.acs.org/doi/abs/10.1021/" + url

                elif company == 'npg':
                    url = getattr(entry, 'feedburner_origlink',
                                  entry.link).split('/')[-1]
                    url = "http://www.nature.com/nature/journal/vaop/ncurrent/abs/" + url + ".html"
                else:
                    url = getattr(entry, 'feedburner_origlink', entry.link)

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.count_futures_images += 1
                    self.count_futures_urls += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)

                        for value in params:
                            query.addBindValue(value)
                        query.exec_()

                        self.l.debug("Inserting {0} in table debug".
                                     format(doi))
                    continue


                # Article complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.count_futures_images += 1
                    self.count_futures_urls += 1
                    self.l.debug("Skipping {}".format(doi))
                    continue


                # Article not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:


                    dl_page, dl_image, data = hosts.updateData(company,
                                                               journal,
                                                               entry,
                                                               care_image)

                    if dl_page:
                        self.parent.counter_updates += 1

                        future = self.session_pages.get(url,
                                                        timeout=self.TIMEOUT,
                                                        headers=headers)
                        future.add_done_callback(functools.partial(
                            self.completeData, doi, company, journal,
                            journal_abb, entry))
                        self.list_futures.append(future)

                        # Continue just to be sure. If dl_page is True,
                        # dl_image is likely True too
                        continue

                    elif dl_image:
                        self.parent.counter_updates += 1
                        self.count_futures_urls += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(self.DATA_PATH +
                                          functions.simpleChar(
                                              graphical_abstract)):
                            self.count_futures_images += 1
                        else:
                            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                       'Connection': 'close',
                                       'Referer': url}

                            future_image = self.session_images.get(
                                graphical_abstract, headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(functools.partial(
                                self.pictureDownloaded, doi, url))
                            self.list_futures.append(future_image)

                    else:
                        self.count_futures_urls += 1
                        self.count_futures_images += 1
                        continue

                else:

                    self.l.debug("Starting adding new entry")

                    future = self.session_pages.get(url, timeout=self.TIMEOUT,
                                                    headers=headers)
                    future.add_done_callback(functools.partial(
                        self.completeData, doi, company, journal, journal_abb,
                        entry))
                    self.list_futures.append(future)


        # Check if the counters are full
        while ((self.count_futures_images + self.count_futures_urls) !=
                len(self.feed.entries) * 2 and self.parent.parsing):
            self.sleep(1)

        if self.parent.parsing:
            if not self.bdd.commit():
                self.l.error(self.bdd.lastError().text())
                self.l.debug("db insertions/modifications: {}".
                             format(self.new_entries_worker))
                self.l.error("Problem when comitting data for {}".
                             format(journal))

        # Free the memory, and clean the remaining futures
        try:
            self.session_pages.executor.shutdown()
        except AttributeError:
            self.l.error("No session_pages to shut down")

        self.session_images.executor.shutdown()
        self.l.debug("Exiting thread for {}".format(journal))

Example #4

Show file

File: worker.py Project: RKBK/ChemBrows

    def completeData(self, doi, company, journal, journal_abb, entry, future):

        """Callback to handle the response of the futures trying to
        download the page of the articles"""

        self.l.debug("Page dled")
        self.count_futures_urls += 1

        if not self.parent.parsing:
            return

        try:
            response = future.result()
        except requests.exceptions.ReadTimeout:
            self.l.error("ReadTimeout for {}".format(journal))
            self.count_futures_images += 1
            return
        except requests.exceptions.ConnectionError:
            self.l.error("ConnectionError for {}".format(journal))
            self.count_futures_images += 1
            return
        except ConnectionResetError:
            self.l.error("ConnectionResetError for {}".format(journal))
            self.count_futures_images += 1
            return
        except socket.timeout:
            self.l.error("socket.timeout for {}".format(journal))
            self.count_futures_images += 1
            return
        except concurrent.futures._base.CancelledError:
            self.l.error("future cancelled for {}".format(journal))
            self.count_futures_images += 1
            return
        except Exception as e:
            self.l.error("Unknown exception {} for {}".format(e, journal))
            self.l.error(traceback.format_exc())
            self.count_futures_images += 1
            return

        query = QtSql.QSqlQuery(self.bdd)

        try:
            title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry, response)
        except TypeError:
            self.l.error("getData returned None for {}".format(journal))
            self.count_futures_images += 1
            return
        except Exception as e:
            self.l.error("Unknown exception completeData {}".format(e))
            self.l.error(traceback.format_exc())
            self.count_futures_images += 1
            return

        # Rejecting the article if no authors
        if authors == "Empty":
            self.count_futures_images += 1
            self.parent.counter_rejected += 1
            self.l.debug("Rejecting article {}, no author".format(title))
            return

        # Check if the DOI is already in the db. Mandatory, bc sometimes
        # updateData will tell the worker to dl the page before downloading
        # the picture
        if doi not in self.dico_doi:
            query.prepare("INSERT INTO papers (doi, title, date, journal, \
                          authors, abstract, graphical_abstract, url, new, \
                          topic_simple, author_simple) VALUES(?, ?, ?, ?, ?, \
                          ?, ?, ?, ?, ?, ?)")

            params = (doi, title, date, journal_abb, authors, abstract,
                      graphical_abstract, url, 1, topic_simple, author_simple)

            self.l.debug("Adding {0} to the database".format(doi))
            self.parent.counter += 1

            for value in params:
                query.addBindValue(value)

            query.exec_()

        self.new_entries_worker += 1

        # Don't try to dl the image if its url is 'Empty', or if the image
        # already exists
        if (graphical_abstract == "Empty" or
                os.path.exists(self.DATA_PATH +
                               functions.simpleChar(graphical_abstract))):
            self.count_futures_images += 1
            self.l.debug("Image already dled or Empty")

            # This block is executed when you delete the db, but not the
            # images. Allows to update the graphical_abstract in db accordingly
            if os.path.exists(self.DATA_PATH +
                              functions.simpleChar(graphical_abstract)):
                query.prepare("UPDATE papers SET graphical_abstract=? WHERE \
                              doi=?")
                params = (functions.simpleChar(graphical_abstract), doi)
                for value in params:
                    query.addBindValue(value)
                query.exec_()
        else:
            self.l.debug("Page dled, adding future image")
            headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                       'Connection': 'close',
                       'Referer': url}

            future_image = self.session_images.get(graphical_abstract,
                                                   headers=headers,
                                                   timeout=self.TIMEOUT)
            future_image.add_done_callback(functools.partial(
                self.pictureDownloaded, doi, url))
            self.list_futures.append(future_image)

Example #5

Show file

    def run(self):
        """Main function. Starts the real business"""

        self.l.debug("Entering worker")

        feed = self._getFeed(timeout=self.TIMEOUT)

        if feed is None:
            self.l.error("Exiting worker, problem w/ the feed")
            self.parent.list_failed_rss.append(self.url_feed)
            return

        # Get the journal name
        journal = feed['feed']['title']

        self.l.info("{}: {}".format(journal, len(feed.entries)))

        # Lists to check if the post is in the db, and if
        # it has all the info
        self.session_images = FuturesSession(
            max_workers=self.MAX_WORKERS, session=self.parent.browsing_session)

        # Get the company and the journal_abb by scrolling the dictionary
        # containing all the data regarding the journals implemented in the
        # program. This dictionary is built in gui.py, to avoid multiple calls
        # to hosts.getJournals
        # care_image determines if the Worker will try to dl the graphical
        # abstracts
        for key, tuple_data in self.dict_journals.items():
            if journal in tuple_data[0]:
                company = key
                index = tuple_data[0].index(journal)
                journal_abb = tuple_data[1][index]
                care_image = tuple_data[3][index]
                break

        try:
            self.dico_doi = self.listDoi(journal_abb)
        except UnboundLocalError:
            self.l.error("Journal not recognized ! Aborting")
            self.parent.list_failed_rss.append(self.url_feed)
            return

        # Create a list for the journals which a dl of the article
        # page is not required. All the data are in the rss page
        company_no_dl = [
            'Science', 'Elsevier', 'Beilstein', 'PLOS', 'ChemArxiv', 'Wiley'
        ]

        query = QtSql.QSqlQuery(self.bdd)

        self.bdd.transaction()

        # The feeds of these journals are complete
        if company in company_no_dl:

            self.counter_futures_urls += len(feed.entries)

            for entry in feed.entries:

                # Get the DOI, a unique number for a publication
                try:
                    doi = hosts.getDoi(company, journal, entry)
                except Exception as e:
                    self.l.error("getDoi failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    continue

                try:
                    url = hosts.refineUrl(company, journal, entry)
                except Exception as e:
                    self.l.error("refineUrl failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    continue

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(entry.title):
                    title = entry.title
                    self.counter_futures_images += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    # Insert the crappy articles in a rescue database
                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)
                        self.l.debug(
                            "Inserting {0} in table debug".format(doi))
                        for value in params:
                            query.addBindValue(value)
                        query.exec_()
                    else:
                        continue

                # Artice complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.counter_futures_images += 1
                    self.l.debug("Article complete, skipping {}".format(doi))
                    continue

                # Artice not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:
                    self.l.debug("Trying to update {}".format(doi))

                    # How to update the entry
                    dl_page, dl_image, data = hosts.updateData(
                        company, journal, entry, care_image)

                    # For these journals, all the infos are in the RSS.
                    # Only care about the image
                    if dl_image:
                        self.parent.counter_updates += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):
                            self.counter_futures_images += 1
                        else:
                            headers = {
                                'User-agent':
                                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                'Connection': 'close',
                                'Referer': url
                            }

                            future_image = self.session_images.get(
                                graphical_abstract,
                                headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(
                                functools.partial(self.pictureDownloaded, doi,
                                                  url))
                            self.list_futures.append(future_image)

                    else:
                        self.counter_futures_images += 1
                        continue

                # New article, treat it
                else:
                    try:
                        title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                            company, journal, entry)
                    except Exception as e:
                        self.l.error(
                            "Problem with getData: {}".format(journal),
                            exc_info=True)
                        self.counter_futures_images += 1
                        self.parent.counter_articles_failed += 1
                        return

                    # Rejecting article if no author
                    if authors == "Empty":
                        self.counter_futures_images += 1
                        self.parent.counter_rejected += 1
                        self.l.debug(
                            "Rejecting article {}, no author".format(title))
                        continue

                    query.prepare("INSERT INTO papers (doi, title, date, \
                                  journal, authors, abstract, \
                                  graphical_abstract, url, new, topic_simple, \
                                  author_simple) \
                                  VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

                    # Set new to 1 and not to true
                    params = (doi, title, date, journal_abb, authors, abstract,
                              graphical_abstract, url, 1, topic_simple,
                              author_simple)

                    for value in params:
                        query.addBindValue(value)

                    # Test that query worked
                    if not query.exec_():
                        self.l.error(
                            "SQL ERROR in run(): {}, company_no_dl".format(
                                query.lastError().text()))
                        self.parent.counter_articles_failed += 1
                        continue
                    else:
                        self.l.debug("{} added to the database".format(doi))
                        self.new_entries_worker += 1
                        self.parent.counter_added += 1

                    # If article has no graphical abstract of if it has been
                    # dled
                    if graphical_abstract == "Empty" or os.path.exists(
                            self.PATH +
                            functions.simpleChar(graphical_abstract)):

                        self.counter_futures_images += 1

                        # This block is executed when you delete the db, but
                        # not the images. Allows to update the
                        # graphical_abstract in db accordingly
                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):

                            query.prepare("UPDATE papers SET \
                                          graphical_abstract=? WHERE doi=?")

                            params = (functions.simpleChar(graphical_abstract),
                                      doi)

                            for value in params:
                                query.addBindValue(value)
                            query.exec_()
                    else:
                        headers = {
                            'User-agent':
                            'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                            'Connection': 'close',
                            'Referer': url
                        }

                        future_image = self.session_images.get(
                            graphical_abstract,
                            headers=headers,
                            timeout=self.TIMEOUT)

                        future_image.add_done_callback(
                            functools.partial(self.pictureDownloaded, doi,
                                              url))

                        self.list_futures.append(future_image)

        # The company requires to download the article's web page
        else:

            headers = {
                'User-agent':
                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                'Connection': 'close'
            }

            self.session_pages = FuturesSession(
                max_workers=self.MAX_WORKERS,
                session=self.parent.browsing_session)

            for entry in feed.entries:

                # Get the DOI, a unique number for a publication
                try:
                    doi = hosts.getDoi(company, journal, entry)
                except Exception as e:
                    self.l.error("getDoi failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Try to refine the url
                try:
                    url = hosts.refineUrl(company, journal, entry)
                except Exception as e:
                    self.l.error("refineUrl failed for: {}".format(journal),
                                 exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Make sure the entry has a title
                try:
                    title = entry.title
                except AttributeError:
                    self.l.error("No title for {}".format(doi), exc_info=True)
                    self.counter_futures_urls += 1
                    self.counter_futures_images += 1
                    continue

                # Reject crappy entries: corrigendum, erratum, etc
                if hosts.reject(title):
                    self.counter_futures_images += 1
                    self.counter_futures_urls += 1
                    self.parent.counter_rejected += 1
                    self.l.debug("Rejecting {0}".format(doi))

                    if self.parent.debug_mod and doi not in self.dico_doi:
                        query.prepare("INSERT INTO debug (doi, title, \
                                      journal, url) VALUES(?, ?, ?, ?)")
                        params = (doi, title, journal_abb, url)

                        for value in params:
                            query.addBindValue(value)
                        query.exec_()

                        self.l.debug(
                            "Inserting {0} in table debug".format(doi))
                    continue

                # Article complete, skip it
                elif doi in self.dico_doi and self.dico_doi[doi]:
                    self.counter_futures_images += 1
                    self.counter_futures_urls += 1
                    self.l.debug("Article complete, skipping {}".format(doi))
                    continue

                # Article not complete, try to complete it
                elif doi in self.dico_doi and not self.dico_doi[doi]:

                    url = hosts.refineUrl(company, journal, entry)

                    dl_page, dl_image, data = hosts.updateData(
                        company, journal, entry, care_image)

                    if dl_page:
                        self.parent.counter_updates += 1

                        future = self.session_pages.get(url,
                                                        timeout=self.TIMEOUT,
                                                        headers=headers)
                        future.add_done_callback(
                            functools.partial(self.completeData, doi, company,
                                              journal, journal_abb, entry))
                        self.list_futures.append(future)

                        # Continue just to be sure. If dl_page is True,
                        # dl_image is likely True too
                        continue

                    elif dl_image:
                        self.parent.counter_updates += 1
                        self.counter_futures_urls += 1

                        graphical_abstract = data['graphical_abstract']

                        if os.path.exists(
                                self.PATH +
                                functions.simpleChar(graphical_abstract)):
                            self.counter_futures_images += 1
                        else:
                            headers = {
                                'User-agent':
                                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                                'Connection': 'close',
                                'Referer': url
                            }

                            future_image = self.session_images.get(
                                graphical_abstract,
                                headers=headers,
                                timeout=self.TIMEOUT)
                            future_image.add_done_callback(
                                functools.partial(self.pictureDownloaded, doi,
                                                  url))
                            self.list_futures.append(future_image)

                    else:
                        self.counter_futures_urls += 1
                        self.counter_futures_images += 1
                        continue

                # New article, treat it
                else:

                    url = hosts.refineUrl(company, journal, entry)
                    self.l.debug("Starting adding new entry")

                    future = self.session_pages.get(url,
                                                    timeout=self.TIMEOUT,
                                                    headers=headers)
                    future.add_done_callback(
                        functools.partial(self.completeData, doi, company,
                                          journal, journal_abb, entry))
                    self.list_futures.append(future)

        # Check if the counters are full
        while ((self.counter_futures_images + self.counter_futures_urls) !=
               len(feed.entries) * 2 and self.parent.parsing):
            self.sleep(0.5)

        if self.parent.parsing:
            if not self.bdd.commit():
                self.l.error(self.bdd.lastError().text())
                self.l.debug("db insertions/modifications: {}".format(
                    self.new_entries_worker))
                self.l.error(
                    "Problem when comitting data for {}".format(journal))

        # Free the memory, and clean the remaining futures
        try:
            self.session_pages.executor.shutdown()
        except AttributeError:
            self.l.error("No session_pages to shut down")

        self.session_images.executor.shutdown()
        self.l.debug("Exiting thread for {}".format(journal))

Example #6

Show file

    def completeData(self, doi, company, journal, journal_abb, entry, future):
        """Callback to handle the response of the futures trying to
        download the page of the articles"""

        self.l.debug("Page dled")
        self.counter_futures_urls += 1

        if not self.parent.parsing:
            return

        try:
            response = future.result()
        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, ConnectionResetError,
                socket.timeout, concurrent.futures._base.CancelledError) as e:

            self.l.error("{} raised for {}. Handled".format(journal, e))
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return
        except Exception as e:
            self.l.error("Unknown exception {} for {}".format(e, journal),
                         exc_info=True)
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return

        query = QtSql.QSqlQuery(self.bdd)

        try:
            title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                company, journal, entry, response)
        except TypeError:
            self.l.error("getData returned None for {}".format(journal),
                         exc_info=True)
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return
        except Exception as e:
            self.l.error("Unknown exception completeData {}".format(e),
                         exc_info=True)
            self.counter_futures_images += 1
            self.parent.counter_articles_failed += 1
            return

        # Rejecting the article if no authors
        if authors == "Empty":
            self.counter_futures_images += 1
            self.parent.counter_rejected += 1
            self.l.debug("Rejecting article {}, no author".format(title))
            return

        # Check if the DOI is already in the db. Mandatory, bc sometimes
        # updateData will tell the worker to dl the page before downloading
        # the picture
        if doi not in self.dico_doi:
            query.prepare("INSERT INTO papers (doi, title, date, journal, \
                          authors, abstract, graphical_abstract, url, new, \
                          topic_simple, author_simple) VALUES(?, \
                          ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")

            params = (doi, title, date, journal_abb, authors, abstract,
                      graphical_abstract, url, 1, topic_simple, author_simple)

            self.l.debug("Adding {} to the database".format(doi))
            self.parent.counter_added += 1

            for value in params:
                query.addBindValue(value)

            # Test that query worked
            if not query.exec_():
                self.l.error("SQL ERROR in completeData(): {}".format(
                    query.lastError().text()))
                self.parent.counter_articles_failed += 1
                return
            else:
                self.new_entries_worker += 1

        # Don't try to dl the image if its url is 'Empty', or if the image
        # already exists
        if (graphical_abstract == "Empty"
                or os.path.exists(self.PATH +
                                  functions.simpleChar(graphical_abstract))):
            self.counter_futures_images += 1
            self.l.debug("Image already dled or Empty")

            # This block is executed when you delete the db, but not the
            # images. Allows to update the graphical_abstract in db accordingly
            if os.path.exists(self.PATH +
                              functions.simpleChar(graphical_abstract)):
                query.prepare("UPDATE papers SET graphical_abstract=? WHERE \
                              doi=?")
                params = (functions.simpleChar(graphical_abstract), doi)
                for value in params:
                    query.addBindValue(value)
                query.exec_()
        else:
            self.l.debug("Page dled, adding future image")
            headers = {
                'User-agent':
                'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0',
                'Connection': 'close',
                'Referer': url
            }

            future_image = self.session_images.get(graphical_abstract,
                                                   headers=headers,
                                                   timeout=self.TIMEOUT)
            future_image.add_done_callback(
                functools.partial(self.pictureDownloaded, doi, url))
            self.list_futures.append(future_image)