def test_getData(journalsUrls): """Tests the function getData. For each journal of each company, tests LENGTH_SAMPLE entries""" print("\n") print("Starting test getData") # Returns a list of the urls of the feed pages list_urls_feed = journalsUrls # Bypass all companies but one list_urls_feed = hosts.getJournals("acs")[2] # Build a dic with key: company # value: journal name dict_journals = {} for company in os.listdir("journals"): company = company.split(".")[0] dict_journals[company] = hosts.getJournals(company)[0] # All the journals are tested for site in list_urls_feed: print("Site {} of {}".format(list_urls_feed.index(site) + 1, len(list_urls_feed))) feed = feedparser.parse(site) journal = feed["feed"]["title"] # Get the company name for publisher, data in dict_journals.items(): if journal in data: company = publisher print("\n") print(journal) if len(feed.entries) < LENGTH_SAMPLE: samples = feed.entries else: samples = random.sample(feed.entries, LENGTH_SAMPLE) # Tests LENGTH_SAMPLE entries for a journal, not all of them for entry in samples: if company in ["science", "elsevier", "beilstein"]: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry ) else: if company == "acs": url = getattr(entry, "feedburner_origlink", entry.link).split("/")[-1] url = "http://pubs.acs.org/doi/abs/10.1021/" + url else: url = getattr(entry, "feedburner_origlink", entry.link) try: response = requests.get(url, timeout=10) title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry, response ) except requests.exceptions.ReadTimeout: print("A ReadTimeout occured, continue to next entry") print(title) print(url) print(graphical_abstract) print(date) print("\n") assert type(abstract) == str and abstract assert type(url) == str and url if url != "Empty": # Test if url is valid assert validators.url(url) is True assert type(graphical_abstract) == str and graphical_abstract if graphical_abstract != "Empty": assert validators.url(graphical_abstract) is True assert type(arrow.get(date)) == arrow.arrow.Arrow assert topic_simple.startswith(" ") is True assert topic_simple.endswith(" ") is True if author_simple is not None: assert author_simple.startswith(" ") is True assert author_simple.endswith(" ") is True
def test_getData(journalsUrls): """Tests the function getData. For each journal of each company, tests LENGTH_SAMPLE entries""" l.info("Starting test getData") start_time = datetime.datetime.now() # Returns a list of the urls of the feed pages list_urls_feed = journalsUrls # TODO: comment or uncomment # Bypass all companies but one # list_urls_feed = hosts.getJournals("ChemRxiv")[2] # Build a dic with key: company # value: journal name dict_journals = {} # Build a dictionnary to store the results of the tests, by company dict_res_by_company = {} for company in hosts.getCompanies(): dict_journals[company] = hosts.getJournals(company)[0] res = { 'count_abs_empty': 0, 'count_image_empty': 0, 'count_articles_tested': 0, 'count_articles_untested': 0, 'count_journals_untested': 0, 'count_redirections': 0, } dict_res_by_company[company] = res s = requests.session() # All the journals are tested for site in list_urls_feed: l.info("Site {} of {} \n".format( list_urls_feed.index(site) + 1, len(list_urls_feed))) # Get the RSS page of the url provided try: feed = feedparser.parse(site, timeout=20) journal = feed['feed']['title'] l.debug("RSS page successfully dled") except Exception as e: dict_res_by_company[company]['count_journals_untested'] += 1 l.error("RSS page could not be downloaded: {}".format(e), exc_info=True) continue # Get the company name for publisher, data in dict_journals.items(): if journal in data: company = publisher l.info(journal) if len(feed.entries) < LENGTH_SAMPLE: samples = feed.entries else: samples = random.sample(feed.entries, LENGTH_SAMPLE) # Tests LENGTH_SAMPLE entries for a journal, not all of them for entry in samples: if company in ['Science', 'Elsevier', 'Beilstein', 'PLOS']: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry) else: url = hosts.refineUrl(company, journal, entry) try: response = s.get(url, timeout=10, headers=HEADERS) title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry, response) except Exception as e: dict_res_by_company[company][ 'count_articles_untested'] += 1 l.error( "A problem occured: {}, continue to next entry".format( e), exc_info=True) continue dict_res_by_company[company]['count_articles_tested'] += 1 l.info("Title: {}".format(title)) l.info("URL: {}".format(url)) l.info("Image: {}".format(graphical_abstract)) l.info("Date: {}".format(date)) # Count and try do detect suspiciously high numbers of # empty results if abstract == "Empty": dict_res_by_company[company]['count_abs_empty'] += 1 if graphical_abstract == "Empty": dict_res_by_company[company]['count_image_empty'] += 1 try: if response.history: dict_res_by_company[company]['count_redirections'] += 1 l.debug("Request was redirected") for resp in response.history: l.debug("Status code, URL: {}, {}".format( resp.status_code, resp.url)) l.debug("Final destination:") l.debug("Status code, URL: {}, {} \n".format( resp.status_code, response.url)) else: l.debug("Request was not redirected \n") except UnboundLocalError: pass # ------------------------ ASSERT SECTION ------------------------- logAssert( type(abstract) == str and abstract, "Abstract missing or not a string {}".format(abstract)) logAssert( type(url) == str and url, "URL is missing or is not a string {}".format(url)) # Test if url is valid if url != 'Empty': logAssert( validators.url(url) is True, "URL is a string but is not a URL {}".format(url)) # For ACS and Nature, check if the URL is the abstract page's URL if company in ['ACS', 'Nature']: logAssert( 'abs' in url, "company is {}, but URL doesn't contain 'abs' {}".format( company, url)) logAssert( type(graphical_abstract) == str and graphical_abstract, "graphical_abstract is missing or not a string {}".format( graphical_abstract)) if graphical_abstract != 'Empty': logAssert( validators.url(graphical_abstract) is True, "graphical_abstract is a string but is not a URL {}". format(graphical_abstract)) logAssert( type(arrow.get(date)) == arrow.arrow.Arrow, "The date is not really a date {}".format(date)) logAssert( topic_simple.startswith(' ') is True, "Topic doesn't start with space {}".format(topic_simple)) logAssert( topic_simple.endswith(' ') is True, "Topic doesn't end with space {}".format(topic_simple)) if author_simple is not None: logAssert( author_simple.startswith(' ') is True, "author_simple doesn't start with space {}".format( author_simple)) logAssert( author_simple.endswith(' ') is True, "author_simple doesn't end with space {}".format( author_simple)) pprint(dict_res_by_company) # Count results count_abs_empty = 0 count_image_empty = 0 count_articles_tested = 0 count_articles_untested = 0 count_journals_untested = 0 count_redirections = 0 for company in dict_res_by_company: count_abs_empty += dict_res_by_company[company]['count_abs_empty'] count_image_empty += dict_res_by_company[company]['count_image_empty'] count_articles_tested += dict_res_by_company[company][ 'count_articles_tested'] count_articles_untested += dict_res_by_company[company][ 'count_articles_untested'] count_journals_untested += dict_res_by_company[company][ 'count_journals_untested'] count_redirections += dict_res_by_company[company][ 'count_redirections'] l.debug("Number of untested jounals: {} / {}".format( count_journals_untested, len(list_urls_feed))) l.debug("Number of test/untested articles: {} / {}".format( count_articles_tested, count_articles_untested)) l.debug("Number of Empty abstracts: {}".format(count_abs_empty)) l.debug( "Number of Empty graphical_abstracts: {}".format(count_image_empty)) l.debug("Number of redirections: {}".format(count_redirections)) l.debug("Time spent in test_getData: {}".format(datetime.datetime.now() - start_time))
def run(self): """Main function. Starts the real business""" self.l.debug("Entering worker") self.l.debug(self.url_feed) # Get the RSS page of the url provided try: self.feed = feedparser.parse(self.url_feed) self.l.debug("RSS page successfully dled") except OSError: self.l.error("Too many files open, could not start the thread !") return # Get the journal name try: journal = self.feed['feed']['title'] except KeyError: self.l.critical("No title for the journal ! Aborting") self.l.critical(self.url_feed) return self.l.info("{0}: {1}".format(journal, len(self.feed.entries))) # Lists to check if the post is in the db, and if # it has all the infos self.session_images = FuturesSession(max_workers=20) # Get the company and the journal_abb by scrolling the dictionnary # containing all the data regarding the journals implemented in the # program. This dictionnary is built in gui.py, to avoid multiple calls # to hosts.getJournals # care_image determines if the Worker will try to dl the graphical # abstracts for key, tuple_data in self.dict_journals.items(): if journal in tuple_data[0]: company = key index = tuple_data[0].index(journal) journal_abb = tuple_data[1][index] care_image = tuple_data[3][index] break try: self.dico_doi = self.listDoi(journal_abb) except UnboundLocalError: self.l.error("Journal not recognized ! Aborting") return # Create a list for the journals which a dl of the article # page is not required. All the data are in the rss page company_no_dl = ['science', 'elsevier', 'beilstein', 'plos'] query = QtSql.QSqlQuery(self.bdd) self.bdd.transaction() # The feeds of these journals are complete # if journal in wiley + science + elsevier: if company in company_no_dl: self.count_futures_urls += len(self.feed.entries) for entry in self.feed.entries: # Get the DOI, a unique number for a publication doi = hosts.getDoi(company, journal, entry) url = getattr(entry, 'feedburner_origlink', entry.link) # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(entry.title): title = entry.title self.count_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) # Insert the crappy articles in a rescue database if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) self.l.debug("Inserting {0} in table debug". format(doi)) for value in params: query.addBindValue(value) query.exec_() else: continue # Artice complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.count_futures_images += 1 self.l.debug("Skipping {}".format(doi)) continue # Artice not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: # How to update the entry dl_page, dl_image, data = hosts.updateData(company, journal, entry, care_image) # For these journals, all the infos are in the RSS. # Only care about the image if dl_image: self.parent.counter_updates += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists(self.DATA_PATH + functions.simpleChar( graphical_abstract)): self.count_futures_images += 1 else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback(functools.partial( self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.count_futures_images += 1 continue else: try: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry) except TypeError: self.l.error("getData returned None for {}". format(journal)) self.count_futures_images += 1 return # Rejecting article if no author if authors == "Empty": self.count_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting article {}, no author". format(title)) continue query.prepare("INSERT INTO papers (doi, title, date, \ journal, authors, abstract, \ graphical_abstract, url, new, topic_simple, \ author_simple) \ VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") # Set new to 1 and not to true params = (doi, title, date, journal_abb, authors, abstract, graphical_abstract, url, 1, topic_simple, author_simple) self.l.debug("Adding {0} to the database".format(doi)) self.parent.counter += 1 self.new_entries_worker += 1 for value in params: query.addBindValue(value) query.exec_() if graphical_abstract == "Empty" or os.path.exists( self.DATA_PATH + functions.simpleChar(graphical_abstract)): self.count_futures_images += 1 # This block is executed when you delete the db, but # not the images. Allows to update the # graphical_abstract in db accordingly if os.path.exists(self.DATA_PATH + functions.simpleChar( graphical_abstract)): query.prepare("UPDATE papers SET \ graphical_abstract=? WHERE doi=?") params = (functions.simpleChar(graphical_abstract), doi) for value in params: query.addBindValue(value) query.exec_() else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close'} self.session_pages = FuturesSession(max_workers=20) for entry in self.feed.entries: doi = hosts.getDoi(company, journal, entry) if company == 'acs': url = getattr(entry, 'feedburner_origlink', entry.link).split('/')[-1] url = "http://pubs.acs.org/doi/abs/10.1021/" + url elif company == 'npg': url = getattr(entry, 'feedburner_origlink', entry.link).split('/')[-1] url = "http://www.nature.com/nature/journal/vaop/ncurrent/abs/" + url + ".html" else: url = getattr(entry, 'feedburner_origlink', entry.link) # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(entry.title): title = entry.title self.count_futures_images += 1 self.count_futures_urls += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) for value in params: query.addBindValue(value) query.exec_() self.l.debug("Inserting {0} in table debug". format(doi)) continue # Article complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.count_futures_images += 1 self.count_futures_urls += 1 self.l.debug("Skipping {}".format(doi)) continue # Article not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: dl_page, dl_image, data = hosts.updateData(company, journal, entry, care_image) if dl_page: self.parent.counter_updates += 1 future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback(functools.partial( self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Continue just to be sure. If dl_page is True, # dl_image is likely True too continue elif dl_image: self.parent.counter_updates += 1 self.count_futures_urls += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists(self.DATA_PATH + functions.simpleChar( graphical_abstract)): self.count_futures_images += 1 else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback(functools.partial( self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.count_futures_urls += 1 self.count_futures_images += 1 continue else: self.l.debug("Starting adding new entry") future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback(functools.partial( self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Check if the counters are full while ((self.count_futures_images + self.count_futures_urls) != len(self.feed.entries) * 2 and self.parent.parsing): self.sleep(1) if self.parent.parsing: if not self.bdd.commit(): self.l.error(self.bdd.lastError().text()) self.l.debug("db insertions/modifications: {}". format(self.new_entries_worker)) self.l.error("Problem when comitting data for {}". format(journal)) # Free the memory, and clean the remaining futures try: self.session_pages.executor.shutdown() except AttributeError: self.l.error("No session_pages to shut down") self.session_images.executor.shutdown() self.l.debug("Exiting thread for {}".format(journal))
def completeData(self, doi, company, journal, journal_abb, entry, future): """Callback to handle the response of the futures trying to download the page of the articles""" self.l.debug("Page dled") self.count_futures_urls += 1 if not self.parent.parsing: return try: response = future.result() except requests.exceptions.ReadTimeout: self.l.error("ReadTimeout for {}".format(journal)) self.count_futures_images += 1 return except requests.exceptions.ConnectionError: self.l.error("ConnectionError for {}".format(journal)) self.count_futures_images += 1 return except ConnectionResetError: self.l.error("ConnectionResetError for {}".format(journal)) self.count_futures_images += 1 return except socket.timeout: self.l.error("socket.timeout for {}".format(journal)) self.count_futures_images += 1 return except concurrent.futures._base.CancelledError: self.l.error("future cancelled for {}".format(journal)) self.count_futures_images += 1 return except Exception as e: self.l.error("Unknown exception {} for {}".format(e, journal)) self.l.error(traceback.format_exc()) self.count_futures_images += 1 return query = QtSql.QSqlQuery(self.bdd) try: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry, response) except TypeError: self.l.error("getData returned None for {}".format(journal)) self.count_futures_images += 1 return except Exception as e: self.l.error("Unknown exception completeData {}".format(e)) self.l.error(traceback.format_exc()) self.count_futures_images += 1 return # Rejecting the article if no authors if authors == "Empty": self.count_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting article {}, no author".format(title)) return # Check if the DOI is already in the db. Mandatory, bc sometimes # updateData will tell the worker to dl the page before downloading # the picture if doi not in self.dico_doi: query.prepare("INSERT INTO papers (doi, title, date, journal, \ authors, abstract, graphical_abstract, url, new, \ topic_simple, author_simple) VALUES(?, ?, ?, ?, ?, \ ?, ?, ?, ?, ?, ?)") params = (doi, title, date, journal_abb, authors, abstract, graphical_abstract, url, 1, topic_simple, author_simple) self.l.debug("Adding {0} to the database".format(doi)) self.parent.counter += 1 for value in params: query.addBindValue(value) query.exec_() self.new_entries_worker += 1 # Don't try to dl the image if its url is 'Empty', or if the image # already exists if (graphical_abstract == "Empty" or os.path.exists(self.DATA_PATH + functions.simpleChar(graphical_abstract))): self.count_futures_images += 1 self.l.debug("Image already dled or Empty") # This block is executed when you delete the db, but not the # images. Allows to update the graphical_abstract in db accordingly if os.path.exists(self.DATA_PATH + functions.simpleChar(graphical_abstract)): query.prepare("UPDATE papers SET graphical_abstract=? WHERE \ doi=?") params = (functions.simpleChar(graphical_abstract), doi) for value in params: query.addBindValue(value) query.exec_() else: self.l.debug("Page dled, adding future image") headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get(graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback(functools.partial( self.pictureDownloaded, doi, url)) self.list_futures.append(future_image)
def run(self): """Main function. Starts the real business""" self.l.debug("Entering worker") feed = self._getFeed(timeout=self.TIMEOUT) if feed is None: self.l.error("Exiting worker, problem w/ the feed") self.parent.list_failed_rss.append(self.url_feed) return # Get the journal name journal = feed['feed']['title'] self.l.info("{}: {}".format(journal, len(feed.entries))) # Lists to check if the post is in the db, and if # it has all the info self.session_images = FuturesSession( max_workers=self.MAX_WORKERS, session=self.parent.browsing_session) # Get the company and the journal_abb by scrolling the dictionary # containing all the data regarding the journals implemented in the # program. This dictionary is built in gui.py, to avoid multiple calls # to hosts.getJournals # care_image determines if the Worker will try to dl the graphical # abstracts for key, tuple_data in self.dict_journals.items(): if journal in tuple_data[0]: company = key index = tuple_data[0].index(journal) journal_abb = tuple_data[1][index] care_image = tuple_data[3][index] break try: self.dico_doi = self.listDoi(journal_abb) except UnboundLocalError: self.l.error("Journal not recognized ! Aborting") self.parent.list_failed_rss.append(self.url_feed) return # Create a list for the journals which a dl of the article # page is not required. All the data are in the rss page company_no_dl = [ 'Science', 'Elsevier', 'Beilstein', 'PLOS', 'ChemArxiv', 'Wiley' ] query = QtSql.QSqlQuery(self.bdd) self.bdd.transaction() # The feeds of these journals are complete if company in company_no_dl: self.counter_futures_urls += len(feed.entries) for entry in feed.entries: # Get the DOI, a unique number for a publication try: doi = hosts.getDoi(company, journal, entry) except Exception as e: self.l.error("getDoi failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 continue try: url = hosts.refineUrl(company, journal, entry) except Exception as e: self.l.error("refineUrl failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 continue # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(entry.title): title = entry.title self.counter_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) # Insert the crappy articles in a rescue database if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) self.l.debug( "Inserting {0} in table debug".format(doi)) for value in params: query.addBindValue(value) query.exec_() else: continue # Artice complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.counter_futures_images += 1 self.l.debug("Article complete, skipping {}".format(doi)) continue # Artice not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: self.l.debug("Trying to update {}".format(doi)) # How to update the entry dl_page, dl_image, data = hosts.updateData( company, journal, entry, care_image) # For these journals, all the infos are in the RSS. # Only care about the image if dl_image: self.parent.counter_updates += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): self.counter_futures_images += 1 else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.counter_futures_images += 1 continue # New article, treat it else: try: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry) except Exception as e: self.l.error( "Problem with getData: {}".format(journal), exc_info=True) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return # Rejecting article if no author if authors == "Empty": self.counter_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug( "Rejecting article {}, no author".format(title)) continue query.prepare("INSERT INTO papers (doi, title, date, \ journal, authors, abstract, \ graphical_abstract, url, new, topic_simple, \ author_simple) \ VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") # Set new to 1 and not to true params = (doi, title, date, journal_abb, authors, abstract, graphical_abstract, url, 1, topic_simple, author_simple) for value in params: query.addBindValue(value) # Test that query worked if not query.exec_(): self.l.error( "SQL ERROR in run(): {}, company_no_dl".format( query.lastError().text())) self.parent.counter_articles_failed += 1 continue else: self.l.debug("{} added to the database".format(doi)) self.new_entries_worker += 1 self.parent.counter_added += 1 # If article has no graphical abstract of if it has been # dled if graphical_abstract == "Empty" or os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): self.counter_futures_images += 1 # This block is executed when you delete the db, but # not the images. Allows to update the # graphical_abstract in db accordingly if os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): query.prepare("UPDATE papers SET \ graphical_abstract=? WHERE doi=?") params = (functions.simpleChar(graphical_abstract), doi) for value in params: query.addBindValue(value) query.exec_() else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) # The company requires to download the article's web page else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close' } self.session_pages = FuturesSession( max_workers=self.MAX_WORKERS, session=self.parent.browsing_session) for entry in feed.entries: # Get the DOI, a unique number for a publication try: doi = hosts.getDoi(company, journal, entry) except Exception as e: self.l.error("getDoi failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # Try to refine the url try: url = hosts.refineUrl(company, journal, entry) except Exception as e: self.l.error("refineUrl failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # Make sure the entry has a title try: title = entry.title except AttributeError: self.l.error("No title for {}".format(doi), exc_info=True) self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(title): self.counter_futures_images += 1 self.counter_futures_urls += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) for value in params: query.addBindValue(value) query.exec_() self.l.debug( "Inserting {0} in table debug".format(doi)) continue # Article complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.counter_futures_images += 1 self.counter_futures_urls += 1 self.l.debug("Article complete, skipping {}".format(doi)) continue # Article not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: url = hosts.refineUrl(company, journal, entry) dl_page, dl_image, data = hosts.updateData( company, journal, entry, care_image) if dl_page: self.parent.counter_updates += 1 future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback( functools.partial(self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Continue just to be sure. If dl_page is True, # dl_image is likely True too continue elif dl_image: self.parent.counter_updates += 1 self.counter_futures_urls += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): self.counter_futures_images += 1 else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # New article, treat it else: url = hosts.refineUrl(company, journal, entry) self.l.debug("Starting adding new entry") future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback( functools.partial(self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Check if the counters are full while ((self.counter_futures_images + self.counter_futures_urls) != len(feed.entries) * 2 and self.parent.parsing): self.sleep(0.5) if self.parent.parsing: if not self.bdd.commit(): self.l.error(self.bdd.lastError().text()) self.l.debug("db insertions/modifications: {}".format( self.new_entries_worker)) self.l.error( "Problem when comitting data for {}".format(journal)) # Free the memory, and clean the remaining futures try: self.session_pages.executor.shutdown() except AttributeError: self.l.error("No session_pages to shut down") self.session_images.executor.shutdown() self.l.debug("Exiting thread for {}".format(journal))
def completeData(self, doi, company, journal, journal_abb, entry, future): """Callback to handle the response of the futures trying to download the page of the articles""" self.l.debug("Page dled") self.counter_futures_urls += 1 if not self.parent.parsing: return try: response = future.result() except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, ConnectionResetError, socket.timeout, concurrent.futures._base.CancelledError) as e: self.l.error("{} raised for {}. Handled".format(journal, e)) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return except Exception as e: self.l.error("Unknown exception {} for {}".format(e, journal), exc_info=True) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return query = QtSql.QSqlQuery(self.bdd) try: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry, response) except TypeError: self.l.error("getData returned None for {}".format(journal), exc_info=True) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return except Exception as e: self.l.error("Unknown exception completeData {}".format(e), exc_info=True) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return # Rejecting the article if no authors if authors == "Empty": self.counter_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting article {}, no author".format(title)) return # Check if the DOI is already in the db. Mandatory, bc sometimes # updateData will tell the worker to dl the page before downloading # the picture if doi not in self.dico_doi: query.prepare("INSERT INTO papers (doi, title, date, journal, \ authors, abstract, graphical_abstract, url, new, \ topic_simple, author_simple) VALUES(?, \ ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") params = (doi, title, date, journal_abb, authors, abstract, graphical_abstract, url, 1, topic_simple, author_simple) self.l.debug("Adding {} to the database".format(doi)) self.parent.counter_added += 1 for value in params: query.addBindValue(value) # Test that query worked if not query.exec_(): self.l.error("SQL ERROR in completeData(): {}".format( query.lastError().text())) self.parent.counter_articles_failed += 1 return else: self.new_entries_worker += 1 # Don't try to dl the image if its url is 'Empty', or if the image # already exists if (graphical_abstract == "Empty" or os.path.exists(self.PATH + functions.simpleChar(graphical_abstract))): self.counter_futures_images += 1 self.l.debug("Image already dled or Empty") # This block is executed when you delete the db, but not the # images. Allows to update the graphical_abstract in db accordingly if os.path.exists(self.PATH + functions.simpleChar(graphical_abstract)): query.prepare("UPDATE papers SET graphical_abstract=? WHERE \ doi=?") params = (functions.simpleChar(graphical_abstract), doi) for value in params: query.addBindValue(value) query.exec_() else: self.l.debug("Page dled, adding future image") headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get(graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image)