def test_getDoi(journalsUrls): """Tests if the function getDoi gets the DOI correctly""" l.info("Function getDoi") start_time = datetime.datetime.now() list_sites = journalsUrls # Build a dic with key: company # value: journal name dict_journals = {} for company in hosts.getCompanies(): dict_journals[company] = hosts.getJournals(company)[0] for site in list_sites: try: feed = feedparser.parse(site, timeout=20) journal = feed['feed']['title'] l.debug("RSS page successfully dled") except Exception as e: l.error("RSS page could not be downloaded: {}".format(e), exc_info=True) continue try: journal = feed['feed']['title'] except KeyError: l.error("Failed to get title for: {}".format(site)) pytest.fail("Failed to get title for: {}".format(site)) # Get the company name for publisher, data in dict_journals.items(): if journal in data: company = publisher l.info("{}: {}".format(site, len(feed.entries))) if len(feed.entries) < LENGTH_SAMPLE: samples = feed.entries else: samples = random.sample(feed.entries, LENGTH_SAMPLE) # Tests LENGTH_SAMPLE entries for a journal, not all of them for entry in samples: doi = hosts.getDoi(company, journal, entry) l.info(doi) logAssert( type(doi) == str or not doi.startswith('10.1'), "DOI is not a string or is not a DOI {}".format(doi)) l.debug("Time spent in test_getDoi: {}".format(datetime.datetime.now() - start_time))
def test_getDoi(journalsUrls): """Tests if the function getDoi gets the DOI correctly""" print("\n") print("Starting test getDoi") list_sites = journalsUrls # Build a dic with key: company # value: journal name dict_journals = {} for company in os.listdir("journals"): company = company.split(".")[0] dict_journals[company] = hosts.getJournals(company)[0] for site in list_sites: feed = feedparser.parse(site) journal = feed["feed"]["title"] # Get the company name for publisher, data in dict_journals.items(): if journal in data: company = publisher print("{}: {}".format(site, len(feed.entries))) if len(feed.entries) < LENGTH_SAMPLE: samples = feed.entries else: samples = random.sample(feed.entries, LENGTH_SAMPLE) # Tests LENGTH_SAMPLE entries for a journal, not all of them for entry in samples: doi = hosts.getDoi(company, journal, entry) print(doi) assert type(doi) == str
def _checkIsFeed(self, url: str, company: str, feed: feedparser.util.FeedParserDict) -> bool: self.l.debug("Entering _checkIsFeed") # Check if the feed has a title try: journal = feed['feed']['title'] except Exception as e: self.l.critical("verifyInput, can't access title {}".format(url), exc_info=True) return False nbr_ok = 0 for entry in feed.entries: try: doi = hosts.getDoi(company, journal, entry) url = hosts.refineUrl(company, journal, entry) self.l.debug("{}, {}".format(doi, url)) except Exception as e: self.l.error( "verifyInput, entry has no doi or no url".format(url), exc_info=True) continue # Check if DOI and URL can be obtained if (doi.startswith('10.') and validators.url(url) or validators.url(doi) and validators.url(url)): nbr_ok += 1 # If 3 entries are OK, the feed is considered valid if nbr_ok == 3: self.l.debug("3 entries ok, valid feed") return True # If still here, the feed is NOT considered valid return False
def run(self): """Main function. Starts the real business""" self.l.debug("Entering worker") self.l.debug(self.url_feed) # Get the RSS page of the url provided try: self.feed = feedparser.parse(self.url_feed) self.l.debug("RSS page successfully dled") except OSError: self.l.error("Too many files open, could not start the thread !") return # Get the journal name try: journal = self.feed['feed']['title'] except KeyError: self.l.critical("No title for the journal ! Aborting") self.l.critical(self.url_feed) return self.l.info("{0}: {1}".format(journal, len(self.feed.entries))) # Lists to check if the post is in the db, and if # it has all the infos self.session_images = FuturesSession(max_workers=20) # Get the company and the journal_abb by scrolling the dictionnary # containing all the data regarding the journals implemented in the # program. This dictionnary is built in gui.py, to avoid multiple calls # to hosts.getJournals # care_image determines if the Worker will try to dl the graphical # abstracts for key, tuple_data in self.dict_journals.items(): if journal in tuple_data[0]: company = key index = tuple_data[0].index(journal) journal_abb = tuple_data[1][index] care_image = tuple_data[3][index] break try: self.dico_doi = self.listDoi(journal_abb) except UnboundLocalError: self.l.error("Journal not recognized ! Aborting") return # Create a list for the journals which a dl of the article # page is not required. All the data are in the rss page company_no_dl = ['science', 'elsevier', 'beilstein', 'plos'] query = QtSql.QSqlQuery(self.bdd) self.bdd.transaction() # The feeds of these journals are complete # if journal in wiley + science + elsevier: if company in company_no_dl: self.count_futures_urls += len(self.feed.entries) for entry in self.feed.entries: # Get the DOI, a unique number for a publication doi = hosts.getDoi(company, journal, entry) url = getattr(entry, 'feedburner_origlink', entry.link) # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(entry.title): title = entry.title self.count_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) # Insert the crappy articles in a rescue database if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) self.l.debug("Inserting {0} in table debug". format(doi)) for value in params: query.addBindValue(value) query.exec_() else: continue # Artice complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.count_futures_images += 1 self.l.debug("Skipping {}".format(doi)) continue # Artice not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: # How to update the entry dl_page, dl_image, data = hosts.updateData(company, journal, entry, care_image) # For these journals, all the infos are in the RSS. # Only care about the image if dl_image: self.parent.counter_updates += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists(self.DATA_PATH + functions.simpleChar( graphical_abstract)): self.count_futures_images += 1 else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback(functools.partial( self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.count_futures_images += 1 continue else: try: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry) except TypeError: self.l.error("getData returned None for {}". format(journal)) self.count_futures_images += 1 return # Rejecting article if no author if authors == "Empty": self.count_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting article {}, no author". format(title)) continue query.prepare("INSERT INTO papers (doi, title, date, \ journal, authors, abstract, \ graphical_abstract, url, new, topic_simple, \ author_simple) \ VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") # Set new to 1 and not to true params = (doi, title, date, journal_abb, authors, abstract, graphical_abstract, url, 1, topic_simple, author_simple) self.l.debug("Adding {0} to the database".format(doi)) self.parent.counter += 1 self.new_entries_worker += 1 for value in params: query.addBindValue(value) query.exec_() if graphical_abstract == "Empty" or os.path.exists( self.DATA_PATH + functions.simpleChar(graphical_abstract)): self.count_futures_images += 1 # This block is executed when you delete the db, but # not the images. Allows to update the # graphical_abstract in db accordingly if os.path.exists(self.DATA_PATH + functions.simpleChar( graphical_abstract)): query.prepare("UPDATE papers SET \ graphical_abstract=? WHERE doi=?") params = (functions.simpleChar(graphical_abstract), doi) for value in params: query.addBindValue(value) query.exec_() else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close'} self.session_pages = FuturesSession(max_workers=20) for entry in self.feed.entries: doi = hosts.getDoi(company, journal, entry) if company == 'acs': url = getattr(entry, 'feedburner_origlink', entry.link).split('/')[-1] url = "http://pubs.acs.org/doi/abs/10.1021/" + url elif company == 'npg': url = getattr(entry, 'feedburner_origlink', entry.link).split('/')[-1] url = "http://www.nature.com/nature/journal/vaop/ncurrent/abs/" + url + ".html" else: url = getattr(entry, 'feedburner_origlink', entry.link) # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(entry.title): title = entry.title self.count_futures_images += 1 self.count_futures_urls += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) for value in params: query.addBindValue(value) query.exec_() self.l.debug("Inserting {0} in table debug". format(doi)) continue # Article complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.count_futures_images += 1 self.count_futures_urls += 1 self.l.debug("Skipping {}".format(doi)) continue # Article not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: dl_page, dl_image, data = hosts.updateData(company, journal, entry, care_image) if dl_page: self.parent.counter_updates += 1 future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback(functools.partial( self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Continue just to be sure. If dl_page is True, # dl_image is likely True too continue elif dl_image: self.parent.counter_updates += 1 self.count_futures_urls += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists(self.DATA_PATH + functions.simpleChar( graphical_abstract)): self.count_futures_images += 1 else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback(functools.partial( self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.count_futures_urls += 1 self.count_futures_images += 1 continue else: self.l.debug("Starting adding new entry") future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback(functools.partial( self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Check if the counters are full while ((self.count_futures_images + self.count_futures_urls) != len(self.feed.entries) * 2 and self.parent.parsing): self.sleep(1) if self.parent.parsing: if not self.bdd.commit(): self.l.error(self.bdd.lastError().text()) self.l.debug("db insertions/modifications: {}". format(self.new_entries_worker)) self.l.error("Problem when comitting data for {}". format(journal)) # Free the memory, and clean the remaining futures try: self.session_pages.executor.shutdown() except AttributeError: self.l.error("No session_pages to shut down") self.session_images.executor.shutdown() self.l.debug("Exiting thread for {}".format(journal))
def run(self): """Main function. Starts the real business""" self.l.debug("Entering worker") feed = self._getFeed(timeout=self.TIMEOUT) if feed is None: self.l.error("Exiting worker, problem w/ the feed") self.parent.list_failed_rss.append(self.url_feed) return # Get the journal name journal = feed['feed']['title'] self.l.info("{}: {}".format(journal, len(feed.entries))) # Lists to check if the post is in the db, and if # it has all the info self.session_images = FuturesSession( max_workers=self.MAX_WORKERS, session=self.parent.browsing_session) # Get the company and the journal_abb by scrolling the dictionary # containing all the data regarding the journals implemented in the # program. This dictionary is built in gui.py, to avoid multiple calls # to hosts.getJournals # care_image determines if the Worker will try to dl the graphical # abstracts for key, tuple_data in self.dict_journals.items(): if journal in tuple_data[0]: company = key index = tuple_data[0].index(journal) journal_abb = tuple_data[1][index] care_image = tuple_data[3][index] break try: self.dico_doi = self.listDoi(journal_abb) except UnboundLocalError: self.l.error("Journal not recognized ! Aborting") self.parent.list_failed_rss.append(self.url_feed) return # Create a list for the journals which a dl of the article # page is not required. All the data are in the rss page company_no_dl = [ 'Science', 'Elsevier', 'Beilstein', 'PLOS', 'ChemArxiv', 'Wiley' ] query = QtSql.QSqlQuery(self.bdd) self.bdd.transaction() # The feeds of these journals are complete if company in company_no_dl: self.counter_futures_urls += len(feed.entries) for entry in feed.entries: # Get the DOI, a unique number for a publication try: doi = hosts.getDoi(company, journal, entry) except Exception as e: self.l.error("getDoi failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 continue try: url = hosts.refineUrl(company, journal, entry) except Exception as e: self.l.error("refineUrl failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 continue # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(entry.title): title = entry.title self.counter_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) # Insert the crappy articles in a rescue database if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) self.l.debug( "Inserting {0} in table debug".format(doi)) for value in params: query.addBindValue(value) query.exec_() else: continue # Artice complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.counter_futures_images += 1 self.l.debug("Article complete, skipping {}".format(doi)) continue # Artice not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: self.l.debug("Trying to update {}".format(doi)) # How to update the entry dl_page, dl_image, data = hosts.updateData( company, journal, entry, care_image) # For these journals, all the infos are in the RSS. # Only care about the image if dl_image: self.parent.counter_updates += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): self.counter_futures_images += 1 else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.counter_futures_images += 1 continue # New article, treat it else: try: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry) except Exception as e: self.l.error( "Problem with getData: {}".format(journal), exc_info=True) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return # Rejecting article if no author if authors == "Empty": self.counter_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug( "Rejecting article {}, no author".format(title)) continue query.prepare("INSERT INTO papers (doi, title, date, \ journal, authors, abstract, \ graphical_abstract, url, new, topic_simple, \ author_simple) \ VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") # Set new to 1 and not to true params = (doi, title, date, journal_abb, authors, abstract, graphical_abstract, url, 1, topic_simple, author_simple) for value in params: query.addBindValue(value) # Test that query worked if not query.exec_(): self.l.error( "SQL ERROR in run(): {}, company_no_dl".format( query.lastError().text())) self.parent.counter_articles_failed += 1 continue else: self.l.debug("{} added to the database".format(doi)) self.new_entries_worker += 1 self.parent.counter_added += 1 # If article has no graphical abstract of if it has been # dled if graphical_abstract == "Empty" or os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): self.counter_futures_images += 1 # This block is executed when you delete the db, but # not the images. Allows to update the # graphical_abstract in db accordingly if os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): query.prepare("UPDATE papers SET \ graphical_abstract=? WHERE doi=?") params = (functions.simpleChar(graphical_abstract), doi) for value in params: query.addBindValue(value) query.exec_() else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) # The company requires to download the article's web page else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close' } self.session_pages = FuturesSession( max_workers=self.MAX_WORKERS, session=self.parent.browsing_session) for entry in feed.entries: # Get the DOI, a unique number for a publication try: doi = hosts.getDoi(company, journal, entry) except Exception as e: self.l.error("getDoi failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # Try to refine the url try: url = hosts.refineUrl(company, journal, entry) except Exception as e: self.l.error("refineUrl failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # Make sure the entry has a title try: title = entry.title except AttributeError: self.l.error("No title for {}".format(doi), exc_info=True) self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(title): self.counter_futures_images += 1 self.counter_futures_urls += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) for value in params: query.addBindValue(value) query.exec_() self.l.debug( "Inserting {0} in table debug".format(doi)) continue # Article complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.counter_futures_images += 1 self.counter_futures_urls += 1 self.l.debug("Article complete, skipping {}".format(doi)) continue # Article not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: url = hosts.refineUrl(company, journal, entry) dl_page, dl_image, data = hosts.updateData( company, journal, entry, care_image) if dl_page: self.parent.counter_updates += 1 future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback( functools.partial(self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Continue just to be sure. If dl_page is True, # dl_image is likely True too continue elif dl_image: self.parent.counter_updates += 1 self.counter_futures_urls += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): self.counter_futures_images += 1 else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # New article, treat it else: url = hosts.refineUrl(company, journal, entry) self.l.debug("Starting adding new entry") future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback( functools.partial(self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Check if the counters are full while ((self.counter_futures_images + self.counter_futures_urls) != len(feed.entries) * 2 and self.parent.parsing): self.sleep(0.5) if self.parent.parsing: if not self.bdd.commit(): self.l.error(self.bdd.lastError().text()) self.l.debug("db insertions/modifications: {}".format( self.new_entries_worker)) self.l.error( "Problem when comitting data for {}".format(journal)) # Free the memory, and clean the remaining futures try: self.session_pages.executor.shutdown() except AttributeError: self.l.error("No session_pages to shut down") self.session_images.executor.shutdown() self.l.debug("Exiting thread for {}".format(journal))