def test_getJournals(): """Function to get the informations about all the journals of a company. Returns the names, the URLs, the abbreviations, and also a boolean to set the download of the graphical abstracts""" l.info("Function getJournals") start_time = datetime.datetime.now() # Create a dictionnary w/ all the data concerning the journals # implemented in the program: names, abbreviations, urls dict_journals = {} for company in hosts.getCompanies(): dict_journals[company] = hosts.getJournals(company) l.debug(dict_journals) for company, data in dict_journals.items(): # data is a tuple: (list_journals_publisher, # list_abb_journals_publisher, # list_urls_publisher, # list_bool # ) logAssert(type(data) == tuple, "data is not a tuple {}".format(data)) # Check that all the fields in the tuple are # non empty lists for list_info_journals in data[:-1]: logAssert( type(list_info_journals) == list and list_info_journals, "list_info_journals is missing or is not a list {}".format( list_info_journals)) # Check that all the elements in the list are # non empty strings for element in list_info_journals: logAssert( type(element) == str and element, "element is not a string or is an empty string {}".format( element)) # Check the urls of the RSS pages for element in data[2]: logAssert( validators.url(element), "One of the publisher's URL is not a URL".format(element)) # Check the list of booleans for element in data[3]: logAssert( type(element) == bool, "One boolean is not really a boolean {}".format(element)) l.debug( "Time spent in test_getJournals: {}".format(datetime.datetime.now() - start_time))
def test_getDoi(journalsUrls): """Tests if the function getDoi gets the DOI correctly""" l.info("Function getDoi") start_time = datetime.datetime.now() list_sites = journalsUrls # Build a dic with key: company # value: journal name dict_journals = {} for company in hosts.getCompanies(): dict_journals[company] = hosts.getJournals(company)[0] for site in list_sites: try: feed = feedparser.parse(site, timeout=20) journal = feed['feed']['title'] l.debug("RSS page successfully dled") except Exception as e: l.error("RSS page could not be downloaded: {}".format(e), exc_info=True) continue try: journal = feed['feed']['title'] except KeyError: l.error("Failed to get title for: {}".format(site)) pytest.fail("Failed to get title for: {}".format(site)) # Get the company name for publisher, data in dict_journals.items(): if journal in data: company = publisher l.info("{}: {}".format(site, len(feed.entries))) if len(feed.entries) < LENGTH_SAMPLE: samples = feed.entries else: samples = random.sample(feed.entries, LENGTH_SAMPLE) # Tests LENGTH_SAMPLE entries for a journal, not all of them for entry in samples: doi = hosts.getDoi(company, journal, entry) l.info(doi) logAssert( type(doi) == str or not doi.startswith('10.1'), "DOI is not a string or is not a DOI {}".format(doi)) l.debug("Time spent in test_getDoi: {}".format(datetime.datetime.now() - start_time))
def journalsUrls(): """Returns a combined list of urls. All the journals of all the companies. Specific to the tests, fixture""" urls = [] for company in hosts.getCompanies(): urls += hosts.getJournals(company)[2] return urls
def journalsUrls(): """Returns a combined list of urls. All the journals of all the companies. Specific to the tests, fixture""" urls = [] for company in os.listdir("journals"): company = company.split(".")[0] urls += hosts.getJournals(company)[2] return urls
def initUI(self): """Handle the display""" self.setWindowTitle('Deleting journals') self.vbox_global = QtWidgets.QVBoxLayout() # Open a dialog box to explain how to add a journal mes = "Confirmation will be asked before\nanything permanent is done: no worries" self.label_help = QtWidgets.QLabel(mes) self.vbox_global.addWidget(self.label_help) # Scroll area for the journals to check self.scroll_check_journals = QtWidgets.QScrollArea() self.scrolling_check_journals = QtWidgets.QWidget() self.vbox_check_journals = QtWidgets.QVBoxLayout() self.scrolling_check_journals.setLayout(self.vbox_check_journals) labels_checkboxes = [] # Get labels of the future check boxes of the journals to be parsed # Only journals on user's side for company in hosts.getCompanies(user=True): labels_checkboxes += hosts.getJournals(company, user=True)[1] labels_checkboxes.sort() self.box_select_all = QtWidgets.QCheckBox("Select all") self.box_select_all.setCheckState(0) self.vbox_check_journals.addWidget(self.box_select_all) # Build the checkboxes, and put them in a layout for label in labels_checkboxes: check_box = QtWidgets.QCheckBox(label) check_box.setCheckState(0) self.check_journals.append(check_box) self.vbox_check_journals.addWidget(check_box) self.scroll_check_journals.setWidget(self.scrolling_check_journals) self.vbox_global.addWidget(self.scroll_check_journals) # Validate. Triggers verification process self.button_del = QtWidgets.QPushButton("Delete journal(s)") self.vbox_global.addWidget(self.button_del) self.setLayout(self.vbox_global) self.show()
def test_getJournals(): """Function to get the informations about all the journals of a company. Returns the names, the URLs, the abbreviations, and also a boolean to set the download of the graphical abstracts""" print("\n") print("Starting test getJournals") # Create a dictionnary w/ all the data concerning the journals # implemented in the program: names, abbreviations, urls dict_journals = {} for company in os.listdir("journals"): company = company.split(".")[0] dict_journals[company] = hosts.getJournals(company) print(dict_journals) for company, data in dict_journals.items(): # data is a tuple: (list_journals_publisher, # list_abb_journals_publisher, # list_urls_publisher, # list_bool # ) assert type(data) == tuple # Check that all the fields in the tuple are # non empty lists for list_info_journals in data[:-1]: assert type(list_info_journals) == list and list_info_journals # Check that all the elements in the list are # non empty strings for element in list_info_journals: assert type(element) == str and element # Check the urls of the RSS pages for element in data[2]: assert validators.url(element) # Check the list of booleans for element in data[3]: assert type(element) == bool
def displayJournals(self): """Display the checkboxes of the journals""" self.clearLayout(self.vbox_check_journals) self.check_journals = [] self.button_manage_journals = QtWidgets.QPushButton("Manage journals") self.vbox_check_journals.addWidget(self.button_manage_journals) self.button_manage_journals.clicked.connect(self.dialogManageJournals) labels_checkboxes = [] # Get labels of the future check boxes of the journals to be parsed for company in hosts.getCompanies(): labels_checkboxes += hosts.getJournals(company)[1] labels_checkboxes.sort() self.box_select_all = QtWidgets.QCheckBox("(Un)Select all") self.box_select_all.setCheckState(0) self.vbox_check_journals.addWidget(self.box_select_all) # Checkbox to select/unselect all the journals self.box_select_all.stateChanged.connect(self.selectUnselectAll) # Build the checkboxes, and put them in a layout for label in labels_checkboxes: check_box = QtWidgets.QCheckBox(label) check_box.setCheckState(2) self.check_journals.append(check_box) self.vbox_check_journals.addWidget(check_box) # Restore check boxes states journals_to_parse = self.options.value("journals_to_parse", []) if not journals_to_parse: return else: for box in self.check_journals: if box.text() in journals_to_parse: box.setCheckState(2) else: box.setCheckState(0)
def test_getDoi(journalsUrls): """Tests if the function getDoi gets the DOI correctly""" print("\n") print("Starting test getDoi") list_sites = journalsUrls # Build a dic with key: company # value: journal name dict_journals = {} for company in os.listdir("journals"): company = company.split(".")[0] dict_journals[company] = hosts.getJournals(company)[0] for site in list_sites: feed = feedparser.parse(site) journal = feed["feed"]["title"] # Get the company name for publisher, data in dict_journals.items(): if journal in data: company = publisher print("{}: {}".format(site, len(feed.entries))) if len(feed.entries) < LENGTH_SAMPLE: samples = feed.entries else: samples = random.sample(feed.entries, LENGTH_SAMPLE) # Tests LENGTH_SAMPLE entries for a journal, not all of them for entry in samples: doi = hosts.getDoi(company, journal, entry) print(doi) assert type(doi) == str
def saveJournal(self, title, abb, url, company): """Will save the new journal, in file company.ini located in the user directory""" mes = "Journal already in the catalog" # Check if the RSS page's URL is not present in any company file for company in hosts.getCompanies(): data_company = hosts.getJournals(company) # If URL already present, display error dialog box if url in data_company[2]: QtWidgets.QMessageBox.critical(self, "Error", mes, QtWidgets.QMessageBox.Ok) self.l.debug("URL {} already in catalog".format(url)) return try: # If still here, write the new journal with open(os.path.join(self.DATA_PATH, "journals/{}.ini".format(company)), 'a', encoding='utf-8') as out: out.write("{} : {} : {}".format(title, abb, url)) self.l.debug("New journal written user side") self.l.debug("{} : {} : {}".format(title, abb, url)) self.l.info("{} added to the catalog".format(title)) except Exception as e: self.l.error("saveJournal, error writing journal: {}".format(e), exc_info=True) return # Refresh parent check boxes and close if self.parent is not None: self.parent.displayJournals() self.parent.saveSettings() self.close()
def test_getData(journalsUrls): """Tests the function getData. For each journal of each company, tests LENGTH_SAMPLE entries""" print("\n") print("Starting test getData") # Returns a list of the urls of the feed pages list_urls_feed = journalsUrls # Bypass all companies but one list_urls_feed = hosts.getJournals("acs")[2] # Build a dic with key: company # value: journal name dict_journals = {} for company in os.listdir("journals"): company = company.split(".")[0] dict_journals[company] = hosts.getJournals(company)[0] # All the journals are tested for site in list_urls_feed: print("Site {} of {}".format(list_urls_feed.index(site) + 1, len(list_urls_feed))) feed = feedparser.parse(site) journal = feed["feed"]["title"] # Get the company name for publisher, data in dict_journals.items(): if journal in data: company = publisher print("\n") print(journal) if len(feed.entries) < LENGTH_SAMPLE: samples = feed.entries else: samples = random.sample(feed.entries, LENGTH_SAMPLE) # Tests LENGTH_SAMPLE entries for a journal, not all of them for entry in samples: if company in ["science", "elsevier", "beilstein"]: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry ) else: if company == "acs": url = getattr(entry, "feedburner_origlink", entry.link).split("/")[-1] url = "http://pubs.acs.org/doi/abs/10.1021/" + url else: url = getattr(entry, "feedburner_origlink", entry.link) try: response = requests.get(url, timeout=10) title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry, response ) except requests.exceptions.ReadTimeout: print("A ReadTimeout occured, continue to next entry") print(title) print(url) print(graphical_abstract) print(date) print("\n") assert type(abstract) == str and abstract assert type(url) == str and url if url != "Empty": # Test if url is valid assert validators.url(url) is True assert type(graphical_abstract) == str and graphical_abstract if graphical_abstract != "Empty": assert validators.url(graphical_abstract) is True assert type(arrow.get(date)) == arrow.arrow.Arrow assert topic_simple.startswith(" ") is True assert topic_simple.endswith(" ") is True if author_simple is not None: assert author_simple.startswith(" ") is True assert author_simple.endswith(" ") is True
def test_getData(journalsUrls): """Tests the function getData. For each journal of each company, tests LENGTH_SAMPLE entries""" l.info("Starting test getData") start_time = datetime.datetime.now() # Returns a list of the urls of the feed pages list_urls_feed = journalsUrls # TODO: comment or uncomment # Bypass all companies but one # list_urls_feed = hosts.getJournals("ChemRxiv")[2] # Build a dic with key: company # value: journal name dict_journals = {} # Build a dictionnary to store the results of the tests, by company dict_res_by_company = {} for company in hosts.getCompanies(): dict_journals[company] = hosts.getJournals(company)[0] res = { 'count_abs_empty': 0, 'count_image_empty': 0, 'count_articles_tested': 0, 'count_articles_untested': 0, 'count_journals_untested': 0, 'count_redirections': 0, } dict_res_by_company[company] = res s = requests.session() # All the journals are tested for site in list_urls_feed: l.info("Site {} of {} \n".format( list_urls_feed.index(site) + 1, len(list_urls_feed))) # Get the RSS page of the url provided try: feed = feedparser.parse(site, timeout=20) journal = feed['feed']['title'] l.debug("RSS page successfully dled") except Exception as e: dict_res_by_company[company]['count_journals_untested'] += 1 l.error("RSS page could not be downloaded: {}".format(e), exc_info=True) continue # Get the company name for publisher, data in dict_journals.items(): if journal in data: company = publisher l.info(journal) if len(feed.entries) < LENGTH_SAMPLE: samples = feed.entries else: samples = random.sample(feed.entries, LENGTH_SAMPLE) # Tests LENGTH_SAMPLE entries for a journal, not all of them for entry in samples: if company in ['Science', 'Elsevier', 'Beilstein', 'PLOS']: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry) else: url = hosts.refineUrl(company, journal, entry) try: response = s.get(url, timeout=10, headers=HEADERS) title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry, response) except Exception as e: dict_res_by_company[company][ 'count_articles_untested'] += 1 l.error( "A problem occured: {}, continue to next entry".format( e), exc_info=True) continue dict_res_by_company[company]['count_articles_tested'] += 1 l.info("Title: {}".format(title)) l.info("URL: {}".format(url)) l.info("Image: {}".format(graphical_abstract)) l.info("Date: {}".format(date)) # Count and try do detect suspiciously high numbers of # empty results if abstract == "Empty": dict_res_by_company[company]['count_abs_empty'] += 1 if graphical_abstract == "Empty": dict_res_by_company[company]['count_image_empty'] += 1 try: if response.history: dict_res_by_company[company]['count_redirections'] += 1 l.debug("Request was redirected") for resp in response.history: l.debug("Status code, URL: {}, {}".format( resp.status_code, resp.url)) l.debug("Final destination:") l.debug("Status code, URL: {}, {} \n".format( resp.status_code, response.url)) else: l.debug("Request was not redirected \n") except UnboundLocalError: pass # ------------------------ ASSERT SECTION ------------------------- logAssert( type(abstract) == str and abstract, "Abstract missing or not a string {}".format(abstract)) logAssert( type(url) == str and url, "URL is missing or is not a string {}".format(url)) # Test if url is valid if url != 'Empty': logAssert( validators.url(url) is True, "URL is a string but is not a URL {}".format(url)) # For ACS and Nature, check if the URL is the abstract page's URL if company in ['ACS', 'Nature']: logAssert( 'abs' in url, "company is {}, but URL doesn't contain 'abs' {}".format( company, url)) logAssert( type(graphical_abstract) == str and graphical_abstract, "graphical_abstract is missing or not a string {}".format( graphical_abstract)) if graphical_abstract != 'Empty': logAssert( validators.url(graphical_abstract) is True, "graphical_abstract is a string but is not a URL {}". format(graphical_abstract)) logAssert( type(arrow.get(date)) == arrow.arrow.Arrow, "The date is not really a date {}".format(date)) logAssert( topic_simple.startswith(' ') is True, "Topic doesn't start with space {}".format(topic_simple)) logAssert( topic_simple.endswith(' ') is True, "Topic doesn't end with space {}".format(topic_simple)) if author_simple is not None: logAssert( author_simple.startswith(' ') is True, "author_simple doesn't start with space {}".format( author_simple)) logAssert( author_simple.endswith(' ') is True, "author_simple doesn't end with space {}".format( author_simple)) pprint(dict_res_by_company) # Count results count_abs_empty = 0 count_image_empty = 0 count_articles_tested = 0 count_articles_untested = 0 count_journals_untested = 0 count_redirections = 0 for company in dict_res_by_company: count_abs_empty += dict_res_by_company[company]['count_abs_empty'] count_image_empty += dict_res_by_company[company]['count_image_empty'] count_articles_tested += dict_res_by_company[company][ 'count_articles_tested'] count_articles_untested += dict_res_by_company[company][ 'count_articles_untested'] count_journals_untested += dict_res_by_company[company][ 'count_journals_untested'] count_redirections += dict_res_by_company[company][ 'count_redirections'] l.debug("Number of untested jounals: {} / {}".format( count_journals_untested, len(list_urls_feed))) l.debug("Number of test/untested articles: {} / {}".format( count_articles_tested, count_articles_untested)) l.debug("Number of Empty abstracts: {}".format(count_abs_empty)) l.debug( "Number of Empty graphical_abstracts: {}".format(count_image_empty)) l.debug("Number of redirections: {}".format(count_redirections)) l.debug("Time spent in test_getData: {}".format(datetime.datetime.now() - start_time))