Ejemplo n.º 1
0
def test_getJournals():
    """Function to get the informations about all the journals of
    a company. Returns the names, the URLs, the abbreviations, and also
    a boolean to set the download of the graphical abstracts"""

    l.info("Function getJournals")
    start_time = datetime.datetime.now()

    # Create a dictionnary w/ all the data concerning the journals
    # implemented in the program: names, abbreviations, urls
    dict_journals = {}

    for company in hosts.getCompanies():
        dict_journals[company] = hosts.getJournals(company)

    l.debug(dict_journals)

    for company, data in dict_journals.items():

        # data is a tuple: (list_journals_publisher,
        # list_abb_journals_publisher,
        # list_urls_publisher,
        # list_bool
        # )

        logAssert(type(data) == tuple, "data is not a tuple {}".format(data))

        # Check that all the fields in the tuple are
        # non empty lists
        for list_info_journals in data[:-1]:
            logAssert(
                type(list_info_journals) == list and list_info_journals,
                "list_info_journals is missing or is not a list {}".format(
                    list_info_journals))

            # Check that all the elements in the list are
            # non empty strings
            for element in list_info_journals:
                logAssert(
                    type(element) == str and element,
                    "element is not a string or is an empty string {}".format(
                        element))

        # Check the urls of the RSS pages
        for element in data[2]:
            logAssert(
                validators.url(element),
                "One of the publisher's URL is not a URL".format(element))

        # Check the list of booleans
        for element in data[3]:
            logAssert(
                type(element) == bool,
                "One boolean is not really a boolean {}".format(element))

    l.debug(
        "Time spent in test_getJournals: {}".format(datetime.datetime.now() -
                                                    start_time))
Ejemplo n.º 2
0
def test_getDoi(journalsUrls):
    """Tests if the function getDoi gets the DOI correctly"""

    l.info("Function getDoi")

    start_time = datetime.datetime.now()

    list_sites = journalsUrls

    # Build a dic with key: company
    # value: journal name
    dict_journals = {}
    for company in hosts.getCompanies():
        dict_journals[company] = hosts.getJournals(company)[0]

    for site in list_sites:

        try:
            feed = feedparser.parse(site, timeout=20)
            journal = feed['feed']['title']
            l.debug("RSS page successfully dled")
        except Exception as e:
            l.error("RSS page could not be downloaded: {}".format(e),
                    exc_info=True)
            continue

        try:
            journal = feed['feed']['title']
        except KeyError:
            l.error("Failed to get title for: {}".format(site))
            pytest.fail("Failed to get title for: {}".format(site))

        # Get the company name
        for publisher, data in dict_journals.items():
            if journal in data:
                company = publisher

        l.info("{}: {}".format(site, len(feed.entries)))

        if len(feed.entries) < LENGTH_SAMPLE:
            samples = feed.entries
        else:
            samples = random.sample(feed.entries, LENGTH_SAMPLE)

        # Tests LENGTH_SAMPLE entries for a journal, not all of them
        for entry in samples:

            doi = hosts.getDoi(company, journal, entry)
            l.info(doi)

            logAssert(
                type(doi) == str or not doi.startswith('10.1'),
                "DOI is not a string or is not a DOI {}".format(doi))

    l.debug("Time spent in test_getDoi: {}".format(datetime.datetime.now() -
                                                   start_time))
Ejemplo n.º 3
0
def journalsUrls():
    """Returns a combined list of urls.
    All the journals of all the companies.
    Specific to the tests, fixture"""

    urls = []
    for company in hosts.getCompanies():
        urls += hosts.getJournals(company)[2]

    return urls
Ejemplo n.º 4
0
def journalsUrls():

    """Returns a combined list of urls.
    All the journals of all the companies.
    Specific to the tests, fixture"""

    urls = []
    for company in os.listdir("journals"):
        company = company.split(".")[0]
        urls += hosts.getJournals(company)[2]

    return urls
Ejemplo n.º 5
0
    def initUI(self):
        """Handle the display"""

        self.setWindowTitle('Deleting journals')

        self.vbox_global = QtWidgets.QVBoxLayout()

        # Open a dialog box to explain how to add a journal
        mes = "Confirmation will be asked before\nanything permanent is done: no worries"

        self.label_help = QtWidgets.QLabel(mes)
        self.vbox_global.addWidget(self.label_help)

        # Scroll area for the journals to check
        self.scroll_check_journals = QtWidgets.QScrollArea()
        self.scrolling_check_journals = QtWidgets.QWidget()
        self.vbox_check_journals = QtWidgets.QVBoxLayout()
        self.scrolling_check_journals.setLayout(self.vbox_check_journals)

        labels_checkboxes = []

        # Get labels of the future check boxes of the journals to be parsed
        # Only journals on user's side
        for company in hosts.getCompanies(user=True):
            labels_checkboxes += hosts.getJournals(company, user=True)[1]

        labels_checkboxes.sort()

        self.box_select_all = QtWidgets.QCheckBox("Select all")
        self.box_select_all.setCheckState(0)
        self.vbox_check_journals.addWidget(self.box_select_all)

        # Build the checkboxes, and put them in a layout
        for label in labels_checkboxes:
            check_box = QtWidgets.QCheckBox(label)
            check_box.setCheckState(0)
            self.check_journals.append(check_box)
            self.vbox_check_journals.addWidget(check_box)

        self.scroll_check_journals.setWidget(self.scrolling_check_journals)

        self.vbox_global.addWidget(self.scroll_check_journals)

        # Validate. Triggers verification process
        self.button_del = QtWidgets.QPushButton("Delete journal(s)")
        self.vbox_global.addWidget(self.button_del)

        self.setLayout(self.vbox_global)
        self.show()
Ejemplo n.º 6
0
def test_getJournals():

    """Function to get the informations about all the journals of
    a company. Returns the names, the URLs, the abbreviations, and also
    a boolean to set the download of the graphical abstracts"""

    print("\n")
    print("Starting test getJournals")

    # Create a dictionnary w/ all the data concerning the journals
    # implemented in the program: names, abbreviations, urls
    dict_journals = {}
    for company in os.listdir("journals"):
        company = company.split(".")[0]
        dict_journals[company] = hosts.getJournals(company)

    print(dict_journals)

    for company, data in dict_journals.items():

        # data is a tuple: (list_journals_publisher,
        # list_abb_journals_publisher,
        # list_urls_publisher,
        # list_bool
        # )

        assert type(data) == tuple

        # Check that all the fields in the tuple are
        # non empty lists
        for list_info_journals in data[:-1]:
            assert type(list_info_journals) == list and list_info_journals

            # Check that all the elements in the list are
            # non empty strings
            for element in list_info_journals:
                assert type(element) == str and element

        # Check the urls of the RSS pages
        for element in data[2]:
            assert validators.url(element)

        # Check the list of booleans
        for element in data[3]:
            assert type(element) == bool
Ejemplo n.º 7
0
    def displayJournals(self):
        """Display the checkboxes of the journals"""

        self.clearLayout(self.vbox_check_journals)

        self.check_journals = []

        self.button_manage_journals = QtWidgets.QPushButton("Manage journals")
        self.vbox_check_journals.addWidget(self.button_manage_journals)
        self.button_manage_journals.clicked.connect(self.dialogManageJournals)

        labels_checkboxes = []

        # Get labels of the future check boxes of the journals to be parsed
        for company in hosts.getCompanies():
            labels_checkboxes += hosts.getJournals(company)[1]

        labels_checkboxes.sort()

        self.box_select_all = QtWidgets.QCheckBox("(Un)Select all")
        self.box_select_all.setCheckState(0)
        self.vbox_check_journals.addWidget(self.box_select_all)

        # Checkbox to select/unselect all the journals
        self.box_select_all.stateChanged.connect(self.selectUnselectAll)

        # Build the checkboxes, and put them in a layout
        for label in labels_checkboxes:
            check_box = QtWidgets.QCheckBox(label)
            check_box.setCheckState(2)
            self.check_journals.append(check_box)
            self.vbox_check_journals.addWidget(check_box)

        # Restore check boxes states
        journals_to_parse = self.options.value("journals_to_parse", [])
        if not journals_to_parse:
            return
        else:
            for box in self.check_journals:
                if box.text() in journals_to_parse:
                    box.setCheckState(2)
                else:
                    box.setCheckState(0)
Ejemplo n.º 8
0
def test_getDoi(journalsUrls):

    """Tests if the function getDoi gets the DOI correctly"""

    print("\n")
    print("Starting test getDoi")

    list_sites = journalsUrls

    # Build a dic with key: company
    # value: journal name
    dict_journals = {}
    for company in os.listdir("journals"):
        company = company.split(".")[0]
        dict_journals[company] = hosts.getJournals(company)[0]

    for site in list_sites:
        feed = feedparser.parse(site)
        journal = feed["feed"]["title"]

        # Get the company name
        for publisher, data in dict_journals.items():
            if journal in data:
                company = publisher

        print("{}: {}".format(site, len(feed.entries)))

        if len(feed.entries) < LENGTH_SAMPLE:
            samples = feed.entries
        else:
            samples = random.sample(feed.entries, LENGTH_SAMPLE)

        # Tests LENGTH_SAMPLE entries for a journal, not all of them
        for entry in samples:

            doi = hosts.getDoi(company, journal, entry)
            print(doi)

            assert type(doi) == str
Ejemplo n.º 9
0
    def saveJournal(self, title, abb, url, company):
        """Will save the new journal, in file company.ini located in
        the user directory"""

        mes = "Journal already in the catalog"

        # Check if the RSS page's URL is not present in any company file
        for company in hosts.getCompanies():
            data_company = hosts.getJournals(company)

            # If URL already present, display error dialog box
            if url in data_company[2]:
                QtWidgets.QMessageBox.critical(self, "Error", mes,
                                               QtWidgets.QMessageBox.Ok)
                self.l.debug("URL {} already in catalog".format(url))
                return

        try:
            # If still here, write the new journal
            with open(os.path.join(self.DATA_PATH,
                                   "journals/{}.ini".format(company)),
                      'a',
                      encoding='utf-8') as out:
                out.write("{} : {} : {}".format(title, abb, url))
            self.l.debug("New journal written user side")
            self.l.debug("{} : {} : {}".format(title, abb, url))
            self.l.info("{} added to the catalog".format(title))
        except Exception as e:
            self.l.error("saveJournal, error writing journal: {}".format(e),
                         exc_info=True)
            return

        # Refresh parent check boxes and close
        if self.parent is not None:
            self.parent.displayJournals()
            self.parent.saveSettings()

        self.close()
Ejemplo n.º 10
0
def test_getData(journalsUrls):

    """Tests the function getData. For each journal of each company,
    tests LENGTH_SAMPLE entries"""

    print("\n")
    print("Starting test getData")

    # Returns a list of the urls of the feed pages
    list_urls_feed = journalsUrls

    # Bypass all companies but one
    list_urls_feed = hosts.getJournals("acs")[2]

    # Build a dic with key: company
    # value: journal name
    dict_journals = {}
    for company in os.listdir("journals"):
        company = company.split(".")[0]
        dict_journals[company] = hosts.getJournals(company)[0]

    # All the journals are tested
    for site in list_urls_feed:

        print("Site {} of {}".format(list_urls_feed.index(site) + 1, len(list_urls_feed)))

        feed = feedparser.parse(site)
        journal = feed["feed"]["title"]

        # Get the company name
        for publisher, data in dict_journals.items():
            if journal in data:
                company = publisher

        print("\n")
        print(journal)

        if len(feed.entries) < LENGTH_SAMPLE:
            samples = feed.entries
        else:
            samples = random.sample(feed.entries, LENGTH_SAMPLE)

        # Tests LENGTH_SAMPLE entries for a journal, not all of them
        for entry in samples:

            if company in ["science", "elsevier", "beilstein"]:
                title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                    company, journal, entry
                )
            else:
                if company == "acs":
                    url = getattr(entry, "feedburner_origlink", entry.link).split("/")[-1]
                    url = "http://pubs.acs.org/doi/abs/10.1021/" + url
                else:
                    url = getattr(entry, "feedburner_origlink", entry.link)

                try:
                    response = requests.get(url, timeout=10)
                    title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                        company, journal, entry, response
                    )
                except requests.exceptions.ReadTimeout:
                    print("A ReadTimeout occured, continue to next entry")

            print(title)
            print(url)
            print(graphical_abstract)
            print(date)
            print("\n")

            assert type(abstract) == str and abstract

            assert type(url) == str and url
            if url != "Empty":
                # Test if url is valid
                assert validators.url(url) is True

            assert type(graphical_abstract) == str and graphical_abstract
            if graphical_abstract != "Empty":
                assert validators.url(graphical_abstract) is True

            assert type(arrow.get(date)) == arrow.arrow.Arrow

            assert topic_simple.startswith(" ") is True
            assert topic_simple.endswith(" ") is True

            if author_simple is not None:
                assert author_simple.startswith(" ") is True
                assert author_simple.endswith(" ") is True
Ejemplo n.º 11
0
def test_getData(journalsUrls):
    """Tests the function getData. For each journal of each company,
    tests LENGTH_SAMPLE entries"""

    l.info("Starting test getData")

    start_time = datetime.datetime.now()

    # Returns a list of the urls of the feed pages
    list_urls_feed = journalsUrls

    # TODO: comment or uncomment
    # Bypass all companies but one
    # list_urls_feed = hosts.getJournals("ChemRxiv")[2]

    # Build a dic with key: company
    # value: journal name
    dict_journals = {}

    # Build a dictionnary to store the results of the tests, by company
    dict_res_by_company = {}

    for company in hosts.getCompanies():
        dict_journals[company] = hosts.getJournals(company)[0]

        res = {
            'count_abs_empty': 0,
            'count_image_empty': 0,
            'count_articles_tested': 0,
            'count_articles_untested': 0,
            'count_journals_untested': 0,
            'count_redirections': 0,
        }

        dict_res_by_company[company] = res

    s = requests.session()

    # All the journals are tested
    for site in list_urls_feed:

        l.info("Site {} of {} \n".format(
            list_urls_feed.index(site) + 1, len(list_urls_feed)))

        # Get the RSS page of the url provided
        try:
            feed = feedparser.parse(site, timeout=20)
            journal = feed['feed']['title']
            l.debug("RSS page successfully dled")
        except Exception as e:
            dict_res_by_company[company]['count_journals_untested'] += 1
            l.error("RSS page could not be downloaded: {}".format(e),
                    exc_info=True)
            continue

        # Get the company name
        for publisher, data in dict_journals.items():
            if journal in data:
                company = publisher

        l.info(journal)

        if len(feed.entries) < LENGTH_SAMPLE:
            samples = feed.entries
        else:
            samples = random.sample(feed.entries, LENGTH_SAMPLE)

        # Tests LENGTH_SAMPLE entries for a journal, not all of them
        for entry in samples:

            if company in ['Science', 'Elsevier', 'Beilstein', 'PLOS']:
                title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                    company, journal, entry)
            else:
                url = hosts.refineUrl(company, journal, entry)

                try:
                    response = s.get(url, timeout=10, headers=HEADERS)
                    title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(
                        company, journal, entry, response)
                except Exception as e:
                    dict_res_by_company[company][
                        'count_articles_untested'] += 1
                    l.error(
                        "A problem occured: {}, continue to next entry".format(
                            e),
                        exc_info=True)
                    continue

            dict_res_by_company[company]['count_articles_tested'] += 1

            l.info("Title: {}".format(title))
            l.info("URL: {}".format(url))
            l.info("Image: {}".format(graphical_abstract))
            l.info("Date: {}".format(date))

            # Count and try do detect suspiciously high numbers of
            # empty results
            if abstract == "Empty":
                dict_res_by_company[company]['count_abs_empty'] += 1
            if graphical_abstract == "Empty":
                dict_res_by_company[company]['count_image_empty'] += 1

            try:
                if response.history:
                    dict_res_by_company[company]['count_redirections'] += 1
                    l.debug("Request was redirected")
                    for resp in response.history:
                        l.debug("Status code, URL: {}, {}".format(
                            resp.status_code, resp.url))
                    l.debug("Final destination:")
                    l.debug("Status code, URL: {}, {} \n".format(
                        resp.status_code, response.url))
                else:
                    l.debug("Request was not redirected \n")
            except UnboundLocalError:
                pass

            # ------------------------ ASSERT SECTION -------------------------

            logAssert(
                type(abstract) == str and abstract,
                "Abstract missing or not a string {}".format(abstract))

            logAssert(
                type(url) == str and url,
                "URL is missing or is not a string {}".format(url))

            # Test if url is valid
            if url != 'Empty':
                logAssert(
                    validators.url(url) is True,
                    "URL is a string but is not a URL {}".format(url))

            # For ACS and Nature, check if the URL is the abstract page's URL
            if company in ['ACS', 'Nature']:
                logAssert(
                    'abs' in url,
                    "company is {}, but URL doesn't contain 'abs' {}".format(
                        company, url))

            logAssert(
                type(graphical_abstract) == str and graphical_abstract,
                "graphical_abstract is missing or not a string {}".format(
                    graphical_abstract))

            if graphical_abstract != 'Empty':
                logAssert(
                    validators.url(graphical_abstract) is True,
                    "graphical_abstract is a string but is not a URL {}".
                    format(graphical_abstract))

            logAssert(
                type(arrow.get(date)) == arrow.arrow.Arrow,
                "The date is not really a date {}".format(date))

            logAssert(
                topic_simple.startswith(' ') is True,
                "Topic doesn't start with space {}".format(topic_simple))

            logAssert(
                topic_simple.endswith(' ') is True,
                "Topic doesn't end with space {}".format(topic_simple))

            if author_simple is not None:
                logAssert(
                    author_simple.startswith(' ') is True,
                    "author_simple doesn't start with space {}".format(
                        author_simple))
                logAssert(
                    author_simple.endswith(' ') is True,
                    "author_simple doesn't end with space {}".format(
                        author_simple))

    pprint(dict_res_by_company)

    # Count results
    count_abs_empty = 0
    count_image_empty = 0
    count_articles_tested = 0
    count_articles_untested = 0
    count_journals_untested = 0
    count_redirections = 0

    for company in dict_res_by_company:
        count_abs_empty += dict_res_by_company[company]['count_abs_empty']
        count_image_empty += dict_res_by_company[company]['count_image_empty']
        count_articles_tested += dict_res_by_company[company][
            'count_articles_tested']
        count_articles_untested += dict_res_by_company[company][
            'count_articles_untested']
        count_journals_untested += dict_res_by_company[company][
            'count_journals_untested']
        count_redirections += dict_res_by_company[company][
            'count_redirections']

    l.debug("Number of untested jounals: {} / {}".format(
        count_journals_untested, len(list_urls_feed)))

    l.debug("Number of test/untested articles: {} / {}".format(
        count_articles_tested, count_articles_untested))

    l.debug("Number of Empty abstracts: {}".format(count_abs_empty))

    l.debug(
        "Number of Empty graphical_abstracts: {}".format(count_image_empty))

    l.debug("Number of redirections: {}".format(count_redirections))

    l.debug("Time spent in test_getData: {}".format(datetime.datetime.now() -
                                                    start_time))