def test_detect(self, name, publisher, start_url): """Test that publishers are correctly detected. :param name: Test name; used by nose_parameterized :param publisher: Correct publisher name :param start_url: Staring URL """ self.browser.open(start_url) init_html, init_qhtml = self.browser.get_docs() assert_equal(publisher, pubdet.pubdet(init_html))
def scrape(self, doi=None, pmid=None, fetch_pmid=True, fetch_types=None): """Download documents for a target article. :param doi: Article DOI :param pmid: Article PubMed ID :param fetch_pmid: Look up PMID if not provided :return: ScrapeInfo instance """ logger.info('Fetching article with DOI={0}, PMID={1}'.format( doi, pmid, )) # Initialize ScrapeInfo object to store results self.info = ScrapeInfo(doi, pmid) # Get publisher link pub_link = None if doi: try: pub_link = self._resolve_doi(doi) except BadDOIError: logger.info('Could not resolve DOI {}'.format(doi)) if not pmid and fetch_pmid: logger.info('Looking up PMID by DOI') self.info.pmid = pmid_doi.pmid_doi({'doi': doi})['pmid'] if pmid and not pub_link: pub_link = self._resolve_pmid(pmid) # Quit if no publisher link found if not pub_link: raise ScrapeError('No publisher link found') # Log publisher link to ScrapeInfo self.info.pub_link = pub_link # Detect publisher self.info.publisher = pubdet.pubdet(self.info.init_html) # Get documents for doc_type in getter_map: # Skip documents not to be included if fetch_types and doc_type not in fetch_types: continue # Identify getter getter_class = getter_map[doc_type][self.info.publisher] # Skip if getter is set to false-y if not getter_class: continue # Construct getter getter = getter_class() # Browser to publisher link if self.browser.geturl() != pub_link: try: self.browser.reopen(pub_link) except URLError: self.info.status[doc_type] = 'Timeout' # Get document try: # Success get_success = getter.reget(self.info, self.browser) self.info.docs[doc_type] = self.info.html self.info.status[doc_type] = 'Success' except Exception as error: # Failure self.info.status[doc_type] = repr(error) # Return ScrapeInfo object return self.info