def __init__(self, reset_cache=False): self.journals = Journals() self.works = Works() self.filter_kwargs = dict(has_license='true', has_full_text='true') self.keywords = 'business financial merger entrepreneur banking insurance commerce trade economics' UnpywallCredentials('*****@*****.**') cache_path = path.join(DATA_DIR, 'unpaywall_cache') if reset_cache and path.exists(cache_path): remove(cache_path) self.unpywall_cache = UnpywallCache(cache_path) Unpywall.init_cache(self.unpywall_cache)
def testISSN(issn): # works = Works() # info = works.doi('10.2514/8.7231') journals = Journals() info = journals.journal(issn) ''' info.pop('last-status-check-time') info.pop('counts') info.pop('breakdowns') info.pop('flags') info.pop('coverage') info.pop('coverage-type') print(info) for x in info: print(x) ''' print(info)
def getInfoFromISSN(issn): # Retrieve all information about the ISSN journals = Journals() info = journals.journal(issn) string = "," # Retrieve the properties we care about only itemTitle = info.get('title') itemPub = info.get('publisher') itemSubj = info.get('subjects') itemISSN = info.get('ISSN') itemISSNinfo = info.get('issn-type') # Go through non-string entities and format string = "" # Loop through dict values in list for x in itemSubj: # Gets all the values from this dict # and stores all values in a list temp = [*x.values()] # Joins the items from the list and seperates with a comma temp = ', '.join(map(str, temp)) # Adds current dict entry (now string of values) to a string string = string + temp + ', ' # Removes trailing ', ' itemSubj = string[:-2] # Join all ISSNs for this item using a , itemISSN = ", ".join(itemISSN) string = "" # Loop through dict values in list for x in itemISSNinfo: # Gets all the values from this dict # and stores all values in a list temp = [*x.values()] # Joins the items from the list and seperates with a comma temp = ', '.join(map(str, temp)) # Adds current dict entry (now string of values) to a string string = string + temp + ', ' # Removes trailing ', ' itemISSNinfo = string[:-2] infoList = [itemTitle, itemPub, itemSubj, itemISSN, itemISSNinfo] return infoList
def query_to_crossref(document): title = document.original_title() author = ' '.join([document.first_author.get('surname', ''), document.first_author.get('given_names', '')]).strip() pub_date = document.publication_date if title is None: return None result = [i for i in Journals().works(document.journal.scielo_issn).query(title=title, author=author).filter(from_pub_date=pub_date, until_pub_date=pub_date)] if len(result) != 1: return None return result.get('DOI', None)
# -*- coding: utf-8 -*- """ Created on Fri Nov 1 18:04:47 2019 @author: asdqw """ from crossref.restful import Works from crossref.restful import Journals from selenium import webdriver from bs4 import BeautifulSoup import requests works = Works() journals = Journals() import os def acquire_text(url, index): # from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # from selenium.webdriver.support.ui import WebDriverWait # desired_capabilities = DesiredCapabilities.CHROME # 修改页面加载策略 # desired_capabilities["pageLoadStrategy"] = "eager" # 注释这两行会导致最后输出结果的延迟,即等待页面加载完成再输出 from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC #element.get_attribute('text') driver = webdriver.Chrome() #创建浏览器 driver.get(url) # 访问网址,并且会等待完全加载除非AJAX
def retreive_journal_between_dates(self, journal_issn, from_date, end_date): journals = Journals() return journals.works(journal_issn).filter(from_created_date=from_date, until_created_date=end_date)
class NarrativeDataset: LICENSE_WHITELIST = [ 'http://creativecommons.org/licenses/by/4.0/', 'http://creativecommons.org/licenses/by/3.0/' ] download_links = dict() def __init__(self, reset_cache=False): self.journals = Journals() self.works = Works() self.filter_kwargs = dict(has_license='true', has_full_text='true') self.keywords = 'business financial merger entrepreneur banking insurance commerce trade economics' UnpywallCredentials('*****@*****.**') cache_path = path.join(DATA_DIR, 'unpaywall_cache') if reset_cache and path.exists(cache_path): remove(cache_path) self.unpywall_cache = UnpywallCache(cache_path) Unpywall.init_cache(self.unpywall_cache) def get_dois_from_journal(self, journal_issn): doi_list = [] try: if self.journals.journal_exists(journal_issn): works = self.journals.works(journal_issn).filter( **self.filter_kwargs).select('DOI', 'license') for response_dict in tqdm(works): license_dict = response_dict['license'] if self.is_license_whitelist(license_dict[0]['URL']): doi_list.append(response_dict['DOI']) except Exception as e: logger.error("Error while getting DOIs from REST service", e, exc_info=True) return doi_list def get_dois_from_keywords(self): doi_list = [] try: results = self.works.query(self.keywords).filter( **self.filter_kwargs).select('DOI', 'license') for response_dict in tqdm(results): license_dict = response_dict['license'] if self.is_license_whitelist(license_dict[0]['URL']): doi_list.append(response_dict['DOI']) except Exception as e: logger.error("Error while getting DOIs from REST service", e, exc_info=True) return doi_list def get_oa_urls(self, doi_list): logger.info('Retreiving doc urls for DOIs now (cached/uncached)') oa_urls = [] for i, doi in tqdm(enumerate(doi_list), total=len(doi_list)): try: oa_urls.append(Unpywall.get_doc_link(doi)) except HTTPError: logger.warning( '\nError received for DOI: {}, will retry 3 times in 20 secs' .format(doi)) sleep(20) for i in range(3): try: logger.info('Retry :{}'.format(i + 1)) oa_urls.append(Unpywall.get_doc_link(doi)) break except HTTPError as e: logger.error('Retry failed', e, exc_info=True) return oa_urls def is_license_whitelist(self, license): license = str(license).replace('https', 'http') return license in self.LICENSE_WHITELIST def retry_from_another_src(self, faulty_files_list, doi_list): src_dict = {'scirp': []} for file in faulty_files_list: base_name = ntpath.basename(file) doi_list_ind = int(base_name.replace("Sample_", "")[:-8]) - 1 doi = doi_list[doi_list_ind] doc_url = Unpywall.get_pdf_link(doi) if doc_url is not None and 'scirp' in doc_url.lower(): try: scirp_id = doc_url[doc_url.index('paperID=') + 8:] except (IndexError, ValueError): continue if scirp_id != "": src_dict['scirp'].append((file, scirp_id)) return download_frm_another_src(src_dict) @staticmethod def download_doi_pdf(works, doi_list, download_dir): logger.info( "Trying to download the required data now for {} DOIs".format( len(doi_list))) for i, doi in enumerate(doi_list): name_pattern = 'Sample_{}.pdf'.format(str(i + 1)) download_link = Unpywall.get_pdf_link(doi) try: if not download_link: result = works.doi(doi)['link'] for item in result: application = item['intended-application'] type = item['content-type'] if application is not None and application == 'text-mining' and type == 'application/pdf': download_link = item['URL'] break NarrativeDataset.download_links[ name_pattern[:-4]] = download_link if not path.exists(path.join(download_dir, name_pattern)): if download_link and filter_url(download_link): logger.debug('Downloading ' + name_pattern + " : " + doi + ' from url: ' + download_link) download_pdf_file(download_link, name_pattern, download_dir, progress=True) sleep(5) except Exception as e: logger.error( "Error while downloading the article ({}, {})".format( str(i + 1), doi), e, exc_info=True) NarrativeDataset.download_links[ name_pattern[:-4]] = download_link return True