def download_pdf(self,record,force_overwrite=False,pdf_choice=None,**kwargs): import re from citco import setup_browser import os.path recid = self.get_recid(record) #first, check to see if we already have the pdf #if not, verify that pdfdir exists pdf_path = self.pdfdir + '/' + recid + '.pdf' if not force_overwrite and os.path.exists(pdf_path): print 'already exists: ' + pdf_path return #start a non proxied browser for later use non_proxy_br = setup_browser() record_page = 'http://apps.isiknowledge.com/CitedFullRecord.do?product=WOS&SID='+self.isi_ID+'&search_mode=CitedFullRecord&isickref=' + recid + '&db_id=WOS&colname=WOS' html = self.br.open(record_page).read() #try to get the doi doi_re = re.compile('DOI.*>(.*)<') try: doi = doi_re.findall(html)[0].strip() #I was getting 404 Error: Forbidden when trying to access the doi pages with a proxied browser. Use no proxy instead html = non_proxy_br.open('http://dx.doi.org/'+doi).read() possible_pdfs = [l.absolute_url for l in non_proxy_br.links() if 'pdf' in l.url and doi in l.url] if len(possible_pdfs) > 1 and pdf_choice==None: return possible_pdfs if not pdf_choice: pdf_choice = 0 pdf_link = possible_pdfs[pdf_choice] except IndexError: print 'Unable to get a doi for this record! Googling for the title. Wish me luck!' self.br.open('https://www.google.com') self.br.select_form(nr=0) self.br.form['q'] = record['Title'] self.br.submit() links = [l for l in self.br.links() if 'http' in l.url and 'google' not in l.url] non_proxy_br.open(links[0].url) new_url = non_proxy_br.geturl() if new_url.endswith('.pdf'): pdf_link = new_url else: self.br.open(new_url) possible_pdfs = [l.absolute_url for l in self.br.links() if 'pdf' in l.url] if len(possible_pdfs) > 1 and pdf_choice==None: return possible_pdfs if not pdf_choice: pdf_choice = 0 pdf_link = possible_pdfs[pdf_choice] print 'attempting to download '+pdf_link print 'saving to ' + pdf_path try: self.br.retrieve(pdf_link,filename=pdf_path) except: non_proxy_br.retrieve(pdf_link,filename=pdf_path)
def search(self,search_term): import re from citco import rcParams, setup_browser url = "http://apps.isiknowledge.com/WOS_AdvancedSearch_input.do?product=WOS&SID="+self.isi_ID+"&search_mode=AdvancedSearch" br = setup_browser(**rcParams) br.open(url) br.select_form(name="WOS_AdvancedSearch_input_form") br['value(input1)'] = search_term resp_page = br.submit().read() summary_page = re.compile("summary\.do.*?\"").findall(resp_page) summary_page = [i for i in summary_page if 'AdvancedSearch' in i] if not summary_page: return [] new_url = 'http://apps.isiknowledge.com/'+summary_page[0][:-1]+'&page=1&action=changePageSize&pageSize=200' html = br.open(new_url).read() return self.isi2dict(html,br.links())