def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True): '''Find PDF by ISSN based on search result from crossref''' # may be improve to not only issn.. if (not issn):return if (len(issn)==9 and issn[4]=='-'): needurl="http://api.crossref.org/journals/"+issn+"/works" elif('10.' in issn): needurl="http://api.crossref.org/prefixes/"+issn+"/works" else: print "Error ISSN/prefix" sys.exit(1) cr=CRrecord() total=cr.gettotalresultfromlink(needurl) if (not maxresult or maxresult <=0 or maxresult>total): maxresult=total params={"rows":str(step)} maxround=(maxresult-offset)/step+1 offsetcount=offset bdcheck=BDCheck() for i in range(maxround): params["offset"]=str(step*i+offset) r=requests.get(needurl,params,timeout=timeout_setting_download) if (r.status_code is 200): # Get all check/in oapdf if usebdcheck: bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True) for j in r.json().get('message',{}).get('items',[]): keyword=j.get('title',['']) doi=DOI(j.get("DOI","")) if not doi: offsetcount+=1 time.sleep(2) continue # Check whether in bdcheck if (usebdcheck and doi in bdcheckall): print doi, 'has search/oapdf/free by bdcheck' offsetcount+=1 time.sleep(1) continue # If not in bdcheck, check oapdf/free and set it # TODO: remove it after combine oapdf information to library oapdffree=bdcheck.setbycheck(doi) if (oapdffree[0] or oapdffree[1]): print doi,'exist in oapdf/free library..' offsetcount+=1 time.sleep(1) continue if (keyword): keyword=keyword[0] else: time.sleep(2) offsetcount+=1 continue if usedoi:keyword+=" "+doi print "#####################################",offsetcount,"####################################" print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............" sys.stdout.flush() self.search(keyword.encode('utf-8'),proxy=proxy) bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck) bdcheck.set(doi) offsetcount+=1 gc.collect() print "End of process for",issn
def getallpdf(self,doifilter=None,onlinecheck=True,savestate=None,usebdcheck=True): '''Get All pdf from link doifilter should be a function, return True when DOI ok''' usedoifilter=callable(doifilter) getallfilelist=[] if isinstance(savestate,(list,tuple,set)): savestate=set(savestate) elif (isinstance(savestate,int)): savestate=set([savestate]) else: savestate=set([0,1,2,3]) bdcheck=BDCheck() for i in range(len(self.items)): try: getfilelist=[] # Get PDF links links=self.getpdflink(i) if (links): doi=DOI(self.getdoi(i)) if not doi: print "blank doi..",doi continue if ( usedoifilter and not doifilter(doi)): print doi,'Not fit filter..' continue # Check by bdcheck api if (usebdcheck): bdout=bdcheck.get(doi) if sum(bdout)>0: print doi, 'has search/oapdf/free',bdout continue oapdffree=bdcheck.setbycheck(doi) if (oapdffree[0] and oapdffree[1]): print doi,'exist in oapdf/free library..' continue elif oapdffree[0]: print doi,'exist in oapdf library..' continue elif oapdffree[1]: print doi,'exist in free library..' continue doifname=doi.quote()+".pdf" if (pdfexistpath(doifname)): print doi,'Files exist in current folder..' continue # Start to find pdf at each link print "### Find for result with DOI: "+doi foundDonePDF=False for link in links: print 'Link:',str(link), if (onlinecheck): print "Try Getting..", # Get a StringIO obj getpdfobj=getwebpdf(link,fname=doifname,params=getwebpdfparams(link),stringio=True) if (not getpdfobj): continue try: dpfresult=self.pdfcheck.checkonlinepdf(fobj=getpdfobj,doi=doi) sys.stdout.flush() if (dpfresult!=0): if ( savestate and (dpfresult in savestate)): #Important to set fname to None rmresult=self.pdfcheck.removegarbage(fname=None,notdelete=True) if (rmresult <= 1): getfilelist.append( (getpdfobj,self.pdfcheck.realdoi,dpfresult)) else: print "Not OK PDF for doi",doi else: foundDonePDF=True if (self.pdfcheck.savefobj2file(doi=self.pdfcheck.realdoi,state=0,fobj=getpdfobj)): print "!!!!!!! Get PDF file to Done!: "+self.pdfcheck.realdoi del getfilelist[:] nowdoi=DOI(self.pdfcheck.realdoi) getallfilelist.append('Done/'+nowdoi.quote()+'.pdf') break else: print "What? should never happen for pdfdoicheck.savefobj2file Done.." except Exception as e: print e,'Error at baidu getallpdf(web) when doing pdfcheck',doi,link # Now should not use this method elif (getwebpdf(link,fname=doifname,params=getwebpdfparams(link))): print "Please don't use download pdf to disk, use check online!" print "Try Getting..", try: dpfresult=self.pdfcheck.renamecheck(doifname) sys.stdout.flush() if (dpfresult!=0): if ( savestate and (dpfresult in savestate)): #Important to set fname to None rmresult=self.pdfcheck.removegarbage(fname=None) if (rmresult <= 1): if (os.path.exists(self.pdfcheck._fname)): getfilelist.append((self.pdfcheck._fname, dpfresult)) else: print "What? should never happen for pdfdoicheck.moveresult Not Done.." else: print "Has been removed.." else: if (os.path.exists(self.pdfcheck._fname)) : os.remove(self.pdfcheck._fname) else: foundDonePDF=True if (os.path.exists(self.pdfcheck._fname)): print "!!!!!!! Get PDF file to Done!: "+doifname getfilelist.append(self.pdfcheck._fname) #time.sleep(random.randint(1,5)) break else: print "What? should never happen for pdfdoicheck.moveresult Done.." except Exception as e: if os.path.exists(doifname): if (not os.path.exists('tmpfail/'+doifname)): os.renames(doifname,'tmpfail/'+doifname) else: os.remove(doifname) print e,'Error at baidu getallpdf when doing pdfcheck' else: print "can't get at this link" bdcheck.set(doi) # Online Check but not Done if onlinecheck and not foundDonePDF and len(getfilelist)>0: minnum=-1 minresult=999999 for i in range(len(getfilelist)): if getfilelist[i][2]<minresult: minnum=i nowdoi=DOI(getfilelist[minnum][1]) if (self.pdfcheck.savefobj2file(doi=nowdoi,state=getfilelist[minnum][2],fobj=getfilelist[minnum][0])): print "!!!!!!! Get PDF file to: "+self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.'),self.pdfcheck.realdoi getallfilelist.append(self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.')+os.sep+nowdoi.quote()+".pdf") del getfilelist[:] except Exception as e: print e, "##### Error when get pdf.." return getallfilelist