def grepBingAcadPDFbyID(self,bid,maxpage=1,printyn=True): '''Grep at most maxpage pages pdf for given bing id Save to doi style based on refering to crossref.''' if (printyn): print "### ### ### ### ### ### ### ### ### " print "## Finding for "+bid+"...." cr=CRrecord() ref=self.bidref(bid) if (printyn): print ref if (os.path.exists(bid+".pdf")): print "Exist file:"+bid+".pdf" return if ref['title']: if (cr.getfromtitle(title=ref['title'],year=ref['year'],volume=ref['volume'], pages=ref['pages'],issue=ref['issue'],fullparse=False) and cr.doi): # try to find by title, if found (true): if (printyn): print cr outname=quotefileDOI(cr.doi)+".pdf" if (not os.path.exists(outname)): if (self.getbidpdf(bid,filename=outname,printyn=printyn)): print "Have Found PDF file: "+outname else: print "Exist file:"+outname else: if (self.getbidpdf(bid,filename=bid+".pdf",printyn=printyn)): print "Have Found PDF file: "+bid+".pdf" else: if (self.getbidpdf(bid,filename=bid+".pdf",printyn=printyn)): print "Have Found PDF file: "+bid+".pdf"
def finddoi(self, num, prefix='', issn=''): title = self.gettitle(num) doi = DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/', 1)[0] if doi else "" volume = self.getvolume(num) journal = self.getjournalfull(num) year = self.getyear(num) pages = self.getpages(num) self.cr = CRrecord() try: # The origin doi maybe true. Find in crossref if (doi and self.cr.getfromdoi(doi, fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if (volume and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and volume == self.cr.volume): return doi if (year and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and year == self.cr.year): return doi print "Origin DOI:", doi, "may be true but record strange..Try title" keyword = title + " " + journal + " " + year + " " + pages + " " + volume if (self.cr.getfromtitledoi(keyword, doi, year=year, limit=10, fullparse=False, prefix=prefix)): if (doi): if (prefix == self.cr.doi.split('/')[0] and strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error for origin doi: " + doi + "; found: " + self.cr.doi return "" return self.cr.doi if (doi): if (strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..", e, "\nRetry..." return self.finddoi(num, prefix=prefix, issn=issn)
def findcrossreftitledoi(self,doi,printyn=True): '''Find doi by crossref first''' cr=CRrecord() if( cr.getfromdoi(doi,fullparse=False) and cr.doi): keyword=(cr.title+" "+cr.doi).encode('utf-8') print "#########################################################################" print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............" sys.stdout.flush() self.grepBingAcadPDF(keyword=keyword,maxpage=1,printyn=printyn) else: print "Error DOI!: "+doi cr.reset()
def getdoi(self,num=0): '''Get DOI from Baidu Cite''' soup=BeautifulSoup(self.getcite(num,citetype='txt'),"html.parser") if (soup.doi): doi=soup.doi.text elif(soup.primarytitle): cr=CRrecord() cr.getfromtitle(soup.primarytitle.info.text,ignorecheminfo=True) doi=cr.doi else: doi=DOI("") return DOI(doi[doi.find('10.'):])
def findcrossreftitledoi(self,doi,printyn=True): '''Find doi by crossref first''' cr=CRrecord() if( cr.getfromdoi(doi,fullparse=False) and cr.doi): keyword=cr.title+" "+cr.doi print "#########################################################################" print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............" sys.stdout.flush() self.search(keyword=keyword) self.getallpdf() else: print "Error DOI!: "+doi cr.reset()
def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0): '''Find PDF by ISSN based on search result from crossref''' # may be improve to not only issn.. if (not issn):return needurl="http://api.crossref.org/journals/"+issn+"/works" cr=CRrecord() total=cr.gettotalresultfromlink(needurl) if (not maxresult or maxresult <=0 or maxresult>total): maxresult=total params={"rows":str(step)} maxround=(maxresult-offset)/step+1 offsetcount=offset for i in range(maxround): params["offset"]=str(step*i+offset) r=requests.get(needurl,params,timeout=timeout_setting_download) if (r.status_code is 200): for j in r.json()['message']['items']: keyword=j.get('title',[''])[0]+" "+j.get("DOI","") print "#####################################",offsetcount,"####################################" print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............" sys.stdout.flush() bingacad.grepBingAcadPDF(keyword.encode('utf-8')) offsetcount+=1 gc.collect()
def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True): '''Find PDF by ISSN based on search result from crossref''' # may be improve to not only issn.. if (not issn):return if (len(issn)==9 and issn[4]=='-'): needurl="http://api.crossref.org/journals/"+issn+"/works" elif('10.' in issn): needurl="http://api.crossref.org/prefixes/"+issn+"/works" else: print "Error ISSN/prefix" sys.exit(1) cr=CRrecord() total=cr.gettotalresultfromlink(needurl) if (not maxresult or maxresult <=0 or maxresult>total): maxresult=total params={"rows":str(step)} maxround=(maxresult-offset)/step+1 offsetcount=offset bdcheck=BDCheck() for i in range(maxround): params["offset"]=str(step*i+offset) r=requests.get(needurl,params,timeout=timeout_setting_download) if (r.status_code is 200): # Get all check/in oapdf if usebdcheck: bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True) for j in r.json().get('message',{}).get('items',[]): keyword=j.get('title',['']) doi=DOI(j.get("DOI","")) if not doi: offsetcount+=1 time.sleep(2) continue # Check whether in bdcheck if (usebdcheck and doi in bdcheckall): print doi, 'has search/oapdf/free by bdcheck' offsetcount+=1 time.sleep(1) continue # If not in bdcheck, check oapdf/free and set it # TODO: remove it after combine oapdf information to library oapdffree=bdcheck.setbycheck(doi) if (oapdffree[0] or oapdffree[1]): print doi,'exist in oapdf/free library..' offsetcount+=1 time.sleep(1) continue if (keyword): keyword=keyword[0] else: time.sleep(2) offsetcount+=1 continue if usedoi:keyword+=" "+doi print "#####################################",offsetcount,"####################################" print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............" sys.stdout.flush() self.search(keyword.encode('utf-8'),proxy=proxy) bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck) bdcheck.set(doi) offsetcount+=1 gc.collect() print "End of process for",issn
def renamecheck(self,fname,wtitle=0.65,cutoff=0.85,justcheck=False,resetfile=True,fdoi=None,excludedoi=None, fobj=None): '''A complex function to get doi from file name, check in crossref, check in pdf file, rename it! just check can cancel move file''' ### Result back: # 0: Done # 1: High # 2: Unsure # 3: Untitle # 4: Fail # 5: Page0 # 6: ErrorDOI # 10: Unknow if (resetfile and isinstance(fobj,(file,StringIO))): self.reset(fname="",fobj=fobj) fname="None" # len(self.doi) is 1 and len(self.doi - excludedoi) is 1 : # :: First Run and perform check # len(self.doi) is 1 or len(self.doi - excludedoi) is 1 : if (not fname and not fdoi): print "No given file name or doi! (Return 6)" return 6 if (fname and not fdoi and excludedoi): print "What do you want?! No excludedoi set by user! (Return 9)" return 9 if (resetfile and fname !="None"): self.reset(fname) elif(resetfile and not isinstance(fobj,(file,StringIO))): print "Use reset file but no file name/object is given!" return 9 if (self.maxpage == 0): if not justcheck: self.moveresult(5, printstr="Error Page 0 (Page0, R5): "+self._fname) return 5 if (not excludedoi): excludedoi=set() if (not fdoi): #File obj is "" fdoi=DOI(os.path.splitext(os.path.basename(self._fname))[0]) else: fdoi=DOI(fdoi) recursive= (len(excludedoi) > 0) # If in recursive, don't move file! if recursive: justcheck=True if resetfile and not recursive: self.realdoi=fdoi # Only find DOI in first time! if (not recursive and fdoi): self.finddoi(1) elif (not recursive and not fdoi): self.finddoi(set([1,2,self.maxpage])) # file doi is shit..Recursively use doi in file or fail if (not fdoi and not recursive): if (len(self.doi) is 1 or len(self.doi) is 2): print "Origin fdoi wrong but has 1~2 dois in file:",self._fname, return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) # No doi or >2 dois in file else: if not justcheck: self.moveresult(4,printstr="Error fdoi and 0/too much doi. (Fail): "+self._fname) return 4 elif (not fdoi and recursive): print "doi (in recursion) may wrong with error doi. Should never happen.." return 4 # Fail # fdoi is ok cr=CRrecord() try: cr=cr.valid_doi(fdoi,fullparse=True) except requests.exceptions.RequestException as e: print e cr=None except Exception as e: print e cr=None # Error when year=None, improve in crrecord. #if (cr and not cr.year): # cr.year='8888' #crossref is ok if (fdoi and cr): totalpagenumber=1 try: totalpagenumber=self.totalpages(cr.pages) except ValueError as e: # should never happen now print e, cr.pages totalpagewrong=False #print "pages:",self.maxpage,' in crossref:',cr.pages,totalpagenumber if totalpagenumber>0 and not (self.maxpage >= totalpagenumber and self.maxpage <= totalpagenumber+2): totalpagewrong=True # When paper with supporting information if (self.maxpage > totalpagenumber+2): self.finddoi(page=2) if (self.withSI or (self.findtext('Supporting Information', page=[totalpagenumber+1,totalpagenumber+2]) and self.findtext(cr.title, similarity=0.75, page=[totalpagenumber+1,totalpagenumber+2]))): if not recursive : self.finddoi(totalpagenumber); self.withSI=True totalpagewrong=False # For NIH Public Access elif (self.hascontent("NIH Public Access")[0]): totalpagewrong=False #Such as some Nature with SI in paper without notify. elif (self.withSI or (totalpagenumber>1 and self.findtext("acknowledgment", page=[totalpagenumber-1, totalpagenumber]) and self.findtext("reference", page=[totalpagenumber-1, totalpagenumber]))): self.withSI=True totalpagewrong=False # Recursive but total page wrong. Fast end recursivedoicheck if (totalpagewrong and recursive): return 4 # Just check first page, not find(find before..), faster: doivalid=self.checkdoi(fdoi,page=1,iterfind=False,justcheck=True) titleeval=self.checktitle(cr.title) if (totalpagenumber > 0 and not totalpagewrong): if (doivalid and titleeval[0] and len(self.doi) is 1): # Yes! Very Good PDF! self.realdoi=fdoi if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 # Further check doi in page2/last, Finally, will check 1,2 and last pages. if (recursive): doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True,justcheck=True) or doivalid ) else: doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True) or doivalid ) if len(self.doi)>3: # Too much doi may be some abstract self.moveresult(2,printstr='Has more than 3 dois! (Unsure):'+self._fname) return 2 # Page wrong and try recursive use doi if (totalpagewrong): if (len(self.doi) is 1 or len(self.doi) is 2): doi=DOI(list(self.doi)[0]) # DOI in file is same so error. Don't need recursive if (len(self.doi) is 1 and doi == fdoi): if not justcheck: self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname) return 4 print 'Wrong total page with dois in file,',self._fname,fdoi,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) else: if not justcheck: self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname) return 4 if (not totalpagewrong): crscore=self.scorefitting(cr) if (self.maxpage <= totalpagenumber+2): # Maybe check when maxpage >total+2 titleeval=self.checktitle(cr.title) if cr.title.strip()=="": titleeval=(False,0.9) titlevalid=titleeval[0] try: paperyear=int(cr.year) except: paperyear=9999 try: # Too old maybe lost information if (paperyear>1990): titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff else: titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff-0.1 #(self.checktitle(cr.title,similarity=0.85) and self.checkcrossref(cr)) except Exception as e: print e if (doivalid): if (titlevalid): # Yes! Good PDF! self.realdoi=fdoi if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and len(self.doi) is 1 and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 print "Title/Paper score:",titleeval[1],crscore,self._fname if (len(self.doi - set([fdoi])) == 1 and not recursive): # Try one more newresult = self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True) if (newresult is 0): newdoi=DOI(list(self.doi - set([fdoi]))[0]) self.realdoi=newdoi print if not justcheck: self.moveresult(0, printstr="(Rename)fdoi ok, but not title. In file doi "+newdoi+" is better for "+self._fname, newfname=newdoi.quote()+".pdf") return 0 # Else DOI ok but not title if not justcheck: self.moveresult(3,printstr="OK fdoi but not title(Untitle): "+self._fname) return 3 # Indeed, doi maybe in pdf, but strange format.. if (self.checkdoinormaltxt(fdoi)): if (titlevalid): # Further check only when title OK if (self.checkdoifurther(fdoi)): # Fine! move to Done dir if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and len(self.doi) is 1 and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 else: # Can't find, but high similar! move to High dir if not justcheck: self.moveresult(1,printstr="OK title and nospacebreak doi,but not pass(High): "+self._fname) return 1 else: # DOI ok but not title print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(3,printstr="Maybe OK fdoi but not title(Untitle): "+self._fname) return 3 # DOI maybe not exist .... if (titlevalid): tmpdois=set(self.doi) for d in tmpdois: dd=DOI(d) if ( not dd.valid_doiorg(geturl=False) ): self.doi.remove(d) # Old paper don't have doi... if len(self.doi) is 0 and totalpagenumber>0: if (crscore['total'] >= 0.4): if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 elif (titleeval[1]>=0.85 and crscore['total'] >= 0.35): if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 elif (titleeval[1]>=0.95 and crscore['total'] >=0.3): if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 elif (titleeval[1]>=0.90 and crscore['pages']>=0.9 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9)): if not justcheck: self.moveresult(0) return 0 elif (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7): if not justcheck: self.moveresult(0) return 0 elif (titleeval[1]>=0.75 or crscore['total'] >=0.25): print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname) return 1 else: print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(2,printstr="OK title and ok info fit. But no doi(Unsure): "+self._fname) return 2 elif len(self.doi) is 0 and totalpagenumber== -1: if (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7): if not justcheck: self.moveresult(0) return 0 else: print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname) return 2 elif len(self.doi) is 0 and totalpagenumber<=0: print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname) return 2 elif ( len(self.doi) > 0 and not recursive): print "Good title but file doesn't contain fdoi, however it has >0 doi in file. " outnow=self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True) if outnow > 0: if not justcheck: self.moveresult(2,printstr="OK title but not fdoi. In file doi is not good(Unsure): "+self._fname) return 2 elif(outnow==0): print 'Good Title but Fail fdoi. Paper has good in file doi,',self._fname,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) ### Old method check old items: #if (self.checkcrossref(cr)): # if (int(cr.year)<=1999 and len(self.doi) is 0): # # Highly possible right # if not justcheck: self.movetodir("High") # return True # Bentham, often blank doi # elif (fdoi[:8] == '10.2174/' and len(self.doi) is 0): # if not justcheck: self.movetodir("Done") # return True # elif (len(self.doi) is 0): # print "Title/Paper score:",titleeval[1],crscore,self._fname # if not justcheck: # self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname) # return 1 # else: # if not justcheck: # self.moveresult(2,printstr="OK title and high info fit. But doi exist not fit(Unsure): "+self._fname) # return 2 #elif(len(self.doi) is 0): # # Maybe wrong file and no doi # if not justcheck: # self.moveresult(2,printstr="Not found doi in file but ok title (Unsure): "+self._fname) # return 2 #fdoi,title wrong, no doi in file # Or in recursive mode if (len(self.doi) is 0 or recursive): if not justcheck: self.moveresult(4,printstr="Both fdoi and title wrong, no doi in file(Fail): "+self._fname) return 4 # Indeed, file has only one more doi, not the same to fname if (len(self.doi - set([fdoi])) is 1 ): print 'Fail fdoi/title. Paper with one more doi in file,',self._fname,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) elif(len(self.doi) > 1): if not justcheck: self.moveresult(4,printstr="fdoi/title fail. Too much infile doi(Fail): "+self._fname) return 4 else: if not justcheck: self.moveresult(4,printstr="What????? What?????(Fail):"+self._fname) return 4 # not cr else: if (not recursive): self.finddoi(set([1,2,self.maxpage])) if (len(self.doi) is 1 or len(self.doi) is 2): print 'Error DOI filename,',self._fname,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) if not justcheck: self.moveresult(6,"Error DOI fname(Fail):"+self._fname) return 6