Esempio n. 1
0
	def grepBingAcadPDFbyID(self,bid,maxpage=1,printyn=True):
		'''Grep at most maxpage pages pdf for given bing id
		Save to doi style based on refering to crossref.'''
		if (printyn):
			print "###  ###  ###  ###  ###  ###  ###  ###  ### "
			print "## Finding for "+bid+"...."
		cr=CRrecord()
		ref=self.bidref(bid)
		if (printyn):
			print ref
		if (os.path.exists(bid+".pdf")):
			print "Exist file:"+bid+".pdf"
			return
		if ref['title']:
			if (cr.getfromtitle(title=ref['title'],year=ref['year'],volume=ref['volume'],
					pages=ref['pages'],issue=ref['issue'],fullparse=False) and cr.doi):
				# try to find by title, if found (true):
				if (printyn): print cr
				outname=quotefileDOI(cr.doi)+".pdf"
				if (not os.path.exists(outname)):
					if (self.getbidpdf(bid,filename=outname,printyn=printyn)):
						print "Have Found PDF file: "+outname
				else:
					print "Exist file:"+outname
			else:
				if (self.getbidpdf(bid,filename=bid+".pdf",printyn=printyn)):
					print "Have Found PDF file: "+bid+".pdf"
		else:
			if (self.getbidpdf(bid,filename=bid+".pdf",printyn=printyn)):
				print "Have Found PDF file: "+bid+".pdf"
Esempio n. 2
0
    def finddoi(self, num, prefix='', issn=''):
        title = self.gettitle(num)
        doi = DOI(self.getdoi(num))
        if (not prefix):
            prefix = doi.split('/', 1)[0] if doi else ""
        volume = self.getvolume(num)
        journal = self.getjournalfull(num)
        year = self.getyear(num)
        pages = self.getpages(num)
        self.cr = CRrecord()
        try:
            # The origin doi maybe true. Find in crossref
            if (doi and self.cr.getfromdoi(doi, fullparse=False)
                    and self.cr.doi):
                # Further check title
                if (strdiff(doi,self.cr.doi)>=0.85 and \
                strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
                    return doi
                if (volume and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and volume == self.cr.volume):
                        return doi
                if (year and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and year == self.cr.year):
                        return doi
                print "Origin DOI:", doi, "may be true but record strange..Try title"

            keyword = title + " " + journal + " " + year + " " + pages + " " + volume
            if (self.cr.getfromtitledoi(keyword,
                                        doi,
                                        year=year,
                                        limit=10,
                                        fullparse=False,
                                        prefix=prefix)):
                if (doi):
                    if (prefix == self.cr.doi.split('/')[0]
                            and strdiff(doi, self.cr.doi) >= 0.85):
                        return self.cr.doi
                    else:
                        print "Error for origin doi: " + doi + "; found: " + self.cr.doi
                        return ""
                return self.cr.doi
            if (doi):
                if (strdiff(doi, self.cr.doi) >= 0.85):
                    return self.cr.doi
                else:
                    print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi
                    return ""
            else:
                return ""
        except Exception as e:
            print "Error when find doi..", e, "\nRetry..."
            return self.finddoi(num, prefix=prefix, issn=issn)
Esempio n. 3
0
	def findcrossreftitledoi(self,doi,printyn=True):
		'''Find doi by crossref first'''
		cr=CRrecord()
		if( cr.getfromdoi(doi,fullparse=False) and cr.doi):
			keyword=(cr.title+" "+cr.doi).encode('utf-8')
			print "#########################################################################"
			print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
			sys.stdout.flush()
			self.grepBingAcadPDF(keyword=keyword,maxpage=1,printyn=printyn)
		else:
			print "Error DOI!: "+doi
		cr.reset()
Esempio n. 4
0
	def getdoi(self,num=0):
		'''Get DOI from Baidu Cite'''
		soup=BeautifulSoup(self.getcite(num,citetype='txt'),"html.parser")
		if (soup.doi): 
			doi=soup.doi.text
		elif(soup.primarytitle):
			cr=CRrecord()
			cr.getfromtitle(soup.primarytitle.info.text,ignorecheminfo=True)
			doi=cr.doi
		else:
			doi=DOI("")
		return DOI(doi[doi.find('10.'):])
Esempio n. 5
0
	def findcrossreftitledoi(self,doi,printyn=True):
		'''Find doi by crossref first'''
		cr=CRrecord()
		if( cr.getfromdoi(doi,fullparse=False) and cr.doi):
			keyword=cr.title+" "+cr.doi
			print "#########################################################################"
			print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
			sys.stdout.flush()
			self.search(keyword=keyword)
			self.getallpdf()
		else:
			print "Error DOI!: "+doi
		cr.reset()
Esempio n. 6
0
	def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0):
		'''Find PDF by ISSN based on search result from crossref'''
		# may be improve to not only issn..
		if (not issn):return
		needurl="http://api.crossref.org/journals/"+issn+"/works"
		cr=CRrecord()
		total=cr.gettotalresultfromlink(needurl)
		if (not maxresult or maxresult <=0 or maxresult>total): 
			maxresult=total
		params={"rows":str(step)}
		maxround=(maxresult-offset)/step+1
		offsetcount=offset
		for i in range(maxround):
			params["offset"]=str(step*i+offset)
			r=requests.get(needurl,params,timeout=timeout_setting_download)
			if (r.status_code is 200):
				for j in r.json()['message']['items']:
					keyword=j.get('title',[''])[0]+" "+j.get("DOI","")
					print "#####################################",offsetcount,"####################################"
					print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
					sys.stdout.flush()
					bingacad.grepBingAcadPDF(keyword.encode('utf-8'))
					offsetcount+=1
			gc.collect()
Esempio n. 7
0
	def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, 
		usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True):
		'''Find PDF by ISSN based on search result from crossref'''
		# may be improve to not only issn..
		if (not issn):return
		if (len(issn)==9 and issn[4]=='-'):
			needurl="http://api.crossref.org/journals/"+issn+"/works"
		elif('10.' in issn):
			needurl="http://api.crossref.org/prefixes/"+issn+"/works"
		else:
			print "Error ISSN/prefix"
			sys.exit(1)
		cr=CRrecord()
		total=cr.gettotalresultfromlink(needurl)
		if (not maxresult or maxresult <=0 or maxresult>total): 
			maxresult=total
		params={"rows":str(step)}
		maxround=(maxresult-offset)/step+1
		offsetcount=offset
		bdcheck=BDCheck()

		for i in range(maxround):
			params["offset"]=str(step*i+offset)
			r=requests.get(needurl,params,timeout=timeout_setting_download)
			if (r.status_code is 200):
				# Get all check/in oapdf 
				if usebdcheck: 
					bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True)

				for j in r.json().get('message',{}).get('items',[]):
					keyword=j.get('title',[''])
					doi=DOI(j.get("DOI",""))
					if not doi:
						offsetcount+=1
						time.sleep(2)
						continue

					# Check whether in bdcheck
					if (usebdcheck and doi in bdcheckall):
						print doi, 'has search/oapdf/free by bdcheck'
						offsetcount+=1
						time.sleep(1)
						continue
						
					# If not in bdcheck, check oapdf/free and set it
					# TODO: remove it after combine oapdf information to library
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] or oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						offsetcount+=1
						time.sleep(1)
						continue						

					if (keyword): 
						keyword=keyword[0]
					else:
						time.sleep(2)
						offsetcount+=1
						continue
					if usedoi:keyword+=" "+doi
					print "#####################################",offsetcount,"####################################"
					print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............"
					sys.stdout.flush()
					self.search(keyword.encode('utf-8'),proxy=proxy)
					bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck)
					bdcheck.set(doi)
					offsetcount+=1
			gc.collect()
		print "End of process for",issn
Esempio n. 8
0
	def renamecheck(self,fname,wtitle=0.65,cutoff=0.85,justcheck=False,resetfile=True,fdoi=None,excludedoi=None, fobj=None):
		'''A complex function to get doi from file name, 
		check in crossref, check in pdf file, rename it!
		just check can cancel move file'''
		### Result back:
		# 0: Done 
		# 1: High
		# 2: Unsure
		# 3: Untitle
		# 4: Fail
		# 5: Page0
		# 6: ErrorDOI
		# 10: Unknow


		if (resetfile and isinstance(fobj,(file,StringIO))):
			self.reset(fname="",fobj=fobj)
			fname="None"

		# len(self.doi) is 1 and len(self.doi - excludedoi) is 1 : 
		# :: First Run and perform check
		# len(self.doi) is 1 or len(self.doi - excludedoi) is 1 :
		if (not fname and not fdoi):
			print "No given file name or doi! (Return 6)"
			return 6

		if (fname and not fdoi and excludedoi):
			print "What do you want?! No excludedoi set by user! (Return 9)"
			return 9

		
		if (resetfile and fname !="None"): 
			self.reset(fname)
		elif(resetfile and not isinstance(fobj,(file,StringIO))):
			print "Use reset file but no file name/object is given!"
			return 9

		if (self.maxpage == 0):
			if not justcheck: 
				self.moveresult(5, printstr="Error Page 0 (Page0, R5): "+self._fname)
			return 5

		if (not excludedoi):
			excludedoi=set()

		if (not fdoi):
			#File obj is ""
			fdoi=DOI(os.path.splitext(os.path.basename(self._fname))[0])
		else:
			fdoi=DOI(fdoi)

		recursive= (len(excludedoi) > 0)
		# If in recursive, don't move file!
		if recursive: justcheck=True

		if resetfile and not recursive:
			self.realdoi=fdoi

		# Only find DOI in first time!
		if (not recursive and fdoi): 
			self.finddoi(1)
		elif (not recursive and not fdoi):
			self.finddoi(set([1,2,self.maxpage]))

		# file doi is shit..Recursively use doi in file or fail
		if (not fdoi and not recursive):
			if (len(self.doi) is 1 or len(self.doi) is 2):
				print "Origin fdoi wrong but has 1~2 dois in file:",self._fname,
				return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
			# No doi or >2 dois in file
			else:
				if not justcheck: 
					self.moveresult(4,printstr="Error fdoi and 0/too much doi. (Fail): "+self._fname)
				return 4
		elif (not fdoi and recursive):
			print "doi (in recursion) may wrong with error doi. Should never happen.."
			return 4 # Fail

		# fdoi is ok
		cr=CRrecord()
		try:
			cr=cr.valid_doi(fdoi,fullparse=True)
		except requests.exceptions.RequestException as e:
			print e
			cr=None
		except Exception as e:
			print e
			cr=None		

		# Error when year=None, improve in crrecord.
		#if (cr and not cr.year):
		#	cr.year='8888'	

		#crossref is ok
		if (fdoi and cr):
			totalpagenumber=1
			try:
				totalpagenumber=self.totalpages(cr.pages)
			except ValueError as e:
				# should never happen now
				print e, cr.pages

			totalpagewrong=False
			#print "pages:",self.maxpage,' in crossref:',cr.pages,totalpagenumber
			if totalpagenumber>0 and not (self.maxpage >= totalpagenumber and self.maxpage <= totalpagenumber+2):
				totalpagewrong=True
				# When paper with supporting information
				if (self.maxpage > totalpagenumber+2):
					self.finddoi(page=2)
					if (self.withSI or (self.findtext('Supporting Information', page=[totalpagenumber+1,totalpagenumber+2])
						and self.findtext(cr.title, similarity=0.75, page=[totalpagenumber+1,totalpagenumber+2]))):
						if not recursive : self.finddoi(totalpagenumber);
						self.withSI=True
						totalpagewrong=False
					# For NIH Public Access
					elif (self.hascontent("NIH Public Access")[0]):
						totalpagewrong=False
					#Such as some Nature with SI in paper without notify.
					elif (self.withSI or (totalpagenumber>1 and self.findtext("acknowledgment", page=[totalpagenumber-1, totalpagenumber]) 
						and self.findtext("reference", page=[totalpagenumber-1, totalpagenumber]))):
						self.withSI=True
						totalpagewrong=False

			# Recursive but total page wrong. Fast end recursivedoicheck
			if (totalpagewrong and recursive):	
				return 4

			# Just check first page, not find(find before..), faster:
			doivalid=self.checkdoi(fdoi,page=1,iterfind=False,justcheck=True)
			titleeval=self.checktitle(cr.title)
			if (totalpagenumber > 0 and not totalpagewrong):
				if (doivalid and titleeval[0] and len(self.doi) is 1):
					# Yes! Very Good PDF!
					self.realdoi=fdoi
					if not justcheck: 
						if (self.maxpage>=2 and self.maxpage == totalpagenumber and
							not self.findtext('Supporting Information', page=[1])):
							self.moveresult(0,good=True)
						else:
							self.moveresult(0)
					return 0

			# Further check doi in page2/last, Finally, will check 1,2 and last pages.
			if (recursive):
				doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True,justcheck=True) or doivalid )
			else:
				doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True) or doivalid )

			if len(self.doi)>3:
				# Too much doi may be some abstract
				self.moveresult(2,printstr='Has more than 3 dois! (Unsure):'+self._fname)
				return 2

			# Page wrong and try recursive use doi
			if (totalpagewrong):
				if (len(self.doi) is 1 or len(self.doi) is 2):
					doi=DOI(list(self.doi)[0])
					# DOI in file is same so error. Don't need recursive
					if (len(self.doi) is 1 and doi == fdoi):
						if not justcheck: 
							self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname)
						return 4

					print 'Wrong total page with dois in file,',self._fname,fdoi,',try recursive'
					return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
				else:
					if not justcheck: 
						self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname)
					return 4

			if (not totalpagewrong):
				crscore=self.scorefitting(cr)
				if (self.maxpage <= totalpagenumber+2): 
					# Maybe check when maxpage >total+2
					titleeval=self.checktitle(cr.title)
				if cr.title.strip()=="":
					titleeval=(False,0.9)
				titlevalid=titleeval[0]
				try:
					paperyear=int(cr.year)
				except:
					paperyear=9999
				try:
					# Too old maybe lost information
					if (paperyear>1990):
						titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff
					else:
						titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff-0.1
				#(self.checktitle(cr.title,similarity=0.85) and self.checkcrossref(cr))
				except Exception as e:
					print e

				if (doivalid):
					if (titlevalid):					
						# Yes! Good PDF!
						self.realdoi=fdoi
						if not justcheck: 
							if (self.maxpage>=2 and self.maxpage == totalpagenumber 
								and len(self.doi) is 1 and
								not self.findtext('Supporting Information', page=[1])):
								self.moveresult(0,good=True)
							else:
								self.moveresult(0)
						return 0

					print "Title/Paper score:",titleeval[1],crscore,self._fname
					if (len(self.doi - set([fdoi])) == 1 and not recursive):
						
						# Try one more
						newresult = self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True)
						if (newresult is 0):
							newdoi=DOI(list(self.doi - set([fdoi]))[0])
							self.realdoi=newdoi
							print 
							if not justcheck: self.moveresult(0, 
								printstr="(Rename)fdoi ok, but not title. In file doi "+newdoi+" is better for "+self._fname,
								newfname=newdoi.quote()+".pdf")
							return 0

					# Else DOI ok but not title
					if not justcheck: 
						self.moveresult(3,printstr="OK fdoi but not title(Untitle): "+self._fname)
					return 3
				
				# Indeed, doi maybe in pdf, but strange format..
				if (self.checkdoinormaltxt(fdoi)):
					if (titlevalid):
						# Further check only when title OK
						if (self.checkdoifurther(fdoi)):
							# Fine! move to Done dir
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber 
								and len(self.doi) is 1 and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						else:
							# Can't find, but high similar! move to High dir
							if not justcheck: 
								self.moveresult(1,printstr="OK title and nospacebreak doi,but not pass(High): "+self._fname)
							return 1
					else:
						# DOI ok but not title
						print "Title/Paper score:",titleeval[1],crscore,self._fname
						if not justcheck: 
							self.moveresult(3,printstr="Maybe OK fdoi but not title(Untitle): "+self._fname)
						return 3						

				# DOI maybe not exist ....
				if (titlevalid):
					tmpdois=set(self.doi)
					for d in tmpdois:
						dd=DOI(d)
						if ( not dd.valid_doiorg(geturl=False) ):
							self.doi.remove(d)
							
					# Old paper don't have doi...
					if len(self.doi) is 0 and totalpagenumber>0:
						if (crscore['total'] >= 0.4):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.85 and crscore['total'] >= 0.35):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)								
							return 0
						elif (titleeval[1]>=0.95 and crscore['total'] >=0.3):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.90 and crscore['pages']>=0.9 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9)):
							if not justcheck: self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7):
							if not justcheck: self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.75 or crscore['total'] >=0.25):
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname)
								return 1
						else:
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(2,printstr="OK title and ok info fit. But no doi(Unsure): "+self._fname)
							return 2
					elif len(self.doi) is 0 and totalpagenumber== -1:
						if (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7):
							if not justcheck: self.moveresult(0)
							return 0
						else:
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname)
							return 2												
					elif len(self.doi) is 0 and totalpagenumber<=0:
						print "Title/Paper score:",titleeval[1],crscore,self._fname
						if not justcheck: 
							self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname)
						return 2
					elif ( len(self.doi) > 0 and not recursive):
						print "Good title but file doesn't contain fdoi, however it has >0 doi in file. "
						outnow=self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True)
						if outnow > 0:
							if not justcheck:
								self.moveresult(2,printstr="OK title but not fdoi. In file doi is not good(Unsure): "+self._fname)
							return 2
						elif(outnow==0):
							print 'Good Title but Fail fdoi. Paper has good in file doi,',self._fname,',try recursive'
							return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)

					### Old method check old items:
					#if (self.checkcrossref(cr)):
					#	if (int(cr.year)<=1999 and len(self.doi) is 0):
					#		# Highly possible right
					#		if not justcheck: self.movetodir("High")
					#		return True
					#	  Bentham, often blank doi
					#	elif (fdoi[:8] == '10.2174/' and len(self.doi) is 0):
					#		if not justcheck: self.movetodir("Done")
					#		return True
					#	elif (len(self.doi) is 0):
					#		print "Title/Paper score:",titleeval[1],crscore,self._fname
					#		if not justcheck: 
					#			self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname)
					#		return 1							
					#	else:
					#		if not justcheck: 
					#			self.moveresult(2,printstr="OK title and high info fit. But doi exist not fit(Unsure): "+self._fname)
					#		return 2								
					#elif(len(self.doi) is 0):
					#	# Maybe wrong file and no doi
					#	if not justcheck: 
					#		self.moveresult(2,printstr="Not found doi in file but ok title (Unsure): "+self._fname)
					#	return 2

			#fdoi,title wrong, no doi in file
			# Or in recursive mode
			if (len(self.doi) is 0 or recursive):
				if not justcheck: 
					self.moveresult(4,printstr="Both fdoi and title wrong, no doi in file(Fail): "+self._fname)
				return 4

			# Indeed, file has only one more doi, not the same to fname
			if (len(self.doi - set([fdoi])) is 1 ):
				print 'Fail fdoi/title. Paper with one more doi in file,',self._fname,',try recursive'
				return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
			elif(len(self.doi) > 1):
				if not justcheck: 
					self.moveresult(4,printstr="fdoi/title fail. Too much infile doi(Fail): "+self._fname)
				return 4
			else:
				if not justcheck: 
					self.moveresult(4,printstr="What????? What?????(Fail):"+self._fname)
				return 4
		# not cr
		else:
			if (not recursive):
				self.finddoi(set([1,2,self.maxpage]))
				if (len(self.doi) is 1 or len(self.doi) is 2):
					print 'Error DOI filename,',self._fname,',try recursive'
					return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)				
			if not justcheck: 
				self.moveresult(6,"Error DOI fname(Fail):"+self._fname)
			return 6