Python PDFdoiCheck Examples

Programming Language: Python

Namespace/Package Name: pdfdoicheck

Class/Type: PDFdoiCheck

Examples at hotexamples.com: 2

Python PDFdoiCheck - 2 examples found. These are the top rated real world Python examples of pdfdoicheck.PDFdoiCheck extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PDFdoiCheck(1)

checkonlinepdf(1)

removegarbage(1)

renamecheck(1)

reset(1)

savefobj2file(1)

Example #1

Show file

	def __init__(self):
		self.request=None
		self.soup=None
		self.items=[]
		#new add to check and remove not good result
		self.pdfcheck=PDFdoiCheck()

Example #2

Show file

class BaiduXueshu(object):
	host="http://xueshu.baidu.com"
	path="/s"
	url="http://xueshu.baidu.com/s"
	word="wd"
	citeurl="http://xueshu.baidu.com/u/citation"
	def __init__(self):
		self.request=None
		self.soup=None
		self.items=[]
		#new add to check and remove not good result
		self.pdfcheck=PDFdoiCheck()
	def reset(self):
		self.request=None
		del self.items[:]
		del self.soup; self.soup=None
		del self.request; self.request=None
		self.pdfcheck.reset('')
		
	def search(self,keyword,params={},headers={},proxy=None):
		self.reset()
		if (not keyword):return

		params[self.word]=keyword
		params['sc_hit']='1'#for find all, not exactly
		if proxy:
			if not isinstance(proxy,dict):
				proxy=None
		try:
			if (proxy):
				r=requests.get(self.url,params=params,headers=headers,proxies=proxy,timeout=timeout_setting)
			else:
				r=requests.get(self.url,params=params,headers=headers,timeout=timeout_setting)
			if r.status_code is 200:
				if ('<img src="http://verify.baidu.com/cgi-bin/genimg' in r.text):
					time.sleep(600)
					self.search(keyword,params=params,headers=headers)
				try:
					self.soup=BeautifulSoup(r.text, "html.parser")
					self.items=self.soup.findChildren('div',attrs={'class':'result sc_default_result xpath-log'})
				except Exception as e:
					print e,'when parsing searching result'
					return 
				#print "Find",len(self.items)," Results."
				#for item in items:
		except Exception as e:
			print "Error when searching word.."
			time.sleep(20)
			self.search(keyword=keyword,params=params,headers=headers,proxy=proxy)

	def _parsepdflink(self,link):
		'''Some pdf link in baidu format'''
		if (link):
			link=requests.utils.unquote(link)
		if (len(link)>4):
			if link[:2]=="/s":
				rer=re.search(r'(?<=url=)http.*?(?=\&ie;=utf-8)',link)
				if rer:
					link=rer.group()
					return link
			elif(link[:4] == 'http'):
				return link
			return ''
		return ""

	def getpdflink(self,num=0):
		pdfs=[ i.text for i in self.items[num].findChildren('p',attrs={'class':"saveurl"})] \
			+[ i['href'] for i in self.items[num].findChildren('a',attrs={'class':"sc_download c-icon-download-hover"})]
		pdfs=list(set([ adjustpdflink(self._parsepdflink(pdf)) for pdf in pdfs]))
		if '' in pdfs: pdfs.remove('')
		if (pdfs): print "Get",len(pdfs)," links for record ",num,":",#,str(pdfs)
		return pdfs

	def getcite(self,num=0,citetype="txt"):
		cite=self.items[num].findChild('a',attrs={'class':'sc_q c-icon-shape-hover'})
		try:
			params={'t':citetype,'url':cite['data-link'],'sign':cite['data-sign']}
			r=requests.get(self.citeurl,params=params,timeout=timeout_setting)
			if r.status_code is 200:
				return r.text
		except:
			print "Can't get citation"
		return ""

	def getdoi(self,num=0):
		'''Get DOI from Baidu Cite'''
		soup=BeautifulSoup(self.getcite(num,citetype='txt'),"html.parser")
		if (soup.doi): 
			doi=soup.doi.text
		elif(soup.primarytitle):
			cr=CRrecord()
			cr.getfromtitle(soup.primarytitle.info.text,ignorecheminfo=True)
			doi=cr.doi
		else:
			doi=DOI("")
		return DOI(doi[doi.find('10.'):])

	def getallpdf(self,doifilter=None,onlinecheck=True,savestate=None,usebdcheck=True):
		'''Get All pdf from link
		doifilter should be a function, return True when DOI ok'''
		usedoifilter=callable(doifilter)
		getallfilelist=[]
		if isinstance(savestate,(list,tuple,set)):
			savestate=set(savestate)
		elif (isinstance(savestate,int)):
			savestate=set([savestate])
		else:
			savestate=set([0,1,2,3])
		bdcheck=BDCheck()
		for i in range(len(self.items)):
			try:
				getfilelist=[]
				# Get PDF links
				links=self.getpdflink(i)
				if (links):
					doi=DOI(self.getdoi(i))
					if not doi:
						print "blank doi..",doi
						continue
					if ( usedoifilter and not doifilter(doi)):
						print doi,'Not fit filter..'
						continue
						
					# Check by bdcheck api
					if (usebdcheck):
						bdout=bdcheck.get(doi)
						if sum(bdout)>0:
							print doi, 'has search/oapdf/free',bdout
							continue
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] and oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						continue						
					elif oapdffree[0]:
						print doi,'exist in oapdf library..'
						continue				
					elif oapdffree[1]:
						print doi,'exist in free library..'
						continue
					doifname=doi.quote()+".pdf"
					if (pdfexistpath(doifname)):
						print doi,'Files exist in current folder..'
						continue

					# Start to find pdf at each link
					print "### Find for result with DOI: "+doi
					foundDonePDF=False
					for link in links:
						print 'Link:',str(link),
						if (onlinecheck):
							print "Try Getting..",
							# Get a StringIO obj
							getpdfobj=getwebpdf(link,fname=doifname,params=getwebpdfparams(link),stringio=True)
							if (not getpdfobj):
								continue
							try:
								dpfresult=self.pdfcheck.checkonlinepdf(fobj=getpdfobj,doi=doi)
								sys.stdout.flush()
								if (dpfresult!=0):
									if ( savestate and (dpfresult in savestate)):
										#Important to set fname to None
										rmresult=self.pdfcheck.removegarbage(fname=None,notdelete=True)
										if (rmresult <= 1):
											getfilelist.append( (getpdfobj,self.pdfcheck.realdoi,dpfresult))
									else:
										print "Not OK PDF for doi",doi												
								else:
									foundDonePDF=True
									if (self.pdfcheck.savefobj2file(doi=self.pdfcheck.realdoi,state=0,fobj=getpdfobj)):
										print "!!!!!!! Get PDF file to Done!: "+self.pdfcheck.realdoi
										del getfilelist[:]	
										nowdoi=DOI(self.pdfcheck.realdoi)
										getallfilelist.append('Done/'+nowdoi.quote()+'.pdf')

										break
									else:
										print "What? should never happen for pdfdoicheck.savefobj2file Done.."
							except Exception as e:
								print e,'Error at baidu getallpdf(web) when doing pdfcheck',doi,link

						# Now should not use this method
						elif (getwebpdf(link,fname=doifname,params=getwebpdfparams(link))):
							print "Please don't use download pdf to disk, use check online!"
							print "Try Getting..",
							try:
								dpfresult=self.pdfcheck.renamecheck(doifname)
								sys.stdout.flush()
								if (dpfresult!=0): 
									if ( savestate and (dpfresult in savestate)):
										#Important to set fname to None		
										rmresult=self.pdfcheck.removegarbage(fname=None)
										if (rmresult <= 1):
											if (os.path.exists(self.pdfcheck._fname)):
												getfilelist.append((self.pdfcheck._fname, dpfresult))
											else:
												print "What? should never happen for pdfdoicheck.moveresult Not Done.."
										else:
											print "Has been removed.."
									else:
										if (os.path.exists(self.pdfcheck._fname)) : 
											os.remove(self.pdfcheck._fname)
								else:
									foundDonePDF=True
									if (os.path.exists(self.pdfcheck._fname)):
										print "!!!!!!! Get PDF file to Done!: "+doifname
										getfilelist.append(self.pdfcheck._fname)
										#time.sleep(random.randint(1,5))								
										break
									else:
										print "What? should never happen for pdfdoicheck.moveresult Done.."
							except Exception as e:
								if os.path.exists(doifname):
									if (not os.path.exists('tmpfail/'+doifname)):
										os.renames(doifname,'tmpfail/'+doifname)
									else:
										os.remove(doifname)
								print e,'Error at baidu getallpdf when doing pdfcheck'
						else:
							print "can't get at this link"

					bdcheck.set(doi)
					# Online Check but not Done
					if onlinecheck and not foundDonePDF and len(getfilelist)>0:
						minnum=-1
						minresult=999999
						for i in range(len(getfilelist)):
							if getfilelist[i][2]<minresult:
								minnum=i
						nowdoi=DOI(getfilelist[minnum][1])
						if (self.pdfcheck.savefobj2file(doi=nowdoi,state=getfilelist[minnum][2],fobj=getfilelist[minnum][0])):
							print "!!!!!!! Get PDF file to: "+self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.'),self.pdfcheck.realdoi
							getallfilelist.append(self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.')+os.sep+nowdoi.quote()+".pdf")
							del getfilelist[:]
			except Exception as e:
				print e, "##### Error when get pdf.."
		return getallfilelist

	def findwordPDF(self,keyword,doifilter=None):
		print "#########################################################################"
		print "## Now finding for: "+ keyword+"............"
		sys.stdout.flush()
		self.search(keyword=keyword)
		self.getallpdf(doifilter)		

	def findcrossreftitledoi(self,doi,printyn=True):
		'''Find doi by crossref first'''
		cr=CRrecord()
		if( cr.getfromdoi(doi,fullparse=False) and cr.doi):
			keyword=cr.title+" "+cr.doi
			print "#########################################################################"
			print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
			sys.stdout.flush()
			self.search(keyword=keyword)
			self.getallpdf()
		else:
			print "Error DOI!: "+doi
		cr.reset()

	def finddoiPDFfromFile(self,fname):
		'''Put doi in file and use it to find pdf'''
		fin=open(fname)
		countN=0
		for line in fin:
			ldoi=line.lower().strip()
			doi=DOI(ldoi)
			if (os.path.exists(doi.quote()+".pdf")):
				continue
			self.findcrossreftitledoi(ldoi)
			#time.sleep(random.randint(1,10))
			countN+=1
			if countN>=10:
				gc.collect()
				countN=0
		fin.close()			

	def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, 
		usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True):
		'''Find PDF by ISSN based on search result from crossref'''
		# may be improve to not only issn..
		if (not issn):return
		if (len(issn)==9 and issn[4]=='-'):
			needurl="http://api.crossref.org/journals/"+issn+"/works"
		elif('10.' in issn):
			needurl="http://api.crossref.org/prefixes/"+issn+"/works"
		else:
			print "Error ISSN/prefix"
			sys.exit(1)
		cr=CRrecord()
		total=cr.gettotalresultfromlink(needurl)
		if (not maxresult or maxresult <=0 or maxresult>total): 
			maxresult=total
		params={"rows":str(step)}
		maxround=(maxresult-offset)/step+1
		offsetcount=offset
		bdcheck=BDCheck()

		for i in range(maxround):
			params["offset"]=str(step*i+offset)
			r=requests.get(needurl,params,timeout=timeout_setting_download)
			if (r.status_code is 200):
				# Get all check/in oapdf 
				if usebdcheck: 
					bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True)

				for j in r.json().get('message',{}).get('items',[]):
					keyword=j.get('title',[''])
					doi=DOI(j.get("DOI",""))
					if not doi:
						offsetcount+=1
						time.sleep(2)
						continue

					# Check whether in bdcheck
					if (usebdcheck and doi in bdcheckall):
						print doi, 'has search/oapdf/free by bdcheck'
						offsetcount+=1
						time.sleep(1)
						continue
						
					# If not in bdcheck, check oapdf/free and set it
					# TODO: remove it after combine oapdf information to library
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] or oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						offsetcount+=1
						time.sleep(1)
						continue						

					if (keyword): 
						keyword=keyword[0]
					else:
						time.sleep(2)
						offsetcount+=1
						continue
					if usedoi:keyword+=" "+doi
					print "#####################################",offsetcount,"####################################"
					print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............"
					sys.stdout.flush()
					self.search(keyword.encode('utf-8'),proxy=proxy)
					bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck)
					bdcheck.set(doi)
					offsetcount+=1
			gc.collect()
		print "End of process for",issn