Beispiel #1
0
	def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, 
		usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True):
		'''Find PDF by ISSN based on search result from crossref'''
		# may be improve to not only issn..
		if (not issn):return
		if (len(issn)==9 and issn[4]=='-'):
			needurl="http://api.crossref.org/journals/"+issn+"/works"
		elif('10.' in issn):
			needurl="http://api.crossref.org/prefixes/"+issn+"/works"
		else:
			print "Error ISSN/prefix"
			sys.exit(1)
		cr=CRrecord()
		total=cr.gettotalresultfromlink(needurl)
		if (not maxresult or maxresult <=0 or maxresult>total): 
			maxresult=total
		params={"rows":str(step)}
		maxround=(maxresult-offset)/step+1
		offsetcount=offset
		bdcheck=BDCheck()

		for i in range(maxround):
			params["offset"]=str(step*i+offset)
			r=requests.get(needurl,params,timeout=timeout_setting_download)
			if (r.status_code is 200):
				# Get all check/in oapdf 
				if usebdcheck: 
					bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True)

				for j in r.json().get('message',{}).get('items',[]):
					keyword=j.get('title',[''])
					doi=DOI(j.get("DOI",""))
					if not doi:
						offsetcount+=1
						time.sleep(2)
						continue

					# Check whether in bdcheck
					if (usebdcheck and doi in bdcheckall):
						print doi, 'has search/oapdf/free by bdcheck'
						offsetcount+=1
						time.sleep(1)
						continue
						
					# If not in bdcheck, check oapdf/free and set it
					# TODO: remove it after combine oapdf information to library
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] or oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						offsetcount+=1
						time.sleep(1)
						continue						

					if (keyword): 
						keyword=keyword[0]
					else:
						time.sleep(2)
						offsetcount+=1
						continue
					if usedoi:keyword+=" "+doi
					print "#####################################",offsetcount,"####################################"
					print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............"
					sys.stdout.flush()
					self.search(keyword.encode('utf-8'),proxy=proxy)
					bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck)
					bdcheck.set(doi)
					offsetcount+=1
			gc.collect()
		print "End of process for",issn
Beispiel #2
0
	def getallpdf(self,doifilter=None,onlinecheck=True,savestate=None,usebdcheck=True):
		'''Get All pdf from link
		doifilter should be a function, return True when DOI ok'''
		usedoifilter=callable(doifilter)
		getallfilelist=[]
		if isinstance(savestate,(list,tuple,set)):
			savestate=set(savestate)
		elif (isinstance(savestate,int)):
			savestate=set([savestate])
		else:
			savestate=set([0,1,2,3])
		bdcheck=BDCheck()
		for i in range(len(self.items)):
			try:
				getfilelist=[]
				# Get PDF links
				links=self.getpdflink(i)
				if (links):
					doi=DOI(self.getdoi(i))
					if not doi:
						print "blank doi..",doi
						continue
					if ( usedoifilter and not doifilter(doi)):
						print doi,'Not fit filter..'
						continue
						
					# Check by bdcheck api
					if (usebdcheck):
						bdout=bdcheck.get(doi)
						if sum(bdout)>0:
							print doi, 'has search/oapdf/free',bdout
							continue
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] and oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						continue						
					elif oapdffree[0]:
						print doi,'exist in oapdf library..'
						continue				
					elif oapdffree[1]:
						print doi,'exist in free library..'
						continue
					doifname=doi.quote()+".pdf"
					if (pdfexistpath(doifname)):
						print doi,'Files exist in current folder..'
						continue

					# Start to find pdf at each link
					print "### Find for result with DOI: "+doi
					foundDonePDF=False
					for link in links:
						print 'Link:',str(link),
						if (onlinecheck):
							print "Try Getting..",
							# Get a StringIO obj
							getpdfobj=getwebpdf(link,fname=doifname,params=getwebpdfparams(link),stringio=True)
							if (not getpdfobj):
								continue
							try:
								dpfresult=self.pdfcheck.checkonlinepdf(fobj=getpdfobj,doi=doi)
								sys.stdout.flush()
								if (dpfresult!=0):
									if ( savestate and (dpfresult in savestate)):
										#Important to set fname to None
										rmresult=self.pdfcheck.removegarbage(fname=None,notdelete=True)
										if (rmresult <= 1):
											getfilelist.append( (getpdfobj,self.pdfcheck.realdoi,dpfresult))
									else:
										print "Not OK PDF for doi",doi												
								else:
									foundDonePDF=True
									if (self.pdfcheck.savefobj2file(doi=self.pdfcheck.realdoi,state=0,fobj=getpdfobj)):
										print "!!!!!!! Get PDF file to Done!: "+self.pdfcheck.realdoi
										del getfilelist[:]	
										nowdoi=DOI(self.pdfcheck.realdoi)
										getallfilelist.append('Done/'+nowdoi.quote()+'.pdf')

										break
									else:
										print "What? should never happen for pdfdoicheck.savefobj2file Done.."
							except Exception as e:
								print e,'Error at baidu getallpdf(web) when doing pdfcheck',doi,link

						# Now should not use this method
						elif (getwebpdf(link,fname=doifname,params=getwebpdfparams(link))):
							print "Please don't use download pdf to disk, use check online!"
							print "Try Getting..",
							try:
								dpfresult=self.pdfcheck.renamecheck(doifname)
								sys.stdout.flush()
								if (dpfresult!=0): 
									if ( savestate and (dpfresult in savestate)):
										#Important to set fname to None		
										rmresult=self.pdfcheck.removegarbage(fname=None)
										if (rmresult <= 1):
											if (os.path.exists(self.pdfcheck._fname)):
												getfilelist.append((self.pdfcheck._fname, dpfresult))
											else:
												print "What? should never happen for pdfdoicheck.moveresult Not Done.."
										else:
											print "Has been removed.."
									else:
										if (os.path.exists(self.pdfcheck._fname)) : 
											os.remove(self.pdfcheck._fname)
								else:
									foundDonePDF=True
									if (os.path.exists(self.pdfcheck._fname)):
										print "!!!!!!! Get PDF file to Done!: "+doifname
										getfilelist.append(self.pdfcheck._fname)
										#time.sleep(random.randint(1,5))								
										break
									else:
										print "What? should never happen for pdfdoicheck.moveresult Done.."
							except Exception as e:
								if os.path.exists(doifname):
									if (not os.path.exists('tmpfail/'+doifname)):
										os.renames(doifname,'tmpfail/'+doifname)
									else:
										os.remove(doifname)
								print e,'Error at baidu getallpdf when doing pdfcheck'
						else:
							print "can't get at this link"

					bdcheck.set(doi)
					# Online Check but not Done
					if onlinecheck and not foundDonePDF and len(getfilelist)>0:
						minnum=-1
						minresult=999999
						for i in range(len(getfilelist)):
							if getfilelist[i][2]<minresult:
								minnum=i
						nowdoi=DOI(getfilelist[minnum][1])
						if (self.pdfcheck.savefobj2file(doi=nowdoi,state=getfilelist[minnum][2],fobj=getfilelist[minnum][0])):
							print "!!!!!!! Get PDF file to: "+self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.'),self.pdfcheck.realdoi
							getallfilelist.append(self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.')+os.sep+nowdoi.quote()+".pdf")
							del getfilelist[:]
			except Exception as e:
				print e, "##### Error when get pdf.."
		return getallfilelist