Example #1
0
	def recursivedoicheck(self,excludedoi,olddoi,wtitle=0.65,cutoff=0.85,justcheck=False):
		tryjudge=4
		trydoi=""
		rightdoi=[]
		excludedoi.add(olddoi)
		for doi in self.doi-excludedoi:
			print "Recursive check doi..",self._fname,doi,
			judgenum = self.renamecheck(self._fname,wtitle=wtitle,cutoff=cutoff,\
			justcheck=True,resetfile=False,excludedoi=excludedoi,fdoi=doi)
			excludedoi.add(doi)
			if (judgenum is 0):
				rightdoi.append(doi)
				tryjudge=0
			elif (judgenum<tryjudge):
				trydoi=doi
				tryjudge=judgenum
			# else, retain 4 and blank doi
		if (len(rightdoi) is 1):
			doi=DOI(rightdoi[0])
			self.realdoi=doi
			if not justcheck:
				self.moveresult(0,printstr=None,newfname=doi.quote()+".pdf")
			return 0
		elif (len(rightdoi) >= 2 ):
			if not justcheck:
				self.moveresult(3,printstr="Many DOIs are OK, can't distinguish...(Unsure)")
			return 3 # Unsure
		else:
			print "Doesn't have reliable doi", self._fname
			if not justcheck:
				self.moveresult(tryjudge,printstr=None)
			return tryjudge
Example #2
0
	def savefobj2file(self,fname="",doi="",state=None,fobj=None):
		'''Save the current file obj(file/StringIO) to a file
		And also set the self.fname'''
		if (not fname and not doi):
			print "File name or doi is not given!"
			return
		if (doi and not fname):
			doi=DOI(doi)
			fname=doi.quote()+'.pdf'
			
		if (state is not None):
			outdir=self.judgedirs.get(state,'.')
			if not os.path.exists(outdir):os.makedirs(outdir)
			fname=outdir+os.sep+fname	

		if not fobj: fobj=self.fobj		

		if (fname and fobj and not fobj.closed):
			fobj.seek(0)
			if (not os.path.exists(fname)):				
				f=open(fname,'wb')
				f.write(fobj.read())
				f.close()
				fobj.seek(0)
				self._fname=fname
				return True
			else:
				print "File has exist...."
				return False
Example #3
0
    def setbycheck(self, doi):
        '''Update the bdcheck/oapdf/free in library based on check oapdf/free
		Can't set the record to "bdcheck" state
		return the [oapdf,free]'''
        try:
            if (isinstance(doi, str)):
                doi = DOI(doi)
                if (doi):
                    oapdffree = doi.freedownload(outtuple=True)
                    if (oapdffree[0] and oapdffree[1]):
                        r = requests.get(self.url + "&doi=" + doi +
                                         "&update=True&oapdf=True&free=True",
                                         timeout=TIMEOUT_SETTING)
                    elif oapdffree[0]:
                        r = requests.get(self.url + "&doi=" + doi +
                                         "&update=True&oapdf=True",
                                         timeout=TIMEOUT_SETTING)
                    elif oapdffree[1]:
                        r = requests.get(self.url + "&doi=" + doi +
                                         "&update=True&free=True",
                                         timeout=TIMEOUT_SETTING)
                    return oapdffree
            return [False, False]
        except Exception as e:
            print e, "SF BDCheck SetByCheck Fail.."
            return [False, False]
Example #4
0
    def finddoi(self, num, prefix='', issn=''):
        title = self.gettitle(num)
        doi = DOI(self.getdoi(num))
        if (not prefix):
            prefix = doi.split('/', 1)[0] if doi else ""
        volume = self.getvolume(num)
        journal = self.getjournalfull(num)
        year = self.getyear(num)
        pages = self.getpages(num)
        self.cr = CRrecord()
        try:
            # The origin doi maybe true. Find in crossref
            if (doi and self.cr.getfromdoi(doi, fullparse=False)
                    and self.cr.doi):
                # Further check title
                if (strdiff(doi,self.cr.doi)>=0.85 and \
                strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
                    return doi
                if (volume and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and volume == self.cr.volume):
                        return doi
                if (year and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and year == self.cr.year):
                        return doi
                print "Origin DOI:", doi, "may be true but record strange..Try title"

            keyword = title + " " + journal + " " + year + " " + pages + " " + volume
            if (self.cr.getfromtitledoi(keyword,
                                        doi,
                                        year=year,
                                        limit=10,
                                        fullparse=False,
                                        prefix=prefix)):
                if (doi):
                    if (prefix == self.cr.doi.split('/')[0]
                            and strdiff(doi, self.cr.doi) >= 0.85):
                        return self.cr.doi
                    else:
                        print "Error for origin doi: " + doi + "; found: " + self.cr.doi
                        return ""
                return self.cr.doi
            if (doi):
                if (strdiff(doi, self.cr.doi) >= 0.85):
                    return self.cr.doi
                else:
                    print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi
                    return ""
            else:
                return ""
        except Exception as e:
            print "Error when find doi..", e, "\nRetry..."
            return self.finddoi(num, prefix=prefix, issn=issn)
Example #5
0
	def getdoi(self,num=0):
		'''Get DOI from Baidu Cite'''
		soup=BeautifulSoup(self.getcite(num,citetype='txt'),"html.parser")
		if (soup.doi): 
			doi=soup.doi.text
		elif(soup.primarytitle):
			cr=CRrecord()
			cr.getfromtitle(soup.primarytitle.info.text,ignorecheminfo=True)
			doi=cr.doi
		else:
			doi=DOI("")
		return DOI(doi[doi.find('10.'):])
Example #6
0
	def finddoi(self,num,prefix='',issn=''):
		title=self.gettitle(num)
		doi=DOI(self.getdoi(num))
		if (not prefix):
			prefix = doi.split('/',1)[0] if doi else ""
		volume= self.getvolume(num)
		journal=self.getjournalfull(num)
		year=self.getyear(num) 
		pages=self.getpages(num)
		self.cr=CRrecord()
		try:
			# The origin doi maybe true. Find in crossref
			if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi):
				# Further check title
				if (strdiff(doi,self.cr.doi)>=0.85 and \
				strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
					return doi
				if( volume and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume):
						return doi
				if( year and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year):
						return doi
				print "Origin DOI:",doi,"may be true but record strange..Try title"

			keyword=title+" "+journal+" "+year+" "+pages+" "+volume
			if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)):
				if (doi):
					if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85):
						return self.cr.doi
					else:
						print "Error for origin doi: "+doi+"; found: "+self.cr.doi
						return ""
				return self.cr.doi
			if (doi):
				if( strdiff(doi,self.cr.doi)>=0.85):
					return self.cr.doi
				else:
					print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi
					return ""
			else:
				return ""
		except Exception as e:
			print "Error when find doi..",e,"\nRetry..."
			return self.finddoi(num,prefix=prefix,issn=issn)
Example #7
0
	def finddoiPDFfromFile(self,fname):
		'''Put doi in file and use it to find pdf'''
		fin=open(fname)
		countN=0
		for line in fin:
			ldoi=line.lower().strip()
			doi=DOI(ldoi)
			if (os.path.exists(doi.quote()+".pdf")):
				continue
			self.findcrossreftitledoi(ldoi)
			#time.sleep(random.randint(1,10))
			countN+=1
			if countN>=10:
				gc.collect()
				countN=0
		fin.close()			
Example #8
0
    def set(self, doi, oapdf=None, free=None):
        '''Update the bdcheck even oapdf/free in library
			If give a list of doi, just post them. 
			No return.'''
        try:
            if (isinstance(doi, str)):
                doi = DOI(doi)
                if (doi):
                    if (oapdf and free):
                        r = requests.get(self.url + "&doi=" + doi +
                                         "&update=True&oapdf=True&free=True",
                                         timeout=TIMEOUT_SETTING)
                    elif oapdf:
                        r = requests.get(self.url + "&doi=" + doi +
                                         "&update=True&oapdf=True",
                                         timeout=TIMEOUT_SETTING)
                    elif free:
                        r = requests.get(self.url + "&doi=" + doi +
                                         "&update=True&free=True",
                                         timeout=TIMEOUT_SETTING)
                    else:
                        r = requests.get(self.url + "&doi=" + doi +
                                         "&update=True",
                                         timeout=TIMEOUT_SETTING)

            elif (isinstance(doi, (list, tuple, set))):
                dois = list(doi)
                length = len(dois)
                maxround = length / 100 + 1 if length % 100 != 0 else length / 100
                for i in range(0, maxround):
                    if ((i + 1) * 100 >= len(dois)):
                        doisjs = json.dumps(dois[i * 100:])
                    else:
                        doisjs = json.dumps(dois[i * 100:(i + 1) * 100])
                    param = {'dois': doisjs}
                    if (oapdf and free):
                        r = requests.post(self.url +
                                          "&update=True&oapdf=True&free=True",
                                          params=param,
                                          timeout=TIMEOUT_SETTING)
                    elif oapdf:
                        r = requests.post(self.url + "&update=True&oapdf=True",
                                          params=param,
                                          timeout=TIMEOUT_SETTING)
                    elif free:
                        r = requests.post(self.url + "&update=True&free=True",
                                          params=param,
                                          timeout=TIMEOUT_SETTING)
                    else:
                        r = requests.post(self.url + "&update=True",
                                          params=param,
                                          timeout=TIMEOUT_SETTING)
                    time.sleep(1)

        except Exception as e:
            print e, "SF BDCheck Set Fail.."
Example #9
0
	def setbycheck(self,doi):
		'''Update the bdcheck/oapdf/free in library based on check oapdf/free
		Can't set the record to "bdcheck" state
		return the [oapdf,free]'''
		try:
			if (isinstance(doi,str)):
				doi=DOI(doi)
				if (doi):
					oapdffree=doi.freedownload(outtuple=True)
					if (oapdffree[0] and oapdffree[1]):
						r=requests.get(self.url+"&doi="+doi+"&update=True&oapdf=True&free=True",timeout=TIMEOUT_SETTING)
					elif oapdffree[0]:
						r=requests.get(self.url+"&doi="+doi+"&update=True&oapdf=True",timeout=TIMEOUT_SETTING)				
					elif oapdffree[1]:
						r=requests.get(self.url+"&doi="+doi+"&update=True&free=True",timeout=TIMEOUT_SETTING)
					return oapdffree
			return [False,False]
		except Exception as e:
			print e,"SF BDCheck SetByCheck Fail.."
			return [False,False]
Example #10
0
    def get(self, doi):
        '''Get list whether [bdcheck,oapdf,free] for single doi
		Return a dict for multi dois'''
        try:
            if (isinstance(doi, str)):
                doi = DOI(doi)
                if (doi):
                    r = requests.get(self.url + "&doi=" + doi + "&select=True",
                                     timeout=TIMEOUT_SETTING)
                    if r.status_code == 200:
                        return r.json().get(doi, [])
                return [0, 0, 0]
            # if dois in list/tuple/set,return {doi:[0,0,0],...}
            elif (isinstance(doi, (list, tuple, set))):
                dois = list(doi)
                result = {}
                length = len(dois)
                maxround = length / 100 + 1 if length % 100 != 0 else length / 100
                for i in range(0, maxround):
                    if ((i + 1) * 100 >= len(dois)):
                        doisjs = json.dumps(dois[i * 100:])
                    else:
                        doisjs = json.dumps(dois[i * 100:(i + 1) * 100])
                    param = {'dois': doisjs}
                    r = requests.post(self.url + "&select=True",
                                      params=param,
                                      timeout=TIMEOUT_SETTING)
                    if r.status_code == 200:
                        result.update(r.json())
                    time.sleep(1)
                return result
            return [0, 0, 0]
        except Exception as e:
            print e, "SF BDCheck Get Fail.."
            if (isinstance(doi, (list, tuple, set))): return {}
            return [0, 0, 0]
Example #11
0
    def process(self, fname="", cleannote=False, prefix='', issn='', start=0):
        epath = self.getpath()
        print "Output", self.length, "to", epath + os.sep + fname
        for i in range(start, self.length):
            try:
                #if (i%100 is 0):
                #	print
                #	print "Doing:",i+1,
                #else:
                #	print i+1,

                pdfs = self.getpdf(i)
                urls = self.geturl(i)
                # Fast consider as record process before
                hasfound = False
                for pdf in pdfs:
                    if "internal-pdf://OAPDF/" in pdf:
                        hasfound = True
                        doistr = self.gettag(i, "electronic-resource-num")
                        if (doistr and len(doistr) > 4
                                and doistr[:4] == 'chk:'):
                            doi = DOI(self.getdoi(i))
                            if doi:
                                self.setdoi(i, "chk: " + doi)
                        break

                if not hasfound:
                    for url in urls:
                        if "http://oapdf.sourceforge.net/cgi-bin/" in url:
                            hasfound = True
                            doistr = self.gettag(i, "electronic-resource-num")
                            if (doistr and len(doistr) > 4
                                    and doistr[:4] == 'chk:'):
                                doi = DOI(self.getdoi(i))
                                if doi:
                                    self.setdoi(i, "chk: " + doi)
                            break
                if hasfound:
                    continue

                if (cleannote):
                    self.cleannote(i)

                doistr = self.gettag(i, "electronic-resource-num")
                if (doistr and len(doistr) > 4 and doistr[:4] == 'chk:'):
                    doi = DOI(self.getdoi(i))
                else:
                    doi = DOI(self.finddoi(i, prefix=prefix, issn=issn))
                    if doi:
                        self.setdoi(i, "chk: " + doi)
                oapdflink = ""
                if (doi and doi.is_oapdf()):
                    oapdflink = "http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi=" + doi

                newpdfs = []
                for pdf in pdfs:
                    pdfpath = pdf.replace("internal-pdf://",
                                          epath + os.sep + "PDF" + os.sep)
                    relpath = pdf.replace("internal-pdf://", "")
                    # should never happen
                    if (relpath == doi.quote() + ".pdf"):
                        newpdfs.append(pdf)
                        continue
                    if (doi):
                        if (os.path.exists(pdfpath)):
                            try:
                                os.renames(
                                    pdfpath, epath + os.sep + "PDF" + os.sep +
                                    doi.quote() + ".pdf")
                                newpdfs.append("internal-pdf://" +
                                               doi.quote() + ".pdf")
                            except:
                                print "Can't rename:", pdf, 'to', doi.quote(
                                ) + ".pdf"
                                newpdfs.append(pdf)
                                continue
                        else:
                            print "Maybe error for the record", doi, "with pdf path:", pdf, '; Try finding..',
                            pdfdir = os.path.split(pdfpath)[0]
                            if (os.path.exists(pdfdir)):
                                fs = glob.glob(pdfdir + os.sep + '*.pdf')
                                if (len(fs) == 1):
                                    try:
                                        os.renames(
                                            fs[0], epath + os.sep + "PDF" +
                                            os.sep + doi.quote() + ".pdf")
                                        newpdfs.append("internal-pdf://" +
                                                       doi.quote() + ".pdf")
                                        print "Find", fs[0], 'and rename!'
                                    except:
                                        print "Can't rename:", fs[
                                            0], 'to', doi.quote() + ".pdf"
                                        newpdfs.append(pdf)
                                        continue
                                else:
                                    print "Can't find.."
                                    newpdfs.append(pdf)
                                    continue
                            else:
                                newpdfs.append(pdf)
                                continue
                    else:
                        print "Blank doi for file:", pdf
                        newpdfs.append(pdf)
                        continue
                if (oapdflink):
                    newpdfs.append("internal-pdf://OAPDF/" + doi.quote() +
                                   ".pdf")
                self.setpdfs(i, newpdfs)
                # Set the urls
                if (oapdflink and oapdflink not in urls):
                    self.addurl(i, oapdflink, first=True)
            except Exception as e:
                print "Error at ", i, 'since: ', e
                #return 1
        if fname:
            self.write(fname)
        return 0
Example #12
0
	def process(self,fname="",cleannote=False,prefix='',issn='',start=0):
		epath=self.getpath()
		print "Output",self.length,"to",epath+os.sep+fname
		for i in range(start,self.length):
			try:
				#if (i%100 is 0):
				#	print
				#	print "Doing:",i+1,
				#else:
				#	print i+1,

				pdfs=self.getpdf(i)
				urls=self.geturl(i)
				# Fast consider as record process before
				hasfound=False
				for pdf in pdfs:
					if "internal-pdf://OAPDF/" in pdf:
						hasfound=True
						doistr=self.gettag(i,"electronic-resource-num")
						if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
							doi=DOI(self.getdoi(i))
							if doi:
								self.setdoi(i,"chk: "+doi)
						break
						
				if not hasfound:
					for url in urls:
						if "http://oapdf.sourceforge.net/cgi-bin/" in url:
							hasfound=True
							doistr=self.gettag(i,"electronic-resource-num")
							if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
								doi=DOI(self.getdoi(i))
								if doi:
									self.setdoi(i,"chk: "+doi)
							break
				if hasfound:
					continue

				if (cleannote):
					self.cleannote(i)

				doistr=self.gettag(i,"electronic-resource-num")
				if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
					doi=DOI(self.getdoi(i))
				else:
					doi=DOI(self.finddoi(i,prefix=prefix,issn=issn))
					if doi:
						self.setdoi(i,"chk: "+doi)
				oapdflink=""
				if (doi and doi.is_oapdf()):
					oapdflink="http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi="+doi

				newpdfs=[]
				for pdf in pdfs:
					pdfpath=pdf.replace("internal-pdf://",epath+os.sep+"PDF"+os.sep)
					relpath=pdf.replace("internal-pdf://","")
					# should never happen
					if (relpath == doi.quote()+".pdf"):
						newpdfs.append(pdf)
						continue
					if (doi):
						if (os.path.exists(pdfpath)):
							try:
								os.renames(pdfpath,epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf")
								newpdfs.append("internal-pdf://"+doi.quote()+".pdf")
							except:
								print "Can't rename:",pdf,'to',doi.quote()+".pdf"
								newpdfs.append(pdf)
								continue
						else:
							print "Maybe error for the record",doi,"with pdf path:",pdf,'; Try finding..',
							pdfdir=os.path.split(pdfpath)[0]
							if (os.path.exists(pdfdir)):
								fs=glob.glob(pdfdir+os.sep+'*.pdf')
								if (len(fs)==1):
									try:
										os.renames(fs[0],epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf")
										newpdfs.append("internal-pdf://"+doi.quote()+".pdf")
										print "Find",fs[0],'and rename!'
									except:
										print "Can't rename:",fs[0],'to',doi.quote()+".pdf"
										newpdfs.append(pdf)
										continue
								else:
									print "Can't find.."
									newpdfs.append(pdf)
									continue
							else:
								newpdfs.append(pdf)
								continue
					else:
						print "Blank doi for file:",pdf
						newpdfs.append(pdf)
						continue
				if (oapdflink):
					newpdfs.append("internal-pdf://OAPDF/"+doi.quote()+".pdf")
				self.setpdfs(i,newpdfs)
				# Set the urls
				if (oapdflink and oapdflink not in urls):
					self.addurl(i,oapdflink,first=True)
			except Exception as e:
				print "Error at ", i, 'since: ',e
				#return 1
		if fname:
			self.write(fname)
		return 0
Example #13
0
	def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, 
		usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True):
		'''Find PDF by ISSN based on search result from crossref'''
		# may be improve to not only issn..
		if (not issn):return
		if (len(issn)==9 and issn[4]=='-'):
			needurl="http://api.crossref.org/journals/"+issn+"/works"
		elif('10.' in issn):
			needurl="http://api.crossref.org/prefixes/"+issn+"/works"
		else:
			print "Error ISSN/prefix"
			sys.exit(1)
		cr=CRrecord()
		total=cr.gettotalresultfromlink(needurl)
		if (not maxresult or maxresult <=0 or maxresult>total): 
			maxresult=total
		params={"rows":str(step)}
		maxround=(maxresult-offset)/step+1
		offsetcount=offset
		bdcheck=BDCheck()

		for i in range(maxround):
			params["offset"]=str(step*i+offset)
			r=requests.get(needurl,params,timeout=timeout_setting_download)
			if (r.status_code is 200):
				# Get all check/in oapdf 
				if usebdcheck: 
					bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True)

				for j in r.json().get('message',{}).get('items',[]):
					keyword=j.get('title',[''])
					doi=DOI(j.get("DOI",""))
					if not doi:
						offsetcount+=1
						time.sleep(2)
						continue

					# Check whether in bdcheck
					if (usebdcheck and doi in bdcheckall):
						print doi, 'has search/oapdf/free by bdcheck'
						offsetcount+=1
						time.sleep(1)
						continue
						
					# If not in bdcheck, check oapdf/free and set it
					# TODO: remove it after combine oapdf information to library
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] or oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						offsetcount+=1
						time.sleep(1)
						continue						

					if (keyword): 
						keyword=keyword[0]
					else:
						time.sleep(2)
						offsetcount+=1
						continue
					if usedoi:keyword+=" "+doi
					print "#####################################",offsetcount,"####################################"
					print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............"
					sys.stdout.flush()
					self.search(keyword.encode('utf-8'),proxy=proxy)
					bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck)
					bdcheck.set(doi)
					offsetcount+=1
			gc.collect()
		print "End of process for",issn
Example #14
0
	def getallpdf(self,doifilter=None,onlinecheck=True,savestate=None,usebdcheck=True):
		'''Get All pdf from link
		doifilter should be a function, return True when DOI ok'''
		usedoifilter=callable(doifilter)
		getallfilelist=[]
		if isinstance(savestate,(list,tuple,set)):
			savestate=set(savestate)
		elif (isinstance(savestate,int)):
			savestate=set([savestate])
		else:
			savestate=set([0,1,2,3])
		bdcheck=BDCheck()
		for i in range(len(self.items)):
			try:
				getfilelist=[]
				# Get PDF links
				links=self.getpdflink(i)
				if (links):
					doi=DOI(self.getdoi(i))
					if not doi:
						print "blank doi..",doi
						continue
					if ( usedoifilter and not doifilter(doi)):
						print doi,'Not fit filter..'
						continue
						
					# Check by bdcheck api
					if (usebdcheck):
						bdout=bdcheck.get(doi)
						if sum(bdout)>0:
							print doi, 'has search/oapdf/free',bdout
							continue
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] and oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						continue						
					elif oapdffree[0]:
						print doi,'exist in oapdf library..'
						continue				
					elif oapdffree[1]:
						print doi,'exist in free library..'
						continue
					doifname=doi.quote()+".pdf"
					if (pdfexistpath(doifname)):
						print doi,'Files exist in current folder..'
						continue

					# Start to find pdf at each link
					print "### Find for result with DOI: "+doi
					foundDonePDF=False
					for link in links:
						print 'Link:',str(link),
						if (onlinecheck):
							print "Try Getting..",
							# Get a StringIO obj
							getpdfobj=getwebpdf(link,fname=doifname,params=getwebpdfparams(link),stringio=True)
							if (not getpdfobj):
								continue
							try:
								dpfresult=self.pdfcheck.checkonlinepdf(fobj=getpdfobj,doi=doi)
								sys.stdout.flush()
								if (dpfresult!=0):
									if ( savestate and (dpfresult in savestate)):
										#Important to set fname to None
										rmresult=self.pdfcheck.removegarbage(fname=None,notdelete=True)
										if (rmresult <= 1):
											getfilelist.append( (getpdfobj,self.pdfcheck.realdoi,dpfresult))
									else:
										print "Not OK PDF for doi",doi												
								else:
									foundDonePDF=True
									if (self.pdfcheck.savefobj2file(doi=self.pdfcheck.realdoi,state=0,fobj=getpdfobj)):
										print "!!!!!!! Get PDF file to Done!: "+self.pdfcheck.realdoi
										del getfilelist[:]	
										nowdoi=DOI(self.pdfcheck.realdoi)
										getallfilelist.append('Done/'+nowdoi.quote()+'.pdf')

										break
									else:
										print "What? should never happen for pdfdoicheck.savefobj2file Done.."
							except Exception as e:
								print e,'Error at baidu getallpdf(web) when doing pdfcheck',doi,link

						# Now should not use this method
						elif (getwebpdf(link,fname=doifname,params=getwebpdfparams(link))):
							print "Please don't use download pdf to disk, use check online!"
							print "Try Getting..",
							try:
								dpfresult=self.pdfcheck.renamecheck(doifname)
								sys.stdout.flush()
								if (dpfresult!=0): 
									if ( savestate and (dpfresult in savestate)):
										#Important to set fname to None		
										rmresult=self.pdfcheck.removegarbage(fname=None)
										if (rmresult <= 1):
											if (os.path.exists(self.pdfcheck._fname)):
												getfilelist.append((self.pdfcheck._fname, dpfresult))
											else:
												print "What? should never happen for pdfdoicheck.moveresult Not Done.."
										else:
											print "Has been removed.."
									else:
										if (os.path.exists(self.pdfcheck._fname)) : 
											os.remove(self.pdfcheck._fname)
								else:
									foundDonePDF=True
									if (os.path.exists(self.pdfcheck._fname)):
										print "!!!!!!! Get PDF file to Done!: "+doifname
										getfilelist.append(self.pdfcheck._fname)
										#time.sleep(random.randint(1,5))								
										break
									else:
										print "What? should never happen for pdfdoicheck.moveresult Done.."
							except Exception as e:
								if os.path.exists(doifname):
									if (not os.path.exists('tmpfail/'+doifname)):
										os.renames(doifname,'tmpfail/'+doifname)
									else:
										os.remove(doifname)
								print e,'Error at baidu getallpdf when doing pdfcheck'
						else:
							print "can't get at this link"

					bdcheck.set(doi)
					# Online Check but not Done
					if onlinecheck and not foundDonePDF and len(getfilelist)>0:
						minnum=-1
						minresult=999999
						for i in range(len(getfilelist)):
							if getfilelist[i][2]<minresult:
								minnum=i
						nowdoi=DOI(getfilelist[minnum][1])
						if (self.pdfcheck.savefobj2file(doi=nowdoi,state=getfilelist[minnum][2],fobj=getfilelist[minnum][0])):
							print "!!!!!!! Get PDF file to: "+self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.'),self.pdfcheck.realdoi
							getallfilelist.append(self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.')+os.sep+nowdoi.quote()+".pdf")
							del getfilelist[:]
			except Exception as e:
				print e, "##### Error when get pdf.."
		return getallfilelist
Example #15
0
	def renamecheck(self,fname,wtitle=0.65,cutoff=0.85,justcheck=False,resetfile=True,fdoi=None,excludedoi=None, fobj=None):
		'''A complex function to get doi from file name, 
		check in crossref, check in pdf file, rename it!
		just check can cancel move file'''
		### Result back:
		# 0: Done 
		# 1: High
		# 2: Unsure
		# 3: Untitle
		# 4: Fail
		# 5: Page0
		# 6: ErrorDOI
		# 10: Unknow


		if (resetfile and isinstance(fobj,(file,StringIO))):
			self.reset(fname="",fobj=fobj)
			fname="None"

		# len(self.doi) is 1 and len(self.doi - excludedoi) is 1 : 
		# :: First Run and perform check
		# len(self.doi) is 1 or len(self.doi - excludedoi) is 1 :
		if (not fname and not fdoi):
			print "No given file name or doi! (Return 6)"
			return 6

		if (fname and not fdoi and excludedoi):
			print "What do you want?! No excludedoi set by user! (Return 9)"
			return 9

		
		if (resetfile and fname !="None"): 
			self.reset(fname)
		elif(resetfile and not isinstance(fobj,(file,StringIO))):
			print "Use reset file but no file name/object is given!"
			return 9

		if (self.maxpage == 0):
			if not justcheck: 
				self.moveresult(5, printstr="Error Page 0 (Page0, R5): "+self._fname)
			return 5

		if (not excludedoi):
			excludedoi=set()

		if (not fdoi):
			#File obj is ""
			fdoi=DOI(os.path.splitext(os.path.basename(self._fname))[0])
		else:
			fdoi=DOI(fdoi)

		recursive= (len(excludedoi) > 0)
		# If in recursive, don't move file!
		if recursive: justcheck=True

		if resetfile and not recursive:
			self.realdoi=fdoi

		# Only find DOI in first time!
		if (not recursive and fdoi): 
			self.finddoi(1)
		elif (not recursive and not fdoi):
			self.finddoi(set([1,2,self.maxpage]))

		# file doi is shit..Recursively use doi in file or fail
		if (not fdoi and not recursive):
			if (len(self.doi) is 1 or len(self.doi) is 2):
				print "Origin fdoi wrong but has 1~2 dois in file:",self._fname,
				return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
			# No doi or >2 dois in file
			else:
				if not justcheck: 
					self.moveresult(4,printstr="Error fdoi and 0/too much doi. (Fail): "+self._fname)
				return 4
		elif (not fdoi and recursive):
			print "doi (in recursion) may wrong with error doi. Should never happen.."
			return 4 # Fail

		# fdoi is ok
		cr=CRrecord()
		try:
			cr=cr.valid_doi(fdoi,fullparse=True)
		except requests.exceptions.RequestException as e:
			print e
			cr=None
		except Exception as e:
			print e
			cr=None		

		# Error when year=None, improve in crrecord.
		#if (cr and not cr.year):
		#	cr.year='8888'	

		#crossref is ok
		if (fdoi and cr):
			totalpagenumber=1
			try:
				totalpagenumber=self.totalpages(cr.pages)
			except ValueError as e:
				# should never happen now
				print e, cr.pages

			totalpagewrong=False
			#print "pages:",self.maxpage,' in crossref:',cr.pages,totalpagenumber
			if totalpagenumber>0 and not (self.maxpage >= totalpagenumber and self.maxpage <= totalpagenumber+2):
				totalpagewrong=True
				# When paper with supporting information
				if (self.maxpage > totalpagenumber+2):
					self.finddoi(page=2)
					if (self.withSI or (self.findtext('Supporting Information', page=[totalpagenumber+1,totalpagenumber+2])
						and self.findtext(cr.title, similarity=0.75, page=[totalpagenumber+1,totalpagenumber+2]))):
						if not recursive : self.finddoi(totalpagenumber);
						self.withSI=True
						totalpagewrong=False
					# For NIH Public Access
					elif (self.hascontent("NIH Public Access")[0]):
						totalpagewrong=False
					#Such as some Nature with SI in paper without notify.
					elif (self.withSI or (totalpagenumber>1 and self.findtext("acknowledgment", page=[totalpagenumber-1, totalpagenumber]) 
						and self.findtext("reference", page=[totalpagenumber-1, totalpagenumber]))):
						self.withSI=True
						totalpagewrong=False

			# Recursive but total page wrong. Fast end recursivedoicheck
			if (totalpagewrong and recursive):	
				return 4

			# Just check first page, not find(find before..), faster:
			doivalid=self.checkdoi(fdoi,page=1,iterfind=False,justcheck=True)
			titleeval=self.checktitle(cr.title)
			if (totalpagenumber > 0 and not totalpagewrong):
				if (doivalid and titleeval[0] and len(self.doi) is 1):
					# Yes! Very Good PDF!
					self.realdoi=fdoi
					if not justcheck: 
						if (self.maxpage>=2 and self.maxpage == totalpagenumber and
							not self.findtext('Supporting Information', page=[1])):
							self.moveresult(0,good=True)
						else:
							self.moveresult(0)
					return 0

			# Further check doi in page2/last, Finally, will check 1,2 and last pages.
			if (recursive):
				doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True,justcheck=True) or doivalid )
			else:
				doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True) or doivalid )

			if len(self.doi)>3:
				# Too much doi may be some abstract
				self.moveresult(2,printstr='Has more than 3 dois! (Unsure):'+self._fname)
				return 2

			# Page wrong and try recursive use doi
			if (totalpagewrong):
				if (len(self.doi) is 1 or len(self.doi) is 2):
					doi=DOI(list(self.doi)[0])
					# DOI in file is same so error. Don't need recursive
					if (len(self.doi) is 1 and doi == fdoi):
						if not justcheck: 
							self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname)
						return 4

					print 'Wrong total page with dois in file,',self._fname,fdoi,',try recursive'
					return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
				else:
					if not justcheck: 
						self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname)
					return 4

			if (not totalpagewrong):
				crscore=self.scorefitting(cr)
				if (self.maxpage <= totalpagenumber+2): 
					# Maybe check when maxpage >total+2
					titleeval=self.checktitle(cr.title)
				if cr.title.strip()=="":
					titleeval=(False,0.9)
				titlevalid=titleeval[0]
				try:
					paperyear=int(cr.year)
				except:
					paperyear=9999
				try:
					# Too old maybe lost information
					if (paperyear>1990):
						titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff
					else:
						titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff-0.1
				#(self.checktitle(cr.title,similarity=0.85) and self.checkcrossref(cr))
				except Exception as e:
					print e

				if (doivalid):
					if (titlevalid):					
						# Yes! Good PDF!
						self.realdoi=fdoi
						if not justcheck: 
							if (self.maxpage>=2 and self.maxpage == totalpagenumber 
								and len(self.doi) is 1 and
								not self.findtext('Supporting Information', page=[1])):
								self.moveresult(0,good=True)
							else:
								self.moveresult(0)
						return 0

					print "Title/Paper score:",titleeval[1],crscore,self._fname
					if (len(self.doi - set([fdoi])) == 1 and not recursive):
						
						# Try one more
						newresult = self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True)
						if (newresult is 0):
							newdoi=DOI(list(self.doi - set([fdoi]))[0])
							self.realdoi=newdoi
							print 
							if not justcheck: self.moveresult(0, 
								printstr="(Rename)fdoi ok, but not title. In file doi "+newdoi+" is better for "+self._fname,
								newfname=newdoi.quote()+".pdf")
							return 0

					# Else DOI ok but not title
					if not justcheck: 
						self.moveresult(3,printstr="OK fdoi but not title(Untitle): "+self._fname)
					return 3
				
				# Indeed, doi maybe in pdf, but strange format..
				if (self.checkdoinormaltxt(fdoi)):
					if (titlevalid):
						# Further check only when title OK
						if (self.checkdoifurther(fdoi)):
							# Fine! move to Done dir
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber 
								and len(self.doi) is 1 and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						else:
							# Can't find, but high similar! move to High dir
							if not justcheck: 
								self.moveresult(1,printstr="OK title and nospacebreak doi,but not pass(High): "+self._fname)
							return 1
					else:
						# DOI ok but not title
						print "Title/Paper score:",titleeval[1],crscore,self._fname
						if not justcheck: 
							self.moveresult(3,printstr="Maybe OK fdoi but not title(Untitle): "+self._fname)
						return 3						

				# DOI maybe not exist ....
				if (titlevalid):
					tmpdois=set(self.doi)
					for d in tmpdois:
						dd=DOI(d)
						if ( not dd.valid_doiorg(geturl=False) ):
							self.doi.remove(d)
							
					# Old paper don't have doi...
					if len(self.doi) is 0 and totalpagenumber>0:
						if (crscore['total'] >= 0.4):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.85 and crscore['total'] >= 0.35):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)								
							return 0
						elif (titleeval[1]>=0.95 and crscore['total'] >=0.3):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.90 and crscore['pages']>=0.9 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9)):
							if not justcheck: self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7):
							if not justcheck: self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.75 or crscore['total'] >=0.25):
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname)
								return 1
						else:
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(2,printstr="OK title and ok info fit. But no doi(Unsure): "+self._fname)
							return 2
					elif len(self.doi) is 0 and totalpagenumber== -1:
						if (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7):
							if not justcheck: self.moveresult(0)
							return 0
						else:
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname)
							return 2												
					elif len(self.doi) is 0 and totalpagenumber<=0:
						print "Title/Paper score:",titleeval[1],crscore,self._fname
						if not justcheck: 
							self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname)
						return 2
					elif ( len(self.doi) > 0 and not recursive):
						print "Good title but file doesn't contain fdoi, however it has >0 doi in file. "
						outnow=self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True)
						if outnow > 0:
							if not justcheck:
								self.moveresult(2,printstr="OK title but not fdoi. In file doi is not good(Unsure): "+self._fname)
							return 2
						elif(outnow==0):
							print 'Good Title but Fail fdoi. Paper has good in file doi,',self._fname,',try recursive'
							return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)

					### Old method check old items:
					#if (self.checkcrossref(cr)):
					#	if (int(cr.year)<=1999 and len(self.doi) is 0):
					#		# Highly possible right
					#		if not justcheck: self.movetodir("High")
					#		return True
					#	  Bentham, often blank doi
					#	elif (fdoi[:8] == '10.2174/' and len(self.doi) is 0):
					#		if not justcheck: self.movetodir("Done")
					#		return True
					#	elif (len(self.doi) is 0):
					#		print "Title/Paper score:",titleeval[1],crscore,self._fname
					#		if not justcheck: 
					#			self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname)
					#		return 1							
					#	else:
					#		if not justcheck: 
					#			self.moveresult(2,printstr="OK title and high info fit. But doi exist not fit(Unsure): "+self._fname)
					#		return 2								
					#elif(len(self.doi) is 0):
					#	# Maybe wrong file and no doi
					#	if not justcheck: 
					#		self.moveresult(2,printstr="Not found doi in file but ok title (Unsure): "+self._fname)
					#	return 2

			#fdoi,title wrong, no doi in file
			# Or in recursive mode
			if (len(self.doi) is 0 or recursive):
				if not justcheck: 
					self.moveresult(4,printstr="Both fdoi and title wrong, no doi in file(Fail): "+self._fname)
				return 4

			# Indeed, file has only one more doi, not the same to fname
			if (len(self.doi - set([fdoi])) is 1 ):
				print 'Fail fdoi/title. Paper with one more doi in file,',self._fname,',try recursive'
				return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
			elif(len(self.doi) > 1):
				if not justcheck: 
					self.moveresult(4,printstr="fdoi/title fail. Too much infile doi(Fail): "+self._fname)
				return 4
			else:
				if not justcheck: 
					self.moveresult(4,printstr="What????? What?????(Fail):"+self._fname)
				return 4
		# not cr
		else:
			if (not recursive):
				self.finddoi(set([1,2,self.maxpage]))
				if (len(self.doi) is 1 or len(self.doi) is 2):
					print 'Error DOI filename,',self._fname,',try recursive'
					return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)				
			if not justcheck: 
				self.moveresult(6,"Error DOI fname(Fail):"+self._fname)
			return 6