Python CRrecord Examples

Programming Language: Python

Namespace/Package Name: crrecord

Class/Type: CRrecord

Examples at hotexamples.com: 11

Python CRrecord - 11 examples found. These are the top rated real world Python examples of crrecord.CRrecord extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CRrecord(8)

getfromdoi(3)

getfromtitle(2)

gettotalresultfromlink(2)

reset(2)

getfromtitledoi(1)

valid_doi(1)

Example #1

Show file

File: bingacademic.py Project: OAPDF/oapdftools

	def grepBingAcadPDFbyID(self,bid,maxpage=1,printyn=True):
		'''Grep at most maxpage pages pdf for given bing id
		Save to doi style based on refering to crossref.'''
		if (printyn):
			print "###  ###  ###  ###  ###  ###  ###  ###  ### "
			print "## Finding for "+bid+"...."
		cr=CRrecord()
		ref=self.bidref(bid)
		if (printyn):
			print ref
		if (os.path.exists(bid+".pdf")):
			print "Exist file:"+bid+".pdf"
			return
		if ref['title']:
			if (cr.getfromtitle(title=ref['title'],year=ref['year'],volume=ref['volume'],
					pages=ref['pages'],issue=ref['issue'],fullparse=False) and cr.doi):
				# try to find by title, if found (true):
				if (printyn): print cr
				outname=quotefileDOI(cr.doi)+".pdf"
				if (not os.path.exists(outname)):
					if (self.getbidpdf(bid,filename=outname,printyn=printyn)):
						print "Have Found PDF file: "+outname
				else:
					print "Exist file:"+outname
			else:
				if (self.getbidpdf(bid,filename=bid+".pdf",printyn=printyn)):
					print "Have Found PDF file: "+bid+".pdf"
		else:
			if (self.getbidpdf(bid,filename=bid+".pdf",printyn=printyn)):
				print "Have Found PDF file: "+bid+".pdf"

Example #2

Show file

File: endnotexml.py Project: OAPDF/oapdftools

    def finddoi(self, num, prefix='', issn=''):
        title = self.gettitle(num)
        doi = DOI(self.getdoi(num))
        if (not prefix):
            prefix = doi.split('/', 1)[0] if doi else ""
        volume = self.getvolume(num)
        journal = self.getjournalfull(num)
        year = self.getyear(num)
        pages = self.getpages(num)
        self.cr = CRrecord()
        try:
            # The origin doi maybe true. Find in crossref
            if (doi and self.cr.getfromdoi(doi, fullparse=False)
                    and self.cr.doi):
                # Further check title
                if (strdiff(doi,self.cr.doi)>=0.85 and \
                strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
                    return doi
                if (volume and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and volume == self.cr.volume):
                        return doi
                if (year and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and year == self.cr.year):
                        return doi
                print "Origin DOI:", doi, "may be true but record strange..Try title"

            keyword = title + " " + journal + " " + year + " " + pages + " " + volume
            if (self.cr.getfromtitledoi(keyword,
                                        doi,
                                        year=year,
                                        limit=10,
                                        fullparse=False,
                                        prefix=prefix)):
                if (doi):
                    if (prefix == self.cr.doi.split('/')[0]
                            and strdiff(doi, self.cr.doi) >= 0.85):
                        return self.cr.doi
                    else:
                        print "Error for origin doi: " + doi + "; found: " + self.cr.doi
                        return ""
                return self.cr.doi
            if (doi):
                if (strdiff(doi, self.cr.doi) >= 0.85):
                    return self.cr.doi
                else:
                    print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi
                    return ""
            else:
                return ""
        except Exception as e:
            print "Error when find doi..", e, "\nRetry..."
            return self.finddoi(num, prefix=prefix, issn=issn)

Example #3

Show file

File: bingacademic.py Project: OAPDF/oapdftools

	def findcrossreftitledoi(self,doi,printyn=True):
		'''Find doi by crossref first'''
		cr=CRrecord()
		if( cr.getfromdoi(doi,fullparse=False) and cr.doi):
			keyword=(cr.title+" "+cr.doi).encode('utf-8')
			print "#########################################################################"
			print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
			sys.stdout.flush()
			self.grepBingAcadPDF(keyword=keyword,maxpage=1,printyn=printyn)
		else:
			print "Error DOI!: "+doi
		cr.reset()

Example #4

Show file

	def getdoi(self,num=0):
		'''Get DOI from Baidu Cite'''
		soup=BeautifulSoup(self.getcite(num,citetype='txt'),"html.parser")
		if (soup.doi): 
			doi=soup.doi.text
		elif(soup.primarytitle):
			cr=CRrecord()
			cr.getfromtitle(soup.primarytitle.info.text,ignorecheminfo=True)
			doi=cr.doi
		else:
			doi=DOI("")
		return DOI(doi[doi.find('10.'):])

Example #5

Show file

	def findcrossreftitledoi(self,doi,printyn=True):
		'''Find doi by crossref first'''
		cr=CRrecord()
		if( cr.getfromdoi(doi,fullparse=False) and cr.doi):
			keyword=cr.title+" "+cr.doi
			print "#########################################################################"
			print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
			sys.stdout.flush()
			self.search(keyword=keyword)
			self.getallpdf()
		else:
			print "Error DOI!: "+doi
		cr.reset()

Example #6

Show file

File: endnotexml.py Project: OAPDF/oapdftools

	def finddoi(self,num,prefix='',issn=''):
		title=self.gettitle(num)
		doi=DOI(self.getdoi(num))
		if (not prefix):
			prefix = doi.split('/',1)[0] if doi else ""
		volume= self.getvolume(num)
		journal=self.getjournalfull(num)
		year=self.getyear(num) 
		pages=self.getpages(num)
		self.cr=CRrecord()
		try:
			# The origin doi maybe true. Find in crossref
			if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi):
				# Further check title
				if (strdiff(doi,self.cr.doi)>=0.85 and \
				strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
					return doi
				if( volume and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume):
						return doi
				if( year and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year):
						return doi
				print "Origin DOI:",doi,"may be true but record strange..Try title"

			keyword=title+" "+journal+" "+year+" "+pages+" "+volume
			if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)):
				if (doi):
					if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85):
						return self.cr.doi
					else:
						print "Error for origin doi: "+doi+"; found: "+self.cr.doi
						return ""
				return self.cr.doi
			if (doi):
				if( strdiff(doi,self.cr.doi)>=0.85):
					return self.cr.doi
				else:
					print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi
					return ""
			else:
				return ""
		except Exception as e:
			print "Error when find doi..",e,"\nRetry..."
			return self.finddoi(num,prefix=prefix,issn=issn)

Example #7

Show file

File: bingacademic.py Project: OAPDF/oapdftools

	def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0):
		'''Find PDF by ISSN based on search result from crossref'''
		# may be improve to not only issn..
		if (not issn):return
		needurl="http://api.crossref.org/journals/"+issn+"/works"
		cr=CRrecord()
		total=cr.gettotalresultfromlink(needurl)
		if (not maxresult or maxresult <=0 or maxresult>total): 
			maxresult=total
		params={"rows":str(step)}
		maxround=(maxresult-offset)/step+1
		offsetcount=offset
		for i in range(maxround):
			params["offset"]=str(step*i+offset)
			r=requests.get(needurl,params,timeout=timeout_setting_download)
			if (r.status_code is 200):
				for j in r.json()['message']['items']:
					keyword=j.get('title',[''])[0]+" "+j.get("DOI","")
					print "#####################################",offsetcount,"####################################"
					print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
					sys.stdout.flush()
					bingacad.grepBingAcadPDF(keyword.encode('utf-8'))
					offsetcount+=1
			gc.collect()

Example #8

Show file

File: endnotexml.py Project: OAPDF/oapdftools

class EndnoteXML(object):
    def __init__(self, fname):
        if (fname):
            f = open(fname)
            self.content = re.sub(r'</?style.*?>', '', f.read())
            f.close()
        else:
            self.content = ""
        self.soup = BeautifulSoup(self.content, 'html.parser')
        self.records = self.soup.records.contents
        self.length = len(self.records)

        for i in range(self.length):
            self.checktag(i, 'titles')
            self.checktag(i, 'authors')
            self.checktag(i, 'urls')
            if (self.records[i].find('related-urls') is None):
                self.addtag(i, 'related-urls', '', parent='urls')
            if (self.records[i].find('pdf-urls') is None):
                self.addtag(i, 'pdf-urls', '', parent='urls')
            self.checktag(i, 'dates')
            self.setdoi(i, self.getdoi(i))

    #def __repr__(self):
    #	return self.soup.encode()

    def __str__(self):
        return self.soup.encode()

    def reset(self, fname):
        self.__init__(fname)

    def read(self, fname):
        self.__init__(fname)

    def reads(self, s):
        self.content = s
        self.soup = BeautifulSoup(self.content, 'html.parser')
        self.records = self.soup.records.contents
        self.length = len(self.records)
        for i in range(self.length):
            self.checktag(i, 'titles')
            self.checktag(i, 'authors')
            self.checktag(i, 'urls')
            if (self.records[i].find('related-urls') is None):
                self.addtag(i, 'related-urls', '', parent='urls')
            if (self.records[i].find('pdf-urls') is None):
                self.addtag(i, 'pdf-urls', '', parent='urls')
            self.checktag(i, 'dates')
            self.setdoi(i, self.getdoi(i))

    def writes(self, encoding='utf-8'):
        return self.soup.encode(encoding=encoding)

    def write(self, fname, encoding='utf-8'):
        f = open(fname, 'w')
        f.write(self.writes(encoding=encoding))
        f.close()

    def getrecord(self, num):
        if (num >= self.length):
            return None
        return self.records[num]

    def checktag(self, num, tag):
        if self.records[num].find(tag) is None:
            self.addtag(num, tag, value='')

    def addtag(self, num, tag, value=None, parent=None):
        '''value can be string, tag'''
        a = self.soup.new_tag(tag)
        if value: a.string = value
        if parent:
            self.records[num].find(parent).append(a)
        else:
            self.records[num].append(a)

    def gettag(self, num, tag, parent=None, obj=False):
        if parent:
            if self.records[num].find(parent):
                if self.records[num].find(parent).find(tag):
                    if (obj):
                        return self.records[num].find(parent).find(tag)
                    else:
                        return self.records[num].find(parent).find(tag).string
                else:
                    return ''
            else:
                return ''
        else:
            if self.records[num].find(tag):
                if (obj):
                    return self.records[num].find(tag)
                else:
                    return self.records[num].find(tag).string
            else:
                return ''

    def settag(self, num, tag, value, parent=None):
        if parent:
            if self.records[num].find(parent):
                if self.records[num].find(parent).find(tag):
                    self.records[num].find(parent).find(tag).string = value
                else:
                    self.addtag(num, tag, parent=parent, value=value)
            else:
                a = self.soup.new_tag(tag)
                a.string = value
                self.addtag(num, parent, parent=None, value=a)
        else:
            if self.records[num].find(tag):
                self.records[num].find(tag).string = value
            else:
                self.addtag(num, tag, parent=None, value=value)

    def getpath(self):
        db = self.soup.findChild("database")
        if (db):
            return os.path.splitext(db['path'])[0] + '.Data'
        else:
            return ""

    def getdoi(self, num):
        doistr = self.gettag(num, "electronic-resource-num")
        if (doistr):
            doiindex = doistr.find('10.')
        else:
            doiindex = -1
        if (doiindex >= 0):
            return doistr[doiindex:].lower().strip()
        else:
            return ""

    def setdoi(self, num, value):
        self.settag(num, "electronic-resource-num", value)

    def gettitle(self, num):
        return self.gettag(num, "title")

    def settitle(self, num, value):
        self.settag(num, "title", value)

    def getjournalfull(self, num):
        return self.gettag(num, 'secondary-title')

    def getyear(self, num):
        return self.gettag(num, 'year', 'dates')

    def setyear(self, num, value):
        self.settag(num, 'year', value, 'dates')

    def getvolume(self, num):
        return self.gettag(num, 'volume')

    def setvolume(self, num, value):
        self.settag(num, 'volume', value)

    def getissue(self, num):
        return self.gettag(num, 'number')

    def setissue(self, num, value):
        self.settag(num, 'number', value)

    def getpages(self, num):
        return self.gettag(num, 'pages')

    def setpages(self, num, value):
        self.settag(num, 'pages', value)

    def getnotes(self, num):
        return self.gettag(num, 'notes')

    def setnotes(self, num, value):
        self.settag(num, 'notes', value)

    def geturl(self, num):
        urls = self.gettag(num, 'related-urls', obj=True)
        if (urls):
            return [i.string for i in urls.find_all('url')]
        else:
            return []

    def seturl(self, num, value):
        '''Note that it will clean all the url!'''
        if (self.soup.find('related-urls') is not None):
            urls = self.gettag(num, 'related-urls', obj=True)
            if (urls):
                urls.clear()
        else:
            self.addtag(num, 'related-urls', parent='urls')
        self.addtag(num, 'url', value, 'related-urls')

    def addurl(self, num, value, first=False):
        urls = self.gettag(num, 'related-urls', obj=True)
        a = self.soup.new_tag('url')
        a.string = value
        if (urls):
            if (not first):
                urls.append(a)
            else:
                urls.insert(0, a)
        else:
            self.settag(num, 'related-urls', a, 'urls')

    def getpdf(self, num):
        urls = self.gettag(num, 'pdf-urls', obj=True)
        if (urls):
            return [i.string for i in urls.find_all('url')]
        else:
            return []

    def setpdf(self, num, value):
        '''Note that it will clean all the url!'''
        if (self.soup.find('pdf-urls') is not None):
            urls = self.gettag(num, 'pdf-urls', obj=True)
            if (urls):
                urls.clear()
        else:
            self.addtag(num, 'pdf-urls', parent='urls')
        self.addtag(num, 'url', value, 'pdf-urls')

    def setpdfs(self, num, value):
        '''Note that it will clean all the url!'''
        if (self.soup.find('pdf-urls') is not None):
            urls = self.gettag(num, 'pdf-urls', obj=True)
            if (urls):
                urls.clear()
        else:
            self.addtag(num, 'pdf-urls', parent='urls')
        for url in value:
            self.addtag(num, 'url', url, 'pdf-urls')

    def addpdf(self, num, value, first=False):
        urls = self.gettag(num, 'pdf-urls', obj=True)
        a = self.soup.new_tag('url')
        a.string = value
        if (urls):
            if (not first):
                urls.append(a)
            else:
                urls.insert(0, a)
        else:
            self.addtag(num, 'pdf-urls', a, 'urls')

    def finddoi(self, num, prefix='', issn=''):
        title = self.gettitle(num)
        doi = DOI(self.getdoi(num))
        if (not prefix):
            prefix = doi.split('/', 1)[0] if doi else ""
        volume = self.getvolume(num)
        journal = self.getjournalfull(num)
        year = self.getyear(num)
        pages = self.getpages(num)
        self.cr = CRrecord()
        try:
            # The origin doi maybe true. Find in crossref
            if (doi and self.cr.getfromdoi(doi, fullparse=False)
                    and self.cr.doi):
                # Further check title
                if (strdiff(doi,self.cr.doi)>=0.85 and \
                strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
                    return doi
                if (volume and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and volume == self.cr.volume):
                        return doi
                if (year and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and year == self.cr.year):
                        return doi
                print "Origin DOI:", doi, "may be true but record strange..Try title"

            keyword = title + " " + journal + " " + year + " " + pages + " " + volume
            if (self.cr.getfromtitledoi(keyword,
                                        doi,
                                        year=year,
                                        limit=10,
                                        fullparse=False,
                                        prefix=prefix)):
                if (doi):
                    if (prefix == self.cr.doi.split('/')[0]
                            and strdiff(doi, self.cr.doi) >= 0.85):
                        return self.cr.doi
                    else:
                        print "Error for origin doi: " + doi + "; found: " + self.cr.doi
                        return ""
                return self.cr.doi
            if (doi):
                if (strdiff(doi, self.cr.doi) >= 0.85):
                    return self.cr.doi
                else:
                    print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi
                    return ""
            else:
                return ""
        except Exception as e:
            print "Error when find doi..", e, "\nRetry..."
            return self.finddoi(num, prefix=prefix, issn=issn)

    def preprocess(self):
        pass

    def cleannote(self, num):
        note = self.getnotes(num)
        notel = note.lower()
        if ("time" in notel):
            self.setnotes(num, notel[notel.find('time'):])

    def cleanallpdf(self, exceptOAPDF=True):
        '''Clean PDF record or except OAPDF record'''
        for i in range(self.length):
            if (not exceptOAPDF):
                self.setpdf(i, '')
            else:
                for pdf in self.getpdf(i):
                    if "internal-pdf://OAPDF/" in pdf:
                        self.setpdf(i, pdf)
                        break

    def process(self, fname="", cleannote=False, prefix='', issn='', start=0):
        epath = self.getpath()
        print "Output", self.length, "to", epath + os.sep + fname
        for i in range(start, self.length):
            try:
                #if (i%100 is 0):
                #	print
                #	print "Doing:",i+1,
                #else:
                #	print i+1,

                pdfs = self.getpdf(i)
                urls = self.geturl(i)
                # Fast consider as record process before
                hasfound = False
                for pdf in pdfs:
                    if "internal-pdf://OAPDF/" in pdf:
                        hasfound = True
                        doistr = self.gettag(i, "electronic-resource-num")
                        if (doistr and len(doistr) > 4
                                and doistr[:4] == 'chk:'):
                            doi = DOI(self.getdoi(i))
                            if doi:
                                self.setdoi(i, "chk: " + doi)
                        break

                if not hasfound:
                    for url in urls:
                        if "http://oapdf.sourceforge.net/cgi-bin/" in url:
                            hasfound = True
                            doistr = self.gettag(i, "electronic-resource-num")
                            if (doistr and len(doistr) > 4
                                    and doistr[:4] == 'chk:'):
                                doi = DOI(self.getdoi(i))
                                if doi:
                                    self.setdoi(i, "chk: " + doi)
                            break
                if hasfound:
                    continue

                if (cleannote):
                    self.cleannote(i)

                doistr = self.gettag(i, "electronic-resource-num")
                if (doistr and len(doistr) > 4 and doistr[:4] == 'chk:'):
                    doi = DOI(self.getdoi(i))
                else:
                    doi = DOI(self.finddoi(i, prefix=prefix, issn=issn))
                    if doi:
                        self.setdoi(i, "chk: " + doi)
                oapdflink = ""
                if (doi and doi.is_oapdf()):
                    oapdflink = "http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi=" + doi

                newpdfs = []
                for pdf in pdfs:
                    pdfpath = pdf.replace("internal-pdf://",
                                          epath + os.sep + "PDF" + os.sep)
                    relpath = pdf.replace("internal-pdf://", "")
                    # should never happen
                    if (relpath == doi.quote() + ".pdf"):
                        newpdfs.append(pdf)
                        continue
                    if (doi):
                        if (os.path.exists(pdfpath)):
                            try:
                                os.renames(
                                    pdfpath, epath + os.sep + "PDF" + os.sep +
                                    doi.quote() + ".pdf")
                                newpdfs.append("internal-pdf://" +
                                               doi.quote() + ".pdf")
                            except:
                                print "Can't rename:", pdf, 'to', doi.quote(
                                ) + ".pdf"
                                newpdfs.append(pdf)
                                continue
                        else:
                            print "Maybe error for the record", doi, "with pdf path:", pdf, '; Try finding..',
                            pdfdir = os.path.split(pdfpath)[0]
                            if (os.path.exists(pdfdir)):
                                fs = glob.glob(pdfdir + os.sep + '*.pdf')
                                if (len(fs) == 1):
                                    try:
                                        os.renames(
                                            fs[0], epath + os.sep + "PDF" +
                                            os.sep + doi.quote() + ".pdf")
                                        newpdfs.append("internal-pdf://" +
                                                       doi.quote() + ".pdf")
                                        print "Find", fs[0], 'and rename!'
                                    except:
                                        print "Can't rename:", fs[
                                            0], 'to', doi.quote() + ".pdf"
                                        newpdfs.append(pdf)
                                        continue
                                else:
                                    print "Can't find.."
                                    newpdfs.append(pdf)
                                    continue
                            else:
                                newpdfs.append(pdf)
                                continue
                    else:
                        print "Blank doi for file:", pdf
                        newpdfs.append(pdf)
                        continue
                if (oapdflink):
                    newpdfs.append("internal-pdf://OAPDF/" + doi.quote() +
                                   ".pdf")
                self.setpdfs(i, newpdfs)
                # Set the urls
                if (oapdflink and oapdflink not in urls):
                    self.addurl(i, oapdflink, first=True)
            except Exception as e:
                print "Error at ", i, 'since: ', e
                #return 1
        if fname:
            self.write(fname)
        return 0

Example #9

Show file

File: endnotexml.py Project: OAPDF/oapdftools

class EndnoteXML(object):
	def __init__(self,fname):
		if (fname):
			f=open(fname)
			self.content=re.sub(r'</?style.*?>','',f.read())
			f.close()
		else:
			self.content=""
		self.soup=BeautifulSoup(self.content,'html.parser')
		self.records=self.soup.records.contents
		self.length=len(self.records)
		
		for i in range(self.length):
			self.checktag(i,'titles')
			self.checktag(i,'authors')
			self.checktag(i,'urls')
			if (self.records[i].find('related-urls') is None):
				self.addtag(i,'related-urls','',parent='urls')
			if (self.records[i].find('pdf-urls') is None):
				self.addtag(i,'pdf-urls','',parent='urls')			
			self.checktag(i,'dates')
			self.setdoi(i,self.getdoi(i))

	#def __repr__(self):
	#	return self.soup.encode()

	def __str__(self):
		return self.soup.encode()

	def reset(self,fname):
		self.__init__(fname)

	def read(self,fname):
		self.__init__(fname)

	def reads(self,s):
		self.content=s
		self.soup=BeautifulSoup(self.content,'html.parser')
		self.records=self.soup.records.contents
		self.length=len(self.records)
		for i in range(self.length):
			self.checktag(i,'titles')
			self.checktag(i,'authors')
			self.checktag(i,'urls')
			if (self.records[i].find('related-urls') is None):
				self.addtag(i,'related-urls','',parent='urls')
			if (self.records[i].find('pdf-urls') is None):
				self.addtag(i,'pdf-urls','',parent='urls')
			self.checktag(i,'dates')
			self.setdoi(i,self.getdoi(i))

	def writes(self,encoding='utf-8'):
		return self.soup.encode(encoding=encoding)

	def write(self,fname,encoding='utf-8'):
		f=open(fname,'w')
		f.write(self.writes(encoding=encoding))
		f.close()

	def getrecord(self,num):
		if (num>=self.length):
			return None
		return self.records[num]

	def checktag(self,num,tag):
		if self.records[num].find(tag) is None:
			self.addtag(num,tag,value='')

	def addtag(self,num,tag,value=None,parent=None):
		'''value can be string, tag'''
		a=self.soup.new_tag(tag)
		if value: a.string=value
		if parent:
			self.records[num].find(parent).append(a)
		else:
			self.records[num].append(a)

	def gettag(self,num,tag,parent=None,obj=False):
		if parent:
			if self.records[num].find(parent):
				if self.records[num].find(parent).find(tag):
					if (obj):
						return self.records[num].find(parent).find(tag)
					else:
						return self.records[num].find(parent).find(tag).string
				else:
					return ''
			else:
				return ''
		else:
			if self.records[num].find(tag):
				if (obj):
					return self.records[num].find(tag)
				else:
					return self.records[num].find(tag).string
			else:
				return ''

	def settag(self,num,tag,value,parent=None):
		if parent:
			if self.records[num].find(parent):
				if self.records[num].find(parent).find(tag):
					self.records[num].find(parent).find(tag).string=value
				else:
					self.addtag(num,tag,parent=parent,value=value)
			else:
				a=self.soup.new_tag(tag)
				a.string=value
				self.addtag(num,parent,parent=None,value=a)
		else:
			if self.records[num].find(tag):
				self.records[num].find(tag).string=value
			else:
				self.addtag(num,tag,parent=None,value=value)	

	def getpath(self):
		db=self.soup.findChild("database")
		if (db):
			return os.path.splitext(db['path'])[0]+'.Data'
		else:
			return ""

	def getdoi(self,num):
		doistr=self.gettag(num,"electronic-resource-num")
		if (doistr):
			doiindex=doistr.find('10.')
		else:
			doiindex=-1
		if (doiindex >=0):
			return doistr[doiindex:].lower().strip()
		else:
			return ""

	def setdoi(self,num,value):
		self.settag(num,"electronic-resource-num",value)

	def gettitle(self,num):
		return self.gettag(num,"title")

	def settitle(self,num,value):
		self.settag(num,"title",value)

	def getjournalfull(self,num):
		return self.gettag(num,'secondary-title')

	def getyear(self,num):
		return self.gettag(num,'year','dates')

	def setyear(self,num,value):
		self.settag(num,'year',value,'dates')

	def getvolume(self,num):
		return self.gettag(num,'volume')

	def setvolume(self,num,value):
		self.settag(num,'volume',value)

	def getissue(self,num):
		return self.gettag(num,'number')

	def setissue(self,num,value):
		self.settag(num,'number',value)

	def getpages(self,num):
		return self.gettag(num,'pages')

	def setpages(self,num,value):
		self.settag(num,'pages',value)

	def getnotes(self,num):
		return self.gettag(num,'notes')

	def setnotes(self,num,value):
		self.settag(num,'notes',value)

	def geturl(self,num):
		urls=self.gettag(num,'related-urls',obj=True)
		if (urls):
			return [ i.string for i in urls.find_all('url') ]
		else:
			return []

	def seturl(self,num,value):
		'''Note that it will clean all the url!'''
		if (self.soup.find('related-urls') is not None):
			urls=self.gettag(num,'related-urls',obj=True)
			if (urls):
				urls.clear()
		else:
			self.addtag(num,'related-urls',parent='urls')
		self.addtag(num,'url',value,'related-urls')

	def addurl(self,num,value,first=False):
		urls=self.gettag(num,'related-urls',obj=True)
		a=self.soup.new_tag('url')
		a.string=value
		if (urls):
			if (not first):
				urls.append(a)
			else:
				urls.insert(0,a)
		else:
			self.settag(num,'related-urls',a,'urls')

	def getpdf(self,num):
		urls=self.gettag(num,'pdf-urls',obj=True)
		if (urls):
			return [ i.string for i in urls.find_all('url') ]
		else:
			return []

	def setpdf(self,num,value):
		'''Note that it will clean all the url!'''
		if (self.soup.find('pdf-urls') is not None):
			urls=self.gettag(num,'pdf-urls',obj=True)
			if (urls):
				urls.clear()
		else:
			self.addtag(num,'pdf-urls',parent='urls')
		self.addtag(num,'url',value,'pdf-urls')

	def setpdfs(self,num,value):
		'''Note that it will clean all the url!'''
		if (self.soup.find('pdf-urls') is not None):
			urls=self.gettag(num,'pdf-urls',obj=True)
			if (urls):
				urls.clear()
		else:
			self.addtag(num,'pdf-urls',parent='urls')
		for url in value:
			self.addtag(num,'url',url,'pdf-urls')

	def addpdf(self,num,value,first=False):
		urls=self.gettag(num,'pdf-urls',obj=True)
		a=self.soup.new_tag('url')
		a.string=value
		if (urls):
			if (not first):
				urls.append(a)
			else:
				urls.insert(0,a)
		else:
			self.addtag(num,'pdf-urls',a,'urls')

	def finddoi(self,num,prefix='',issn=''):
		title=self.gettitle(num)
		doi=DOI(self.getdoi(num))
		if (not prefix):
			prefix = doi.split('/',1)[0] if doi else ""
		volume= self.getvolume(num)
		journal=self.getjournalfull(num)
		year=self.getyear(num) 
		pages=self.getpages(num)
		self.cr=CRrecord()
		try:
			# The origin doi maybe true. Find in crossref
			if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi):
				# Further check title
				if (strdiff(doi,self.cr.doi)>=0.85 and \
				strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
					return doi
				if( volume and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume):
						return doi
				if( year and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year):
						return doi
				print "Origin DOI:",doi,"may be true but record strange..Try title"

			keyword=title+" "+journal+" "+year+" "+pages+" "+volume
			if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)):
				if (doi):
					if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85):
						return self.cr.doi
					else:
						print "Error for origin doi: "+doi+"; found: "+self.cr.doi
						return ""
				return self.cr.doi
			if (doi):
				if( strdiff(doi,self.cr.doi)>=0.85):
					return self.cr.doi
				else:
					print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi
					return ""
			else:
				return ""
		except Exception as e:
			print "Error when find doi..",e,"\nRetry..."
			return self.finddoi(num,prefix=prefix,issn=issn)

	def preprocess(self):
		pass

	def cleannote(self,num):
		note=self.getnotes(num)
		notel=note.lower()
		if ("time" in notel):
			self.setnotes(num,notel[notel.find('time'):])

	def cleanallpdf(self,exceptOAPDF=True):
		'''Clean PDF record or except OAPDF record'''
		for i in range(self.length):
			if (not exceptOAPDF):
				self.setpdf(i,'')
			else:
				for pdf in self.getpdf(i):
					if "internal-pdf://OAPDF/" in pdf:
						self.setpdf(i,pdf)
						break

	def process(self,fname="",cleannote=False,prefix='',issn='',start=0):
		epath=self.getpath()
		print "Output",self.length,"to",epath+os.sep+fname
		for i in range(start,self.length):
			try:
				#if (i%100 is 0):
				#	print
				#	print "Doing:",i+1,
				#else:
				#	print i+1,

				pdfs=self.getpdf(i)
				urls=self.geturl(i)
				# Fast consider as record process before
				hasfound=False
				for pdf in pdfs:
					if "internal-pdf://OAPDF/" in pdf:
						hasfound=True
						doistr=self.gettag(i,"electronic-resource-num")
						if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
							doi=DOI(self.getdoi(i))
							if doi:
								self.setdoi(i,"chk: "+doi)
						break
						
				if not hasfound:
					for url in urls:
						if "http://oapdf.sourceforge.net/cgi-bin/" in url:
							hasfound=True
							doistr=self.gettag(i,"electronic-resource-num")
							if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
								doi=DOI(self.getdoi(i))
								if doi:
									self.setdoi(i,"chk: "+doi)
							break
				if hasfound:
					continue

				if (cleannote):
					self.cleannote(i)

				doistr=self.gettag(i,"electronic-resource-num")
				if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
					doi=DOI(self.getdoi(i))
				else:
					doi=DOI(self.finddoi(i,prefix=prefix,issn=issn))
					if doi:
						self.setdoi(i,"chk: "+doi)
				oapdflink=""
				if (doi and doi.is_oapdf()):
					oapdflink="http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi="+doi

				newpdfs=[]
				for pdf in pdfs:
					pdfpath=pdf.replace("internal-pdf://",epath+os.sep+"PDF"+os.sep)
					relpath=pdf.replace("internal-pdf://","")
					# should never happen
					if (relpath == doi.quote()+".pdf"):
						newpdfs.append(pdf)
						continue
					if (doi):
						if (os.path.exists(pdfpath)):
							try:
								os.renames(pdfpath,epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf")
								newpdfs.append("internal-pdf://"+doi.quote()+".pdf")
							except:
								print "Can't rename:",pdf,'to',doi.quote()+".pdf"
								newpdfs.append(pdf)
								continue
						else:
							print "Maybe error for the record",doi,"with pdf path:",pdf,'; Try finding..',
							pdfdir=os.path.split(pdfpath)[0]
							if (os.path.exists(pdfdir)):
								fs=glob.glob(pdfdir+os.sep+'*.pdf')
								if (len(fs)==1):
									try:
										os.renames(fs[0],epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf")
										newpdfs.append("internal-pdf://"+doi.quote()+".pdf")
										print "Find",fs[0],'and rename!'
									except:
										print "Can't rename:",fs[0],'to',doi.quote()+".pdf"
										newpdfs.append(pdf)
										continue
								else:
									print "Can't find.."
									newpdfs.append(pdf)
									continue
							else:
								newpdfs.append(pdf)
								continue
					else:
						print "Blank doi for file:",pdf
						newpdfs.append(pdf)
						continue
				if (oapdflink):
					newpdfs.append("internal-pdf://OAPDF/"+doi.quote()+".pdf")
				self.setpdfs(i,newpdfs)
				# Set the urls
				if (oapdflink and oapdflink not in urls):
					self.addurl(i,oapdflink,first=True)
			except Exception as e:
				print "Error at ", i, 'since: ',e
				#return 1
		if fname:
			self.write(fname)
		return 0

Example #10

Show file

	def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, 
		usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True):
		'''Find PDF by ISSN based on search result from crossref'''
		# may be improve to not only issn..
		if (not issn):return
		if (len(issn)==9 and issn[4]=='-'):
			needurl="http://api.crossref.org/journals/"+issn+"/works"
		elif('10.' in issn):
			needurl="http://api.crossref.org/prefixes/"+issn+"/works"
		else:
			print "Error ISSN/prefix"
			sys.exit(1)
		cr=CRrecord()
		total=cr.gettotalresultfromlink(needurl)
		if (not maxresult or maxresult <=0 or maxresult>total): 
			maxresult=total
		params={"rows":str(step)}
		maxround=(maxresult-offset)/step+1
		offsetcount=offset
		bdcheck=BDCheck()

		for i in range(maxround):
			params["offset"]=str(step*i+offset)
			r=requests.get(needurl,params,timeout=timeout_setting_download)
			if (r.status_code is 200):
				# Get all check/in oapdf 
				if usebdcheck: 
					bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True)

				for j in r.json().get('message',{}).get('items',[]):
					keyword=j.get('title',[''])
					doi=DOI(j.get("DOI",""))
					if not doi:
						offsetcount+=1
						time.sleep(2)
						continue

					# Check whether in bdcheck
					if (usebdcheck and doi in bdcheckall):
						print doi, 'has search/oapdf/free by bdcheck'
						offsetcount+=1
						time.sleep(1)
						continue
						
					# If not in bdcheck, check oapdf/free and set it
					# TODO: remove it after combine oapdf information to library
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] or oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						offsetcount+=1
						time.sleep(1)
						continue						

					if (keyword): 
						keyword=keyword[0]
					else:
						time.sleep(2)
						offsetcount+=1
						continue
					if usedoi:keyword+=" "+doi
					print "#####################################",offsetcount,"####################################"
					print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............"
					sys.stdout.flush()
					self.search(keyword.encode('utf-8'),proxy=proxy)
					bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck)
					bdcheck.set(doi)
					offsetcount+=1
			gc.collect()
		print "End of process for",issn

Example #11

Show file

File: pdfdoicheck.py Project: OAPDF/oapdftools

	def renamecheck(self,fname,wtitle=0.65,cutoff=0.85,justcheck=False,resetfile=True,fdoi=None,excludedoi=None, fobj=None):
		'''A complex function to get doi from file name, 
		check in crossref, check in pdf file, rename it!
		just check can cancel move file'''
		### Result back:
		# 0: Done 
		# 1: High
		# 2: Unsure
		# 3: Untitle
		# 4: Fail
		# 5: Page0
		# 6: ErrorDOI
		# 10: Unknow


		if (resetfile and isinstance(fobj,(file,StringIO))):
			self.reset(fname="",fobj=fobj)
			fname="None"

		# len(self.doi) is 1 and len(self.doi - excludedoi) is 1 : 
		# :: First Run and perform check
		# len(self.doi) is 1 or len(self.doi - excludedoi) is 1 :
		if (not fname and not fdoi):
			print "No given file name or doi! (Return 6)"
			return 6

		if (fname and not fdoi and excludedoi):
			print "What do you want?! No excludedoi set by user! (Return 9)"
			return 9

		
		if (resetfile and fname !="None"): 
			self.reset(fname)
		elif(resetfile and not isinstance(fobj,(file,StringIO))):
			print "Use reset file but no file name/object is given!"
			return 9

		if (self.maxpage == 0):
			if not justcheck: 
				self.moveresult(5, printstr="Error Page 0 (Page0, R5): "+self._fname)
			return 5

		if (not excludedoi):
			excludedoi=set()

		if (not fdoi):
			#File obj is ""
			fdoi=DOI(os.path.splitext(os.path.basename(self._fname))[0])
		else:
			fdoi=DOI(fdoi)

		recursive= (len(excludedoi) > 0)
		# If in recursive, don't move file!
		if recursive: justcheck=True

		if resetfile and not recursive:
			self.realdoi=fdoi

		# Only find DOI in first time!
		if (not recursive and fdoi): 
			self.finddoi(1)
		elif (not recursive and not fdoi):
			self.finddoi(set([1,2,self.maxpage]))

		# file doi is shit..Recursively use doi in file or fail
		if (not fdoi and not recursive):
			if (len(self.doi) is 1 or len(self.doi) is 2):
				print "Origin fdoi wrong but has 1~2 dois in file:",self._fname,
				return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
			# No doi or >2 dois in file
			else:
				if not justcheck: 
					self.moveresult(4,printstr="Error fdoi and 0/too much doi. (Fail): "+self._fname)
				return 4
		elif (not fdoi and recursive):
			print "doi (in recursion) may wrong with error doi. Should never happen.."
			return 4 # Fail

		# fdoi is ok
		cr=CRrecord()
		try:
			cr=cr.valid_doi(fdoi,fullparse=True)
		except requests.exceptions.RequestException as e:
			print e
			cr=None
		except Exception as e:
			print e
			cr=None		

		# Error when year=None, improve in crrecord.
		#if (cr and not cr.year):
		#	cr.year='8888'	

		#crossref is ok
		if (fdoi and cr):
			totalpagenumber=1
			try:
				totalpagenumber=self.totalpages(cr.pages)
			except ValueError as e:
				# should never happen now
				print e, cr.pages

			totalpagewrong=False
			#print "pages:",self.maxpage,' in crossref:',cr.pages,totalpagenumber
			if totalpagenumber>0 and not (self.maxpage >= totalpagenumber and self.maxpage <= totalpagenumber+2):
				totalpagewrong=True
				# When paper with supporting information
				if (self.maxpage > totalpagenumber+2):
					self.finddoi(page=2)
					if (self.withSI or (self.findtext('Supporting Information', page=[totalpagenumber+1,totalpagenumber+2])
						and self.findtext(cr.title, similarity=0.75, page=[totalpagenumber+1,totalpagenumber+2]))):
						if not recursive : self.finddoi(totalpagenumber);
						self.withSI=True
						totalpagewrong=False
					# For NIH Public Access
					elif (self.hascontent("NIH Public Access")[0]):
						totalpagewrong=False
					#Such as some Nature with SI in paper without notify.
					elif (self.withSI or (totalpagenumber>1 and self.findtext("acknowledgment", page=[totalpagenumber-1, totalpagenumber]) 
						and self.findtext("reference", page=[totalpagenumber-1, totalpagenumber]))):
						self.withSI=True
						totalpagewrong=False

			# Recursive but total page wrong. Fast end recursivedoicheck
			if (totalpagewrong and recursive):	
				return 4

			# Just check first page, not find(find before..), faster:
			doivalid=self.checkdoi(fdoi,page=1,iterfind=False,justcheck=True)
			titleeval=self.checktitle(cr.title)
			if (totalpagenumber > 0 and not totalpagewrong):
				if (doivalid and titleeval[0] and len(self.doi) is 1):
					# Yes! Very Good PDF!
					self.realdoi=fdoi
					if not justcheck: 
						if (self.maxpage>=2 and self.maxpage == totalpagenumber and
							not self.findtext('Supporting Information', page=[1])):
							self.moveresult(0,good=True)
						else:
							self.moveresult(0)
					return 0

			# Further check doi in page2/last, Finally, will check 1,2 and last pages.
			if (recursive):
				doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True,justcheck=True) or doivalid )
			else:
				doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True) or doivalid )

			if len(self.doi)>3:
				# Too much doi may be some abstract
				self.moveresult(2,printstr='Has more than 3 dois! (Unsure):'+self._fname)
				return 2

			# Page wrong and try recursive use doi
			if (totalpagewrong):
				if (len(self.doi) is 1 or len(self.doi) is 2):
					doi=DOI(list(self.doi)[0])
					# DOI in file is same so error. Don't need recursive
					if (len(self.doi) is 1 and doi == fdoi):
						if not justcheck: 
							self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname)
						return 4

					print 'Wrong total page with dois in file,',self._fname,fdoi,',try recursive'
					return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
				else:
					if not justcheck: 
						self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname)
					return 4

			if (not totalpagewrong):
				crscore=self.scorefitting(cr)
				if (self.maxpage <= totalpagenumber+2): 
					# Maybe check when maxpage >total+2
					titleeval=self.checktitle(cr.title)
				if cr.title.strip()=="":
					titleeval=(False,0.9)
				titlevalid=titleeval[0]
				try:
					paperyear=int(cr.year)
				except:
					paperyear=9999
				try:
					# Too old maybe lost information
					if (paperyear>1990):
						titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff
					else:
						titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff-0.1
				#(self.checktitle(cr.title,similarity=0.85) and self.checkcrossref(cr))
				except Exception as e:
					print e

				if (doivalid):
					if (titlevalid):					
						# Yes! Good PDF!
						self.realdoi=fdoi
						if not justcheck: 
							if (self.maxpage>=2 and self.maxpage == totalpagenumber 
								and len(self.doi) is 1 and
								not self.findtext('Supporting Information', page=[1])):
								self.moveresult(0,good=True)
							else:
								self.moveresult(0)
						return 0

					print "Title/Paper score:",titleeval[1],crscore,self._fname
					if (len(self.doi - set([fdoi])) == 1 and not recursive):
						
						# Try one more
						newresult = self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True)
						if (newresult is 0):
							newdoi=DOI(list(self.doi - set([fdoi]))[0])
							self.realdoi=newdoi
							print 
							if not justcheck: self.moveresult(0, 
								printstr="(Rename)fdoi ok, but not title. In file doi "+newdoi+" is better for "+self._fname,
								newfname=newdoi.quote()+".pdf")
							return 0

					# Else DOI ok but not title
					if not justcheck: 
						self.moveresult(3,printstr="OK fdoi but not title(Untitle): "+self._fname)
					return 3
				
				# Indeed, doi maybe in pdf, but strange format..
				if (self.checkdoinormaltxt(fdoi)):
					if (titlevalid):
						# Further check only when title OK
						if (self.checkdoifurther(fdoi)):
							# Fine! move to Done dir
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber 
								and len(self.doi) is 1 and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						else:
							# Can't find, but high similar! move to High dir
							if not justcheck: 
								self.moveresult(1,printstr="OK title and nospacebreak doi,but not pass(High): "+self._fname)
							return 1
					else:
						# DOI ok but not title
						print "Title/Paper score:",titleeval[1],crscore,self._fname
						if not justcheck: 
							self.moveresult(3,printstr="Maybe OK fdoi but not title(Untitle): "+self._fname)
						return 3						

				# DOI maybe not exist ....
				if (titlevalid):
					tmpdois=set(self.doi)
					for d in tmpdois:
						dd=DOI(d)
						if ( not dd.valid_doiorg(geturl=False) ):
							self.doi.remove(d)
							
					# Old paper don't have doi...
					if len(self.doi) is 0 and totalpagenumber>0:
						if (crscore['total'] >= 0.4):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.85 and crscore['total'] >= 0.35):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)								
							return 0
						elif (titleeval[1]>=0.95 and crscore['total'] >=0.3):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.90 and crscore['pages']>=0.9 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9)):
							if not justcheck: self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7):
							if not justcheck: self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.75 or crscore['total'] >=0.25):
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname)
								return 1
						else:
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(2,printstr="OK title and ok info fit. But no doi(Unsure): "+self._fname)
							return 2
					elif len(self.doi) is 0 and totalpagenumber== -1:
						if (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7):
							if not justcheck: self.moveresult(0)
							return 0
						else:
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname)
							return 2												
					elif len(self.doi) is 0 and totalpagenumber<=0:
						print "Title/Paper score:",titleeval[1],crscore,self._fname
						if not justcheck: 
							self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname)
						return 2
					elif ( len(self.doi) > 0 and not recursive):
						print "Good title but file doesn't contain fdoi, however it has >0 doi in file. "
						outnow=self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True)
						if outnow > 0:
							if not justcheck:
								self.moveresult(2,printstr="OK title but not fdoi. In file doi is not good(Unsure): "+self._fname)
							return 2
						elif(outnow==0):
							print 'Good Title but Fail fdoi. Paper has good in file doi,',self._fname,',try recursive'
							return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)

					### Old method check old items:
					#if (self.checkcrossref(cr)):
					#	if (int(cr.year)<=1999 and len(self.doi) is 0):
					#		# Highly possible right
					#		if not justcheck: self.movetodir("High")
					#		return True
					#	  Bentham, often blank doi
					#	elif (fdoi[:8] == '10.2174/' and len(self.doi) is 0):
					#		if not justcheck: self.movetodir("Done")
					#		return True
					#	elif (len(self.doi) is 0):
					#		print "Title/Paper score:",titleeval[1],crscore,self._fname
					#		if not justcheck: 
					#			self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname)
					#		return 1							
					#	else:
					#		if not justcheck: 
					#			self.moveresult(2,printstr="OK title and high info fit. But doi exist not fit(Unsure): "+self._fname)
					#		return 2								
					#elif(len(self.doi) is 0):
					#	# Maybe wrong file and no doi
					#	if not justcheck: 
					#		self.moveresult(2,printstr="Not found doi in file but ok title (Unsure): "+self._fname)
					#	return 2

			#fdoi,title wrong, no doi in file
			# Or in recursive mode
			if (len(self.doi) is 0 or recursive):
				if not justcheck: 
					self.moveresult(4,printstr="Both fdoi and title wrong, no doi in file(Fail): "+self._fname)
				return 4

			# Indeed, file has only one more doi, not the same to fname
			if (len(self.doi - set([fdoi])) is 1 ):
				print 'Fail fdoi/title. Paper with one more doi in file,',self._fname,',try recursive'
				return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
			elif(len(self.doi) > 1):
				if not justcheck: 
					self.moveresult(4,printstr="fdoi/title fail. Too much infile doi(Fail): "+self._fname)
				return 4
			else:
				if not justcheck: 
					self.moveresult(4,printstr="What????? What?????(Fail):"+self._fname)
				return 4
		# not cr
		else:
			if (not recursive):
				self.finddoi(set([1,2,self.maxpage]))
				if (len(self.doi) is 1 or len(self.doi) is 2):
					print 'Error DOI filename,',self._fname,',try recursive'
					return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)				
			if not justcheck: 
				self.moveresult(6,"Error DOI fname(Fail):"+self._fname)
			return 6