def removegarbage(self,fname=None,cutoff=0.85,fontsize=0,autotry=False,notdelete=False): '''Remove patents, supporting informations files''' if not fname: fname=self._fname if (not fname or (fname == "None" and not notdelete)): print "No file name is set!" return 0 outstr=self.getbigtitle(fname=fname,cutoff=cutoff,fontsize=fontsize,autotry=autotry).lower().strip().replace(' ','') ## Open Access #oawords=['NIH Public Access'] #moveyn=False #for word in oawords: # word=word.lower().strip().replace(" ",'') # sim=strsimilarity(outstr,word) # if (sim >= 0.95 and fname != "None"): # os.renames(fname,'OAPub/'+os.path.split(fname)[1]) # self._fname='OAPub/'+os.path.split(fname)[1] # moveyn=True # return 1 # Patents, SI gwords=['EUROPEAN PATENT APPLICATION', 'EUROPEAN PATENT SPECIFICATION', 'United States Patent', 'AUSTRALIAN PATENT'] for word in gwords: word=word.lower().strip().replace(" ",'') sim=strsimilarity(outstr,word) if (sim >= 0.95): if (not notdelete): os.remove(fname) elif ( fname != "None" ): tmp=os.path.splitext(fname) os.renames(fname,tmp[0]+'@.Patent'+tmp[1]) self._fname=tmp[0]+'@.Patent'+tmp[1] return 2 gwords=['Supporting Information'] for word in gwords: word=word.lower().strip().replace(" ",'') sim=strsimilarity(outstr,word) if (sim >= 0.95): if (not notdelete): os.remove(fname) elif ( fname != "None" ): tmp=os.path.splitext(fname) os.renames(fname,tmp[0]+'@.SI'+tmp[1]) self._fname=tmp[0]+'@.SI'+tmp[1] return 3
def finddoi(self, num, prefix='', issn=''): title = self.gettitle(num) doi = DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/', 1)[0] if doi else "" volume = self.getvolume(num) journal = self.getjournalfull(num) year = self.getyear(num) pages = self.getpages(num) self.cr = CRrecord() try: # The origin doi maybe true. Find in crossref if (doi and self.cr.getfromdoi(doi, fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if (volume and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and volume == self.cr.volume): return doi if (year and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and year == self.cr.year): return doi print "Origin DOI:", doi, "may be true but record strange..Try title" keyword = title + " " + journal + " " + year + " " + pages + " " + volume if (self.cr.getfromtitledoi(keyword, doi, year=year, limit=10, fullparse=False, prefix=prefix)): if (doi): if (prefix == self.cr.doi.split('/')[0] and strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error for origin doi: " + doi + "; found: " + self.cr.doi return "" return self.cr.doi if (doi): if (strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..", e, "\nRetry..." return self.finddoi(num, prefix=prefix, issn=issn)
def hascontent(self,text, similarity=0.95,page=None,algorithm=2): '''Normalize text and find it in normalized pdf content found before. Normal use algorithm 2, for title use algorithm 3''' if not self._fname: print "File Name Not Set!!!" return (False,0.0) text=normalizeString(text).lower().strip().replace(' ','') if (not text): return (False,0.0) if (len(text)<3): return (False,0.0) try: #Check all parse before if (not page or (isinstance(page,int) and (page>self.maxpage or page<=0))): if (len(text)==3): perfect=text in ''.join(self.normaltxt) return (perfect,float(perfect)/2) if (similarity<1.0): #print text,''.join(self.normaltxt) sim=strsimilarity(''.join(self.normaltxt),text,algorithm=algorithm) return (sim >= similarity,sim) else: perfect=text in ''.join(self.normaltxt) return (perfect,float(perfect)) elif (isinstance(page,int)): if (len(text)==3): perfect=text in self.normaltxt[page-1] return (perfect,float(perfect)/2) if (similarity<1.0): #print text,self.normaltxt[page-1] sim=strsimilarity(self.normaltxt[page-1],text,algorithm=algorithm) return (sim >= similarity,sim) else: perfect=text in self.normaltxt[page-1] return (perfect,float(perfect)) except: print "Something error for hascontent function: "+text return (False,0.0)
def finddoi(self,num,prefix='',issn=''): title=self.gettitle(num) doi=DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/',1)[0] if doi else "" volume= self.getvolume(num) journal=self.getjournalfull(num) year=self.getyear(num) pages=self.getpages(num) self.cr=CRrecord() try: # The origin doi maybe true. Find in crossref if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if( volume and pages ): ops=pages.split('-') crps=self.cr.pages.split('-') if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume): return doi if( year and pages ): ops=pages.split('-') crps=self.cr.pages.split('-') if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year): return doi print "Origin DOI:",doi,"may be true but record strange..Try title" keyword=title+" "+journal+" "+year+" "+pages+" "+volume if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)): if (doi): if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85): return self.cr.doi else: print "Error for origin doi: "+doi+"; found: "+self.cr.doi return "" return self.cr.doi if (doi): if( strdiff(doi,self.cr.doi)>=0.85): return self.cr.doi else: print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..",e,"\nRetry..." return self.finddoi(num,prefix=prefix,issn=issn)