def moveissn(origin='.',prefixdir=True): '''Move files to their ISSN/Volume/Issue folder Files can be in prefix dir (prefixdir) or just in a folder''' targetfiles= '10.*/10.*.pdf' if prefixdir else '10.*.pdf' for f in glob.iglob(origin+os.sep+targetfiles): doi=DOI(filebasename(f)) os.renames(f,origin+os.sep+doi.prefix+os.sep+getpdfdir(doi)+doi.quote()+'.pdf')
def moveissn(origin='.', prefixdir=True): '''Move files to their ISSN/Volume/Issue folder Files can be in prefix dir (prefixdir) or just in a folder''' targetfiles = '10.*/10.*.pdf' if prefixdir else '10.*.pdf' for f in glob.iglob(origin + os.sep + targetfiles): doi = DOI(filebasename(f)) os.renames( f, origin + os.sep + doi.prefix + os.sep + getpdfdir(doi) + doi.quote() + '.pdf')
def moveprefix(origin='.'): '''Just move the PDFs in the origin folder to their prefix sub-folder''' pdffiles = glob.glob(origin + os.sep + "10.*.pdf") for pdf in pdffiles: fname = os.path.split(pdf)[1] doi = DOI(filebasename(pdf)) if (not os.path.exists(origin + os.sep + doi.prefix)): os.makedirs(origin + os.sep + doi.prefix) if (not os.path.exists(origin + os.sep + doi.prefix + os.sep + fname)): os.renames(pdf, origin + os.sep + doi.prefix + os.sep + fname)
def comparepdfsizeMove(totalfiles, sfresult, target='Done', check=False): # Start to compare for fg in totalfiles: #glob.iglob(origin+os.sep+'10.*.pdf'): fnamesplit = os.path.split(fg) doi = DOI(filebasename(fg)) if (doi): fpath = fg.strip().split('@', 1) fsize = os.path.getsize(fg) printout = "" if (not sfresult.has_key(doi)): if (not check or ph.FastCheck(fg)): targetfname = target + os.sep + fnamesplit[1] try: os.renames(fg, targetfname) except: if (os.path.exists(targetfname)): if (similarsize( fsize, os.path.getsize(targetfname)) >= 0): os.remove(fg) else: print "Move fail...", fg else: print 'File maybe wrong..', fg else: for line in sfresult[doi]: s = similarsize(fsize, str(line).strip()) if (s >= 0): os.remove(fg) printout = "" break #elif ( s is 1 and int(fsize)<int(line.strip())): # os.remove(fg) # printout="" # break else: printout = fg + " now size: " + str( fsize) + "; in lib: " + str(line).strip() if (printout): print printout else: print "File is not in doi style!", fg
def filter(self,doi): '''Use to filter the publisher''' doi=DOI(doi) return doi.publisherstyle(self.publisher)
def acsstyledoi(doi): doi=DOI(doi) if (doi.prefix == '10.1021'): return True else: return False
def genpdfsize(jsfnames): '''Generate PDF information on SF library''' sfurl = "http://oapdf.sourceforge.net/cgi-bin/pdfsize.cgi?owner=oapdf" count = 0 tmpdoisize = {} tmpdois = set() toappend = {} if isinstance(jsfnames, str): jsfnames = [jsfnames] for jf in jsfnames: f = open(jf) j = json.loads(f.read()) f.close() for pdf, fs in j['items'].items(): count += 1 if (count % 50 == 0): # Firstly, query the records r = requests.post(sfurl, params={'dois': json.dumps(list(tmpdois))}, timeout=120) result = r.json() for d, s in tmpdoisize.items(): if (not set(s).issubset(set(result.get(d, [])))): toappend[d] = list(set(s + result.get(d, []))) if (len(toappend) > 0): rr = requests.post(sfurl, params={ 'doisize': json.dumps(toappend), 'update': "True" }, timeout=120) if (rr.status_code == 200): #print rr.text #if fail, submit at next time tmpdoisize.clear() tmpdois.clear() toappend.clear() else: tmpdoisize.clear() tmpdois.clear() toappend.clear() time.sleep(2) doi = DOI(filebasename(pdf)) tmpdois.add(doi) tmpdoisize.setdefault(doi, []).append(fs) if (len(tmpdois) > 0): # Firstly, query the records r = requests.post(sfurl, params={'dois': json.dumps(list(tmpdois))}, timeout=120) result = r.json() for d, s in tmpdoisize.items(): if (not set(s).issubset(set(result.get(d, [])))): toappend[d] = list(set(s + result.get(d, []))) if (len(toappend) > 0): rr = requests.post(sfurl, params={ 'doisize': json.dumps(toappend), 'update': "True" }, timeout=120) if (rr.status_code == 200): #print rr.text #if fail, submit at next time tmpdoisize.clear() tmpdois.clear() toappend.clear() else: tmpdoisize.clear() tmpdois.clear() toappend.clear() print "Total done for:", count
def comparepdfsize(origin='.', target='Done', check=False, totalsize=None): '''Compare PDF size in origin to online library. If new, move to target folder''' if not os.path.exists(origin): print "The origin folder not exist! Exit!" sys.exit(1) if (not os.path.exists(target)): os.makedirs(target) if (totalsize and float(totalsize) > 0): totalsize = float(totalsize) * 1000000000 sfurl = "http://oapdf.sourceforge.net/cgi-bin/pdfsize.cgi?owner=oapdf" workingdir = os.path.abspath('.') # Get online informations count = 0 sfresult = {} tmpdois = set() totalfiles = [] nowsizes = 0 for fg in glob.iglob(origin + os.sep + '10.*.pdf'): if nowsizes > totalsize: break doi = DOI(filebasename(fg)) if (doi): count += 1 totalfiles.append(fg) nowsizes += os.path.getsize(fg) if (count % 50 == 0): # Firstly, query the records try: r = requests.post( sfurl, params={'dois': json.dumps(list(tmpdois))}, timeout=120) except requests.RequestException: r = requests.post( sfurl, params={'dois': json.dumps(list(tmpdois))}, timeout=120) if (r.status_code == 200): sfresult.update(r.json()) tmpdois.clear() comparepdfsizeMove(totalfiles, sfresult, target=target, check=check) del totalfiles[:] sfresult.clear() #time.sleep(2) tmpdois.add(doi) if (len(tmpdois) > 0): r = requests.post(sfurl, params={'dois': json.dumps(list(tmpdois))}, timeout=120) if (r.status_code == 200): sfresult.update(r.json()) tmpdois.clear() comparepdfsizeMove(totalfiles, sfresult, target=target, check=check) del totalfiles[:] sfresult.clear()
def touchpage(origin='.', doilink='../doilink', pdf=True, force=False): # Use to save local page record if not os.path.exists(doilink): os.makedirs(doilink + os.sep + 'pages') doilink = doilink.rstrip('/').rstrip('\\') sfurl = "http://oapdf.sourceforge.net/cgi-bin/touchdoi.cgi?owner=oapdf" workdir = os.path.abspath(origin).rstrip('\\').rstrip('/') count = 0 touchcount = 1 # avoid submit when start forcesf = force # force to overwrite the exist doilink page if (pdf): result = (chain.from_iterable( glob.iglob(os.path.join(x[0], '10.*.pdf')) for x in os.walk(workdir))) else: result = (chain.from_iterable( glob.iglob(os.path.join(x[0], '10.*.html')) for x in os.walk(workdir))) toappend = [] newtouch = 0 for f in result: if (touchcount % 50 == 0): r = requests.post(sfurl, params={'dois': json.dumps(toappend)}, timeout=120) if (r.status_code == 200): bs = BeautifulSoup(r.text, "html.parser") totaldid = bs.findChild('span', attrs={'id': 'total'}) if totaldid and totaldid.text: newtouch += int(totaldid.text) del toappend[:] else: print "Maybe Error when submit to SF-OAPDF.." sys.exit(1) count += 1 fname = filebasename(f) if (' ' in fname): print "File name has blank!", f os.renames( f, os.path.split(f)[0] + os.sep + fname.strip() + os.path.splitext(f)[1]) fname = fname.strip() doi = DOI(fname) if (doi): dirname = doilink + "/pages/" + doi.decompose(url=False, outdir=True) if (forcesf or not os.path.exists(dirname + fname + '.html')): touchcount += 1 toappend.append(doi) try: if (not os.path.exists(dirname)): os.makedirs(dirname) f = open(dirname + fname + '.html', "w") f.close() except WindowsError as e: print e except: print "Something error for file:", f else: print "File name may be error (Not DOI name):", fname r = requests.post(sfurl, params={'dois': json.dumps(toappend)}, timeout=120) if (r.status_code == 200): bs = BeautifulSoup(r.text, "html.parser") totaldid = bs.findChild('span', attrs={'id': 'total'}) if totaldid and totaldid.text: newtouch += int(totaldid.text) del toappend[:] else: print "Maybe Error when submit to SF-OAPDF.." sys.exit(1) print "Process total file:", count, "; local touch new:", touchcount - 1, "; remote touch:", newtouch
def touchpage(origin='.', doilink='../doilink',pdf=True,force=False): # Use to save local page record if not os.path.exists(doilink): os.makedirs(doilink+os.sep+'pages') doilink=doilink.rstrip('/').rstrip('\\') sfurl="http://oapdf.sourceforge.net/cgi-bin/touchdoi.cgi?owner=oapdf" workdir=os.path.abspath(origin).rstrip('\\').rstrip('/') count=0 touchcount=1 # avoid submit when start forcesf=force # force to overwrite the exist doilink page if (pdf): result = (chain.from_iterable(glob.iglob(os.path.join(x[0], '10.*.pdf')) for x in os.walk(workdir))) else: result = (chain.from_iterable(glob.iglob(os.path.join(x[0], '10.*.html')) for x in os.walk(workdir))) toappend=[] newtouch=0 for f in result: if (touchcount%50==0): r=requests.post(sfurl,params={'dois':json.dumps(toappend)},timeout=120) if (r.status_code == 200): bs=BeautifulSoup(r.text,"html.parser") totaldid=bs.findChild('span',attrs={'id':'total'}) if totaldid and totaldid.text : newtouch+=int(totaldid.text) del toappend[:] else: print "Maybe Error when submit to SF-OAPDF.." sys.exit(1) count+=1 fname=filebasename(f) if (' ' in fname): print "File name has blank!",f os.renames(f,os.path.split(f)[0]+os.sep+fname.strip()+os.path.splitext(f)[1]) fname=fname.strip() doi=DOI(fname) if (doi): dirname=doilink+"/pages/"+doi.decompose(url=False, outdir=True) if (forcesf or not os.path.exists(dirname+fname+'.html')): touchcount+=1 toappend.append(doi) try: if (not os.path.exists(dirname)): os.makedirs(dirname) f=open(dirname+fname+'.html',"w") f.close() except WindowsError as e: print e except: print "Something error for file:",f else: print "File name may be error (Not DOI name):",fname r=requests.post(sfurl,params={'dois':json.dumps(toappend)},timeout=120) if (r.status_code == 200): bs=BeautifulSoup(r.text,"html.parser") totaldid=bs.findChild('span',attrs={'id':'total'}) if totaldid and totaldid.text : newtouch+=int(totaldid.text) del toappend[:] else: print "Maybe Error when submit to SF-OAPDF.." sys.exit(1) print "Process total file:",count,"; local touch new:",touchcount-1, "; remote touch:",newtouch
if (not os.path.exists('Done')): os.makedirs('Done') ph = PDFHandler() nocheckpdf = False targetdir = '.' workingdir = os.path.abspath('.') if (len(sys.argv) >= 2): nocheckpdf = True targetdir = sys.argv[1] for fg in glob.iglob(targetdir + os.sep + '10.*.pdf'): fnamesplit = os.path.split(fg) doi = DOI(os.path.splitext(fnamesplit[1])[0]) if (doi): fpath = fg.strip().split('@', 1) fsize = os.path.getsize(fg) fname = 'Decoy/' + doi.prefix + os.sep + fnamesplit[1] printout = "" if (not os.path.exists(fname)): if (nocheckpdf or ph.FastCheck(fg)): targetfname = 'Done' + os.sep + fnamesplit[1] try: os.renames(fg, targetfname) except: if (os.path.exists(targetfname)): if (similarsize(fsize, os.path.getsize(targetfname)) >= 0):