Example #1
0
def moveissn(origin='.',prefixdir=True):
	'''Move files to their ISSN/Volume/Issue folder
	Files can be in prefix dir (prefixdir) or just in a folder'''

	targetfiles= '10.*/10.*.pdf' if prefixdir else '10.*.pdf'
	for f in glob.iglob(origin+os.sep+targetfiles):
		doi=DOI(filebasename(f))
		os.renames(f,origin+os.sep+doi.prefix+os.sep+getpdfdir(doi)+doi.quote()+'.pdf')
Example #2
0
def moveissn(origin='.', prefixdir=True):
    '''Move files to their ISSN/Volume/Issue folder
	Files can be in prefix dir (prefixdir) or just in a folder'''

    targetfiles = '10.*/10.*.pdf' if prefixdir else '10.*.pdf'
    for f in glob.iglob(origin + os.sep + targetfiles):
        doi = DOI(filebasename(f))
        os.renames(
            f, origin + os.sep + doi.prefix + os.sep + getpdfdir(doi) +
            doi.quote() + '.pdf')
Example #3
0
def moveprefix(origin='.'):
    '''Just move the PDFs in the origin folder to their prefix sub-folder'''
    pdffiles = glob.glob(origin + os.sep + "10.*.pdf")
    for pdf in pdffiles:
        fname = os.path.split(pdf)[1]
        doi = DOI(filebasename(pdf))
        if (not os.path.exists(origin + os.sep + doi.prefix)):
            os.makedirs(origin + os.sep + doi.prefix)
        if (not os.path.exists(origin + os.sep + doi.prefix + os.sep + fname)):
            os.renames(pdf, origin + os.sep + doi.prefix + os.sep + fname)
Example #4
0
def comparepdfsizeMove(totalfiles, sfresult, target='Done', check=False):
    # Start to compare
    for fg in totalfiles:  #glob.iglob(origin+os.sep+'10.*.pdf'):
        fnamesplit = os.path.split(fg)
        doi = DOI(filebasename(fg))
        if (doi):
            fpath = fg.strip().split('@', 1)
            fsize = os.path.getsize(fg)
            printout = ""
            if (not sfresult.has_key(doi)):
                if (not check or ph.FastCheck(fg)):
                    targetfname = target + os.sep + fnamesplit[1]
                    try:
                        os.renames(fg, targetfname)
                    except:
                        if (os.path.exists(targetfname)):
                            if (similarsize(
                                    fsize, os.path.getsize(targetfname)) >= 0):
                                os.remove(fg)
                        else:
                            print "Move fail...", fg
                else:
                    print 'File maybe wrong..', fg
            else:
                for line in sfresult[doi]:
                    s = similarsize(fsize, str(line).strip())
                    if (s >= 0):
                        os.remove(fg)
                        printout = ""
                        break
                    #elif ( s is 1 and int(fsize)<int(line.strip())):
                    #	os.remove(fg)
                    #	printout=""
                    #	break
                    else:
                        printout = fg + " now size: " + str(
                            fsize) + "; in lib: " + str(line).strip()
                if (printout): print printout
        else:
            print "File is not in doi style!", fg
Example #5
0
	def filter(self,doi):
		'''Use to filter the publisher'''
		doi=DOI(doi)
		return doi.publisherstyle(self.publisher)
Example #6
0
def acsstyledoi(doi):
	doi=DOI(doi)
	if (doi.prefix == '10.1021'):
		return True
	else:
		return False
Example #7
0
def genpdfsize(jsfnames):
    '''Generate PDF information on SF library'''
    sfurl = "http://oapdf.sourceforge.net/cgi-bin/pdfsize.cgi?owner=oapdf"
    count = 0
    tmpdoisize = {}
    tmpdois = set()
    toappend = {}
    if isinstance(jsfnames, str):
        jsfnames = [jsfnames]

    for jf in jsfnames:
        f = open(jf)
        j = json.loads(f.read())
        f.close()
        for pdf, fs in j['items'].items():
            count += 1
            if (count % 50 == 0):
                # Firstly, query the records
                r = requests.post(sfurl,
                                  params={'dois': json.dumps(list(tmpdois))},
                                  timeout=120)
                result = r.json()
                for d, s in tmpdoisize.items():
                    if (not set(s).issubset(set(result.get(d, [])))):
                        toappend[d] = list(set(s + result.get(d, [])))
                if (len(toappend) > 0):
                    rr = requests.post(sfurl,
                                       params={
                                           'doisize': json.dumps(toappend),
                                           'update': "True"
                                       },
                                       timeout=120)
                    if (rr.status_code == 200):
                        #print rr.text
                        #if fail, submit at next time
                        tmpdoisize.clear()
                        tmpdois.clear()
                        toappend.clear()
                else:
                    tmpdoisize.clear()
                    tmpdois.clear()
                    toappend.clear()
                time.sleep(2)
            doi = DOI(filebasename(pdf))
            tmpdois.add(doi)
            tmpdoisize.setdefault(doi, []).append(fs)

    if (len(tmpdois) > 0):
        # Firstly, query the records
        r = requests.post(sfurl,
                          params={'dois': json.dumps(list(tmpdois))},
                          timeout=120)
        result = r.json()
        for d, s in tmpdoisize.items():
            if (not set(s).issubset(set(result.get(d, [])))):
                toappend[d] = list(set(s + result.get(d, [])))
        if (len(toappend) > 0):
            rr = requests.post(sfurl,
                               params={
                                   'doisize': json.dumps(toappend),
                                   'update': "True"
                               },
                               timeout=120)
            if (rr.status_code == 200):
                #print rr.text
                #if fail, submit at next time
                tmpdoisize.clear()
                tmpdois.clear()
                toappend.clear()
        else:
            tmpdoisize.clear()
            tmpdois.clear()
            toappend.clear()
    print "Total done for:", count
Example #8
0
def comparepdfsize(origin='.', target='Done', check=False, totalsize=None):
    '''Compare PDF size in origin to online library.
	If new, move to target folder'''
    if not os.path.exists(origin):
        print "The origin folder not exist! Exit!"
        sys.exit(1)

    if (not os.path.exists(target)):
        os.makedirs(target)

    if (totalsize and float(totalsize) > 0):
        totalsize = float(totalsize) * 1000000000

    sfurl = "http://oapdf.sourceforge.net/cgi-bin/pdfsize.cgi?owner=oapdf"
    workingdir = os.path.abspath('.')

    # Get online informations
    count = 0
    sfresult = {}
    tmpdois = set()
    totalfiles = []
    nowsizes = 0
    for fg in glob.iglob(origin + os.sep + '10.*.pdf'):
        if nowsizes > totalsize:
            break

        doi = DOI(filebasename(fg))
        if (doi):
            count += 1
            totalfiles.append(fg)
            nowsizes += os.path.getsize(fg)

            if (count % 50 == 0):
                # Firstly, query the records
                try:
                    r = requests.post(
                        sfurl,
                        params={'dois': json.dumps(list(tmpdois))},
                        timeout=120)
                except requests.RequestException:
                    r = requests.post(
                        sfurl,
                        params={'dois': json.dumps(list(tmpdois))},
                        timeout=120)

                if (r.status_code == 200):
                    sfresult.update(r.json())
                    tmpdois.clear()

                    comparepdfsizeMove(totalfiles,
                                       sfresult,
                                       target=target,
                                       check=check)
                    del totalfiles[:]
                    sfresult.clear()
                    #time.sleep(2)
            tmpdois.add(doi)

    if (len(tmpdois) > 0):
        r = requests.post(sfurl,
                          params={'dois': json.dumps(list(tmpdois))},
                          timeout=120)
        if (r.status_code == 200):
            sfresult.update(r.json())
            tmpdois.clear()

            comparepdfsizeMove(totalfiles,
                               sfresult,
                               target=target,
                               check=check)
            del totalfiles[:]
            sfresult.clear()
Example #9
0
def touchpage(origin='.', doilink='../doilink', pdf=True, force=False):
    # Use to save local page record
    if not os.path.exists(doilink):
        os.makedirs(doilink + os.sep + 'pages')
    doilink = doilink.rstrip('/').rstrip('\\')
    sfurl = "http://oapdf.sourceforge.net/cgi-bin/touchdoi.cgi?owner=oapdf"

    workdir = os.path.abspath(origin).rstrip('\\').rstrip('/')
    count = 0
    touchcount = 1  # avoid submit when start
    forcesf = force  # force to overwrite the exist doilink page

    if (pdf):
        result = (chain.from_iterable(
            glob.iglob(os.path.join(x[0], '10.*.pdf'))
            for x in os.walk(workdir)))
    else:
        result = (chain.from_iterable(
            glob.iglob(os.path.join(x[0], '10.*.html'))
            for x in os.walk(workdir)))

    toappend = []
    newtouch = 0
    for f in result:
        if (touchcount % 50 == 0):
            r = requests.post(sfurl,
                              params={'dois': json.dumps(toappend)},
                              timeout=120)
            if (r.status_code == 200):
                bs = BeautifulSoup(r.text, "html.parser")
                totaldid = bs.findChild('span', attrs={'id': 'total'})
                if totaldid and totaldid.text:
                    newtouch += int(totaldid.text)
                del toappend[:]
            else:
                print "Maybe Error when submit to SF-OAPDF.."
                sys.exit(1)
        count += 1
        fname = filebasename(f)
        if (' ' in fname):
            print "File name has blank!", f
            os.renames(
                f,
                os.path.split(f)[0] + os.sep + fname.strip() +
                os.path.splitext(f)[1])
            fname = fname.strip()
        doi = DOI(fname)
        if (doi):
            dirname = doilink + "/pages/" + doi.decompose(url=False,
                                                          outdir=True)
            if (forcesf or not os.path.exists(dirname + fname + '.html')):
                touchcount += 1
                toappend.append(doi)
                try:
                    if (not os.path.exists(dirname)): os.makedirs(dirname)
                    f = open(dirname + fname + '.html', "w")
                    f.close()
                except WindowsError as e:
                    print e
                except:
                    print "Something error for file:", f
        else:
            print "File name may be error (Not DOI name):", fname

    r = requests.post(sfurl,
                      params={'dois': json.dumps(toappend)},
                      timeout=120)
    if (r.status_code == 200):
        bs = BeautifulSoup(r.text, "html.parser")
        totaldid = bs.findChild('span', attrs={'id': 'total'})
        if totaldid and totaldid.text:
            newtouch += int(totaldid.text)
        del toappend[:]
    else:
        print "Maybe Error when submit to SF-OAPDF.."
        sys.exit(1)
    print "Process total file:", count, "; local touch new:", touchcount - 1, "; remote touch:", newtouch
Example #10
0
def touchpage(origin='.', doilink='../doilink',pdf=True,force=False):
	# Use to save local page record
	if not os.path.exists(doilink):
		os.makedirs(doilink+os.sep+'pages')
	doilink=doilink.rstrip('/').rstrip('\\')
	sfurl="http://oapdf.sourceforge.net/cgi-bin/touchdoi.cgi?owner=oapdf"

	workdir=os.path.abspath(origin).rstrip('\\').rstrip('/')
	count=0
	touchcount=1 # avoid submit when start
	forcesf=force # force to overwrite the exist doilink page

	if (pdf):
		result = (chain.from_iterable(glob.iglob(os.path.join(x[0], '10.*.pdf')) for x in os.walk(workdir)))
	else:
		result = (chain.from_iterable(glob.iglob(os.path.join(x[0], '10.*.html')) for x in os.walk(workdir)))

	toappend=[]
	newtouch=0
	for f in result:
		if (touchcount%50==0):
			r=requests.post(sfurl,params={'dois':json.dumps(toappend)},timeout=120)
			if (r.status_code == 200):
				bs=BeautifulSoup(r.text,"html.parser")
				totaldid=bs.findChild('span',attrs={'id':'total'})
				if totaldid and totaldid.text :
					newtouch+=int(totaldid.text)
				del toappend[:]
			else:
				print "Maybe Error when submit to SF-OAPDF.."
				sys.exit(1)
		count+=1
		fname=filebasename(f)
		if (' ' in fname):
			print "File name has blank!",f
			os.renames(f,os.path.split(f)[0]+os.sep+fname.strip()+os.path.splitext(f)[1])
			fname=fname.strip()
		doi=DOI(fname)
		if (doi):
			dirname=doilink+"/pages/"+doi.decompose(url=False, outdir=True)
			if (forcesf or not os.path.exists(dirname+fname+'.html')):
				touchcount+=1
				toappend.append(doi)
				try:
					if (not os.path.exists(dirname)): os.makedirs(dirname)
					f=open(dirname+fname+'.html',"w")
					f.close()
				except WindowsError as e:
					print e
				except:
					print "Something error for file:",f
		else:
			print "File name may be error (Not DOI name):",fname

	r=requests.post(sfurl,params={'dois':json.dumps(toappend)},timeout=120)
	if (r.status_code == 200):
		bs=BeautifulSoup(r.text,"html.parser")
		totaldid=bs.findChild('span',attrs={'id':'total'})
		if totaldid and totaldid.text :
			newtouch+=int(totaldid.text)
		del toappend[:]
	else:
		print "Maybe Error when submit to SF-OAPDF.."
		sys.exit(1)
	print "Process total file:",count,"; local touch new:",touchcount-1, "; remote touch:",newtouch
Example #11
0
if (not os.path.exists('Done')):
    os.makedirs('Done')

ph = PDFHandler()

nocheckpdf = False

targetdir = '.'
workingdir = os.path.abspath('.')
if (len(sys.argv) >= 2):
    nocheckpdf = True
    targetdir = sys.argv[1]

for fg in glob.iglob(targetdir + os.sep + '10.*.pdf'):
    fnamesplit = os.path.split(fg)
    doi = DOI(os.path.splitext(fnamesplit[1])[0])

    if (doi):
        fpath = fg.strip().split('@', 1)
        fsize = os.path.getsize(fg)
        fname = 'Decoy/' + doi.prefix + os.sep + fnamesplit[1]
        printout = ""
        if (not os.path.exists(fname)):
            if (nocheckpdf or ph.FastCheck(fg)):
                targetfname = 'Done' + os.sep + fnamesplit[1]
                try:
                    os.renames(fg, targetfname)
                except:
                    if (os.path.exists(targetfname)):
                        if (similarsize(fsize, os.path.getsize(targetfname)) >=
                                0):