def download(self):
        if not os.path.exists(self.contents_file):
            print "Error: arXiv contents file %s does not exist" % (self.contents_file)
            sys.exit(1)

        # Change directory to source folder
        os.chdir(self.filedir)

        print "Press 'x' to break after the current download."
        while True:
            arxiv_file_line = fq.get(self.contents_file)
            if arxiv_file_line == None: 
                break

            print "Processing ", arxiv_file_line
    
            return_code = call([self.s3_cmd_ex,'get','--add-header=x-amz-request-payer: requester','--skip-existing', arxiv_file_line])

            if return_code != 0:
                print "Error downloading", arxiv_file_line 
                break

            fq.pop(self.contents_file)
            # break if x was pressed
            if 'x' in nbRawInput('',timeout=1):
                print "Download suspended. Restart script to resume."
                break        

        # Change directory to project current folder
        os.chdir(self.current_dir)
    def download(self):
        if not os.path.exists(self.contents_file):
            print "Error: arXiv contents file %s does not exist" % (
                self.contents_file)
            sys.exit(1)

        # Change directory to source folder
        os.chdir(self.filedir)

        print "Press 'x' to break after the current download."
        while True:
            arxiv_file_line = fq.get(self.contents_file)
            if arxiv_file_line == None:
                break

            print "Processing ", arxiv_file_line

            return_code = call([
                self.s3_cmd_ex, 'get',
                '--add-header=x-amz-request-payer: requester',
                '--skip-existing', arxiv_file_line
            ])

            if return_code != 0:
                print "Error downloading", arxiv_file_line
                break

            fq.pop(self.contents_file)
            # break if x was pressed
            if 'x' in nbRawInput('', timeout=1):
                print "Download suspended. Restart script to resume."
                break

        # Change directory to project current folder
        os.chdir(self.current_dir)
    def retrieve_citations(self):
        if not os.path.exists(self.tmp_dir):
            os.mkdir(self.tmp_dir)

        #Creates arXiv_citationqueue.txt if it doesn't exist by finding all the gz files in the extract folder
        if not os.path.exists(self.citation_queue):
            call('find {source_dir}*.gz -type f > {target_file}'.format(
                source_dir=self.extract_dir, target_file=self.citation_queue),
                 shell=True)

        # Initialise some variables
        batcher = Batch.Batch()

        while True:
            file_name = fq.get(self.citation_queue)
            if file_name is None: break

            arxiv_id = os.path.splitext(os.path.split(file_name)[1])[0]
            print "Retrieving citations", arxiv_id

            uncompressed_tmp = self.tmp_dir + arxiv_id
            if not os.path.exists(uncompressed_tmp):
                os.mkdir(uncompressed_tmp)
            returncode = call(
                ["tar", "xzf", file_name, "-C", uncompressed_tmp])
            if (
                    returncode == 1
            ):  #there was an error, so perhaps its not a Tar file. Instead try to decompress with plain old gunzip
                print "trying to gunzip instead for " + file_name
                os.system("gunzip -c %s > %s" %
                          (file_name, uncompressed_tmp + "/file.tex"))

            #Now process .tex files
            for tex_file_name in os.listdir(uncompressed_tmp):
                if not (tex_file_name.endswith('.tex')
                        or tex_file_name.endswith('.bbl')):
                    continue
                citations = self.settings["metadata_reader"].process(
                    arxiv_id, uncompressed_tmp + '/' + tex_file_name)

                #Store the citations in BibServer
                self.store_citations(batcher, arxiv_id, citations)

                #print "CITATIONS for " + arxiv_id
                #print citations

            # Delete temporary files
            if call('rm -R ' + uncompressed_tmp + '*', shell=True):
                break

            fq.pop(self.citation_queue)

        batcher.clear()
    def retrieve_citations(self):
        if not os.path.exists(self.tmp_dir):
            os.mkdir(self.tmp_dir)

        #Creates arXiv_citationqueue.txt if it doesn't exist by finding all the gz files in the extract folder
        if not os.path.exists(self.citation_queue):
            call('find {source_dir}*.gz -type f > {target_file}'.format(
                    source_dir = self.extract_dir,
                    target_file = self.citation_queue 
                    ) , shell = True)

        # Initialise some variables
        batcher = Batch.Batch()

        while True:
            file_name = fq.get(self.citation_queue)
            if file_name is None: break
            
            arxiv_id = os.path.splitext(os.path.split(file_name)[1])[0]
            print "Retrieving citations", arxiv_id

            uncompressed_tmp = self.tmp_dir + arxiv_id
            if not os.path.exists(uncompressed_tmp):
                os.mkdir(uncompressed_tmp)
            returncode = call(["tar", "xzf", file_name, "-C", uncompressed_tmp])
            if (returncode == 1): #there was an error, so perhaps its not a Tar file. Instead try to decompress with plain old gunzip
                print "trying to gunzip instead for " + file_name
                os.system("gunzip -c %s > %s" % (file_name, uncompressed_tmp + "/file.tex"))

            #Now process .tex files
            for tex_file_name in os.listdir(uncompressed_tmp):
                if not (tex_file_name.endswith('.tex') or tex_file_name.endswith('.bbl')): continue
                citations = self.settings["metadata_reader"].process(arxiv_id, uncompressed_tmp + '/' + tex_file_name)

                #Store the citations in BibServer
                self.store_citations(batcher, arxiv_id, citations)

                #print "CITATIONS for " + arxiv_id
                #print citations

            # Delete temporary files
            if call('rm -R ' + uncompressed_tmp + '*', shell=True):
                break

            fq.pop(self.citation_queue)

        batcher.clear()
def main():
    print 'Press "x" to break'


    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)

    if not os.path.exists(extract_dir):
        os.mkdir(extract_dir)

    if not os.path.exists(extraction_queue) or not RESUME:
        call('find {source_dir} -type f > {target_file}'.format(
                source_dir = bucket_dir,
                target_file = extraction_queue 
                ) , shell = True)

    while True:
        file_name = fq.get(extraction_queue)
        if file_name is None: break

        print "Extracting bucket" , file_name
        if call(['tar','xf',file_name,'-C',tmp_dir]):
            # call returns 1 on error.
            break

        if call('find %s -name *.gz -type f -exec mv {} %s \;' % (tmp_dir, extract_dir), shell = True):
            break

        if call('rm -R ' + tmp_dir + '*', shell=True):
            break

        fq.pop(extraction_queue)

        # break if x was pressed
        if nbRawInput('',timeout=1) == 'x':
            print "Extraction suspended. Restart script to resume."
            break
def main():
    print 'Press "x" to break'

    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)

    if not os.path.exists(extract_dir):
        os.mkdir(extract_dir)

    if not os.path.exists(extraction_queue) or not RESUME:
        call('find {source_dir} -type f > {target_file}'.format(
            source_dir=bucket_dir, target_file=extraction_queue),
             shell=True)

    while True:
        file_name = fq.get(extraction_queue)
        if file_name is None: break

        print "Extracting bucket", file_name
        if call(['tar', 'xf', file_name, '-C', tmp_dir]):
            # call returns 1 on error.
            break

        if call('find %s -name *.gz -type f -exec mv {} %s \;' %
                (tmp_dir, extract_dir),
                shell=True):
            break

        if call('rm -R ' + tmp_dir + '*', shell=True):
            break

        fq.pop(extraction_queue)

        # break if x was pressed
        if nbRawInput('', timeout=1) == 'x':
            print "Extraction suspended. Restart script to resume."
            break
    def extract(self):
        print "Press 'x' to interupt the extraction process"
        if not os.path.exists(self.tmp_dir):
            os.mkdir(self.tmp_dir)

        if not os.path.exists(self.extract_dir):
            os.mkdir(self.extract_dir)

        #Creates arXiv_extraction_queue.txt if it doesn't exist by finding all the tar files in the download folder
        if not os.path.exists(self.extraction_queue):
            call('find {source_dir}*.tar -type f > {target_file}'.format(
                source_dir=self.filedir, target_file=self.extraction_queue),
                 shell=True)

        while True:
            file_name = fq.get(self.extraction_queue)
            if file_name is None: break

            print "Extracting bucket", file_name
            if call(['tar', 'xf', file_name, '-C', self.tmp_dir]):
                # call returns 1 on error.
                break

            if call('find %s -name *.gz -type f -exec mv {} %s \;' %
                    (self.tmp_dir, self.extract_dir),
                    shell=True):
                break

            if call('rm -R ' + self.tmp_dir + '*', shell=True):
                break

            fq.pop(self.extraction_queue)

            # break if x was pressed
            if nbRawInput('', timeout=1) == 'x':
                print "Extraction suspended. Restart script to resume."
                break
    def extract(self):
        print "Press 'x' to interupt the extraction process"
        if not os.path.exists(self.tmp_dir):
            os.mkdir(self.tmp_dir)

        if not os.path.exists(self.extract_dir):
            os.mkdir(self.extract_dir)

        #Creates arXiv_extraction_queue.txt if it doesn't exist by finding all the tar files in the download folder
        if not os.path.exists(self.extraction_queue):
            call('find {source_dir}*.tar -type f > {target_file}'.format(
                    source_dir = self.filedir,
                    target_file = self.extraction_queue 
                    ) , shell = True)

        while True:
            file_name = fq.get(self.extraction_queue)
            if file_name is None: break

            print "Extracting bucket" , file_name
            if call(['tar','xf',file_name,'-C',self.tmp_dir]):
                # call returns 1 on error.
                break

            if call('find %s -name *.gz -type f -exec mv {} %s \;' % (self.tmp_dir, self.extract_dir), shell = True):
                break

            if call('rm -R ' + self.tmp_dir + '*', shell=True):
                break

            fq.pop(self.extraction_queue)

            # break if x was pressed
            if nbRawInput('',timeout=1) == 'x':
                print "Extraction suspended. Restart script to resume."
                break
cur_dir = os.getcwd()

contents_file = cur_dir + '/s3_contents.txt'
s3_cmd_ex     = cur_dir + "/../tools/s3cmd/s3cmd"
dl_dir        = cur_dir + '/../DATA/BUCKETS/'

if not os.path.exists(dl_dir):
    os.makedirs(dl_dir)

os.chdir(dl_dir)

print "Press 'x' to suspend after the current download."
while True:
    line = fq.get(contents_file)
    if line == None: 
        break

    print "Processing ", line
    
    return_code = call([s3_cmd_ex,'get','--add-header=x-amz-request-payer: requester','--skip-existing',line])

    if return_code != 0:
        print "ERROR downloading", line 
        break

    fq.pop(contents_file)
    # break if x was pressed
    if 'x' in nbRawInput('',timeout=1):
        print "Download suspended. Restart script to resume."
        break
Example #10
0
s3_cmd_ex = cur_dir + "/../tools/s3cmd/s3cmd"
dl_dir = cur_dir + '/../DATA/BUCKETS/'

if not os.path.exists(dl_dir):
    os.makedirs(dl_dir)

os.chdir(dl_dir)

print "Press 'x' to suspend after the current download."
while True:
    line = fq.get(contents_file)
    if line == None:
        break

    print "Processing ", line

    return_code = call([
        s3_cmd_ex, 'get', '--add-header=x-amz-request-payer: requester',
        '--skip-existing', line
    ])

    if return_code != 0:
        print "ERROR downloading", line
        break

    fq.pop(contents_file)
    # break if x was pressed
    if 'x' in nbRawInput('', timeout=1):
        print "Download suspended. Restart script to resume."
        break