def __call__(self, t_path, name_info, i_str): ''' Load chunk from t_path and put it into the right place in s3 using the output_name template from the config ''' name_info.update( get_name_info(t_path, i_str=i_str) ) if name_info['num'] == 0: o_path = None return o_path o_fname = self.config['output_name'] % name_info o_path = os.path.join(self.config['s3_path_prefix'], o_fname + '.tar.gz') logger.info('to_s3_tarballs: \n\t%r\n\tfrom: %r\n\tby way of %r ' % (o_path, i_str, t_path)) ## forcibly collect dereferenced objects #gc.collect() t_path2 = tarball_export(t_path, name_info) data = open(t_path2).read() name_info['md5'] = hashlib.md5(data).hexdigest() # pylint: disable=E1101 self.upload(o_path, data, name_info) self.cleanup(t_path) self.cleanup(t_path2) logger.info('to_s3_tarballs finished:\n\t input: %s\n\toutput: %s' % (i_str, o_path)) ## return the final output path return o_path
def __call__(self, t_path, name_info, i_str): ''' Load chunk from t_path and put it into the right place in s3 using the output_name template from the config ''' name_info.update(get_name_info(t_path, i_str=i_str)) if name_info['num'] == 0: o_path = None return o_path o_fname = self.config['output_name'] % name_info o_path = os.path.join(self.config['s3_path_prefix'], o_fname + '.tar.gz') logger.info('to_s3_tarballs: \n\t%r\n\tfrom: %r\n\tby way of %r ' % (o_path, i_str, t_path)) ## forcibly collect dereferenced objects #gc.collect() t_path2 = tarball_export(t_path, name_info) data = open(t_path2).read() name_info['md5'] = hashlib.md5(data).hexdigest() # pylint: disable=E1101 self.upload(o_path, data, name_info) self.cleanup(t_path) self.cleanup(t_path2) logger.info('to_s3_tarballs finished:\n\t input: %s\n\toutput: %s' % (i_str, o_path)) ## return the final output path return o_path
def __call__(self, t_path, name_info, i_str): name_info.update( get_name_info( t_path, i_str=i_str ) ) if name_info['num'] == 0: return None o_fname = self.config['output_name'] % name_info o_dir = self.config['output_path'] o_path = os.path.join(o_dir, o_fname + '.tar.gz') ## if dir is missing make it dirname = os.path.dirname(o_path) if dirname and not os.path.exists(dirname): os.makedirs(dirname) t_path2 = tarball_export(self.config, t_path, name_info) ## do an atomic renaming try: logger.debug('attemping os.rename(%r, %r)' % (t_path2, o_path)) os.rename(t_path2, o_path) except OSError, exc: if exc.errno==18: patient_move(t_path2, o_path) else: msg = 'failed shutil.copy2(%r, %r) and/or os.remove(t_path)\n%s'\ % (t_path2, o_path, traceback.format_exc(exc)) logger.critical(traceback.format_exc(exc)) raise
def __call__(self, t_path, name_info, i_str): ''' Load chunk from t_path and put it into the right place in s3 using the output_name template from the config ''' name_info.update( get_name_info(t_path, i_str=i_str) ) if name_info['num'] == 0: o_path = None return o_path o_fname = self.config['output_name'] % name_info o_path = os.path.join(self.config['s3_path_prefix'], o_fname + '.sc.xz') if self.config.get('gpg_encryption_key_path'): o_path += '.gpg' name_info['s3_output_path'] = o_path logger.info('to_s3_chunks: \n\t%r\n\tfrom: %r\n\tby way of %r ' % (o_path, i_str, t_path)) ## forcibly collect dereferenced objects #gc.collect() ## compress and encrypt logger.info( 'key path: %r', self.config.get('gpg_encryption_key_path') ) _errors, t_path2 = compress_and_encrypt_path( t_path, self.config.get('gpg_encryption_key_path'), gpg_recipient=self.config.get('gpg_recipient'), tmp_dir=self.config['tmp_dir_path'], ) logger.info( '\n'.join(_errors) ) data = open(t_path2).read() logger.debug('compressed size: %d' % len(data)) while 1: start_time = time.time() self.put(o_path, data) elapsed = time.time() - start_time if elapsed > 0: logger.debug('put %.1f bytes/second' % (len(data) / elapsed)) if self.config['verify_via_http']: try: start_time = time.time() self.verify(o_path, name_info['md5']) elapsed = time.time() - start_time if elapsed > 0: logger.debug('verify %.1f bytes/second' % (len(data) / elapsed)) break except Exception, exc: logger.critical( 'verify_via_http failed so retrying: %r' % exc ) ## keep looping if verify raises anything continue else: ## not verifying, so don't attempt multiple puts break
def test_get_name_info(tmpdir): path = str(tmpdir.join('test_path')) c = Chunk(path, mode='wb') c.add(make_stream_item(28491, 'abs_url')) name_info = get_name_info(path, i_str='foo') assert name_info['date_now'] == name_info['date_time_now'][:10] assert name_info['date_now'] + '-' + name_info['time_now'] == name_info['date_time_now']
def __call__(self, t_path, name_info, i_str): ''' Load chunk from t_path and put it into the right place in s3 using the output_name template from the config ''' name_info.update(get_name_info(t_path, i_str=i_str)) if name_info['num'] == 0: o_path = None return o_path o_fname = self.config['output_name'] % name_info o_path = os.path.join(self.config['s3_path_prefix'], o_fname + '.sc.xz') if self.config.get('gpg_encryption_key_path'): o_path += '.gpg' name_info['s3_output_path'] = o_path logger.info('to_s3_chunks: \n\t%r\n\tfrom: %r\n\tby way of %r ' % (o_path, i_str, t_path)) ## forcibly collect dereferenced objects #gc.collect() ## compress and encrypt logger.info('key path: %r', self.config.get('gpg_encryption_key_path')) _errors, t_path2 = compress_and_encrypt_path( t_path, self.config.get('gpg_encryption_key_path'), gpg_recipient=self.config.get('gpg_recipient'), tmp_dir=self.config['tmp_dir_path'], ) logger.info('\n'.join(_errors)) data = open(t_path2).read() logger.debug('compressed size: %d' % len(data)) while 1: start_time = time.time() self.put(o_path, data) elapsed = time.time() - start_time if elapsed > 0: logger.debug('put %.1f bytes/second' % (len(data) / elapsed)) if self.config['verify_via_http']: try: start_time = time.time() self.verify(o_path, name_info['md5']) elapsed = time.time() - start_time if elapsed > 0: logger.debug('verify %.1f bytes/second' % (len(data) / elapsed)) break except Exception, exc: logger.critical('verify_via_http failed so retrying: %r' % exc) ## keep looping if verify raises anything continue else: ## not verifying, so don't attempt multiple puts break
def __call__(self, t_path, name_info, i_str): o_type = self.config['output_type'] name_info.update( get_name_info( t_path, i_str=i_str ) ) if name_info['num'] == 0: return None if 'input' in self.config['output_name']: i_fname = i_str.split('/')[-1] if i_fname.endswith('.gpg'): i_fname = i_fname[:-4] if i_fname.endswith('.xz'): i_fname = i_fname[:-3] if i_fname.endswith('.sc'): i_fname = i_fname[:-3] name_info['input_fname'] = i_fname ## prepare to compress the output compress = self.config.get('compress', None) if o_type == 'samedir': ## assume that i_str was a local path assert i_str[-3:] == '.sc', repr(i_str[-3:]) o_path = i_str[:-3] + '-%s.sc' % self.config['output_name'] if compress: o_path += '.xz' #print 'creating %s' % o_path elif o_type == 'inplace': ## replace the input chunks with the newly created o_path = i_str if o_path.endswith('.xz'): compress = True elif o_type == 'otherdir': ## put the if not self.config['output_path'].startswith('/'): o_dir = os.path.join(os.getcwd(), self.config['output_path']) else: o_dir = self.config['output_path'] if not os.path.exists(o_dir): os.makedirs(o_dir) o_fname = self.config['output_name'] % name_info o_path = os.path.join(o_dir, o_fname + '.sc') if compress: o_path += '.xz' logger.info('writing chunk file to {}'.format(o_path)) logger.debug('temporary chunk in {}'.format(t_path)) ## if dir is missing make it dirname = os.path.dirname(o_path) if dirname and not os.path.exists(dirname): os.makedirs(dirname) if compress: assert o_path.endswith('.xz'), o_path logger.info('compress_and_encrypt_path(%r, tmp_dir=%r)', t_path, self.config['tmp_dir_path']) ## forcibly collect dereferenced objects #gc.collect() errors, t_path2 = streamcorpus.compress_and_encrypt_path( t_path, tmp_dir=self.config['tmp_dir_path']) assert not errors, errors if self.config['cleanup_tmp_files']: # default action, move tmp file to output position try: logger.debug('attempting renamed(%r, %r)', t_path2, o_path) os.rename(t_path2, o_path) logger.debug('renamed(%r, %r)', t_path2, o_path) except OSError, exc: if exc.errno==18: logger.debug('resorting to patient_move(%r, %r)', t_path2, o_path, exc_info=True) patient_move(t_path2, o_path) logger.debug('patient_move succeeded') else: logger.critical('rename failed (%r -> %r)', t_path2, o_path, exc_info=True) raise return o_path else: # for debugging, leave temp file, copy to output shutil.copy(t_path2, o_path) logger.info('copied %r -> %r', t_path2, o_path) return o_path