def run(self): LOCAL_PATH = os.getcwd( ) + self.LOCAL_DIR # I need to do this trick to facilitate pytest run print("====================================", LOCAL_PATH) # Read the hyperspectral image from the S3 write to local directory with S3Target(path=f'{self.S3_ROOT}{self.image}', format=luigi.format.Nop).open('r') as in_image: my_image = in_image.read() with LocalTarget(path=f'{LOCAL_PATH}{self.image}', format=luigi.format.Nop).open('w') as out_image: out_image.write(my_image) # Read ground truth from S3 and write to local directory with S3Target(path=f'{self.S3_ROOT}{self.gt}', format=luigi.format.Nop).open('r') as in_gt: my_gt = in_gt.read() with LocalTarget(path=f'{LOCAL_PATH}{self.gt}', format=luigi.format.Nop).open('w') as out_gt: out_gt.write(my_gt) # Read ground truth names from S3 and write to local directory with S3Target(path=f'{self.S3_ROOT}{self.gt_names}', format=luigi.format.Nop).open('r') as in_gt_names: my_gt_names = in_gt_names.read() with LocalTarget(path=f'{LOCAL_PATH}{self.gt_names}', format=luigi.format.Nop).open('w') as out_gt_names: out_gt_names.write(my_gt_names)
def output(self): return [ S3Target('s3://data-observatory/observatory.pdf'), S3Target( 's3://data-observatory/observatory-{timestamp}.pdf'.format( timestamp=self.timestamp)), ]
def output(self): prefix = '{}/{}/'.format(cfg['S3_BUCKET'], self.expt_id) out_dict = { 'est_counts': S3Target(prefix + 'est_counts.csv'), 'tpm': S3Target(prefix + 'tpm.csv') } if self.annot: out_dict['annotations'] = S3Target(prefix + 'annotations.csv') return out_dict
def output(self): return S3Target( s3.path(S3.MODELS + "{date:%Y/%m/%d/random_forest_T%H%M%S.pkl}".format( date=self.date)), client=s3.create_client(), )
def get_target(path): """ Factory method to create a Luigi Target from a path string. Supports the following Target types: * S3Target: s3://my-bucket/my-path * LocalTarget: /path/to/file or file:///path/to/file :type path: str :param path: s3 or file URL, or local path :rtype: Target: :returns: Target for path string """ if path.startswith('s3:'): return S3Target(path) elif path.startswith('/'): return LocalTarget(path) elif path.startswith('file://'): # remove the file portion actual_path = path[7:] return LocalTarget(actual_path) else: raise RuntimeError("Unknown scheme for path: %s" % path)
def test_read_iterator_long(self): # Test iteration - write a file that is 5X the boto buffersize old_buffer = key.Key.BufferSize key.Key.BufferSize = 2 try: tempf = tempfile.NamedTemporaryFile(mode='wb', delete=False) temppath = tempf.name firstline = ''.zfill(key.Key.BufferSize * 5) + os.linesep secondline = 'line two' + os.linesep thirdline = 'line three' + os.linesep contents = firstline + secondline + thirdline tempf.write(contents.encode('utf-8')) tempf.close() client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) create_bucket() remote_path = 's3://psetbucket/largetempfile' client.put(temppath, remote_path) t = S3Target(remote_path, client=client) with t.open() as read_file: lines = [line for line in read_file] finally: key.Key.BufferSize = old_buffer self.assertEqual(3, len(lines)) self.assertEqual(firstline, lines[0]) self.assertEqual(secondline, lines[1]) self.assertEqual(thirdline, lines[2])
def output(self): return S3Target( s3.path(S3.MODELS + "{date:%Y/%m/%d/gradient_boosting_T%H%M%S.pkl}".format( date=self.date)), client=s3.create_client(), )
def output(self): return S3Target( s3.path(S3.MODELS + "{date:%Y/%m/%d/logistic_regression_T%H%M%S.pkl}".format( date=self.date)), client=s3.create_client(), )
def requires(self): # _x = tempfile.NamedTemporaryFile(mode="w+b",delete=False) # # with _x as temporaryfile: # # temporaryfile.write(b"cool") # return _x f = S3Target(path='test', format=Nop) return f
def test_read_iterator_long(self): # write a file that is 5X the boto buffersize # to test line buffering old_buffer = key.Key.BufferSize key.Key.BufferSize = 2 try: tempf = tempfile.NamedTemporaryFile(mode='wb', delete=False) temppath = tempf.name firstline = ''.zfill(key.Key.BufferSize * 5) + os.linesep contents = firstline + 'line two' + os.linesep + 'line three' tempf.write(contents.encode('utf-8')) tempf.close() client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') client.put(temppath, 's3://mybucket/largetempfile') t = S3Target('s3://mybucket/largetempfile', client=client) with t.open() as read_file: lines = [line for line in read_file] finally: key.Key.BufferSize = old_buffer self.assertEqual(3, len(lines)) self.assertEqual(firstline, lines[0]) self.assertEqual("line two" + os.linesep, lines[1]) self.assertEqual("line three", lines[2])
def get_target(cls, scheme, path, fragment, username, password, hostname, port, query, **kwargs): query.update(kwargs) return S3Target( '{scheme}://{hostname}{path}'.format(scheme=scheme, hostname=hostname, path=path), **query)
def create_target(self, format=None, **kwargs): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) create_bucket() return S3Target('s3://mybucket/test_file', client=client, format=format, **kwargs)
def test_read_with_session(self): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_SESSION_TOKEN) create_bucket() client.put(self.tempFilePath, 's3://mybucket/tempfile-with-session') t = S3Target('s3://mybucket/tempfile-with-session', client=client) read_file = t.open() file_str = read_file.read() self.assertEqual(self.tempFileContents, file_str.encode('utf-8'))
def output(self): s3_client = boto3.client('s3') objects = s3_client.list_objects(Bucket=self.S3_BUCKET, Prefix=self.IMAGE_ROOT) for obj in objects['Contents']: print('Checking for files in S3: %s' % obj['Key']) return S3Target("s3://{}/{}".format(self.S3_BUCKET, self.IMAGE_ROOT), format=luigi.format.Nop)
def test_read(self): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') client.put(self.tempFilePath, 's3://mybucket/tempfile') t = S3Target('s3://mybucket/tempfile', client=client) read_file = t.open() file_str = read_file.read() self.assertEqual(self.tempFileContents, file_str.encode('utf-8'))
def output(self): params = config(section='s3') client = S3Client(**params) return S3Target( 's3://s3-bucket-wikidata/{}/wikipedia_info_output.csv'.format( strftime("%Y-%m-%d")), format=UTF8, client=client)
def __init__(self, path, *args, **kwargs): self.local_s3_path = kwargs.pop('local_s3_path', os.getenv('LOCAL_S3_PATH', None)) if not self.local_s3_path: self._proxy = S3Target(path, *args, **kwargs) else: path = os.path.join(self.local_s3_path, path.replace('s3://', '')) self._proxy = LocalTarget(path, *args, **kwargs)
def output(self): s3_prefix = '{}/{}/disambiguate/{}'.format(cfg['S3_BUCKET'], self.sample_folder, self.sample_id) s3_paths = { 'fq1': s3_prefix + '_1.fq.gz', 'fq2': s3_prefix + '_2.fq.gz', } return {k: S3Target(path) for k, path in s3_paths.items()}
def output(self): s3_prefix = '{}/{}/filtered/{}'.format(cfg['S3_BUCKET'], self.sample_folder, self.sample_id) s3_paths = [ s3_prefix + '_1.fq.gz', s3_prefix + '_2.fq.gz', ] return [S3Target(path) for path in s3_paths]
def output(self): # test from local file (development use) # return LocalTarget( # os.path.join(local_root, "FinanceData", "{}.parquet".format(self.ticker)), # format=Nop, # ) return S3Target( os.path.join(s3_root, "FinanceData", "{}.parquet".format(self.ticker)))
def output(self): path = self.input().path.replace('tmp/carto/Dump_', 'do-release-') path = path.replace('.dump', '/obs.dump') path = 's3://cartodb-observatory-data/{path}'.format(path=path) LOGGER.info(path) target = S3Target(path) if self.force: shell('aws s3 rm {output}'.format(output=path)) self.force = False return target
def output(self): if self.layout == 'paired': s3_path = '{folder}/{srr_id}_{pe}.fastq.gz' return { 'fq1': S3Target( s3_path.format(folder=self.outpath, pe=1, srr_id=self.srr_id)), 'fq2': S3Target( s3_path.format(folder=self.outpath, pe=2, srr_id=self.srr_id)) } else: s3_path = '{folder}/{srr_id}.fastq.gz' return S3Target( s3_path.format(folder=self.outpath, srr_id=self.srr_id))
def output(self): return [ S3Target(s3.path(S3.MODELLING + "train.parquet"), client=s3.create_client()), S3Target(s3.path(S3.MODELLING + "test.parquet"), client=s3.create_client()), S3Target( s3.path(S3.MODELS + "{date:%Y/%m/%d/train_T%H%M%S.parquet}".format( date=self.date)), client=s3.create_client(), ), S3Target( s3.path(S3.MODELS + "{date:%Y/%m/%d/test_T%H%M%S.parquet}".format( date=self.date)), client=s3.create_client(), ), ]
def output(self): output_files = { 'abundance': 'abundance.tsv', 'h5': 'abundance.h5', 'run_info': 'run_info.json' } return { k: S3Target('{}/{}/kallisto/{}'.format(cfg['S3_BUCKET'], self.sample_folder, fname)) for k, fname in output_files.items() }
def _interpret_scheme(full_path): scheme = urllib.parse.urlparse(full_path).scheme if scheme == '' or scheme == 'file': ''' LOCAL FILE ''' return luigi.LocalTarget(full_path) elif scheme == 's3': ''' S3 FILE ''' return S3Target(full_path) assert False
def output(self): """ The output that this Task produces. See :ref:`Task.output` :rtype: luigi.LocalTarget """ if self.target.startswith('s3://'): return S3Target(self.target) else: return luigi.LocalTarget(self.target + self.dry_run_suffix)
def output(self): s3_paths = { 'html_1': self.sample_id + '_1_fastqc.html', 'zip_1': self.sample_id + '_1_fastqc.zip', 'html_2': self.sample_id + '_2_fastqc.html', 'zip_2': self.sample_id + '_2_fastqc.zip' } return { k: S3Target('{}/{}/fastqc/{}'.format(cfg['S3_BUCKET'], self.sample_folder, fname)) for k, fname in s3_paths.items() }
def output(self): output_files = { 'human': '{}{}.disambiguatedSpeciesA.bam', 'mouse': '{}{}.disambiguatedSpeciesB.bam', 'human_ambiguous': '{}{}.ambiguousSpeciesA.bam', 'mouse_ambiguous': '{}{}.ambiguousSpeciesB.bam', 'summary': '{}{}_summary.txt' } s3_paths = { k: v.format(self.parameters['outdir'], self.parameters['sample']) for k, v in output_files.items() } return {k: S3Target(path) for k, path in s3_paths.items()}
def run(self): entries = [] for folder_path in self.folder_paths: s3 = S3Target(folder_path) client = s3.fs for file_name in client.list(s3.path): entries.append({ 'url': '%s/%s' % (folder_path, file_name), 'mandatory': True }) manifest = {'entries': entries} target = self.output().open('w') dump = json.dumps(manifest) if not self.text_target: dump = dump.encode('utf8') target.write(dump) target.close()
def save_result(data, path): print('Saving result') sleep(3) S3Target(path).open('w').close()