class SingleEvaluation(luigi.WrapperTask): iteration = luigi.IntParameter(default=186000) path = luigi.Parameter( default= '/nrs/saalfeld/heinrichl/synapses/pre_and_post/pre_and_post-v9.0/run01/' ) data_eval = luigi.TupleParameter(default=('data2016-aligned', 'data2016-unaligned')) samples = luigi.TupleParameter(default=('A', 'B', 'C', 'A+', 'B+', 'C+')) def requires(self): for de in self.data_eval: if 'A+' in self.samples or 'B+' in self.samples or 'C+' in self.samples: test_samples = [] for s in self.samples: if ('+' in s): test_samples.append(s) test_samples = tuple(test_samples) yield SplitModi(self.iteration, self.path, de, 'groundtruth', test_samples, self.data_eval) if 'A' in self.samples or 'B' in self.samples or 'C' in self.samples: training_samples = [] for s in self.samples: if not ('+' in s): training_samples.append(s) training_samples = tuple(training_samples) yield PartnerReport(self.iteration, self.path, de, 'groundtruth', training_samples, self.data_eval) yield PartnerReport(self.iteration, self.path, de, 'validation', training_samples, self.data_eval) yield PartnerReport(self.iteration, self.path, de, 'training', training_samples, self.data_eval)
class SgovCompaniesParse(luigi.Task): sheets = luigi.TupleParameter(default=None) skiptop = luigi.TupleParameter(default=None) usecolumns = luigi.TupleParameter(default=None) def output(self): # _name = build_fname2(self.name, '.csv', suff=today()) return luigi.LocalTarget( build_file_path(self.directory, self.name, '.csv', suff=today(FILENAME_DATE_FORMAT))) def run(self): for i, target in enumerate(self.input()): self.set_status_message('Parsing {}'.format(target.path)) rows = parse(target.path, CompanieRow, skiprows=self.skiptop, sheets=self.sheets) save_csvrows(self.output().path, [attr.astuple(r) for r in rows]) percent = round((i + 1) * 100 / len(self.input())) self.set_progress_percentage(percent)
class LeaveOneOutEvaluation(luigi.Task): name = luigi.Parameter() edgetypes = luigi.TupleParameter(default=('linked', 'mutual', 'followee', 'follower')) methods = luigi.TupleParameter(default=('mv', 'gm', 'pm', 'rn')) def requires(self): yield HomeLocation(name=self.name) for edgetype in self.edgetypes: for method in self.methods: yield LeaveOneOutPrediction(name=self.name, edgetype=edgetype, method=method) def output(self): output_path = os.path.join('data/experiments/loocv/evaluation', '{}.tsv'.format(self.name)) return luigi.LocalTarget(output_path) def run(self): truth = self.input()[0] cmd = 'python -m snlocest.scripts.evaluate_prf {} {} >> {}' with self.output().temporary_path() as temp_output_path: for result in self.input()[1:]: run(cmd.format(truth.path, result.path, temp_output_path), shell=True, check=True)
class Crop(luigi.Task): it = luigi.IntParameter() path = luigi.Parameter() de = luigi.Parameter() samples = luigi.TupleParameter() data_eval = luigi.TupleParameter() resources = {'ram': 50} @property def priority(self): if int(self.it) % 10000 == 0: return 1. / int(self.it) else: return 0. def requires(self): return Predict(self.it, self.path, self.samples, self.data_eval) def output(self): return luigi.LocalTarget( os.path.join(os.path.dirname(self.input()[0].fn), self.de, 'crop.msg')) def run(self): progress = 0. self.set_progress_percentage(progress) if 'unaligned' in self.de: aligned = False else: aligned = True for s in self.samples: filename = os.path.join(os.path.dirname(self.input()[0].fn), self.de, s + '.n5') datasets_src = ['clefts', 'pre_dist', 'post_dist'] datasets_tgt = [ 'clefts_cropped', 'pre_dist_cropped', 'post_dist_cropped' ] off = offsets[s][aligned] sh = shapes[s][aligned] f = z5py.File(filename, use_zarr_format=False) for dss, dst in zip(datasets_src, datasets_tgt): chunk_size = tuple( min(c, shi) for c, shi in zip(f[dss].chunks, sh)) f.create_dataset(dst, shape=sh, compression='gzip', dtype=f[dss].dtype, chunks=chunk_size) bb = tuple(slice(o, o + shi, None) for o, shi in zip(off, sh)) f[dst][:] = f[dss][bb] f[dst].attrs['offset'] = off[::-1] progress += 100. / (len(self.samples) * len(datasets_src)) try: self.set_progress_percentage(progress) except: pass done = self.output().open('w') done.close()
class AllEvaluations(luigi.WrapperTask): up_to_iteration = luigi.IntParameter(default=200000) iteration_step = luigi.IntParameter(default=10000, significant=False) data_train = luigi.TupleParameter(default=("data2016-aligned", "data2016-unaligned")) data_eval = luigi.TupleParameter(default=("data2017-aligned", "data2017-unaligned")) augmentation = luigi.TupleParameter(default=("deluxe", "classic", "lite")) mode = luigi.TupleParameter(default=("validation", "training")) samples = luigi.TupleParameter(default=("A", "B", "C")) def requires(self): for it in range( self.iteration_step, self.up_to_iteration + self.iteration_step, self.iteration_step, ): for dt in self.data_train: for aug in self.augmentation: for de in self.data_eval: for m in self.mode: # yield CleftReport(it, dt, aug, de, m, self.samples, self.data_eval) if it > 20000: yield PartnerReport(it, dt, aug, de, m, self.samples, self.data_eval)
class Crop(luigi.Task): it = luigi.IntParameter() path = luigi.Parameter() de = luigi.Parameter() samples = luigi.TupleParameter() data_eval = luigi.TupleParameter() resources = {"ram": 50} @property def priority(self): if int(self.it) % 10000 == 0: return 1.0 / int(self.it) else: return 0.0 def requires(self): return Predict(self.it, self.path, self.samples, self.data_eval) def output(self): return luigi.LocalTarget( os.path.join(os.path.dirname(self.input()[0].fn), self.de, "crop.msg") ) def run(self): progress = 0.0 self.set_progress_percentage(progress) if "unaligned" in self.de: aligned = False else: aligned = True for s in self.samples: filename = os.path.join( os.path.dirname(self.input()[0].fn), self.de, s + ".n5" ) datasets_src = ["clefts", "pre_dist", "post_dist"] datasets_tgt = ["clefts_cropped", "pre_dist_cropped", "post_dist_cropped"] off = offsets[s][aligned] sh = shapes[s][aligned] f = zarr.open(filename, mode="a") for dss, dst in zip(datasets_src, datasets_tgt): chunk_size = tuple(min(c, shi) for c, shi in zip(f[dss].chunks, sh)) f.create_dataset( name=dst, shape=sh, compressor=numcodecs.GZip(6), dtype=f[dss].dtype, chunks=chunk_size, ) bb = tuple(slice(o, o + shi, None) for o, shi in zip(off, sh)) f[dst][:] = f[dss][bb] f[dst].attrs["offset"] = off[::-1] progress += 100.0 / (len(self.samples) * len(datasets_src)) try: self.set_progress_percentage(progress) except: pass done = self.output().open("w") done.close()
class A(luigi.Task): task_namespace = 'mynamespace' t = luigi.TupleParameter(default=((1, 2), (3, 4))) expected = luigi.TupleParameter() def complete(self): if self.t != self.expected: raise ValueError return True
class SingleEvaluation(luigi.WrapperTask): iteration = luigi.IntParameter(default=186000) path = luigi.Parameter( default= "/nrs/saalfeld/heinrichl/synapses/pre_and_post/pre_and_post-v9.0/run01/" ) data_eval = luigi.TupleParameter(default=("data2016-aligned", "data2016-unaligned")) samples = luigi.TupleParameter(default=("A", "B", "C", "A+", "B+", "C+")) def requires(self): for de in self.data_eval: if "A+" in self.samples or "B+" in self.samples or "C+" in self.samples: test_samples = [] for s in self.samples: if "+" in s: test_samples.append(s) test_samples = tuple(test_samples) yield SplitModi( self.iteration, self.path, de, "groundtruth", test_samples, self.data_eval, ) if "A" in self.samples or "B" in self.samples or "C" in self.samples: training_samples = [] for s in self.samples: if not ("+" in s): training_samples.append(s) training_samples = tuple(training_samples) yield PartnerReport( self.iteration, self.path, de, "groundtruth", training_samples, self.data_eval, ) yield PartnerReport( self.iteration, self.path, de, "validation", training_samples, self.data_eval, ) yield PartnerReport( self.iteration, self.path, de, "training", training_samples, self.data_eval, )
class Threshold(luigi.Task): it = luigi.IntParameter() dt = luigi.Parameter() aug = luigi.Parameter() de = luigi.Parameter() samples = luigi.TupleParameter() data_eval = luigi.TupleParameter() resources = {"ram": 50} @property def priority(self): if int(self.it) % 10000 == 0: return 1.0 / int(self.it) else: return 0.0 def requires(self): return Crop(self.it, self.dt, self.aug, self.de, self.samples, self.data_eval) def output(self): return luigi.LocalTarget( os.path.join(os.path.dirname(self.input().fn), "thr.msg")) def run(self): thrs = [127, 42] progress = 0.0 self.set_progress_percentage(progress) for s in self.samples: filename = os.path.join(os.path.dirname(self.input().fn), s + ".n5") dataset_src = "clefts_cropped" dataset_tgt = "clefts_cropped_thr{0:}" f = zarr.open(filename, mode="a") for t in thrs: f.empty( name=dataset_tgt.format(t), shape=f[dataset_src].shape, compressor=numcodecs.GZip(6), dtype="uint8", chunks=f[dataset_src].chunks, ) f[dataset_tgt.format(t)][:] = (f[dataset_src][:] > t).astype( np.uint8) f[dataset_tgt.format( t)].attrs["offset"] = f[dataset_src].attrs["offset"] progress += 100.0 / len(self.samples) try: self.set_progress_percentage(progress) except: pass done = self.output().open("w") done.close()
class Threshold(luigi.Task): it = luigi.IntParameter() dt = luigi.Parameter() aug = luigi.Parameter() de = luigi.Parameter() samples = luigi.TupleParameter() data_eval = luigi.TupleParameter() resources = {'ram': 50} @property def priority(self): if int(self.it) % 10000 == 0: return 1. / int(self.it) else: return 0. def requires(self): return Crop(self.it, self.dt, self.aug, self.de, self.samples, self.data_eval) def output(self): return luigi.LocalTarget( os.path.join(os.path.dirname(self.input().fn), 'thr.msg')) def run(self): thrs = [127, 42] progress = 0. self.set_progress_percentage(progress) for s in self.samples: filename = os.path.join(os.path.dirname(self.input().fn), s + '.n5') dataset_src = 'clefts_cropped' dataset_tgt = 'clefts_cropped_thr{0:}' f = z5py.File(filename, use_zarr_format=False) for t in thrs: f.create_dataset(dataset_tgt.format(t), shape=f[dataset_src].shape, compression='gzip', dtype='uint8', chunks=f[dataset_src].chunks) f[dataset_tgt.format(t)][:] = (f[dataset_src][:] > t).astype( np.uint8) f[dataset_tgt.format( t)].attrs['offset'] = f[dataset_src].attrs['offset'] progress += 100. / len(self.samples) try: self.set_progress_percentage(progress) except: pass done = self.output().open('w') done.close()
class PartnerReport(luigi.Task): it = luigi.IntParameter() dt = luigi.Parameter() aug = luigi.Parameter() de = luigi.Parameter() m = luigi.Parameter() samples = luigi.TupleParameter() data_eval = luigi.TupleParameter() resources = {'ram': 50} @property def priority(self): if int(self.it)%10000==0: return 1./int(self.it) else: return 0. def requires(self): return SplitModi(self.it, self.dt, self.aug, self.de, self.m, self.samples, self.data_eval) def output(self): return luigi.LocalTarget(os.path.join(os.path.dirname(self.input().fn), 'partners.' + self.m + '.json')) def run(self): progress = 0. results = dict() self.set_progress_percentage(progress) for s in self.samples: truth = os.path.join('/groups/saalfeld/saalfeldlab/larissa/data/cremieval/', self.de, s + '.' + self.m + '.h5') test = os.path.join(os.path.dirname(self.input().fn), s+'.'+self.m+'.h5') truth = CremiFile(truth, 'a') test = CremiFile(test, 'a') synaptic_partners_eval = SynapticPartners() print(test.read_annotations()) fscore, precision, recall, fp, fn, filtered_matches = synaptic_partners_eval.fscore( test.read_annotations(), truth.read_annotations(), truth.read_neuron_ids(), all_stats=True) results[s] = dict() results[s]['fscore'] = fscore results[s]['precision'] = precision results[s]['recall'] = recall results[s]['fp'] = fp results[s]['fn'] = fn results[s]['filtered_matches'] = filtered_matches progress += 100. / len(self.samples) try: self.set_progress_percentage(progress) except: pass with self.output().open('w') as done: json.dump(results, done)
class FollowFilteredEdgelist(luigi.Task): '''edgelistの左側にunknownが出て来るエッジを消して、居住地の付けたユーザからのデータのみにしたエッジリスト Args: --name LocationUserListとUnknownListがわかるように保存パスに使われる名前 --month ''' month = luigi.MonthParameter() name = luigi.Parameter() type = luigi.ChoiceParameter(choices=['followers', 'following']) sources = luigi.TupleParameter(default=('followers', 'following')) def requires(self): return { 'edgelist': TwitterFollowRawEdgelist(month=self.month, type=self.type), 'hl': RemainedHomeLocation(name=self.name, month=self.month) } def output(self): return luigi.LocalTarget( os.path.join( NETWORK_DIR, 'filtered', self.name, self.month.strftime('%Y%m_{}.tsv.gz'.format(self.type)))) def run(self): with self.output().temporary_path() as temp_output_path: cmd = 'zcat {edgelist.path} | python -m snlocest.scripts.edgefilter -i {hl.path} | gzip > {}'.format( temp_output_path, **self.input()) run(cmd, shell=True, check=True)
class LibRadarHeatmap(luigi.Task): apks = luigi.TupleParameter() pkg = luigi.Parameter() app_info_folder = cfg.info_app_folder def get_app_info(self): with self.input()['app_info'].open() as data_file: return json.load(data_file) # requires application json def requires(self): appinfo_file = os.path.join(self.app_info_folder, self.pkg, self.pkg + '.json') return { 'matrix': LibRadarMatrix(pkg=self.pkg, apks=self.apks), 'app_info': ExternalFile(file_name=appinfo_file) } # output is the heatmap def output(self): output_file = os.path.join(cfg.libradar_heatmap_folder, self.pkg + ".pdf") return ExternalFileTarget(output_file) # creates the heatmap of permission use and saves it to a file def create_heatmap(self, data, row_labels, col_labels): pdata = pd.DataFrame(data, index=row_labels, columns=col_labels) pdata.index.name = "libraries" pdata.columns.name = "Versions" # TODO put this in all heatmap creation. refactor code row_cluster = True if data.shape[0] > 1 else False # get app_info from external file app_info = self.get_app_info() col_colors = heatmaps.get_col_colors(col_labels, app_info) vmax = pdata.values.max() splot = heatmaps.plot_heatmap(pdata, vmax=vmax, col_colors=col_colors, row_cluster=row_cluster, annot=False) if not os.path.exists(os.path.dirname(self.output().path)): os.makedirs(os.path.dirname(self.output().path)) splot.savefig(os.path.abspath(self.output().path), format='pdf') # creates the heatmap def run(self): # read app matrix from json with self.input()['matrix'].open() as data_file: data = json.load(data_file) # get matrix and create the heatmap matrix = numpy.array(data['m']) self.create_heatmap(matrix, data['yl'], data['xl'])
class MakeItFolder(luigi.ExternalTask): it = luigi.IntParameter() dt = luigi.IntParameter() aug = luigi.Parameter() data_eval = luigi.TupleParameter() @property def priority(self): return self.it def requires(self): return CheckCheckpoint(self.it, self.dt, self.aug) def output(self): base = os.path.dirname(self.input()[0].fn) return luigi.LocalTarget( os.path.join(base, "evaluation", str(self.it), self.data_eval[-1]) ) def run(self): # make the folders base = os.path.dirname(self.input()[0].fn) for de in self.data_eval: if not os.path.exists(os.path.join(base, "evaluation", str(self.it), de)): os.makedirs(os.path.join(base, "evaluation", str(self.it), de))
class DummyS3CopyJSONToTableBase(luigi.contrib.redshift.S3CopyJSONToTable): # Class attributes taken from `DummyPostgresImporter` in # `../postgres_test.py`. aws_access_key_id = AWS_ACCESS_KEY aws_secret_access_key = AWS_SECRET_KEY host = 'dummy_host' database = 'dummy_database' user = '******' password = '******' table = luigi.Parameter(default='dummy_table') columns = luigi.TupleParameter(default=( ('some_text', 'varchar(255)'), ('some_int', 'int'), )) copy_options = '' prune_table = '' prune_column = '' prune_date = '' jsonpath = '' copy_json_options = '' def s3_load_path(self): return 's3://%s/%s' % (BUCKET, KEY)
class WideRecommender(ClassifierWithTransferLearningKerasModelTraining): input_shape: Tuple[int, int] = luigi.TupleParameter(default=(100, )) batch_size: int = luigi.IntParameter(default=10) learning_rate = luigi.FloatParameter(default=1e-5) dense_layers: List[int] = luigi.ListParameter(default=[512, 512]) dropout: float = luigi.FloatParameter(default=None) activation_function: str = luigi.ChoiceParameter( choices=KERAS_ACTIVATION_FUNCTIONS.keys(), default="relu") kernel_initializer: str = luigi.ChoiceParameter( choices=KERAS_WEIGHT_INIT.keys(), default="glorot_uniform") def create_base_model(self) -> Model: x_input = Input(shape=self.input_shape, name='wide_inp') wide = Dense(self.input_shape[0], activation=self.activation_function, kernel_initializer=self.kernel_initializer, name='wide_mlp')(x_input) output = Dense(1, activation='sigmoid', kernel_initializer=self.kernel_initializer)(wide) model = Model(x_input, output, name='Wide') return model def create_model_with(self, base_model: Model) -> Model: return base_model
class ParseElasticApi(luigi.Task): name = luigi.Parameter(default='') version = luigi.Parameter(default='') versions = luigi.TupleParameter(default='') rep_name = luigi.Parameter(default='') chunk_size = luigi.IntParameter(default=CHUNK_SIZE) api_key = luigi.Parameter(default=DGOV_API_KEY) struct = luigi.Parameter(default=None) columns_filter = luigi.DictParameter(default=None) def output(self): return luigi.LocalTarget(build_fpath(TMP_DIR, self.name, 'csv')) def run(self): query = '{' + QUERY_TMPL.format(0, self.chunk_size) + '}' rep_url = build_url_for_report_page(self.rep_name) versions = self.versions if not versions: versions = load_versions(rep_url) for vs in versions: url = build_url_for_data_page(self.rep_name, self.api_key, version=vs, query=query) data = load_data(url, self.struct, self.columns_filter) save_csvrows(self.output().path, data)
class RemainedHomeLocation(luigi.Task): '''作成した居住地データ(LocationuserList)からunknownになったユーザをひいて、 ソーシャルネットワークを取得しているuserlistとANDをとったものを保存する Args: --homelocation-path 居住地データのファイルへのパス ''' name = luigi.Parameter() month = luigi.MonthParameter() sources = luigi.TupleParameter(default=('followers', 'following')) homelocation_path = luigi.Parameter() def requires(self): return { 'unknown': UnknownList(month=self.month, sources=self.sources), 'userlist': LocationUserList(path=self.homelocation_path), 'seed': SeedUserList(month=self.month) } def output(self): return luigi.LocalTarget( os.path.join('data/datasets', self.name, 'groundtruth', os.path.basename(self.input()['userlist'].path))) def run(self): cmd = 'cat {userlist.path} | python -m snlocest.scripts.edgefilter -e {unknown.path} | python -m snlocest.scripts.edgefilter -i {seed.path} > {}' with self.output().temporary_path() as temp_output_path: run(cmd.format(temp_output_path, **self.input()), shell=True, check=True)
class MLPClassifier(ClassifierWithTransferLearningKerasModelTraining): input_shape: Tuple[int, int] = luigi.TupleParameter(default=(100, )) batch_size: int = luigi.IntParameter(default=10) learning_rate = luigi.FloatParameter(default=1e-5) dense_layers: List[int] = luigi.ListParameter(default=[512, 512]) dropout: float = luigi.FloatParameter(default=None) activation_function: str = luigi.ChoiceParameter( choices=KERAS_ACTIVATION_FUNCTIONS.keys(), default="relu") kernel_initializer: str = luigi.ChoiceParameter( choices=KERAS_WEIGHT_INIT.keys(), default="glorot_uniform") def create_base_model(self) -> Model: x_input = Input(shape=self.input_shape) mlp = Dense(self.dense_layers[0], activation=self.activation_function, kernel_initializer=self.kernel_initializer)(x_input) for dense_neurons in self.dense_layers[1:]: mlp = Dense(dense_neurons, activation=self.activation_function, kernel_initializer=self.kernel_initializer)(mlp) #model.add(BatchNormalization()) if self.dropout: mlp = Dropout(self.dropout)(mlp) output = Dense(1, activation='sigmoid')(mlp) model = Model(x_input, output, name='BaseMLP') return model def create_model_with(self, base_model: Model) -> Model: return base_model
class CleftReport(luigi.Task): it = luigi.IntParameter() dt = luigi.Parameter() aug = luigi.Parameter() de = luigi.Parameter() m = luigi.Parameter() samples = luigi.TupleParameter() data_eval = luigi.TupleParameter() resources = {'ram': 10} @property def priority(self): if int(self.it)%10000==0: return 1./int(self.it) else: return 0. def requires(self): return Threshold(self.it, self.dt, self.aug, self.de, self.samples, self.data_eval) def output(self): cleftrep = os.path.join(os.path.dirname(self.input().fn), 'cleft.' + self.m + '.json') return luigi.LocalTarget(cleftrep) def run(self): progress = 0. self.set_progress_percentage(progress) results = dict() for s in self.samples: thr=127 testfile = os.path.join(os.path.dirname(self.input().fn), s+'.n5') truthfile = os.path.join('/groups/saalfeld/saalfeldlab/larissa/data/cremieval/', self.de, s+'.n5') test = np.array(z5py.File(testfile, use_zarr_format=False)['clefts_cropped_thr'+str(thr)][:]) truth = np.array(z5py.File(truthfile, use_zarr_format=False)['volumes/labels/clefts_cropped'][:]) mask = np.array(z5py.File(truthfile, use_zarr_format=False)['volumes/masks/'+self.m+'_cropped'][:]) clefts_evaluation = Clefts(test, truth, np.logical_not(mask)) results[s] = dict() results[s]['false negatives count'] = clefts_evaluation.count_false_negatives() results[s]['false positives count'] = clefts_evaluation.count_false_positives() results[s]['false negative distance'] = clefts_evaluation.acc_false_negatives() results[s]['false positive distance'] = clefts_evaluation.acc_false_positives() progress += 100./len(self.samples) try: self.set_progress_percentage(progress) except: pass with self.output().open('w') as done: json.dump(results, done)
class LinkwaglOutputs(luigi.Task): """ Link all the multifile outputs from wagl into a single file. """ level1 = luigi.Parameter() work_root = luigi.Parameter() granule = luigi.OptionalParameter(default="") acq_parser_hint = luigi.OptionalParameter(default="") workflow = luigi.EnumParameter(enum=Workflow) vertices = luigi.TupleParameter(default=(5, 5)) pixel_quality = luigi.BoolParameter() method = luigi.EnumParameter(enum=Method, default=Method.SHEAR) dsm_fname = luigi.Parameter(significant=False) buffer_distance = luigi.FloatParameter(default=8000, significant=False) def requires(self): container = acquisitions(self.level1, self.acq_parser_hint) for group in container.supported_groups: kwargs = { "level1": self.level1, "work_root": self.work_root, "granule": self.granule, "group": group, "workflow": self.workflow, "vertices": self.vertices, "pixel_quality": self.pixel_quality, "method": self.method, "dsm_fname": self.dsm_fname, "buffer_distance": self.buffer_distance, } yield DataStandardisation(**kwargs) def output(self): out_fname = pjoin(dirname(self.work_root), "{}.h5".format(self.granule)) return luigi.LocalTarget(out_fname) def run(self): with self.output().temporary_path() as out_fname: for root, _, files in os.walk(self.work_root): # skip any private files if basename(root)[0] == "_": continue for file_ in files: if splitext(file_)[1] == ".h5": fname = pjoin(root, file_) grp_name = basename(dirname(fname.replace(self.work_root, ""))) with h5py.File(fname, "r") as fid: groups = [g for g in fid] for pth in groups: new_path = ppjoin(self.granule, grp_name, pth) create_external_link(fname, pth, out_fname, new_path) with h5py.File(out_fname, "a") as fid: fid.attrs["level1_uri"] = self.level1
class AncillaryData(luigi.Task): """Get all ancillary data.""" level1 = luigi.Parameter() work_root = luigi.Parameter(significant=False) granule = luigi.OptionalParameter(default="") vertices = luigi.TupleParameter() workflow = luigi.EnumParameter(enum=Workflow) acq_parser_hint = luigi.OptionalParameter(default="") aerosol = luigi.DictParameter({"user": 0.05}, significant=False) brdf = luigi.DictParameter() ozone_path = luigi.Parameter(significant=False) water_vapour = luigi.DictParameter({"user": 1.5}, significant=False) dem_path = luigi.Parameter(significant=False) ecmwf_path = luigi.Parameter(significant=False) invariant_height_fname = luigi.Parameter(significant=False) compression = luigi.EnumParameter( enum=H5CompressionFilter, default=H5CompressionFilter.LZF, significant=False ) filter_opts = luigi.DictParameter(default=None, significant=False) def requires(self): group = acquisitions(self.level1, self.acq_parser_hint).supported_groups[0] args = [self.level1, self.work_root, self.granule, group] return CalculateSatelliteAndSolarGrids(*args) def output(self): return luigi.LocalTarget(pjoin(self.work_root, "ancillary.h5")) def run(self): container = acquisitions(self.level1, self.acq_parser_hint) grn = container.get_granule(granule=self.granule, container=True) sbt_path = None nbar_paths = { "aerosol_dict": self.aerosol, "water_vapour_dict": self.water_vapour, "ozone_path": self.ozone_path, "dem_path": self.dem_path, "brdf_dict": self.brdf, } if self.workflow == Workflow.STANDARD or self.workflow == Workflow.SBT: sbt_path = self.ecmwf_path with self.output().temporary_path() as out_fname: _collect_ancillary( grn, self.input().path, nbar_paths, sbt_path, self.invariant_height_fname, self.vertices, out_fname, self.compression, self.filter_opts, )
class AllEvaluations(luigi.WrapperTask): up_to_iteration = luigi.IntParameter(default=200000) iteration_step = luigi.IntParameter(default=10000, significant=False) data_train = luigi.TupleParameter(default=('data2016-aligned', 'data2016-unaligned')) data_eval = luigi.TupleParameter(default=('data2017-aligned', 'data2017-unaligned')) augmentation = luigi.TupleParameter(default=('deluxe', 'classic', 'lite')) mode = luigi.TupleParameter(default =('validation', 'training')) samples = luigi.TupleParameter(default=('A', 'B', 'C')) def requires(self): for it in range(self.iteration_step, self.up_to_iteration+self.iteration_step, self.iteration_step): for dt in self.data_train: for aug in self.augmentation: for de in self.data_eval: for m in self.mode: #yield CleftReport(it, dt, aug, de, m, self.samples, self.data_eval) if it>20000: yield PartnerReport(it, dt, aug, de, m, self.samples, self.data_eval)
class DataStandardisation(luigi.Task): """ Runs the standardised product workflow. """ level1 = luigi.Parameter() outdir = luigi.Parameter() granule = luigi.OptionalParameter(default='') workflow = luigi.EnumParameter(enum=Workflow, default=Workflow.STANDARD) vertices = luigi.TupleParameter(default=(5, 5)) method = luigi.EnumParameter(enum=Method, default=Method.SHEAR) pixel_quality = luigi.BoolParameter() land_sea_path = luigi.Parameter() aerosol = luigi.DictParameter(default={'user': 0.05}) brdf = luigi.DictParameter() ozone_path = luigi.Parameter(significant=False) water_vapour = luigi.DictParameter(default={'user': 1.5}, significant=False) dem_path = luigi.Parameter(significant=False) ecmwf_path = luigi.Parameter(significant=False) invariant_height_fname = luigi.Parameter(significant=False) dsm_fname = luigi.Parameter(significant=False) modtran_exe = luigi.Parameter(significant=False) tle_path = luigi.Parameter(significant=False) rori = luigi.FloatParameter(default=0.52, significant=False) compression = luigi.EnumParameter(enum=H5CompressionFilter, default=H5CompressionFilter.LZF, significant=False) filter_opts = luigi.DictParameter(default=None, significant=False) acq_parser_hint = luigi.OptionalParameter(default='') buffer_distance = luigi.FloatParameter(default=8000, significant=False) h5_driver = luigi.OptionalParameter(default='', significant=False) normalized_solar_zenith = luigi.FloatParameter(default=45.0) def output(self): fmt = '{label}.wagl.h5' label = self.granule if self.granule else basename(self.level1) out_fname = fmt.format(label=label) return luigi.LocalTarget(pjoin(self.outdir, out_fname)) def run(self): if self.workflow == Workflow.STANDARD or self.workflow == Workflow.SBT: ecmwf_path = self.ecmwf_path else: ecmwf_path = None with self.output().temporary_path() as out_fname: card4l(self.level1, self.granule, self.workflow, self.vertices, self.method, self.pixel_quality, self.land_sea_path, self.tle_path, self.aerosol, self.brdf, self.ozone_path, self.water_vapour, self.dem_path, self.dsm_fname, self.invariant_height_fname, self.modtran_exe, out_fname, ecmwf_path, self.rori, self.buffer_distance, self.compression, self.filter_opts, self.h5_driver, self.acq_parser_hint, self.normalized_solar_zenith)
class ParseCompanies(luigi.Task): sheets = luigi.TupleParameter(default=None) skiptop = luigi.TupleParameter(default=None) def output(self): return luigi.LocalTarget(build_fpath(TMP_DIR, self.name, 'csv')) def run(self): for i, target in enumerate(self.input()): self.set_status_message('Parsing {}'.format(target.path)) rows = parse(target.path, Row, skiprows=self.skiptop, sheets=self.sheets) save_csvrows(self.output().path, [attr.astuple(r) for r in rows]) percent = round((i + 1) * 100 / len(self.input())) self.set_progress_percentage(percent)
class ParseBigElasticApi(BigDataToCsv): name = luigi.Parameter(default='') version = luigi.Parameter(default='') versions = luigi.TupleParameter(default='') rep_name = luigi.Parameter(default='') chunk_size = luigi.IntParameter(default=CHUNK_SIZE) api_key = luigi.Parameter(default=DGOV_API_KEY) struct = luigi.Parameter(default=None) columns_filter = luigi.DictParameter(default=None)
class InterpolateCoefficients(luigi.Task): """ Issues InterpolateCoefficient tasks. This acts as a helper task, and links the results from each InterpolateCoefficient task single HDF5 file. """ vertices = luigi.TupleParameter() workflow = luigi.EnumParameter(enum=Workflow) method = luigi.EnumParameter(enum=Method, default=Method.SHEAR) def requires(self): container = acquisitions(self.level1, self.acq_parser_hint) acqs = container.get_acquisitions(group=self.group, granule=self.granule) # NBAR & SBT acquisitions nbar_acqs = [a for a in acqs if a.band_type == BandType.REFLECTIVE] sbt_acqs = [a for a in acqs if a.band_type == BandType.THERMAL] tasks = {} for coefficient in self.workflow.atmos_coefficients: if coefficient in Workflow.NBAR.atmos_coefficients: band_acqs = nbar_acqs else: band_acqs = sbt_acqs for acq in band_acqs: key = (acq.band_name, coefficient) kwargs = { 'level1': self.level1, 'work_root': self.work_root, 'granule': self.granule, 'group': self.group, 'band_name': acq.band_name, 'coefficient': coefficient, 'workflow': self.workflow, 'vertices': self.vertices, 'method': self.method } tasks[key] = InterpolateCoefficient(**kwargs) return tasks def output(self): out_fname = pjoin(self.work_root, self.group, 'interpolated-coefficients.h5') return luigi.LocalTarget(out_fname) def run(self): fnames = {} for key, value in self.input().items(): fnames[key] = value.path with self.output().temporary_path() as out_fname: link_interpolated_data(fnames, out_fname)
class CommonDomains(luigi.Task): apks = luigi.TupleParameter() pkg = luigi.Parameter() domains_folder = cfg.dynamic_bro_analysis_folder def get_domains_file_path(self, version, date): return os.path.join(self.domains_folder, self.pkg, self.pkg + '_' + version + '_' + date, 'domains.txt') # requires json of single releases def requires(self): return [StringoidParse(file_name=apk) for apk in self.apks] # output is the json file with aggregated info of the app def output(self): output_file = os.path.join(cfg.stringoid_commondomains_folder, self.pkg + "_commondomains.json") return ExternalFileTarget(output_file) def run(self): commondomains = {} for i in self.input(): domains_list = [] both = [] commondomain = {} with open(i.path) as url_file: urls = json.load(url_file) version = i.path.split("_")[-2] date = i.path.split("_")[-1].replace(".json", "") domains_path = self.get_domains_file_path(version, date) urls_list = urls.keys() with open(domains_path) as domain_file: for line in domain_file: domains_list.append(line.strip()) for url in urls.keys(): if line.strip() in url: both.append(line.strip()) domains_list.remove(line.strip()) urls_list.remove(url) break commondomain["domains"] = domains_list commondomain["urls"] = urls_list commondomain["both"] = both commondomain["domains_number"] = len(domains_list) commondomain["urls_number"] = len(urls_list) commondomain["both_number"] = len(both) commondomains[version] = commondomain for i in self.input(): i.cleanup() with self.output().open('w') as f: json.dump(commondomains, f, sort_keys=True)
class LoadJSON(index_util.LoadJSONBase): batch = luigi.TupleParameter() last_update_date = luigi.Parameter() index_name = 'devicerecall' mapping_file = './schemas/device_recall_mapping.json' use_checksum = True optimize_index = False docid_key = 'product_res_number' def _data(self): return AnnotateWeeklyBatch(self.batch)
class ClosedDataset(luigi.WrapperTask): '''closed_[name]なデータセットを作る''' name = luigi.Parameter() sources = luigi.TupleParameter(default=('linked', 'mutual', 'followee', 'follower')) def requires(self): dst_name = 'closed_' + self.name return ([CopyGroundtruth(src_name=self.name, dst_name=dst_name)] + [ ClosedNetwork(src_name=self.name, dst_name=dst_name, source=s) for s in self.sources ])