class GetCompoundsFromChEMBL(luigi.Task): """ This task will retrieve compounds from ChEMBL and save them In the SDF format. Compounds will be prefiltered by: - logP (lower limit) - number of aromatic rings (upper limit) - chirality (exact value) - molecular weight (upper limit) """ logP = luigi.FloatParameter(default=1.9) rings_number = luigi.IntParameter(default=3) chirality = luigi.IntParameter(default=(-1)) mwt = luigi.FloatParameter(default=100.0) def requires(self): return [] def run(self): molecule = new_client.molecule molecule.set_format('sdf') mols = molecule.filter(molecule_properties__acd_logp__gte=self.logP) \ .filter(molecule_properties__aromatic_rings__lte=self.rings_number) \ .filter(chirality=self.chirality) \ .filter(molecule_properties__full_mwt__lte=self.mwt) with self.output().open('w') as output: for mol in mols: output.write(mol) output.write('$$$$\n') def output(self): return luigi.LocalTarget('mols_2D.sdf')
class Transmission(luigi.Task): id = luigi.IntParameter() name = luigi.Parameter() wavelength = luigi.FloatParameter() # nm theta = luigi.FloatParameter() # deg def requires(self): return self.clone(Run) def run(self): # Update cached theta. f = TFile(self.input()[1].fn) gps = f.Get("generalParticleSourceMessenger") gps.GetEntry(0) hits = f.Get("hits") # Calculate transmission trans = u.ufloat(hits.GetEntries(), np.sqrt(hits.GetEntries())) trans = trans / gps.nParticles f.Close() # Write to pandas DataFrame for later merging. df = pd.DataFrame({ "wavelength": [self.wavelength], "theta": [self.theta], "t": [trans] }) df.to_pickle(self.output().fn) def output(self): return luigi.LocalTarget("./results/%s/wico-%d.pkl" % (self.name, self.id))
class Run(luigi.contrib.external_program.ExternalProgramTask): id = luigi.IntParameter() name = luigi.Parameter() wavelength = luigi.FloatParameter() # nm dz = luigi.FloatParameter() # mm nice_level = luigi.IntParameter(5, significant=False) n_particles = luigi.IntParameter(3000, significant=False) ug11_filter_thickness = luigi.FloatParameter(1, significant=False) def program_args(self): # Create macro file. mac = self.output()[0] if not mac.exists(): with mac.open("w") as o: e = scipy.constants.value("Planck constant in eV s") * scipy.constants.c / (self.wavelength * 1e-9) print >> o, "/gps/energy/eMin %.18f eV" % e print >> o, "/gps/energy/eMax %.18f eV" % e print >> o, "/gps/nParticles %d" % self.n_particles print >> o, "/gps/angle/thetaMin 0 deg" print >> o, "/gps/angle/thetaMax 0.75 deg" print >> o, "/gps/angle/phiMin 0 deg" print >> o, "/gps/angle/phiMax 360 deg" print >> o, "/g4sipm/digitize/hits 0" print >> o, "/run/beamOn 1" output = self.output()[1] return ["nice", "-n", self.nice_level, "./sim", "--mac", mac.fn, "--output", output.fn, "--dz", self.dz, "--ug11-filter-thickness", self.ug11_filter_thickness] def output(self): return [luigi.LocalTarget("./results/%s/famous-%d.mac" % (self.name, self.id)), luigi.LocalTarget("./results/%s/famous-%d.root" % (self.name, self.id))]
class MLPClassifier(ClassifierWithTransferLearningKerasModelTraining): input_shape: Tuple[int, int] = luigi.TupleParameter(default=(100, )) batch_size: int = luigi.IntParameter(default=10) learning_rate = luigi.FloatParameter(default=1e-5) dense_layers: List[int] = luigi.ListParameter(default=[512, 512]) dropout: float = luigi.FloatParameter(default=None) activation_function: str = luigi.ChoiceParameter( choices=KERAS_ACTIVATION_FUNCTIONS.keys(), default="relu") kernel_initializer: str = luigi.ChoiceParameter( choices=KERAS_WEIGHT_INIT.keys(), default="glorot_uniform") def create_base_model(self) -> Model: x_input = Input(shape=self.input_shape) mlp = Dense(self.dense_layers[0], activation=self.activation_function, kernel_initializer=self.kernel_initializer)(x_input) for dense_neurons in self.dense_layers[1:]: mlp = Dense(dense_neurons, activation=self.activation_function, kernel_initializer=self.kernel_initializer)(mlp) #model.add(BatchNormalization()) if self.dropout: mlp = Dropout(self.dropout)(mlp) output = Dense(1, activation='sigmoid')(mlp) model = Model(x_input, output, name='BaseMLP') return model def create_model_with(self, base_model: Model) -> Model: return base_model
class SplitDataset(luigi.Task): test_size = luigi.FloatParameter(default=0.1) random_state = luigi.FloatParameter(default=12) def run(self): with self.input().open('r') as f: motions = f.readlines() train, test = train_test_split(motions, test_size=self.test_size, random_state=self.random_state) with self.output()['train'].open('w') as f: f.writelines(train) with self.output()['test'].open('w') as f: f.writelines(test) def output(self): data_folder = luigi.configuration.get_config().get( 'GlobalConfig', 'data_folder') return { 'train': luigi.LocalTarget(os.path.join(data_folder, 'train.txt')), 'test': luigi.LocalTarget(os.path.join(data_folder, 'test.txt')) }
class GenerateImageByBounds(luigi.WrapperTask): """ Schedule Download Tasks """ west = luigi.FloatParameter() north = luigi.FloatParameter() south = luigi.FloatParameter() east = luigi.FloatParameter() zoom = luigi.IntParameter() targetTask = luigi.TaskParameter(default=GenerateImageCSReliefMap) def requires(self): """ scheduling tasks """ candidateTasks = [ GenerateImageCSReliefMap, GenerateImageCurvature, GenerateImageSlope ] if not self.targetTask in candidateTasks: raise edge_nw_x, edge_nw_y, _, _ = deg_to_num(self.north, self.west, self.zoom) edge_se_x, edge_se_y, _, _ = deg_to_num(self.south, self.east, self.zoom) # xRange = [edge_nw_x, edge_se_x] # yRange = [edge_nw_y, edge_se_y] print deg_to_num(self.north, self.west, self.zoom) + deg_to_num( self.south, self.east, self.zoom) for tile_x in range(edge_nw_x - 3, edge_se_x + 3): for tile_y in range(edge_nw_y - 3, edge_se_y + 3): yield self.targetTask(x=tile_x, y=tile_y, z=self.zoom)
class GenerateJijiDataSplits(gokart.TaskOnKart): task_namespace = "context_nmt" jiji_source_path = luigi.Parameter() target_path = luigi.Parameter() dev_proportion = luigi.FloatParameter() test_proportion = luigi.FloatParameter() quality_aware = luigi.BoolParameter() score_threhold = luigi.FloatParameter(default=0.3) def requires(self): return MergeJijiFiles( source_path=self.jiji_source_path, quality_aware=self.quality_aware, score_threhold=self.score_threhold, ) def output(self): return self.input() def run(self): documents = self.load() test_size = self.test_proportion dev_size = self.dev_proportion / (1 - test_size) train, test = map( dict, train_test_split(list(documents.items()), test_size=test_size)) train, dev = map( dict, train_test_split(list(train.items()), test_size=dev_size)) if not os.path.isdir(self.target_path): os.mkdir(self.target_path) for name, split in (("train", train), ("dev", dev), ("test", test)): with open(f"{self.target_path}/{name}.json", "w") as target: json.dump(split, target, ensure_ascii=False)
class ExtractSegment(luigi.Task): task_namespace = 'voxceleb' priority = 3 person = luigi.Parameter() video = luigi.Parameter() segment = luigi.IntParameter() start = luigi.FloatParameter() stop = luigi.FloatParameter() def requires(self): return DownloadAudio(video=self.video) def output(self): return luigi.LocalTarget( data_out_path( 'segments', 'original', self.person, '{}_{:07d}.wav'.format(self.video, int(self.segment)))) def run(self): ffmpeg = FFmpeg(ffmpeg_bin=Config().ffmpeg_bin) with AtomizedLocalTarget(self.output()) as target: ffmpeg.extract_segment(self.input().path, str(target.path), start=self.start, stop=self.stop, timeout=300) check_output(self.output().path)
class GeneratePsudoData(gokart.TaskOnKart): task_namespace = 'novelty_enhanced_bpr' test_size: float = luigi.FloatParameter(default=0.3) validation_size: float = luigi.FloatParameter(default=0.1) def requires(self): item_embed_vector_task = GenerateItemEmbedVectors() user_embed_vector_task = GenerateUserEmbedVectors() user_item_iteraction_task = GenerateUserItemInteractions( item_embed_vector_task=item_embed_vector_task, user_embed_vector_task=user_embed_vector_task) item_distance_task = GetItemDistance( item_embed_vector_task=item_embed_vector_task) return dict(item_distance=item_distance_task, user_item_interaction=user_item_iteraction_task) def run(self): clicks = self.load('user_item_interaction') item_distance = self.load('item_distance') clicks_train, clicks_test = train_test_split(clicks, test_size=self.test_size) clicks_train, clicks_validation = train_test_split( clicks_train, test_size=self.validation_size / (1 - self.test_size)) self.dump( dict(clicks_train=clicks_train, clicks_validation=clicks_validation, clicks_test=clicks_test, item_distance=item_distance))
class BICSegmentation(sciluigi.Task, AutoOutput): in_segmentation = None in_features = None penalty_coef = luigi.FloatParameter(default=1.0) covariance_type = luigi.Parameter(default='full') min_duration = luigi.FloatParameter(default=1.0) precision = luigi.FloatParameter(default=0.1) def run(self): segmenter = pyannote.algorithms.segmentation.bic.BICSegmentation( penalty_coef=self.penalty_coef, covariance_type=self.covariance_type, min_duration=self.min_duration, precision=self.precision) with self.in_features().open('r') as fp: features = pickle.load(fp) with self.in_segmentation().open('r') as fp: segmentation = pyannote.core.json.load(fp) timeline = segmenter.apply(features, segmentation=segmentation) annotation = Annotation() for s, segment in enumerate(timeline): annotation[segment] = s with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp)
class ItemSimilaritySparkJob(luigi.Task): """ Spark job for running item similarity model """ inputPath = luigi.Parameter(default="/seldon-data/seldon-models/") outputPath = luigi.Parameter(default="/seldon-data/seldon-models/") client = luigi.Parameter(default="test") startDay = luigi.IntParameter(default=1) days = luigi.IntParameter(default=1) itemType = luigi.IntParameter(-1) limit = luigi.IntParameter(default=100) minItemsPerUser = luigi.IntParameter(default=0) minUsersPerItem = luigi.IntParameter(default=0) maxUsersPerItem = luigi.IntParameter(default=2000000) dimsumThreshold =luigi.FloatParameter(default=0.1) sample = luigi.FloatParameter(default=1.0) def output(self): return luigi.LocalTarget("{}/{}/item-similarity/{}".format(self.outputPath,self.client,self.startDay)) def run(self): params = ["seldon-cli","model","--action","add","--client-name",self.client,"--model-name","similar-items","--inputPath",self.inputPath,"--outputPath",self.outputPath,"--startDay",str(self.startDay),"--days",str(self.days),"--sample",str(self.sample),"--itemType",str(self.itemType),"--limit",str(self.limit),"--minItemsPerUser",str(self.minItemsPerUser),"--minUsersPerItem",str(self.minUsersPerItem),"--maxUsersPerItem",str(self.maxUsersPerItem),"--dimsumThreshold",str(self.dimsumThreshold)] res = call(params) params = ["seldon-cli","model","--action","train","--client-name",self.client,"--model-name","similar-items"] res = call(params) return res
class OvervoltageSimulation(SimulationMetaTask, luigi.Task): name = luigi.Parameter("overvoltage-simulation") wavelength = luigi.FloatParameter(450) # nm n_repititions = luigi.IntParameter(1000) breakdown_voltage = luigi.FloatParameter(53) # Volt @property def run_kwargs(self): if hasattr(self, '_cached_run_kwargs'): return self._cached_run_kwargs # Dice temperatures. overvoltages = np.random.uniform(0, 10, self.n_repititions) # Set run_kwargs. e = 4.135667516e-15 * 299792458.0 / (self.wavelength * 1e-9) run_kwargs = dict(e_min=e, e_max=e, n_particles=10000, digitize_hits=False, persist_hits=True, persist_digis=False) # self._cached_run_kwargs = [clone(run_kwargs, bias_voltage=(self.breakdown_voltage + ov)) for ov in overvoltages] return self._cached_run_kwargs def run(self): inputs = self.sqlite_from_runs() with self.output().open("w") as o: for input in inputs: con = sqlite3.connect(input.fn) cur = con.cursor() try: # TODO get angle. temp, vb, vov, f_th = cur.execute("SELECT temperature, breakdownVoltage, overVoltage, thermalNoiseRate FROM sipmModel;").fetchone() n = cur.execute("SELECT count() FROM `g4sipmHits-0`;").fetchone()[0] print >> o, temp, vb, vov, f_th, n except Exception as e: print "Failure in", input.fn print e
class DynamicRangeSimulation(SimulationDynamicMetaTask, luigi.Task): name = luigi.Parameter("dynamic-range-simulation-schumacher") n_repititions = luigi.IntParameter(1000) step = luigi.IntParameter(1000) n_min = luigi.IntParameter(1) # minimum number of photons n_max = luigi.IntParameter(1e7) # maximum number of photons t_input = luigi.Parameter("../sample/resources/sawtooth-100ps.properties") e_min = luigi.FloatParameter(default=3.061338207066896, significant=False) # eV (405 nm) e_max = luigi.FloatParameter(default=3.061338207066896, significant=False) # eV (405 nm) def run_kwargs(self): kwargs = dict(exe="../fast/fast", persist_hits=False, noise_if_no_signal=True, t_input=self.t_input, bias_voltage=56.7) # Dice number of particles n = np.array(np.ceil(np.exp(np.random.uniform(np.log(self.n_min), np.log(self.n_max), self.step))), dtype="int") return [clone(kwargs, n_particles=ni) for ni in n] def run_after_yield(self): # Open results. inputs = self.sqlite_from_runs() with self.output().open("w") as o: for input in inputs: con = sqlite3.connect(input.fn) cur = con.cursor() try: n_particles, t_min, t_max = cur.execute("SELECT nParticles, tMin, tMax FROM particleSourceMessenger;").fetchone() n_eff_cells = np.sum(cur.execute("SELECT weight FROM `g4sipmDigis-0` WHERE time >= %s AND time < %s;" % (t_min, t_max)).fetchall()) print >> o, n_particles, n_eff_cells except Exception as e: print "Failure in", input.fn print e
class Welch_t_test(luigi.Task): multiple_comparison_method = luigi.Parameter(default="bonferroni") adjusted_pval = luigi.FloatParameter(default=0.001) threshold = luigi.FloatParameter(default=0.3) def requires(self): return Calculate_relative_frequency_profile() def output(self): misc_dir = os.path.join(out_dir, "miscellaneous") Welch_kmers = os.path.join(misc_dir, "welch_kmers.txt") return luigi.LocalTarget(Welch_kmers) def run(self): # Load input with self.input()['pos'].open('r') as fh: pos_rel_freq_df = pd.read_csv(fh).set_index('seq_ID') with self.input()['neg'].open('r') as fh: neg_rel_freq_df = pd.read_csv(fh).set_index('seq_ID') welch_kmers = feature_processing.Welch_t_test( pos_rel_freq_df, neg_rel_freq_df, self.multiple_comparison_method, self.adjusted_pval, self.threshold ) with self.output().open('w') as fh: for kmer in welch_kmers: fh.write(kmer + "\n")
class WideRecommender(ClassifierWithTransferLearningKerasModelTraining): input_shape: Tuple[int, int] = luigi.TupleParameter(default=(100, )) batch_size: int = luigi.IntParameter(default=10) learning_rate = luigi.FloatParameter(default=1e-5) dense_layers: List[int] = luigi.ListParameter(default=[512, 512]) dropout: float = luigi.FloatParameter(default=None) activation_function: str = luigi.ChoiceParameter( choices=KERAS_ACTIVATION_FUNCTIONS.keys(), default="relu") kernel_initializer: str = luigi.ChoiceParameter( choices=KERAS_WEIGHT_INIT.keys(), default="glorot_uniform") def create_base_model(self) -> Model: x_input = Input(shape=self.input_shape, name='wide_inp') wide = Dense(self.input_shape[0], activation=self.activation_function, kernel_initializer=self.kernel_initializer, name='wide_mlp')(x_input) output = Dense(1, activation='sigmoid', kernel_initializer=self.kernel_initializer)(wide) model = Model(x_input, output, name='Wide') return model def create_model_with(self, base_model: Model) -> Model: return base_model
class DownloadBounds(luigi.WrapperTask): """ Schedule Download Tasks """ baseUrl = luigi.Parameter() baseName = luigi.Parameter(default="output") west = luigi.FloatParameter() north = luigi.FloatParameter() south = luigi.FloatParameter() east = luigi.FloatParameter() zoom = luigi.IntParameter() def requires(self): """ scheduling tasks """ edge_nw_x, edge_nw_y, _, _ = deg_to_num( self.north, self.west, self.zoom) edge_se_x, edge_se_y, _, _ = deg_to_num( self.south, self.east, self.zoom) print deg_to_num(self.north, self.west, self.zoom) + deg_to_num(self.south, self.east, self.zoom) for tile_x in range(edge_nw_x, edge_se_x + 1): for tile_y in range(edge_nw_y, edge_se_y + 1): print "scheduling z:{} x:{} y:{}".format(self.zoom, tile_x, tile_y) yield DownloadTile(self.baseUrl, self.baseName, tile_x, tile_y, self.zoom)
class LinearBICClustering(sciluigi.Task, AutoOutput): in_segmentation = None in_features = None max_gap = luigi.FloatParameter(default=3600.0) penalty_coef = luigi.FloatParameter(default=1.0) covariance_type = luigi.Parameter(default='diag') def run(self): clustering = pyannote.algorithms.clustering.bic.LinearBICClustering( max_gap=self.max_gap, penalty_coef=self.penalty_coef, covariance_type=self.covariance_type) with self.in_features().open('r') as fp: features = pickle.load(fp) with self.in_segmentation().open('r') as fp: starting_point = pyannote.core.json.load(fp) result = clustering(starting_point, features=features) with self.out_put().open('w') as fp: pyannote.core.json.dump(result, fp)
class DatasetPairs(luigi.Task): """ Outputs a list of training/testing dataset pairs. """ d = luigi.Parameter() tr_bias = luigi.FloatParameter() te_bias = luigi.FloatParameter() foldidx = luigi.IntParameter() outdir = luigi.Parameter() tr_frac = luigi.FloatParameter(default=.5) random_seed = luigi.IntParameter(default=1234) size = luigi.IntParameter(default=1000) def requires(self): pass def run(self): # reads original dataset data = read_pickle(self.d) # creates train and test unbiased datasets seed = self.random_seed + self.foldidx np.random.seed(seed) ogsize = data.X.shape[0] data_range = list(range(ogsize)) np.random.shuffle(data_range) spliti = int(ogsize * self.tr_frac) tr_split = data_range[:spliti] te_split = data_range[spliti:] tr_data = Dataset(X=data.X[tr_split], y=data.y[tr_split], z=data.z[tr_split]) te_data = Dataset(X=data.X[te_split], y=data.y[te_split], z=data.z[te_split]) # creates biased datasets biased_pair = [ tr_data.make_confounding_dataset(self.tr_bias, self.size), te_data.make_confounding_dataset(self.te_bias, self.size), ] for d in biased_pair: d.parent = None # writes out datasets for d, outpath in zip(biased_pair, self.output()): with outpath.open("w") as fd: pickle.dump(d, fd) def output(self): fname = "trbias={:.3f}_tebias={:.3f}_size={}_foldidx={}_trfrac={:.3f}".format( self.tr_bias, self.te_bias, self.size, self.foldidx, self.tr_frac) fpath = os.path.join(self.outdir, "datapairs", fname) fpaths = [_.format(fpath) for _ in ["{}_train.pkl", "{}_test.pkl"]] return [luigi.LocalTarget(_, format=luigi.format.Nop) for _ in fpaths]
class AnalysisTask(luigi.Task, ABC): """ Abstract class that requires the completion of dataset selection and model training. """ imgfolder = luigi.Parameter() hdffolder = luigi.Parameter() modelsfolder = luigi.Parameter() target_size = luigi.IntParameter() # standardizing to square images keep_categories = luigi.ListParameter() fractions = luigi.ListParameter() # train/valid/test fraction model_definition = luigi.Parameter( ) # JSON file with model definition specs sigma = luigi.FloatParameter(default=0.5) threshold = luigi.BoolParameter(default=False) rest_as_other = luigi.BoolParameter( default=False ) # set the remaining as "other" - not recommended for small keep_category lengths whiten = luigi.BoolParameter(default=False) epsilon = luigi.FloatParameter(default=0.1) def requires(self): """ Task depends on a trained model and a dataset archive. :return: Dictionary with TrainKerasModelFromDefinitionTask and SelectDatasetTask """ return { "model": TrainKerasModelFromDefinitionTask( self.imgfolder, self.hdffolder, self.modelsfolder, self.target_size, self.keep_categories, self.fractions, self.model_definition, self.sigma, self.threshold, self.rest_as_other, self.whiten, self.epsilon, ), "dataset": SelectDatasetTask( self.imgfolder, self.hdffolder, self.target_size, self.keep_categories, self.fractions, self.sigma, self.threshold, self.rest_as_other, self.whiten, self.epsilon, ), }
class HTCondorWorkflow(law.HTCondorWorkflow): """ Custom htcondor workflow with good default configs for the CERN batch system. """ poll_interval = luigi.FloatParameter( default=0.5, significant=False, description="time between " "status polls in minutes, default: 0.5") max_runtime = luigi.FloatParameter(default=24.0, significant=False, description="maximum " "runtime in hours") only_missing = luigi.BoolParameter(default=True, significant=False, description="skip tasks " "that are considered complete") cmst3 = luigi.BoolParameter(default=False, significant=False, description="use the CMS T3 " "HTCondor quota for jobs, default: False") def htcondor_output_directory(self): return law.LocalDirectoryTarget(self.local_path(store="$HGC_STORE")) def htcondor_wrapper_file(self): return os.path.expandvars("$HGC_BASE/hgc/files/bash_wrapper.sh") def htcondor_bootstrap_file(self): return os.path.expandvars("$HGC_BASE/hgc/files/htcondor_bootstrap.sh") def htcondor_use_local_scheduler(self): return True def htcondor_job_config(self, config, job_num, branches): # render_data is rendered into all files sent with a job config.render_variables["hgc_base"] = os.getenv("HGC_BASE") # force to run on CC7, http://batchdocs.web.cern.ch/batchdocs/local/submit.html#os-choice config.custom_content.append( ("requirements", "(OpSysAndVer =?= \"CentOS7\")")) # copy the entire environment config.custom_content.append(("getenv", "true")) # fix for CERN htcondor batch: pass the true PATH variable as a render variable which is # used in the custom wapper file to set PATH config.render_variables["env_path"] = os.getenv("PATH") # the CERN htcondor setup requires a "log" config, but we can safely set it to /dev/null # if you are interested in the logs of the batch system itself, set a meaningful value here config.custom_content.append(("log", "/dev/null")) # set the maximum runtime config.custom_content.append( ("+MaxRuntime", int(math.floor(self.max_runtime * 3600)) - 1)) # CMS T3 group settings if self.cmst3: config.custom_content.append( ("+AccountingGroup", "group_u_CMST3.all")) return config
class GlobalParams(luigi.Config): model_name = luigi.Parameter() project_folder = luigi.Parameter() timestep = luigi.FloatParameter() dataset_id = luigi.IntParameter() endtime = luigi.FloatParameter() nb_past_timesteps = luigi.IntParameter() random_seed = luigi.IntParameter()
class DataStandardisation(luigi.Task): """ Runs the standardised product workflow. """ level1 = luigi.Parameter() outdir = luigi.Parameter() granule = luigi.OptionalParameter(default='') workflow = luigi.EnumParameter(enum=Workflow, default=Workflow.STANDARD) vertices = luigi.TupleParameter(default=(5, 5)) method = luigi.EnumParameter(enum=Method, default=Method.SHEAR) pixel_quality = luigi.BoolParameter() land_sea_path = luigi.Parameter() aerosol = luigi.DictParameter(default={'user': 0.05}) brdf = luigi.DictParameter() ozone_path = luigi.Parameter(significant=False) water_vapour = luigi.DictParameter(default={'user': 1.5}, significant=False) dem_path = luigi.Parameter(significant=False) ecmwf_path = luigi.Parameter(significant=False) invariant_height_fname = luigi.Parameter(significant=False) dsm_fname = luigi.Parameter(significant=False) modtran_exe = luigi.Parameter(significant=False) tle_path = luigi.Parameter(significant=False) rori = luigi.FloatParameter(default=0.52, significant=False) compression = luigi.EnumParameter(enum=H5CompressionFilter, default=H5CompressionFilter.LZF, significant=False) filter_opts = luigi.DictParameter(default=None, significant=False) acq_parser_hint = luigi.OptionalParameter(default='') buffer_distance = luigi.FloatParameter(default=8000, significant=False) h5_driver = luigi.OptionalParameter(default='', significant=False) normalized_solar_zenith = luigi.FloatParameter(default=45.0) def output(self): fmt = '{label}.wagl.h5' label = self.granule if self.granule else basename(self.level1) out_fname = fmt.format(label=label) return luigi.LocalTarget(pjoin(self.outdir, out_fname)) def run(self): if self.workflow == Workflow.STANDARD or self.workflow == Workflow.SBT: ecmwf_path = self.ecmwf_path else: ecmwf_path = None with self.output().temporary_path() as out_fname: card4l(self.level1, self.granule, self.workflow, self.vertices, self.method, self.pixel_quality, self.land_sea_path, self.tle_path, self.aerosol, self.brdf, self.ozone_path, self.water_vapour, self.dem_path, self.dsm_fname, self.invariant_height_fname, self.modtran_exe, out_fname, ecmwf_path, self.rori, self.buffer_distance, self.compression, self.filter_opts, self.h5_driver, self.acq_parser_hint, self.normalized_solar_zenith)
class GetAlignment(SlurmTask): min_cov = luigi.FloatParameter(default=0.8) min_indvs = luigi.FloatParameter(default=0.8) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Set the SLURM request params for this task self.mem = 4000 self.n_cpu = 1 self.partition = "nbi-short" def output(self): return { 'phy': LocalTarget( os.path.join(self.base_dir, VERSION, PIPELINE, self.output_prefix, self.output_prefix + ".phy")), } #'nex': LocalTarget(os.path.join(self.base_dir, VERSION, PIPELINE, self.output_prefix, self.output_prefix + ".nex"))} def work(self): import Bio import Bio.SeqIO import Bio.AlignIO import contextlib import numpy as np with contextlib.ExitStack() as stack, self.output()['phy'].open( 'w') as fphy: #, self.output()['nex'].open('w') as fnex: fhs = [ stack.enter_context(open(fname.path)) for fname in self.input()['iupac-codes'] ] parsers = zip(*[Bio.SeqIO.parse(f, 'fasta') for f in fhs]) msa = [ Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(''), id=lib) for lib in self.lib_list ] for seqs in parsers: id, l = seqs[0].id, len(seqs[0]) assert all([x.id == id for x in seqs]), "Fasta sequences not sorted!" coverage = 1 - np.array([x.seq.count('N') for x in seqs]) / l indvs = np.mean(coverage > self.min_cov) if indvs > self.min_indvs: for (i, x) in enumerate(seqs): # 3rd codon msa[i] += x.seq[::3] Bio.AlignIO.write(Bio.Align.MultipleSeqAlignment(msa), fphy, 'phylip-relaxed')
class Transmission(luigi.Task): id = luigi.IntParameter() name = luigi.Parameter() wavelength = luigi.FloatParameter() dz = luigi.FloatParameter() n_sipms = luigi.IntParameter() def requires(self): return self.clone(Run) def run(self): df = {} # Open output file. f = TFile(self.input()[1].fn) # Update cached wavelength. gps = f.Get("generalParticleSourceMessenger") gps.GetEntry(0) self.wavelength = scipy.constants.value("Planck constant in eV s") * scipy.constants.c / (gps.energyEMin * 1e-3) df["wavelength"] = [self.wavelength] # Get number of particles. n = gps.nParticles df["n"] = [gps.nParticles] # Update cached dz. lensModel = f.Get("fresnelLensModel") lensModel.GetEntry(0) self.dz = lensModel.dz z = lensModel.dz + lensModel.f df["dz"] = [self.dz] df["z"] = [z] # Get transmission after lens. lens = f.Get("fresnelLensBackHits") df["t_after_lens"] = u.ufloat(lens.GetEntries(), np.sqrt(lens.GetEntries())) / n # Get transmission at focal plane. focal = f.Get("focalPlaneHits") df["t_focal_plane"] = u.ufloat(focal.GetEntries(), np.sqrt(focal.GetEntries())) / n # Get transmission after Winston cones. wico = f.Get("wicoBackHits") df["t_after_wico"] = u.ufloat(wico.GetEntries(), np.sqrt(wico.GetEntries())) / n # Get detected photons for each SiPM. n_sipm = 0.0 for sipm_id in xrange(self.n_sipms): sipm = f.Get("g4sipmHits-%d" % sipm_id) if sipm != None: n_sipm += sipm.GetEntries() if sipm_id > 0: continue df["t_sipm_%d" % sipm_id] = u.ufloat(sipm.GetEntries(), np.sqrt(sipm.GetEntries())) / n df["t_sipm"] = u.ufloat(n_sipm, np.sqrt(n_sipm)) / n f.Close() # Save to pandas dataframe df = pd.DataFrame(df) df.to_pickle(self.output().fn) def output(self): return luigi.LocalTarget("./results/%s/famous-%d.pkl" % (self.name, self.id))
class SurfaceReflectance(luigi.Task): """Run the terrain correction over a given band.""" band_name = luigi.Parameter() rori = luigi.FloatParameter(default=0.52, significant=False) base_dir = luigi.Parameter(default='_standardised', significant=False) dsm_fname = luigi.Parameter(significant=False) buffer_distance = luigi.FloatParameter(default=8000, significant=False) def requires(self): reqs = { 'interpolation': self.clone(InterpolateCoefficients), 'ancillary': self.clone(AncillaryData), 'rel_slope': self.clone(RelativeAzimuthSlope), 'shadow': self.clone(CalculateShadowMasks), 'slp_asp': self.clone(SlopeAndAspect), 'incident': self.clone(IncidentAngles), 'exiting': self.clone(ExitingAngles), 'sat_sol': self.clone(CalculateSatelliteAndSolarGrids) } return reqs def output(self): out_path = pjoin(self.work_root, self.group, self.base_dir) fname = 'reflectance-{}.h5'.format(self.band_name) return luigi.LocalTarget(pjoin(out_path, fname)) def run(self): container = acquisitions(self.level1, self.acq_parser_hint) acqs = container.get_acquisitions(self.group, self.granule) # inputs inputs = self.input() interpolation_fname = inputs['interpolation'].path slp_asp_fname = inputs['slp_asp'].path incident_fname = inputs['incident'].path exiting_fname = inputs['exiting'].path relative_slope_fname = inputs['rel_slope'].path shadow_fname = inputs['shadow'].path sat_sol_fname = inputs['sat_sol'].path ancillary_fname = inputs['ancillary'].path # get the acquisition we wish to process acq = [acq for acq in acqs if acq.band_name == self.band_name][0] with self.output().temporary_path() as out_fname: _calculate_reflectance(acq, acqs, interpolation_fname, sat_sol_fname, slp_asp_fname, relative_slope_fname, incident_fname, exiting_fname, shadow_fname, ancillary_fname, self.rori, out_fname, self.compression, self.filter_opts)
class MergeImgByBounds(luigi.Task): """ Schedule Download Tasks """ baseUrl = luigi.Parameter() baseName = luigi.Parameter(default="output") west = luigi.FloatParameter() north = luigi.FloatParameter() south = luigi.FloatParameter() east = luigi.FloatParameter() zoom = luigi.IntParameter() def __init__(self, *args, **kwargs): super(MergeImgByBounds, self).__init__(*args, **kwargs) edge_nw_x, edge_nw_y, _, _ = deg_to_num( self.north, self.west, self.zoom) edge_se_x, edge_se_y, _, _ = deg_to_num( self.south, self.east, self.zoom) self.edge_nw_x = edge_nw_x self.edge_nw_y = edge_nw_y self.edge_se_x = edge_se_x self.edge_se_y = edge_se_y require_list = [] x_range = range(edge_nw_x - 1, edge_se_x + 1) y_range = range(edge_nw_y - 1, edge_se_y + 1) for x, tile_x in enumerate(x_range): for y, tile_y in enumerate(y_range): require_list.append( (x, y, DownloadTile(self.baseUrl, self.baseName, tile_x, tile_y, self.zoom))) self.require_list = require_list def requires(self): """ scheduling tasks """ return [x[2] for x in self.require_list] def output(self): return luigi.LocalTarget("./var/combined_z{}_x{}_{}_y{}_{}.png".format(self.zoom, self.edge_nw_x, self.edge_se_x, self.edge_nw_y, self.edge_se_y)) def run(self): combined_tile = Image.new( 'RGBA', (256 * (self.edge_se_x - self.edge_nw_x + 2), 256 * (self.edge_se_y - self.edge_nw_y + 2)), (255, 255, 255, 255)) for x, y, inputimg in self.require_list: input_img = Image.open(inputimg.output().fn) combined_tile.paste(input_img, (256 * x, 256 * y)) with self.output().open("wb") as output_f: combined_tile.save(output_f, 'PNG')
class RunFmask(luigi.Task): """ Execute the Fmask algorithm for a given granule. """ level1 = luigi.Parameter() granule = luigi.Parameter() workdir = luigi.Parameter() cloud_buffer_distance = luigi.FloatParameter(default=150.0) cloud_shadow_buffer_distance = luigi.FloatParameter(default=300.0) parallax_test = luigi.BoolParameter() upstream_settings = luigi.DictParameter(default={}) acq_parser_hint = luigi.OptionalParameter(default="") def requires(self): # for the time being have fmask require wagl, # no point in running fmask if wagl fails... # return WorkDir(self.level1, dirname(self.workdir)) return DataStandardisation( self.level1, self.workdir, self.granule, **self.upstream_settings, # pylint: disable=not-a-mapping ) def output(self): out_fname1 = pjoin(self.workdir, "{}.fmask.img".format(self.granule)) out_fname2 = pjoin(self.workdir, "{}.fmask.yaml".format(self.granule)) out_fnames = { "image": luigi.LocalTarget(out_fname1), "metadata": luigi.LocalTarget(out_fname2), } return out_fnames def run(self): out_fnames = self.output() with out_fnames["image"].temporary_path() as out_fname1: with out_fnames["metadata"].temporary_path() as out_fname2: fmask( self.level1, self.granule, out_fname1, out_fname2, self.workdir, self.acq_parser_hint, self.cloud_buffer_distance, self.cloud_shadow_buffer_distance, self.parallax_test, )
class TrainDevTestSplits(sciluigi.Task): train = luigi.FloatParameter(default=0.87) dev = luigi.FloatParameter(default=0.003) analysis = luigi.FloatParameter(default=0.127) in_processed = None def out_splits(self): return [ TargetInfo(self, 'data/translate/splits/train.source'), TargetInfo(self, 'data/translate/splits/train.target'), TargetInfo(self, 'data/translate/splits/dev.source'), TargetInfo(self, 'data/translate/splits/dev.target'), TargetInfo(self, 'data/translate/splits/analysis.source'), TargetInfo(self, 'data/translate/splits/analysis.target') ] def run(self): self.ex('mkdir -p data/translate/splits') self.ex('rm data/translate/splits/* || true') assert self.train + self.dev + self.analysis == 1. lines = sum(1 for line in open(self.in_processed[0].path)) split_counts = [ 0, int(lines * self.train), int(lines * self.dev), int(lines * self.analysis) ] split_idx = list(accumulate(split_counts)) logging.info('Rough counts of train/dev/analysis sizes: \n\t%s\n\t%s' % \ (str(split_counts), str(split_idx))) tup = tuple( open(out_file.path, 'wt') for out_file in self.out_splits()) train_src, train_trg, dev_src, dev_trg, analysis_src, analysis_trg = tup for i, (src, trg) in enumerate( zip(open(self.in_processed[0].path), open(self.in_processed[1].path))): if split_idx[1] > i: train_src.write(src) train_trg.write(trg) elif split_idx[2] > i: dev_src.write(src) dev_trg.write(trg) else: analysis_src.write(src) analysis_trg.write(trg) [i.close() for i in tup] call('wc -l data/translate/splits/*', shell=True)
class ClusterL1C2(luigi.Task): """ Level 1 clustering """ tissue = luigi.Parameter() a = luigi.FloatParameter(default=1) b = luigi.FloatParameter(default=10) c = luigi.FloatParameter(default=1) d = luigi.FloatParameter(default=10) n_factors = luigi.IntParameter(default=100) k_smoothing = luigi.IntParameter(default=100) k = luigi.IntParameter(default=10) log = luigi.BoolParameter(default=True) normalize = luigi.BoolParameter(default=True) accel = luigi.BoolParameter(default=False) def requires(self) -> luigi.Task: return am.PrepareTissuePool(tissue=self.tissue) def output(self) -> luigi.Target: return luigi.LocalTarget( os.path.join( am.paths().build, f"L1_{self.tissue}_nfactors={self.n_factors}_k={self.k}_ksmoothing={self.k_smoothing}_a={self.a}_b={self.b}_c={self.c}_d={self.d}_log={self.log}_normalize={self.normalize}_accel={self.accel}.loom" )) def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: with loompy.connect(self.input().fn, mode="r") as ds: logging.info("Collecting valid cells") for (ix, selection, view) in ds.scan( items=np.where(ds.col_attrs["_Valid"] == 1)[0], axis=1, key="Accession"): loompy.create_append(out_file, view.layers, view.ra, view.ca) with loompy.connect(out_file) as ds: logging.info(f"Found {ds.shape[1]} valid cells") logging.info("Learning the manifold") cg.Cytograph2(accel=self.accel, log=self.log, normalize=self.normalize, a=self.a, b=self.b, c=self.c, d=self.d, k=self.k, k_smoothing=self.k_smoothing, n_factors=self.n_factors, max_iter=200).fit(ds)
class GlobalParams(luigi.Config): model_name = luigi.Parameter() project_folder = luigi.Parameter() timestep = luigi.FloatParameter() dataset_id = luigi.IntParameter() endtime = luigi.FloatParameter() nb_past_timesteps = luigi.IntParameter() params_to_randomize = luigi.Parameter() nb_randomized_params = luigi.IntParameter() random_seed = luigi.IntParameter() # GenerateDataset nb_settings = luigi.IntParameter() nb_trajectories = luigi.IntParameter() # FormatDataset positivity = luigi.Parameter() test_fraction = luigi.FloatParameter() save_format = luigi.Parameter() # GenerateHistogramData nb_histogram_settings = luigi.IntParameter() nb_histogram_trajectories = luigi.IntParameter() histogram_endtime = luigi.FloatParameter() # Train model_id = luigi.IntParameter() nb_features = luigi.IntParameter() body_config_path = luigi.Parameter() mixture_config_path = luigi.Parameter() batch_size = luigi.IntParameter(default=256) add_noise = luigi.Parameter(default='false') stddev = luigi.FloatParameter(default=0.01) # TrainStatic n_epochs = luigi.IntParameter(default=100) # TrainSearch n_epochs_main = luigi.IntParameter(default=100) n_epochs_heat_up = luigi.IntParameter(default=20) n_epochs_arch = luigi.IntParameter(default=5) n_epochs_interval = luigi.IntParameter(default=5) n_epochs_finetune = luigi.IntParameter(default=30) # Evaluate distance_kind = luigi.Parameter(default='iou') target_species_names = luigi.Parameter(default='') time_lag_range = luigi.Parameter(default='10') settings_idxs_to_save_histograms = luigi.Parameter(default='0')