class SeqrMTToESTask(HailElasticSearchTask): source_paths = luigi.Parameter(default="[]", description='Path or list of paths of VCFs to be loaded.') dest_path = luigi.Parameter(description='Path to write the matrix table.') genome_version = luigi.Parameter(description='Reference Genome Version (37 or 38)') vep_runner = luigi.ChoiceParameter(choices=['VEP', 'DUMMY'], default='VEP', description='Choice of which vep runner to annotate vep.') reference_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the reference variants.') clinvar_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the clinvar variants.') hgmd_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the hgmd variants.') sample_type = luigi.ChoiceParameter(default="WES", choices=['WGS', 'WES'], description='Sample type, WGS or WES') dont_validate = luigi.BoolParameter(description='Disable checking whether the dataset matches the specified ' 'genome version and WGS vs. WES sample type.') dataset_type = luigi.ChoiceParameter(choices=['VARIANTS', 'SV'], default='VARIANTS', description='VARIANTS or SV.') remap_path = luigi.OptionalParameter(default=None, description="Path to a tsv file with two columns: s and seqr_id.") subset_path = luigi.OptionalParameter(default=None, description="Path to a tsv file with one column of sample IDs: s.") vep_config_json_path = luigi.OptionalParameter(default=None, description="Path of hail vep config .json file") def __init__(self, *args, **kwargs): # TODO: instead of hardcoded index, generate from project_guid, etc. kwargs['source_path'] = self.dest_path super().__init__(*args, **kwargs) self.completed_marker_path = os.path.join(self.dest_path, '_EXPORTED_TO_ES') def requires(self): return [SeqrVCFToMTTask( source_paths=self.source_paths, dest_path=self.dest_path, genome_version=self.genome_version, vep_runner=self.vep_runner, reference_ht_path=self.reference_ht_path, clinvar_ht_path=self.clinvar_ht_path, hgmd_ht_path=self.hgmd_ht_path, sample_type=self.sample_type, dont_validate=self.dont_validate, dataset_type=self.dataset_type, remap_path=self.remap_path, subset_path=self.subset_path, vep_config_json_path=self.vep_config_json_path, )] def output(self): # TODO: Use https://luigi.readthedocs.io/en/stable/api/luigi.contrib.esindex.html. return GCSorLocalTarget(filename=self.completed_marker_path) def complete(self): # Complete is called by Luigi to check if the task is done and will skip if it is. # By default it checks to see that the output exists, but we want to check for the # _EXPORTED_TO_ES file to make sure it was not terminated halfway. return GCSorLocalTarget(filename=self.completed_marker_path).exists() def run(self): mt = self.import_mt() row_table = SeqrVariantsAndGenotypesSchema.elasticsearch_row(mt) self.export_table_to_elasticsearch(row_table, self._mt_num_shards(mt)) with hl.hadoop_open(self.completed_marker_path, "w") as f: f.write(".") self.cleanup()
class CopyFiles(luigi.Task): src = luigi.Parameter() dst = luigi.Parameter() threads = luigi.IntParameter(default=16) filelist = luigi.OptionalParameter(default=None) symlinks = luigi.BoolParameter(default=False) ignore = luigi.OptionalParameter(default=None) copymeta = luigi.BoolParameter(default=True) #def output(self): # return [] #def requires(self): # return [] def run(self): from saisoku import ThreadedCopy ThreadedCopy(src=self.src, dst=self.dst, threads=self.threads, filelist=self.filelist, symlinks=self.symlinks, ignore=self.ignore, copymeta=self.copymeta)
class GetMarcapCodes(BaseTask): markets = luigi.OptionalParameter("") year = luigi.OptionalParameter("") month = luigi.OptionalParameter("") def makeQuery(self) -> Dict: query = {} markets = json.loads(self.markets) if len(markets) > 0: query["$or"] = list(map(lambda market: {"market": market}, markets)) if len(self.year) > 0: month = "" if len(self.month) > 0: month = str(self.month).zfill(2) query["date"] = {"$regex": f"^{self.year}{month}", "$options": "i"} print(query) return query def run(self) -> Generator: path = self.makeDirs() target = yield MongoGetCollectionTask(index="stock", collection="marcap") collection = target.get_collection() cursor = collection.distinct("code", self.makeQuery()) df = pd.Series(list(cursor)) df.to_hdf(path, key='df', mode='w') print(df) def makePath(self) -> str: result = hashlib.md5(f'{self.markets}'.encode()) return f'data/marcap/codes/GetMarcapCodes-{self.year}-{self.month}-{result.hexdigest()}'
class GetStockMonthTask(BaseTask): year = luigi.OptionalParameter("") month = luigi.OptionalParameter("") market = luigi.OptionalParameter("") def run(self) -> Generator: path = self.makeDirs() target = yield MongoGetCollectionTask(index="stock", collection="marcap") collection = target.get_collection() cursor = collection.find({ "$and": [{ "date": { "$regex": f"^{self.year}{str(self.month).zfill(2)}", "$options": "i" } }, { "market": self.market }] }) df = pd.DataFrame(list(cursor)) df.to_hdf(path, key='df', mode='w') print(df) def makePath(self) -> str: return f'data/stock/month/stock-marcap-{self.market}-{self.year}-{self.month}'
class BatchTask(luigi.Task): """ Base class for an Amazon Batch job Amazon Batch requires you to register "job definitions", which are JSON descriptions for how to issue the ``docker run`` command. This Luigi Task requires a pre-registered Batch jobDefinition name passed as a Parameter :param job_definition (str): name of pre-registered jobDefinition :param job_name: name of specific job, for tracking in the queue and logs. :param job_queue: name of job queue where job is going to be submitted. """ job_definition = luigi.Parameter() job_name = luigi.OptionalParameter(default=None) job_queue = luigi.OptionalParameter(default=None) poll_time = luigi.IntParameter(default=POLL_TIME) def run(self): bc = BatchClient(self.poll_time) job_id = bc.submit_job(self.job_definition, self.parameters, job_name=self.job_name, queue=self.job_queue) bc.wait_on_job(job_id) @property def parameters(self): """Override to return a dict of parameters for the Batch Task""" return {}
class CalculateLonLatGrids(luigi.Task): """Calculates the longitude and latitude grids.""" level1 = luigi.Parameter() work_root = luigi.Parameter(significant=False) granule = luigi.OptionalParameter(default="") group = luigi.Parameter() acq_parser_hint = luigi.OptionalParameter(default="") compression = luigi.EnumParameter( enum=H5CompressionFilter, default=H5CompressionFilter.LZF, significant=False ) filter_opts = luigi.DictParameter(default=None, significant=False) buffer_distance = luigi.FloatParameter(default=8000, significant=False) def requires(self): # we want to pass the level1 root not the granule root return WorkRoot(self.level1, dirname(self.work_root)) def output(self): out_fname = pjoin(self.work_root, self.group, "longitude-latitude.h5") return luigi.LocalTarget(out_fname) def run(self): acq = ( acquisitions(self.level1, self.acq_parser_hint).get_acquisitions( self.group, self.granule ) )[0] with self.output().temporary_path() as out_fname: _create_lon_lat_grids(acq, out_fname, self.compression, self.filter_opts)
class CutadaptTask(RemoveTaskOutputOnFailureMixin, ScheduledExternalProgramTask): """ Base class for all cutadapt-derived tasks. """ task_namespace = 'cutadapt' adapter_3prime = luigi.OptionalParameter(default='', positional=False) adapter_5prime = luigi.OptionalParameter(default='', positional=False) cut = luigi.IntParameter(default=0, positional=False) trim_n = luigi.BoolParameter(default=False, positional=False) minimum_length = luigi.IntParameter(default=0, positional=False) def program_args(self): args = [cfg.cutadapt_bin] args.extend(['-j', self.cpus]) if self.adapter_3prime: args.extend(['-a', self.adapter_3prime]) if self.adapter_5prime: args.extend(['-g', self.adapter_5prime]) if self.cut: args.extend(['-u', self.cut]) if self.trim_n: args.append('--trim-n') if self.minimum_length: args.extend(['--minimum-length', self.minimum_length]) return args
class ExternalDatabaseCredentialsParameter(ExternalDatabaseHostParameter, ExternalDatabaseXMLRPCParameter): external_exasol_db_user = luigi.OptionalParameter() external_exasol_db_password = luigi.OptionalParameter( significant=False, visibility=ParameterVisibility.HIDDEN) external_exasol_bucketfs_write_password = luigi.OptionalParameter( significant=False, visibility=ParameterVisibility.HIDDEN)
class ExternalDatabaseXMLRPCParameter(Config): external_exasol_xmlrpc_host = luigi.OptionalParameter() external_exasol_xmlrpc_port = luigi.IntParameter(443) external_exasol_xmlrpc_user = luigi.OptionalParameter() external_exasol_xmlrpc_cluster_name = luigi.OptionalParameter() external_exasol_xmlrpc_password = luigi.OptionalParameter( significant=False, visibility=ParameterVisibility.HIDDEN)
class Annotate(BcftoolsTask): """ Annotate a VCF using bcftools annotate. """ output_file = luigi.Parameter() output_format = luigi.Parameter(positional=False, default='z') # options given an annotation file annotations_file = luigi.OptionalParameter(positional=False, default=None) columns = luigi.ListParameter(positional=False, default=[]) rename_chrs = luigi.OptionalParameter(positional=False, default=None) def subcommand_args(self): args = ['annotate'] if self.rename_chrs is not None: args.extend(['--rename-chrs', self.rename_chrs]) if self.annotations_file: args.extend(['-a', self.annotations_file]) args.extend(['-c', ','.join(self.columns)]) args.extend([ '--output-type', self.output_format, '--output', self.output_file ]) return args def output(self): return luigi.LocalTarget(self.output_file)
class SeqrVCFToGenotypesMTTask(HailMatrixTableTask): remap_path = luigi.OptionalParameter( default=None, description="Path to a tsv file with two columns: s and seqr_id.") subset_path = luigi.OptionalParameter( default=None, description="Path to a tsv file with one column of sample IDs: s.") def requires(self): return [SeqrVCFToVariantMTTask()] def run(self): # Hack that fixes something in Hail. TODO: Remove when Hail fix comes. hl._set_flags(newaggs=None) mt = hl.read_matrix_table(self.input()[0].path) if self.remap_path: mt = self.remap_sample_ids(mt, self.remap_path) if self.subset_path: mt = self.subset_samples_and_variants(mt, self.subset_path) mt = SeqrGenotypesSchema(mt).annotate_all( overwrite=True).select_annotated_mt() mt.describe() mt.write(self.output().path, stage_locally=True, overwrite=True)
class TrimPairedReads(CutadaptTask): input_file = luigi.Parameter() input2_file = luigi.Parameter() output_file = luigi.Parameter() output2_file = luigi.Parameter() reverse_adapter_3prime = luigi.OptionalParameter(default='', positional=False) reverse_adapter_5prime = luigi.OptionalParameter(default='', positional=False) def program_args(self): args = super(TrimPairedReads, self).program_args() if self.reverse_adapter_3prime: args.extend(['-A', self.reverse_adapter_3prime]) if self.reverse_adapter_5prime: args.extend(['-G', self.reverse_adapter_5prime]) args.extend([ '-o', self.output_file, '-p', self.output2_file, self.input_file, self.input2_file ]) return args def output(self): return [ luigi.LocalTarget(self.output_file), luigi.LocalTarget(self.output2_file) ]
class GetStockRangeTask(BaseTask): startDate = luigi.OptionalParameter("") endDate = luigi.OptionalParameter("") market = luigi.OptionalParameter("") def run(self) -> Generator: path = self.makeDirs() target = yield MongoGetCollectionTask(index="stock", collection="marcap") collection = target.get_collection() cursor = collection.find({ "$and": [{ "date": { "$gte": self.startDate, "$lte": self.endDate } }, { "market": self.market }] }) df = pd.DataFrame(list(cursor)) df.to_hdf(path, key='df', mode='w') print(df) def makePath(self) -> str: return f'data/stock/range/stock-marcap-{self.market}-{self.startDate}-{self.endDate}'
class LinkwaglOutputs(luigi.Task): """ Link all the multifile outputs from wagl into a single file. """ level1 = luigi.Parameter() work_root = luigi.Parameter() granule = luigi.OptionalParameter(default="") acq_parser_hint = luigi.OptionalParameter(default="") workflow = luigi.EnumParameter(enum=Workflow) vertices = luigi.TupleParameter(default=(5, 5)) pixel_quality = luigi.BoolParameter() method = luigi.EnumParameter(enum=Method, default=Method.SHEAR) dsm_fname = luigi.Parameter(significant=False) buffer_distance = luigi.FloatParameter(default=8000, significant=False) def requires(self): container = acquisitions(self.level1, self.acq_parser_hint) for group in container.supported_groups: kwargs = { "level1": self.level1, "work_root": self.work_root, "granule": self.granule, "group": group, "workflow": self.workflow, "vertices": self.vertices, "pixel_quality": self.pixel_quality, "method": self.method, "dsm_fname": self.dsm_fname, "buffer_distance": self.buffer_distance, } yield DataStandardisation(**kwargs) def output(self): out_fname = pjoin(dirname(self.work_root), "{}.h5".format(self.granule)) return luigi.LocalTarget(out_fname) def run(self): with self.output().temporary_path() as out_fname: for root, _, files in os.walk(self.work_root): # skip any private files if basename(root)[0] == "_": continue for file_ in files: if splitext(file_)[1] == ".h5": fname = pjoin(root, file_) grp_name = basename(dirname(fname.replace(self.work_root, ""))) with h5py.File(fname, "r") as fid: groups = [g for g in fid] for pth in groups: new_path = ppjoin(self.granule, grp_name, pth) create_external_link(fname, pth, out_fname, new_path) with h5py.File(out_fname, "a") as fid: fid.attrs["level1_uri"] = self.level1
class AncillaryData(luigi.Task): """Get all ancillary data.""" level1 = luigi.Parameter() work_root = luigi.Parameter(significant=False) granule = luigi.OptionalParameter(default="") vertices = luigi.TupleParameter() workflow = luigi.EnumParameter(enum=Workflow) acq_parser_hint = luigi.OptionalParameter(default="") aerosol = luigi.DictParameter({"user": 0.05}, significant=False) brdf = luigi.DictParameter() ozone_path = luigi.Parameter(significant=False) water_vapour = luigi.DictParameter({"user": 1.5}, significant=False) dem_path = luigi.Parameter(significant=False) ecmwf_path = luigi.Parameter(significant=False) invariant_height_fname = luigi.Parameter(significant=False) compression = luigi.EnumParameter( enum=H5CompressionFilter, default=H5CompressionFilter.LZF, significant=False ) filter_opts = luigi.DictParameter(default=None, significant=False) def requires(self): group = acquisitions(self.level1, self.acq_parser_hint).supported_groups[0] args = [self.level1, self.work_root, self.granule, group] return CalculateSatelliteAndSolarGrids(*args) def output(self): return luigi.LocalTarget(pjoin(self.work_root, "ancillary.h5")) def run(self): container = acquisitions(self.level1, self.acq_parser_hint) grn = container.get_granule(granule=self.granule, container=True) sbt_path = None nbar_paths = { "aerosol_dict": self.aerosol, "water_vapour_dict": self.water_vapour, "ozone_path": self.ozone_path, "dem_path": self.dem_path, "brdf_dict": self.brdf, } if self.workflow == Workflow.STANDARD or self.workflow == Workflow.SBT: sbt_path = self.ecmwf_path with self.output().temporary_path() as out_fname: _collect_ancillary( grn, self.input().path, nbar_paths, sbt_path, self.invariant_height_fname, self.vertices, out_fname, self.compression, self.filter_opts, )
class BcftoolsTask(ScheduledExternalProgramTask): task_namespace = 'bcftools' input_file = luigi.Parameter() include = luigi.OptionalParameter(positional=False, default=None) exclude = luigi.OptionalParameter(positional=False, default=None) regions = luigi.ListParameter(default=[], positional=False) regions_file = luigi.OptionalParameter(positional=False, default=None) samples = luigi.ListParameter(default=[], positional=False) samples_file = luigi.OptionalParameter(default=None, positional=False) apply_filters = luigi.OptionalParameter(positional=False, default=None) # FIXME: the '--threads' flag does not seem to work def subcommand_args(self): """Returns specific sub-command arguments.""" raise NotImplementedError def subcommand_input_args(self): """ Returns arguments to be appended at the input file location. This is meant to be to to deal with commands that accept multiple input files. """ return [self.input_file] def program_args(self): args = [cfg.bcftools_bin] args.extend(self.subcommand_args()) if self.include is not None: args.extend(['-i', self.include]) if self.exclude is not None: args.extend(['-e', self.exclude]) if self.regions: args.extend(['-r', ','.join(self.regions)]) if self.regions_file is not None: args.extend(['-R', self.regions_file]) if self.samples: args.extend(['-s', ','.join(self.samples)]) if self.samples_file: args.extend(['-S', self.samples_file]) if self.apply_filters is not None: args.extend(['-f', self.apply_filters]) args.extend(self.subcommand_input_args()) return args
class hdfs(luigi.Config): client_version = luigi.IntParameter(default=None) namenode_host = luigi.OptionalParameter(default=None) namenode_port = luigi.IntParameter(default=None) client = luigi.Parameter(default='hadoopcli') tmp_dir = luigi.OptionalParameter( default=None, config_path=dict(section='core', name='hdfs-tmp-dir'), )
class target_docker_repository_config(luigi.Config): repository_name = luigi.Parameter("exasol/script-language-container") tag_prefix = luigi.Parameter("") username = luigi.OptionalParameter(None, significant=False, visibility=ParameterVisibility.PRIVATE) password = luigi.OptionalParameter(None, significant=False, visibility=ParameterVisibility.PRIVATE)
class DataStandardisation(luigi.Task): """ Runs the standardised product workflow. """ level1 = luigi.Parameter() outdir = luigi.Parameter() granule = luigi.OptionalParameter(default='') workflow = luigi.EnumParameter(enum=Workflow, default=Workflow.STANDARD) vertices = luigi.TupleParameter(default=(5, 5)) method = luigi.EnumParameter(enum=Method, default=Method.SHEAR) pixel_quality = luigi.BoolParameter() land_sea_path = luigi.Parameter() aerosol = luigi.DictParameter(default={'user': 0.05}) brdf = luigi.DictParameter() ozone_path = luigi.Parameter(significant=False) water_vapour = luigi.DictParameter(default={'user': 1.5}, significant=False) dem_path = luigi.Parameter(significant=False) ecmwf_path = luigi.Parameter(significant=False) invariant_height_fname = luigi.Parameter(significant=False) dsm_fname = luigi.Parameter(significant=False) modtran_exe = luigi.Parameter(significant=False) tle_path = luigi.Parameter(significant=False) rori = luigi.FloatParameter(default=0.52, significant=False) compression = luigi.EnumParameter(enum=H5CompressionFilter, default=H5CompressionFilter.LZF, significant=False) filter_opts = luigi.DictParameter(default=None, significant=False) acq_parser_hint = luigi.OptionalParameter(default='') buffer_distance = luigi.FloatParameter(default=8000, significant=False) h5_driver = luigi.OptionalParameter(default='', significant=False) normalized_solar_zenith = luigi.FloatParameter(default=45.0) def output(self): fmt = '{label}.wagl.h5' label = self.granule if self.granule else basename(self.level1) out_fname = fmt.format(label=label) return luigi.LocalTarget(pjoin(self.outdir, out_fname)) def run(self): if self.workflow == Workflow.STANDARD or self.workflow == Workflow.SBT: ecmwf_path = self.ecmwf_path else: ecmwf_path = None with self.output().temporary_path() as out_fname: card4l(self.level1, self.granule, self.workflow, self.vertices, self.method, self.pixel_quality, self.land_sea_path, self.tle_path, self.aerosol, self.brdf, self.ozone_path, self.water_vapour, self.dem_path, self.dsm_fname, self.invariant_height_fname, self.modtran_exe, out_fname, ecmwf_path, self.rori, self.buffer_distance, self.compression, self.filter_opts, self.h5_driver, self.acq_parser_hint, self.normalized_solar_zenith)
class build_config(luigi.Config): force_pull = luigi.BoolParameter(False) force_load = luigi.BoolParameter(False) force_rebuild = luigi.BoolParameter(False) force_rebuild_from = luigi.ListParameter([]) log_build_context_content = luigi.BoolParameter(False) #keep_build_context = luigi.BoolParameter(False) temporary_base_directory = luigi.OptionalParameter(None) output_directory = luigi.Parameter(".build_output") cache_directory = luigi.OptionalParameter("")
class DatasetComponentsAnnotationMapImage(ComponentsAnnotationMapImage): data_path = luigi.Parameter() step_size = luigi.Parameter() model_path = luigi.Parameter() scene_id = luigi.Parameter() transform_type = luigi.OptionalParameter() transform_extra_args = luigi.OptionalParameter() pretrained_transform_model = luigi.OptionalParameter(default=None) components = luigi.ListParameter(default=[0, 1, 2]) crop_img = luigi.BoolParameter(default=False) def requires(self): if self.transform_type is None: return DatasetImagePredictionMapData( data_path=self.data_path, scene_id=self.scene_id, model_path=self.model_path, step_size=self.step_size, ) else: return DatasetEmbeddingTransform( data_path=self.data_path, scene_id=self.scene_id, model_path=self.model_path, step_size=self.step_size, transform_type=self.transform_type, transform_extra_args=self.transform_extra_args, pretrained_model=self.pretrained_transform_model, ) @property def input_path(self): return self.input().fn @property def src_data_path(self): return self.data_path def output(self): model_name = Path(self.model_path).name.replace(".pkl", "") fn = "{}.{}_step.{}_transform.map.{}__comp.png".format( self.scene_id, self.step_size, self.transform_type, "_".join([str(v) for v in self.components]), ) p_root = Path(self.data_path) / "embeddings" / "rect" / model_name if self.pretrained_transform_model is not None: p = p_root / self.pretrained_transform_model / "components_map" / fn else: p = p_root / "components_map" / fn return XArrayTarget(str(p))
class MongoGetCollectionTask(luigi.Task): index = luigi.OptionalParameter("") collection = luigi.OptionalParameter("") def output(self) -> MongoCollectionTarget: host = config["mongodbHost"] port = config["mongodbPort"] userName = config["mongodbUserName"] password = config["mongodbPassword"] path = f'mongodb://{userName}:{password}@{host}:{port}' client = MongoClient(path) return MongoCollectionTarget(client, self.index, self.collection)
class GetStockCodeFilteringByFactorRank(BaseTask): date = luigi.OptionalParameter("") factor = luigi.OptionalParameter("") markets = luigi.OptionalParameter("") targets = luigi.OptionalParameter("") ascending = luigi.BoolParameter(True) limit = luigi.IntParameter(sys.maxsize) includeSame = luigi.BoolParameter(True) def run(self) -> Generator: path = self.makeDirs() year = int(self.date[:4]) month = int(self.date[4:6]) markets = json.loads(self.markets) limit = int(self.limit) if len(self.targets) > 0: targets = json.loads(self.targets) marcapOutput = yield GetMarcapCodes(markets=self.markets, year=int(year), month=month) targets = pd.read_hdf(marcapOutput.path).to_list() factorTarget = None if month <= 4: factorTarget = yield GetFactorYearTask(year=str(year - 1), name=self.factor) else: factorTarget = yield GetFactorYearTask(year=str(year), name=self.factor) factorDf: pd.DataFrame = pd.read_hdf(factorTarget.path) if factorDf.empty: return newDf = factorDf[factorDf["code"].isin(targets)] newDf.sort_values(by="dataValue", ascending=self.ascending, inplace=True) if not self.includeSame: newDf = newDf.iloc[0:int(limit)] else: newDf["rank"] = newDf["dataValue"].rank(method="min", ascending=self.ascending) newDf = newDf[newDf["rank"] <= limit] newDf.to_hdf(path, key='df', mode='w') print(newDf) print(path) def makePath(self) -> str: result = hashlib.md5(f'{self.markets}{self.targets}'.encode()) return f"data/simul/factor/GetStockCodeFilteringByFactorRank-{self.date}-{self.factor}-{self.ascending}-{self.limit}-{result.hexdigest()}"
class GetFactorYearTask(BaseTask): year = luigi.OptionalParameter("") month = luigi.OptionalParameter("12") name = luigi.OptionalParameter("") exact = luigi.BoolParameter(False) def makeAndQuery(self) -> List: query = [] if len(self.year) > 0: query.append({"dataYear": "{:.1f}".format(int(self.year))}) if len(self.month) > 0: query.append({"dataMonth": self.month}) if len(self.name) > 0: if self.exact: query.append({"dataName": self.name}) else: query.append({"dataName": {"$regex": self.name}}) print(query) return query def run(self) -> Generator: path = self.makeDirs() target = yield MongoGetCollectionTask(index="stock", collection="factor") collection = target.get_collection() cursor = collection.find({"$and": self.makeAndQuery()}) df = pd.DataFrame(list(cursor)) # errors를 coerce로 하면 숫자로 못바꾸는 항목은 NaN으로 설정 df["_id"] = df["_id"].astype(str) df["createdAt"] = df["createdAt"].astype(str) df["updatedAt"] = df["updatedAt"].astype(str) df["dataValue"] = pd.to_numeric(df["dataValue"], errors="coerce") df.dropna(subset=['dataValue']) df.to_hdf(path, key='df', mode='w') print(df) def makePath(self) -> str: path = 'data/factor/fnguide/year/factor-fnguide' strYear = str(self.year) strMonth = str(self.month) strName = str(self.name) if len(strYear) > 0: path = path + f"-{strYear}" if len(strMonth) > 0: path = path + f"-{strMonth}" if len(strName) > 0: path = path + f"-{strName}" return path
class hdfs(luigi.Config): client_version = luigi.IntParameter(default=None) effective_user = luigi.OptionalParameter( default=os.getenv('HADOOP_USER_NAME'), description="Optionally specifies the effective user for snakebite. " "If not set the environment variable HADOOP_USER_NAME is " "used, else USER") snakebite_autoconfig = luigi.BoolParameter(default=False) namenode_host = luigi.OptionalParameter(default=None) namenode_port = luigi.IntParameter(default=None) client = luigi.Parameter(default='hadoopcli') tmp_dir = luigi.OptionalParameter( default=None, config_path=dict(section='core', name='hdfs-tmp-dir'), )
class WorkRoot(luigi.Task): """ Create the work root directory space, and sub directories that could compete later in a race condition of creation. """ level1 = luigi.Parameter() work_root = luigi.Parameter(significant=False) acq_parser_hint = luigi.OptionalParameter(default="") reflectance_dir = "_standardised" shadow_dir = "_shadow" interpolation_dir = "_interpolation" def output(self): out_dirs = [self.reflectance_dir, self.shadow_dir, self.interpolation_dir] container = acquisitions(self.level1, self.acq_parser_hint) for granule in container.granules: for group in container.supported_groups: pth = container.get_root(self.work_root, group, granule) for out_dir in out_dirs: yield luigi.LocalTarget(pjoin(pth, out_dir)) def run(self): local_fs = LocalFileSystem() for target in self.output(): local_fs.mkdir(target.path)
class RunFmask(luigi.Task): """ Execute the Fmask algorithm for a given granule. """ level1 = luigi.Parameter() granule = luigi.Parameter() workdir = luigi.Parameter() acq_parser_hint = luigi.OptionalParameter(default='') def requires(self): # for the time being have fmask require wagl, # no point in running fmask if wagl fails... # return WorkDir(self.level1, dirname(self.workdir)) return DataStandardisation(self.level1, self.workdir, self.granule) def output(self): out_fname = pjoin(self.workdir, '{}.fmask.img'.format(self.granule)) return luigi.LocalTarget(out_fname) def run(self): with self.output().temporary_path() as out_fname: fmask(self.level1, self.granule, out_fname, self.workdir, self.acq_parser_hint)
class DownloadSraExperiment(DynamicTaskWithOutputMixin, DynamicWrapperTask): """ Download a SRA experiment comprising one SRA run It is possible for experiments to be reprocessed in SRA leading to multiple associated runs. The default is to select the latest run based on the lexicographic order of its identifier. """ srr = luigi.OptionalParameter( default=None, description='Specific SRA run accession to use (defaults to latest)') @property def sample_id(self): return self.srx def run(self): # this will raise an error of no FASTQs are related df = pd.read_csv(self.input().path) if self.srr is not None: run = df[df.Run == self.srr].iloc[0] else: run = df.sort_values('Run', ascending=False).iloc[0] # layout is very often not annotated correctly and it is best to rely # on the number of mates per spot is_paired = (self.sample_id in sra_cfg.paired_read_experiments) or ( run.spots_with_mates > 0) yield DumpSraRun(run.Run, self.srx, paired_reads=is_paired)
class Raw(luigi.Task): __name__ = 'Raw' data_root = luigi.Parameter(default=os.path.expanduser('~/.emu/')) file_id = luigi.IntParameter(description='Box file_id') file_name = luigi.Parameter() save_to = luigi.OptionalParameter() overwrite = luigi.BoolParameter(default=False) def __repr__(self): cache = self.out_dir().split('.emu/')[-1] return '{}(file=/.emu/{}/{})'.format(self.__name__, cache, self.file_name) def out_dir(self): return self.save_to def download(self): client = jwt() file = client.file(self.file_id) fp = os.path.join(self.out_dir(), self.file_name) with open(fp, 'wb') as open_file: file.download_to(open_file) open_file.close() def run(self): check_or_create(self.out_dir()) self.download() def output(self): out_fp = os.path.join(self.out_dir(), self.file_name) return luigi.LocalTarget(out_fp)
class hadoop(luigi.task.Config): pool = luigi.OptionalParameter( default=None, description=( 'Hadoop pool so use for Hadoop tasks. To specify pools per tasks, ' 'see BaseHadoopJobTask.pool'), )