class SeqrMTToESTask(HailElasticSearchTask):
    source_paths = luigi.Parameter(default="[]", description='Path or list of paths of VCFs to be loaded.')
    dest_path = luigi.Parameter(description='Path to write the matrix table.')
    genome_version = luigi.Parameter(description='Reference Genome Version (37 or 38)')
    vep_runner = luigi.ChoiceParameter(choices=['VEP', 'DUMMY'], default='VEP', description='Choice of which vep runner to annotate vep.')

    reference_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the reference variants.')
    clinvar_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the clinvar variants.')
    hgmd_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the hgmd variants.')
    sample_type = luigi.ChoiceParameter(default="WES", choices=['WGS', 'WES'], description='Sample type, WGS or WES')
    dont_validate = luigi.BoolParameter(description='Disable checking whether the dataset matches the specified '
                                                    'genome version and WGS vs. WES sample type.')
    dataset_type = luigi.ChoiceParameter(choices=['VARIANTS', 'SV'], default='VARIANTS', description='VARIANTS or SV.')
    remap_path = luigi.OptionalParameter(default=None, description="Path to a tsv file with two columns: s and seqr_id.")
    subset_path = luigi.OptionalParameter(default=None, description="Path to a tsv file with one column of sample IDs: s.")
    vep_config_json_path = luigi.OptionalParameter(default=None, description="Path of hail vep config .json file")

    def __init__(self, *args, **kwargs):
        # TODO: instead of hardcoded index, generate from project_guid, etc.
        kwargs['source_path'] = self.dest_path
        super().__init__(*args, **kwargs)

        self.completed_marker_path = os.path.join(self.dest_path, '_EXPORTED_TO_ES')

    def requires(self):
        return [SeqrVCFToMTTask(
            source_paths=self.source_paths,
            dest_path=self.dest_path,
            genome_version=self.genome_version,
            vep_runner=self.vep_runner,
            reference_ht_path=self.reference_ht_path,
            clinvar_ht_path=self.clinvar_ht_path,
            hgmd_ht_path=self.hgmd_ht_path,
            sample_type=self.sample_type,
            dont_validate=self.dont_validate,
            dataset_type=self.dataset_type,
            remap_path=self.remap_path,
            subset_path=self.subset_path,
            vep_config_json_path=self.vep_config_json_path,
        )]

    def output(self):
        # TODO: Use https://luigi.readthedocs.io/en/stable/api/luigi.contrib.esindex.html.
        return GCSorLocalTarget(filename=self.completed_marker_path)

    def complete(self):
        # Complete is called by Luigi to check if the task is done and will skip if it is.
        # By default it checks to see that the output exists, but we want to check for the
        # _EXPORTED_TO_ES file to make sure it was not terminated halfway.
        return GCSorLocalTarget(filename=self.completed_marker_path).exists()

    def run(self):
        mt = self.import_mt()
        row_table = SeqrVariantsAndGenotypesSchema.elasticsearch_row(mt)
        self.export_table_to_elasticsearch(row_table, self._mt_num_shards(mt))

        with hl.hadoop_open(self.completed_marker_path, "w") as f:
            f.write(".")

        self.cleanup()
Example #2
0
class CopyFiles(luigi.Task):
    src = luigi.Parameter()
    dst = luigi.Parameter()
    threads = luigi.IntParameter(default=16)
    filelist = luigi.OptionalParameter(default=None)
    symlinks = luigi.BoolParameter(default=False)
    ignore = luigi.OptionalParameter(default=None)
    copymeta = luigi.BoolParameter(default=True)

    #def output(self):
    #    return []

    #def requires(self):
    #    return []

    def run(self):
        from saisoku import ThreadedCopy

        ThreadedCopy(src=self.src,
                     dst=self.dst,
                     threads=self.threads,
                     filelist=self.filelist,
                     symlinks=self.symlinks,
                     ignore=self.ignore,
                     copymeta=self.copymeta)
Example #3
0
class GetMarcapCodes(BaseTask):
    markets = luigi.OptionalParameter("")
    year = luigi.OptionalParameter("")
    month = luigi.OptionalParameter("")

    def makeQuery(self) -> Dict:
        query = {}
        markets = json.loads(self.markets)
        if len(markets) > 0:
            query["$or"] = list(map(lambda market: {"market": market},
                                    markets))
        if len(self.year) > 0:
            month = ""
            if len(self.month) > 0:
                month = str(self.month).zfill(2)
        query["date"] = {"$regex": f"^{self.year}{month}", "$options": "i"}
        print(query)
        return query

    def run(self) -> Generator:
        path = self.makeDirs()
        target = yield MongoGetCollectionTask(index="stock",
                                              collection="marcap")
        collection = target.get_collection()
        cursor = collection.distinct("code", self.makeQuery())
        df = pd.Series(list(cursor))
        df.to_hdf(path, key='df', mode='w')
        print(df)

    def makePath(self) -> str:
        result = hashlib.md5(f'{self.markets}'.encode())
        return f'data/marcap/codes/GetMarcapCodes-{self.year}-{self.month}-{result.hexdigest()}'
Example #4
0
class GetStockMonthTask(BaseTask):
    year = luigi.OptionalParameter("")
    month = luigi.OptionalParameter("")
    market = luigi.OptionalParameter("")

    def run(self) -> Generator:
        path = self.makeDirs()
        target = yield MongoGetCollectionTask(index="stock",
                                              collection="marcap")
        collection = target.get_collection()
        cursor = collection.find({
            "$and": [{
                "date": {
                    "$regex": f"^{self.year}{str(self.month).zfill(2)}",
                    "$options": "i"
                }
            }, {
                "market": self.market
            }]
        })
        df = pd.DataFrame(list(cursor))
        df.to_hdf(path, key='df', mode='w')
        print(df)

    def makePath(self) -> str:
        return f'data/stock/month/stock-marcap-{self.market}-{self.year}-{self.month}'
Example #5
0
class BatchTask(luigi.Task):
    """
    Base class for an Amazon Batch job

    Amazon Batch requires you to register "job definitions", which are JSON
    descriptions for how to issue the ``docker run`` command. This Luigi Task
    requires a pre-registered Batch jobDefinition name passed as a Parameter

    :param job_definition (str): name of pre-registered jobDefinition
    :param job_name: name of specific job, for tracking in the queue and logs.
    :param job_queue: name of job queue where job is going to be submitted.

    """
    job_definition = luigi.Parameter()
    job_name = luigi.OptionalParameter(default=None)
    job_queue = luigi.OptionalParameter(default=None)
    poll_time = luigi.IntParameter(default=POLL_TIME)

    def run(self):
        bc = BatchClient(self.poll_time)
        job_id = bc.submit_job(self.job_definition,
                               self.parameters,
                               job_name=self.job_name,
                               queue=self.job_queue)
        bc.wait_on_job(job_id)

    @property
    def parameters(self):
        """Override to return a dict of parameters for the Batch Task"""
        return {}
Example #6
0
class CalculateLonLatGrids(luigi.Task):

    """Calculates the longitude and latitude grids."""

    level1 = luigi.Parameter()
    work_root = luigi.Parameter(significant=False)
    granule = luigi.OptionalParameter(default="")
    group = luigi.Parameter()
    acq_parser_hint = luigi.OptionalParameter(default="")
    compression = luigi.EnumParameter(
        enum=H5CompressionFilter, default=H5CompressionFilter.LZF, significant=False
    )
    filter_opts = luigi.DictParameter(default=None, significant=False)
    buffer_distance = luigi.FloatParameter(default=8000, significant=False)

    def requires(self):
        # we want to pass the level1 root not the granule root
        return WorkRoot(self.level1, dirname(self.work_root))

    def output(self):
        out_fname = pjoin(self.work_root, self.group, "longitude-latitude.h5")
        return luigi.LocalTarget(out_fname)

    def run(self):
        acq = (
            acquisitions(self.level1, self.acq_parser_hint).get_acquisitions(
                self.group, self.granule
            )
        )[0]

        with self.output().temporary_path() as out_fname:
            _create_lon_lat_grids(acq, out_fname, self.compression, self.filter_opts)
Example #7
0
class CutadaptTask(RemoveTaskOutputOnFailureMixin,
                   ScheduledExternalProgramTask):
    """
    Base class for all cutadapt-derived tasks.
    """
    task_namespace = 'cutadapt'

    adapter_3prime = luigi.OptionalParameter(default='', positional=False)
    adapter_5prime = luigi.OptionalParameter(default='', positional=False)

    cut = luigi.IntParameter(default=0, positional=False)
    trim_n = luigi.BoolParameter(default=False, positional=False)
    minimum_length = luigi.IntParameter(default=0, positional=False)

    def program_args(self):
        args = [cfg.cutadapt_bin]

        args.extend(['-j', self.cpus])

        if self.adapter_3prime:
            args.extend(['-a', self.adapter_3prime])

        if self.adapter_5prime:
            args.extend(['-g', self.adapter_5prime])

        if self.cut:
            args.extend(['-u', self.cut])

        if self.trim_n:
            args.append('--trim-n')

        if self.minimum_length:
            args.extend(['--minimum-length', self.minimum_length])

        return args
Example #8
0
class ExternalDatabaseCredentialsParameter(ExternalDatabaseHostParameter,
                                           ExternalDatabaseXMLRPCParameter):
    external_exasol_db_user = luigi.OptionalParameter()
    external_exasol_db_password = luigi.OptionalParameter(
        significant=False, visibility=ParameterVisibility.HIDDEN)
    external_exasol_bucketfs_write_password = luigi.OptionalParameter(
        significant=False, visibility=ParameterVisibility.HIDDEN)
Example #9
0
class ExternalDatabaseXMLRPCParameter(Config):
    external_exasol_xmlrpc_host = luigi.OptionalParameter()
    external_exasol_xmlrpc_port = luigi.IntParameter(443)
    external_exasol_xmlrpc_user = luigi.OptionalParameter()
    external_exasol_xmlrpc_cluster_name = luigi.OptionalParameter()
    external_exasol_xmlrpc_password = luigi.OptionalParameter(
        significant=False, visibility=ParameterVisibility.HIDDEN)
Example #10
0
class Annotate(BcftoolsTask):
    """
    Annotate a VCF using bcftools annotate.
    """
    output_file = luigi.Parameter()
    output_format = luigi.Parameter(positional=False, default='z')

    # options given an annotation file
    annotations_file = luigi.OptionalParameter(positional=False, default=None)

    columns = luigi.ListParameter(positional=False, default=[])

    rename_chrs = luigi.OptionalParameter(positional=False, default=None)

    def subcommand_args(self):
        args = ['annotate']

        if self.rename_chrs is not None:
            args.extend(['--rename-chrs', self.rename_chrs])

        if self.annotations_file:
            args.extend(['-a', self.annotations_file])
            args.extend(['-c', ','.join(self.columns)])

        args.extend([
            '--output-type', self.output_format, '--output', self.output_file
        ])

        return args

    def output(self):
        return luigi.LocalTarget(self.output_file)
class SeqrVCFToGenotypesMTTask(HailMatrixTableTask):
    remap_path = luigi.OptionalParameter(
        default=None,
        description="Path to a tsv file with two columns: s and seqr_id.")
    subset_path = luigi.OptionalParameter(
        default=None,
        description="Path to a tsv file with one column of sample IDs: s.")

    def requires(self):
        return [SeqrVCFToVariantMTTask()]

    def run(self):
        # Hack that fixes something in Hail. TODO: Remove when Hail fix comes.
        hl._set_flags(newaggs=None)
        mt = hl.read_matrix_table(self.input()[0].path)

        if self.remap_path:
            mt = self.remap_sample_ids(mt, self.remap_path)
        if self.subset_path:
            mt = self.subset_samples_and_variants(mt, self.subset_path)

        mt = SeqrGenotypesSchema(mt).annotate_all(
            overwrite=True).select_annotated_mt()

        mt.describe()
        mt.write(self.output().path, stage_locally=True, overwrite=True)
Example #12
0
class TrimPairedReads(CutadaptTask):
    input_file = luigi.Parameter()
    input2_file = luigi.Parameter()
    output_file = luigi.Parameter()
    output2_file = luigi.Parameter()

    reverse_adapter_3prime = luigi.OptionalParameter(default='',
                                                     positional=False)
    reverse_adapter_5prime = luigi.OptionalParameter(default='',
                                                     positional=False)

    def program_args(self):
        args = super(TrimPairedReads, self).program_args()
        if self.reverse_adapter_3prime:
            args.extend(['-A', self.reverse_adapter_3prime])
        if self.reverse_adapter_5prime:
            args.extend(['-G', self.reverse_adapter_5prime])
        args.extend([
            '-o', self.output_file, '-p', self.output2_file, self.input_file,
            self.input2_file
        ])
        return args

    def output(self):
        return [
            luigi.LocalTarget(self.output_file),
            luigi.LocalTarget(self.output2_file)
        ]
Example #13
0
class GetStockRangeTask(BaseTask):
    startDate = luigi.OptionalParameter("")
    endDate = luigi.OptionalParameter("")
    market = luigi.OptionalParameter("")

    def run(self) -> Generator:
        path = self.makeDirs()
        target = yield MongoGetCollectionTask(index="stock",
                                              collection="marcap")
        collection = target.get_collection()
        cursor = collection.find({
            "$and": [{
                "date": {
                    "$gte": self.startDate,
                    "$lte": self.endDate
                }
            }, {
                "market": self.market
            }]
        })
        df = pd.DataFrame(list(cursor))
        df.to_hdf(path, key='df', mode='w')
        print(df)

    def makePath(self) -> str:
        return f'data/stock/range/stock-marcap-{self.market}-{self.startDate}-{self.endDate}'
Example #14
0
class LinkwaglOutputs(luigi.Task):

    """
    Link all the multifile outputs from wagl into a single file.
    """

    level1 = luigi.Parameter()
    work_root = luigi.Parameter()
    granule = luigi.OptionalParameter(default="")
    acq_parser_hint = luigi.OptionalParameter(default="")
    workflow = luigi.EnumParameter(enum=Workflow)
    vertices = luigi.TupleParameter(default=(5, 5))
    pixel_quality = luigi.BoolParameter()
    method = luigi.EnumParameter(enum=Method, default=Method.SHEAR)
    dsm_fname = luigi.Parameter(significant=False)
    buffer_distance = luigi.FloatParameter(default=8000, significant=False)

    def requires(self):
        container = acquisitions(self.level1, self.acq_parser_hint)
        for group in container.supported_groups:
            kwargs = {
                "level1": self.level1,
                "work_root": self.work_root,
                "granule": self.granule,
                "group": group,
                "workflow": self.workflow,
                "vertices": self.vertices,
                "pixel_quality": self.pixel_quality,
                "method": self.method,
                "dsm_fname": self.dsm_fname,
                "buffer_distance": self.buffer_distance,
            }
            yield DataStandardisation(**kwargs)

    def output(self):
        out_fname = pjoin(dirname(self.work_root), "{}.h5".format(self.granule))
        return luigi.LocalTarget(out_fname)

    def run(self):
        with self.output().temporary_path() as out_fname:
            for root, _, files in os.walk(self.work_root):
                # skip any private files
                if basename(root)[0] == "_":
                    continue

                for file_ in files:
                    if splitext(file_)[1] == ".h5":
                        fname = pjoin(root, file_)
                        grp_name = basename(dirname(fname.replace(self.work_root, "")))

                        with h5py.File(fname, "r") as fid:
                            groups = [g for g in fid]

                        for pth in groups:
                            new_path = ppjoin(self.granule, grp_name, pth)
                            create_external_link(fname, pth, out_fname, new_path)

            with h5py.File(out_fname, "a") as fid:
                fid.attrs["level1_uri"] = self.level1
Example #15
0
class AncillaryData(luigi.Task):

    """Get all ancillary data."""

    level1 = luigi.Parameter()
    work_root = luigi.Parameter(significant=False)
    granule = luigi.OptionalParameter(default="")
    vertices = luigi.TupleParameter()
    workflow = luigi.EnumParameter(enum=Workflow)
    acq_parser_hint = luigi.OptionalParameter(default="")
    aerosol = luigi.DictParameter({"user": 0.05}, significant=False)
    brdf = luigi.DictParameter()
    ozone_path = luigi.Parameter(significant=False)
    water_vapour = luigi.DictParameter({"user": 1.5}, significant=False)
    dem_path = luigi.Parameter(significant=False)
    ecmwf_path = luigi.Parameter(significant=False)
    invariant_height_fname = luigi.Parameter(significant=False)
    compression = luigi.EnumParameter(
        enum=H5CompressionFilter, default=H5CompressionFilter.LZF, significant=False
    )
    filter_opts = luigi.DictParameter(default=None, significant=False)

    def requires(self):
        group = acquisitions(self.level1, self.acq_parser_hint).supported_groups[0]
        args = [self.level1, self.work_root, self.granule, group]
        return CalculateSatelliteAndSolarGrids(*args)

    def output(self):
        return luigi.LocalTarget(pjoin(self.work_root, "ancillary.h5"))

    def run(self):
        container = acquisitions(self.level1, self.acq_parser_hint)
        grn = container.get_granule(granule=self.granule, container=True)
        sbt_path = None

        nbar_paths = {
            "aerosol_dict": self.aerosol,
            "water_vapour_dict": self.water_vapour,
            "ozone_path": self.ozone_path,
            "dem_path": self.dem_path,
            "brdf_dict": self.brdf,
        }

        if self.workflow == Workflow.STANDARD or self.workflow == Workflow.SBT:
            sbt_path = self.ecmwf_path

        with self.output().temporary_path() as out_fname:
            _collect_ancillary(
                grn,
                self.input().path,
                nbar_paths,
                sbt_path,
                self.invariant_height_fname,
                self.vertices,
                out_fname,
                self.compression,
                self.filter_opts,
            )
Example #16
0
class BcftoolsTask(ScheduledExternalProgramTask):
    task_namespace = 'bcftools'

    input_file = luigi.Parameter()

    include = luigi.OptionalParameter(positional=False, default=None)
    exclude = luigi.OptionalParameter(positional=False, default=None)
    regions = luigi.ListParameter(default=[], positional=False)
    regions_file = luigi.OptionalParameter(positional=False, default=None)
    samples = luigi.ListParameter(default=[], positional=False)
    samples_file = luigi.OptionalParameter(default=None, positional=False)
    apply_filters = luigi.OptionalParameter(positional=False, default=None)

    # FIXME: the '--threads' flag does not seem to work

    def subcommand_args(self):
        """Returns specific sub-command arguments."""
        raise NotImplementedError

    def subcommand_input_args(self):
        """
        Returns arguments to be appended at the input file location.

        This is meant to be to to deal with commands that accept multiple input
        files.
        """
        return [self.input_file]

    def program_args(self):
        args = [cfg.bcftools_bin]

        args.extend(self.subcommand_args())

        if self.include is not None:
            args.extend(['-i', self.include])

        if self.exclude is not None:
            args.extend(['-e', self.exclude])

        if self.regions:
            args.extend(['-r', ','.join(self.regions)])

        if self.regions_file is not None:
            args.extend(['-R', self.regions_file])

        if self.samples:
            args.extend(['-s', ','.join(self.samples)])

        if self.samples_file:
            args.extend(['-S', self.samples_file])

        if self.apply_filters is not None:
            args.extend(['-f', self.apply_filters])

        args.extend(self.subcommand_input_args())

        return args
Example #17
0
class hdfs(luigi.Config):
    client_version = luigi.IntParameter(default=None)
    namenode_host = luigi.OptionalParameter(default=None)
    namenode_port = luigi.IntParameter(default=None)
    client = luigi.Parameter(default='hadoopcli')
    tmp_dir = luigi.OptionalParameter(
        default=None,
        config_path=dict(section='core', name='hdfs-tmp-dir'),
    )
Example #18
0
class target_docker_repository_config(luigi.Config):
    repository_name = luigi.Parameter("exasol/script-language-container")
    tag_prefix = luigi.Parameter("")
    username = luigi.OptionalParameter(None,
                                       significant=False,
                                       visibility=ParameterVisibility.PRIVATE)
    password = luigi.OptionalParameter(None,
                                       significant=False,
                                       visibility=ParameterVisibility.PRIVATE)
Example #19
0
class DataStandardisation(luigi.Task):

    """
    Runs the standardised product workflow.
    """
    level1 = luigi.Parameter()
    outdir = luigi.Parameter()
    granule = luigi.OptionalParameter(default='')
    workflow = luigi.EnumParameter(enum=Workflow, default=Workflow.STANDARD)
    vertices = luigi.TupleParameter(default=(5, 5))
    method = luigi.EnumParameter(enum=Method, default=Method.SHEAR)
    pixel_quality = luigi.BoolParameter()
    land_sea_path = luigi.Parameter()
    aerosol = luigi.DictParameter(default={'user': 0.05})
    brdf = luigi.DictParameter()
    ozone_path = luigi.Parameter(significant=False)
    water_vapour = luigi.DictParameter(default={'user': 1.5},
                                       significant=False)
    dem_path = luigi.Parameter(significant=False)
    ecmwf_path = luigi.Parameter(significant=False)
    invariant_height_fname = luigi.Parameter(significant=False)
    dsm_fname = luigi.Parameter(significant=False)
    modtran_exe = luigi.Parameter(significant=False)
    tle_path = luigi.Parameter(significant=False)
    rori = luigi.FloatParameter(default=0.52, significant=False)
    compression = luigi.EnumParameter(enum=H5CompressionFilter,
                                      default=H5CompressionFilter.LZF,
                                      significant=False)
    filter_opts = luigi.DictParameter(default=None, significant=False)
    acq_parser_hint = luigi.OptionalParameter(default='')
    buffer_distance = luigi.FloatParameter(default=8000, significant=False)
    h5_driver = luigi.OptionalParameter(default='', significant=False)
    normalized_solar_zenith = luigi.FloatParameter(default=45.0)

    def output(self):
        fmt = '{label}.wagl.h5'
        label = self.granule if self.granule else basename(self.level1)
        out_fname = fmt.format(label=label)
         
        return luigi.LocalTarget(pjoin(self.outdir, out_fname))

    def run(self):
        if self.workflow == Workflow.STANDARD or self.workflow == Workflow.SBT:
            ecmwf_path = self.ecmwf_path
        else:
            ecmwf_path = None

        with self.output().temporary_path() as out_fname:
            card4l(self.level1, self.granule, self.workflow, self.vertices,
                   self.method, self.pixel_quality, self.land_sea_path,
                   self.tle_path, self.aerosol, self.brdf,
                   self.ozone_path, self.water_vapour,
                   self.dem_path, self.dsm_fname, self.invariant_height_fname,
                   self.modtran_exe, out_fname, ecmwf_path, self.rori,
                   self.buffer_distance, self.compression, self.filter_opts,
                   self.h5_driver, self.acq_parser_hint, self.normalized_solar_zenith)
Example #20
0
class build_config(luigi.Config):
    force_pull = luigi.BoolParameter(False)
    force_load = luigi.BoolParameter(False)
    force_rebuild = luigi.BoolParameter(False)
    force_rebuild_from = luigi.ListParameter([])
    log_build_context_content = luigi.BoolParameter(False)
    #keep_build_context = luigi.BoolParameter(False)
    temporary_base_directory = luigi.OptionalParameter(None)
    output_directory = luigi.Parameter(".build_output")
    cache_directory = luigi.OptionalParameter("")
Example #21
0
class DatasetComponentsAnnotationMapImage(ComponentsAnnotationMapImage):
    data_path = luigi.Parameter()
    step_size = luigi.Parameter()
    model_path = luigi.Parameter()
    scene_id = luigi.Parameter()
    transform_type = luigi.OptionalParameter()
    transform_extra_args = luigi.OptionalParameter()
    pretrained_transform_model = luigi.OptionalParameter(default=None)
    components = luigi.ListParameter(default=[0, 1, 2])
    crop_img = luigi.BoolParameter(default=False)

    def requires(self):
        if self.transform_type is None:
            return DatasetImagePredictionMapData(
                data_path=self.data_path,
                scene_id=self.scene_id,
                model_path=self.model_path,
                step_size=self.step_size,
            )
        else:
            return DatasetEmbeddingTransform(
                data_path=self.data_path,
                scene_id=self.scene_id,
                model_path=self.model_path,
                step_size=self.step_size,
                transform_type=self.transform_type,
                transform_extra_args=self.transform_extra_args,
                pretrained_model=self.pretrained_transform_model,
            )

    @property
    def input_path(self):
        return self.input().fn

    @property
    def src_data_path(self):
        return self.data_path

    def output(self):
        model_name = Path(self.model_path).name.replace(".pkl", "")

        fn = "{}.{}_step.{}_transform.map.{}__comp.png".format(
            self.scene_id,
            self.step_size,
            self.transform_type,
            "_".join([str(v) for v in self.components]),
        )

        p_root = Path(self.data_path) / "embeddings" / "rect" / model_name

        if self.pretrained_transform_model is not None:
            p = p_root / self.pretrained_transform_model / "components_map" / fn
        else:
            p = p_root / "components_map" / fn
        return XArrayTarget(str(p))
Example #22
0
class MongoGetCollectionTask(luigi.Task):
    index = luigi.OptionalParameter("")
    collection = luigi.OptionalParameter("")

    def output(self) -> MongoCollectionTarget:
        host = config["mongodbHost"]
        port = config["mongodbPort"]
        userName = config["mongodbUserName"]
        password = config["mongodbPassword"]
        path = f'mongodb://{userName}:{password}@{host}:{port}'
        client = MongoClient(path)
        return MongoCollectionTarget(client, self.index, self.collection)
Example #23
0
class GetStockCodeFilteringByFactorRank(BaseTask):
    date = luigi.OptionalParameter("")
    factor = luigi.OptionalParameter("")
    markets = luigi.OptionalParameter("")
    targets = luigi.OptionalParameter("")
    ascending = luigi.BoolParameter(True)
    limit = luigi.IntParameter(sys.maxsize)
    includeSame = luigi.BoolParameter(True)

    def run(self) -> Generator:
        path = self.makeDirs()
        year = int(self.date[:4])
        month = int(self.date[4:6])
        markets = json.loads(self.markets)
        limit = int(self.limit)
        if len(self.targets) > 0:
            targets = json.loads(self.targets)
        marcapOutput = yield GetMarcapCodes(markets=self.markets,
                                            year=int(year),
                                            month=month)
        targets = pd.read_hdf(marcapOutput.path).to_list()

        factorTarget = None
        if month <= 4:
            factorTarget = yield GetFactorYearTask(year=str(year - 1),
                                                   name=self.factor)
        else:
            factorTarget = yield GetFactorYearTask(year=str(year),
                                                   name=self.factor)
        factorDf: pd.DataFrame = pd.read_hdf(factorTarget.path)
        if factorDf.empty:
            return
        newDf = factorDf[factorDf["code"].isin(targets)]
        newDf.sort_values(by="dataValue",
                          ascending=self.ascending,
                          inplace=True)
        if not self.includeSame:
            newDf = newDf.iloc[0:int(limit)]
        else:
            newDf["rank"] = newDf["dataValue"].rank(method="min",
                                                    ascending=self.ascending)
            newDf = newDf[newDf["rank"] <= limit]

        newDf.to_hdf(path, key='df', mode='w')
        print(newDf)
        print(path)

    def makePath(self) -> str:
        result = hashlib.md5(f'{self.markets}{self.targets}'.encode())
        return f"data/simul/factor/GetStockCodeFilteringByFactorRank-{self.date}-{self.factor}-{self.ascending}-{self.limit}-{result.hexdigest()}"
Example #24
0
class GetFactorYearTask(BaseTask):
    year = luigi.OptionalParameter("")
    month = luigi.OptionalParameter("12")
    name = luigi.OptionalParameter("")
    exact = luigi.BoolParameter(False)

    def makeAndQuery(self) -> List:
        query = []
        if len(self.year) > 0:
            query.append({"dataYear": "{:.1f}".format(int(self.year))})
        if len(self.month) > 0:
            query.append({"dataMonth": self.month})
        if len(self.name) > 0:
            if self.exact:
                query.append({"dataName": self.name})
            else:
                query.append({"dataName": {"$regex": self.name}})
        print(query)
        return query

    def run(self) -> Generator:
        path = self.makeDirs()
        target = yield MongoGetCollectionTask(index="stock",
                                              collection="factor")
        collection = target.get_collection()
        cursor = collection.find({"$and": self.makeAndQuery()})

        df = pd.DataFrame(list(cursor))
        # errors를 coerce로 하면 숫자로 못바꾸는 항목은 NaN으로 설정
        df["_id"] = df["_id"].astype(str)
        df["createdAt"] = df["createdAt"].astype(str)
        df["updatedAt"] = df["updatedAt"].astype(str)
        df["dataValue"] = pd.to_numeric(df["dataValue"], errors="coerce")
        df.dropna(subset=['dataValue'])
        df.to_hdf(path, key='df', mode='w')
        print(df)

    def makePath(self) -> str:
        path = 'data/factor/fnguide/year/factor-fnguide'
        strYear = str(self.year)
        strMonth = str(self.month)
        strName = str(self.name)
        if len(strYear) > 0:
            path = path + f"-{strYear}"
        if len(strMonth) > 0:
            path = path + f"-{strMonth}"
        if len(strName) > 0:
            path = path + f"-{strName}"
        return path
Example #25
0
class hdfs(luigi.Config):
    client_version = luigi.IntParameter(default=None)
    effective_user = luigi.OptionalParameter(
        default=os.getenv('HADOOP_USER_NAME'),
        description="Optionally specifies the effective user for snakebite. "
        "If not set the environment variable HADOOP_USER_NAME is "
        "used, else USER")
    snakebite_autoconfig = luigi.BoolParameter(default=False)
    namenode_host = luigi.OptionalParameter(default=None)
    namenode_port = luigi.IntParameter(default=None)
    client = luigi.Parameter(default='hadoopcli')
    tmp_dir = luigi.OptionalParameter(
        default=None,
        config_path=dict(section='core', name='hdfs-tmp-dir'),
    )
Example #26
0
class WorkRoot(luigi.Task):

    """
    Create the work root directory space, and sub directories that
    could compete later in a race condition of creation.
    """

    level1 = luigi.Parameter()
    work_root = luigi.Parameter(significant=False)
    acq_parser_hint = luigi.OptionalParameter(default="")
    reflectance_dir = "_standardised"
    shadow_dir = "_shadow"
    interpolation_dir = "_interpolation"

    def output(self):
        out_dirs = [self.reflectance_dir, self.shadow_dir, self.interpolation_dir]
        container = acquisitions(self.level1, self.acq_parser_hint)
        for granule in container.granules:
            for group in container.supported_groups:
                pth = container.get_root(self.work_root, group, granule)
                for out_dir in out_dirs:
                    yield luigi.LocalTarget(pjoin(pth, out_dir))

    def run(self):
        local_fs = LocalFileSystem()
        for target in self.output():
            local_fs.mkdir(target.path)
Example #27
0
class RunFmask(luigi.Task):
    """
    Execute the Fmask algorithm for a given granule.
    """

    level1 = luigi.Parameter()
    granule = luigi.Parameter()
    workdir = luigi.Parameter()
    acq_parser_hint = luigi.OptionalParameter(default='')

    def requires(self):
        # for the time being have fmask require wagl,
        # no point in running fmask if wagl fails...
        # return WorkDir(self.level1, dirname(self.workdir))
        return DataStandardisation(self.level1, self.workdir, self.granule)

    def output(self):
        out_fname = pjoin(self.workdir, '{}.fmask.img'.format(self.granule))

        return luigi.LocalTarget(out_fname)

    def run(self):
        with self.output().temporary_path() as out_fname:
            fmask(self.level1, self.granule, out_fname, self.workdir,
                  self.acq_parser_hint)
Example #28
0
class DownloadSraExperiment(DynamicTaskWithOutputMixin, DynamicWrapperTask):
    """
    Download a SRA experiment comprising one SRA run

    It is possible for experiments to be reprocessed in SRA leading to multiple
    associated runs. The default is to select the latest run based on the
    lexicographic order of its identifier.
    """
    srr = luigi.OptionalParameter(
        default=None,
        description='Specific SRA run accession to use (defaults to latest)')

    @property
    def sample_id(self):
        return self.srx

    def run(self):
        # this will raise an error of no FASTQs are related
        df = pd.read_csv(self.input().path)

        if self.srr is not None:
            run = df[df.Run == self.srr].iloc[0]
        else:
            run = df.sort_values('Run', ascending=False).iloc[0]

        # layout is very often not annotated correctly and it is best to rely
        # on the number of mates per spot
        is_paired = (self.sample_id in sra_cfg.paired_read_experiments) or (
            run.spots_with_mates > 0)

        yield DumpSraRun(run.Run, self.srx, paired_reads=is_paired)
Example #29
0
class Raw(luigi.Task):
    __name__ = 'Raw'
    data_root = luigi.Parameter(default=os.path.expanduser('~/.emu/'))
    file_id = luigi.IntParameter(description='Box file_id')
    file_name = luigi.Parameter()
    save_to = luigi.OptionalParameter()
    overwrite = luigi.BoolParameter(default=False)

    def __repr__(self):
        cache = self.out_dir().split('.emu/')[-1]
        return '{}(file=/.emu/{}/{})'.format(self.__name__, cache,
                                             self.file_name)

    def out_dir(self):
        return self.save_to

    def download(self):
        client = jwt()
        file = client.file(self.file_id)
        fp = os.path.join(self.out_dir(), self.file_name)

        with open(fp, 'wb') as open_file:
            file.download_to(open_file)
            open_file.close()

    def run(self):
        check_or_create(self.out_dir())
        self.download()

    def output(self):
        out_fp = os.path.join(self.out_dir(), self.file_name)
        return luigi.LocalTarget(out_fp)
Example #30
0
class hadoop(luigi.task.Config):
    pool = luigi.OptionalParameter(
        default=None,
        description=(
            'Hadoop pool so use for Hadoop tasks. To specify pools per tasks, '
            'see BaseHadoopJobTask.pool'),
    )