class MeasurementFinderTask(luigi.Task):
    pipeline = luigi.IntParameter()
    job = luigi.IntParameter()
    start = luigi.IntParameter()
    batch = luigi.IntParameter()
    solr_query = luigi.Parameter()
    segment = segmentation.Segmentation()

    def run(self):
        client = MongoClient(util.mongo_host, util.mongo_port)

        try:
            jobs.update_job_status(
                str(self.job), util.conn_string, jobs.IN_PROGRESS,
                "Running MeasurementFinder Batch %s" % self.batch)

            pipeline_config = config.get_pipeline_config(
                self.pipeline, util.conn_string)

            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.IN_PROGRESS, "Running Solr query")
            docs = solr_data.query(self.solr_query,
                                   rows=util.row_count,
                                   start=self.start,
                                   solr_url=util.solr_url,
                                   tags=pipeline_config.report_tags,
                                   mapper_inst=util.report_mapper_inst,
                                   mapper_url=util.report_mapper_url,
                                   mapper_key=util.report_mapper_key,
                                   cohort_ids=pipeline_config.cohort)

            filters = dict()
            if pipeline_config.sections and len(pipeline_config.sections) > 0:
                filters[SECTIONS_FILTER] = pipeline_config.sections

            with self.output().open('w') as outfile:
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Finding terms with MeasurementFinder")
                # TODO incorporate sections and filters
                for doc in docs:
                    meas_results = run_measurement_finder_full(
                        doc["report_text"], pipeline_config.terms)
                    for meas in meas_results:
                        inserted = mongo_writer(client, self.pipeline,
                                                self.job, self.batch,
                                                pipeline_config, meas, doc,
                                                "MeasurementFinder")
                        outfile.write(str(inserted))
                        outfile.write('\n')
                    del meas_results
            del docs
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.WARNING,
                                   ''.join(traceback.format_stack()))
            print(ex)
        finally:
            client.close()

    def output(self):
        return luigi.LocalTarget(
            "%s/pipeline_job%s_measurement_finder_batch%s.txt" %
            (util.tmp_dir, str(self.job), str(self.start)))
Exemple #2
0
import os
from typing import cast
import luigi
from luijo.config import FileSystem


class {{cookiecutter.task_name}}(luigi.Task):
    """
    This is a starter task to use as a template for the creation of your real
    task.  Let's get started! (...and don't forget to update your docstrings!)

    :cvar hello: the string that follows 'Hello' in the output
    :cvar repeat: the number of times the message should be repeated
    """
    hello: luigi.Parameter = luigi.Parameter(default='Pythonista')
    repeat: luigi.Parameter = luigi.IntParameter(default=10)

    def requires(self):
        """
        This task has no requirements.

        :return: an empty iteration
        """
        return []

    def output(self) -> luigi.LocalTarget:
        """
        This task returns a local target object containing the number of 'hello'
        lines that were specified by the :py:attr:`HelloLuigi.repeat` parameter.

        :return: the local target output
Exemple #3
0
class IntraSessionInteractionsDataFrame(BasePrepareDataFrames):
    sample_days: int = luigi.IntParameter(default=16)
    max_itens_per_session: int = luigi.IntParameter(default=15)
    min_itens_interactions: int = luigi.IntParameter(default=3)
    max_relative_pos: int = luigi.IntParameter(default=3)
    days_test: int = luigi.IntParameter(default=1)
    pos_max_deep: int = luigi.IntParameter(default=1)
    filter_first_interaction: bool = luigi.BoolParameter(default=False)

    def requires(self):
        return CreateIntraSessionInteractionDataset(
            max_itens_per_session=self.max_itens_per_session,
            sample_days=self.sample_days,
            min_itens_interactions=self.min_itens_interactions,
            max_relative_pos=self.max_relative_pos,
            pos_max_deep=self.pos_max_deep)

    @property
    def timestamp_property(self) -> str:
        return "Timestamp"

    @property
    def dataset_dir(self) -> str:
        return DATASET_DIR

    def read_data_frame(self) -> pd.DataFrame:
        df = pd.read_parquet(self.read_data_frame_path)  #.sample(10000)

        # TODO
        if self.filter_first_interaction:
            df = df.groupby(['ItemID_A',
                             'ItemID_B']).head(1).reset_index(drop=True)

        #df["ItemID"]        = df.ItemID_A
        #df['sub_a_b']        = df['sub_a_b'].apply(list)
        df['available_arms'] = None
        df["visit"] = 1

        df_session = df[['SessionID']].drop_duplicates().reset_index().rename(
            columns={"index": 'SessionIDX'})

        df = df.merge(df_session).drop(['SessionID'], axis=1)
        df = df.rename(columns={"ItemID_A": 'ItemID'})

        return df

    @property
    def metadata_data_frame_path(self) -> Optional[str]:
        return self.input()[1].path

    @property
    def read_data_frame_path(self) -> pd.DataFrame:
        return self.input()[0].path

    def transform_data_frame(self, df: pd.DataFrame,
                             data_key: str) -> pd.DataFrame:
        print(data_key)
        print(df.describe())

        return df

    def time_train_test_split(
            self, df: pd.DataFrame,
            test_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
        df[self.timestamp_property] = pd.to_datetime(
            df[self.timestamp_property])

        if self.timestamp_property:
            df = df.sort_values(self.timestamp_property)

        cutoff_date = df[self.timestamp_property].iloc[-1] - pd.Timedelta(
            days=self.days_test)

        return df[df[self.timestamp_property] < cutoff_date], df[
            df[self.timestamp_property] >= cutoff_date]
Exemple #4
0
class CallVariantsWithHaplotypeCaller(VclineTask):
    cf = luigi.DictParameter()
    n_cpu = luigi.IntParameter(default=1)
    memory_mb = luigi.FloatParameter(default=4096)
    sh_config = luigi.DictParameter(default=dict())
    priority = 50

    def output(self):
        run_dir = Path(self.cf['germline_snv_indel_gatk_dir_path']).joinpath(
            Path(self.input()[0][0].path).stem)
        return [
            luigi.LocalTarget(
                run_dir.joinpath(f'{run_dir.name}.haplotypecaller.{s}'))
            for s in ['vcf.gz', 'vcf.gz.tbi', 'cram', 'cram.crai']
        ]

    def run(self):
        output_vcf = Path(self.output()[0].path)
        intervals = [Path(i.path) for i in self.input()[3]]
        skip_interval_split = (len(intervals) == 1)
        fa = Path(self.input()[1][0].path)
        input_cram = Path(self.input()[0][0].path)
        dbsnp_vcf = Path(self.input()[2][0].path)
        output_path_prefix = '.'.join(str(output_vcf).split('.')[:-2])
        if skip_interval_split:
            tmp_prefixes = [output_path_prefix]
        else:
            tmp_prefixes = [
                '{0}.{1}'.format(output_path_prefix, o.stem) for o in intervals
            ]
        input_targets = yield [
            HaplotypeCaller(input_cram_path=str(input_cram),
                            fa_path=str(fa),
                            dbsnp_vcf_path=str(dbsnp_vcf),
                            evaluation_interval_path=str(o),
                            output_path_prefix=s,
                            gatk=self.cf['gatk'],
                            save_memory=self.cf['save_memory'],
                            n_cpu=self.n_cpu,
                            memory_mb=self.memory_mb,
                            sh_config=self.sh_config)
            for o, s in zip(intervals, tmp_prefixes)
        ]
        run_id = '.'.join(output_vcf.name.split('.')[:-3])
        self.print_log(
            f'Call germline variants with HaplotypeCaller:\t{run_id}')
        output_cram = Path(self.output()[2].path)
        gatk = self.cf['gatk']
        samtools = self.cf['samtools']
        self.setup_shell(run_id=run_id,
                         commands=gatk,
                         cwd=output_vcf.parent,
                         **self.sh_config,
                         env={
                             'JAVA_TOOL_OPTIONS':
                             self.generate_gatk_java_options(
                                 n_cpu=self.n_cpu, memory_mb=self.memory_mb)
                         })
        if skip_interval_split:
            tmp_bam = Path(f'{tmp_prefixes[0]}.bam')
            self.samtools_view(input_sam_path=tmp_bam,
                               fa_path=fa,
                               output_sam_path=output_cram,
                               samtools=samtools,
                               n_cpu=self.n_cpu,
                               index_sam=True,
                               remove_input=True)
        else:
            tmp_vcfs = [Path(f'{s}.vcf.gz') for s in tmp_prefixes]
            self.run_shell(
                args=(f'set -e && {gatk} MergeVcfs' +
                      ''.join(f' --INPUT {v}' for v in tmp_vcfs) +
                      f' --REFERENCE_SEQUENCE {fa}' +
                      f' --OUTPUT {output_vcf}'),
                input_files_or_dirs=[*tmp_vcfs, fa],
                output_files_or_dirs=[output_vcf, f'{output_vcf}.tbi'])
            self.samtools_merge(
                input_sam_paths=[f'{s}.bam' for s in tmp_prefixes],
                fa_path=fa,
                output_sam_path=output_cram,
                samtools=samtools,
                n_cpu=self.n_cpu,
                memory_mb=self.memory_mb,
                index_sam=True,
                remove_input=False)
            self.remove_files_and_dirs(*chain.from_iterable(
                [o.path for o in t] for t in input_targets))
Exemple #5
0
class Crop(luigi.Task):
    it = luigi.IntParameter()
    dt = luigi.Parameter()
    aug = luigi.Parameter()
    de = luigi.Parameter()
    samples = luigi.TupleParameter()
    data_eval = luigi.TupleParameter()
    resources = {"ram": 50}

    @property
    def priority(self):
        if int(self.it) % 10000 == 0:
            return 1.0 / int(self.it)
        else:
            return 0.0

    def requires(self):
        return Predict(self.it, self.dt, self.aug, self.samples,
                       self.data_eval)
        # and so on

    def output(self):
        return luigi.LocalTarget(
            os.path.join(os.path.dirname(self.input().fn), self.de,
                         "crop.msg"))

    def run(self):
        progress = 0.0
        self.set_progress_percentage(progress)
        if "unaligned" in self.de:
            aligned = False
        else:
            aligned = True
        for s in self.samples:
            filename = os.path.join(os.path.dirname(self.input().fn), self.de,
                                    s + ".n5")
            datasets_src = ["clefts", "pre_dist", "post_dist"]
            datasets_tgt = [
                "clefts_cropped", "pre_dist_cropped", "post_dist_cropped"
            ]
            off = offsets[s][aligned]
            sh = shapes[s][aligned]
            f = z5py.File(filename, use_zarr_format=False)
            for dss, dst in zip(datasets_src, datasets_tgt):
                chunk_size = tuple(
                    min(c, shi) for c, shi in zip(f[dss].chunks, sh))
                f.create_dataset(
                    dst,
                    shape=sh,
                    compression="gzip",
                    dtype=f[dss].dtype,
                    chunks=chunk_size,
                )
                bb = tuple(slice(o, o + shi, None) for o, shi in zip(off, sh))
                f[dst][:] = f[dss][bb]
                f[dst].attrs["offset"] = off[::-1]

                progress += 100.0 / (len(self.samples) * len(datasets_src))
                try:
                    self.set_progress_percentage(progress)
                except:
                    pass

        done = self.output().open("w")
        done.close()
class FindPartners(luigi.Task):
    it = luigi.IntParameter()
    dt = luigi.Parameter()
    aug = luigi.Parameter()
    de = luigi.Parameter()
    samples = luigi.TupleParameter()
    data_eval = luigi.TupleParameter()
    resources = {"ram": 650, "fp": 1}
    retry_count = 1

    @property
    def priority(self):
        if int(self.it) % 10000 == 0:
            return 1.0 / int(self.it)
        else:
            return 0.0

    def requires(self):
        return ConnectedComponents(
            self.it, self.dt, self.aug, self.de, self.samples, self.data_eval
        )

    def output(self):
        return luigi.LocalTarget(
            os.path.join(os.path.dirname(self.input().fn), "partners.msg")
        )

    def run(self):
        logging.debug("Starting to run partner finding")
        progress = 0.0
        self.set_progress_percentage(progress)
        thr = 127
        cc_thr = 42
        pre_thr = 42
        post_thr = 42
        dist_thr = 600
        size_thr = 5
        for s in self.samples:
            logging.debug("Starting with sample {0:}".format(s))
            filename = os.path.join(os.path.dirname(self.input().fn), s + ".h5")
            syn_file = os.path.join(os.path.dirname(self.input().fn), s + ".n5")
            cleft_cc_ds = "clefts_cropped_thr{0:}_cc{1:}".format(thr, cc_thr)
            pre_ds = "pre_dist_cropped"
            post_ds = "post_dist_cropped"
            seg_file = os.path.join(
                "/groups/saalfeld/saalfeldlab/larissa/data/cremieval/",
                self.de,
                s + ".n5",
            )
            seg_ds = "volumes/labels/neuron_ids_constis_slf1_sf750_cropped"
            if "unaligned" in self.de:
                aligned = False
            else:
                aligned = True
            off = tuple(np.array(offsets[s][aligned]) * np.array((40, 4, 4)))
            mm = Matchmaker(
                syn_file,
                cleft_cc_ds,
                pre_ds,
                post_ds,
                seg_file,
                seg_ds,
                filename,
                offset=off,
                safe_mem=True,
                dist_thr=dist_thr,
                size_thr=size_thr,
                pre_thr=pre_thr,
                post_thr=post_thr,
            )
            # mm.prepare_file()
            mm.write_partners()
            mm.cremi_file.close()
            del mm
            progress += 100.0 / len(self.samples)
            try:
                self.set_progress_percentage(progress)
            except:
                pass
        done = self.output().open("w")
        done.close()
Exemple #7
0
class CascadeMerge(LocalWorkflow):

    cascade_tree = luigi.IntParameter(
        default=0,
        description="the index of the cascade tree, only "
        "necessary when multiple trees (a forrest) are used, -1 denotes a wrapper that requires "
        "and outputs all trees, default: 0")
    cascade_depth = luigi.IntParameter(
        default=0,
        description="the depth of this workflow in the "
        "cascade tree with 0 being the root of the tree, default: 0")
    keep_nodes = luigi.BoolParameter(
        significant=False,
        description="keep merged results from "
        "intermediary nodes in the cascade cache directory")

    # internal parameter
    n_cascade_leaves = luigi.IntParameter(default=NO_INT, significant=False)

    # fixate some workflow parameters
    acceptance = 1.
    tolerance = 0.
    pilot = False

    node_format = "{name}.d{depth}.b{branch}{ext}"
    merge_factor = 2

    exclude_params_db = {"n_cascade_leaves"}

    exclude_db = True

    def __init__(self, *args, **kwargs):
        super(CascadeMerge, self).__init__(*args, **kwargs)

        # the merge factor should not be 1
        if self.merge_factor == 1:
            raise ValueError("the merge factor should not be 1")

        self._forest_built = False

    def is_branch(self, default=False):
        return super(CascadeMerge, self).is_branch() or (not default
                                                         and self.is_forest())

    def is_forest(self):
        return self.cascade_tree < 0

    def is_root(self):
        if self.is_forest():
            return False

        return self.cascade_depth == 0

    def is_leaf(self):
        if self.is_forest():
            return False

        tree = self._get_tree()
        max_depth = max(tree.keys())
        return self.cascade_depth == max_depth

    @cached_workflow_property
    def cascade_forest(self):
        self._build_cascade_forest()
        return self.cascade_forest

    @cached_workflow_property
    def leaves_per_tree(self):
        self._build_cascade_forest()
        return self.leaves_per_tree

    def _get_tree(self):
        try:
            return self.cascade_forest[self.cascade_tree]
        except IndexError:
            raise Exception(
                "cascade tree {} not found, forest only contains {} tree(s)".
                format(self.cascade_tree, len(self.cascade_forest)))

    def _build_cascade_forest(self):
        # a node in the tree can be described by a tuple of integers, where each value denotes the
        # branch path to go down the tree to reach the node (e.g. (2, 0) -> 2nd branch, 0th branch),
        # so the length of the tuple defines the depth of the node via ``depth = len(node) - 1``
        # the tree itself is a dict that maps depths to lists of nodes with that depth
        # when multiple trees are used (a forest), each one handles ``n_leaves / n_trees`` leaves

        if self._forest_built:
            return

        # helper to convert nested lists of leaf number chunks into a list of nodes in the format
        # described above
        def nodify(obj, node=None, root_id=0):
            if not isinstance(obj, list):
                return []
            nodes = []
            if node is None:
                node = tuple()
            else:
                nodes.append(node)
            for i, _obj in enumerate(obj):
                nodes += nodify(_obj, node + (i if node else root_id, ))
            return nodes

        # first, determine the number of files to merge in total when not already set via params
        if self.n_cascade_leaves == NO_INT:
            if self.is_branch(default=True):
                raise Exception(
                    "number of files to merge cannot be computed for a branch")
            # get inputs, i.e. outputs of workflow requirements and trace actual inputs to merge
            # an integer number representing the number of inputs is also valid
            inputs = luigi.task.getpaths(self.cascade_workflow_requires())
            inputs = self.trace_cascade_workflow_inputs(inputs)
            self.n_cascade_leaves = inputs if isinstance(
                inputs, six.integer_types) else len(inputs)

        # infer the number of trees from the cascade output
        output = self.cascade_output()
        n_trees = 1 if not isinstance(output,
                                      TargetCollection) else len(output)

        if self.n_cascade_leaves < n_trees:
            raise Exception(
                "too many leaves ({}) for number of requested trees ({})".
                format(self.n_cascade_leaves, n_trees))

        # determine the number of leaves per tree
        n_min = self.n_cascade_leaves // n_trees
        n_trees_overlap = self.n_cascade_leaves % n_trees
        leaves_per_tree = n_trees_overlap * [n_min + 1] + (
            n_trees - n_trees_overlap) * [n_min]

        # build the trees
        forest = []
        for i, n_leaves in enumerate(leaves_per_tree):
            # build a nested list of leaf numbers using the merge factor
            # e.g. 9 leaves with factor 3 -> [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
            # TODO: this point defines the actual tree structure, which is bottom-up at the moment,
            # but maybe it's good to have this configurable
            nested_leaves = list(iter_chunks(n_leaves, self.merge_factor))
            while len(nested_leaves) > 1:
                nested_leaves = list(
                    iter_chunks(nested_leaves, self.merge_factor))

            # convert the list of nodes to the tree format described above
            tree = {}
            for node in nodify(nested_leaves, root_id=i):
                depth = len(node) - 1
                tree.setdefault(depth, []).append(node)

            forest.append(tree)

        # store values
        self.leaves_per_tree = leaves_per_tree
        self.cascade_forest = forest
        self._forest_built = True

    def create_branch_map(self):
        if self.is_forest():
            raise Exception(
                "cannot define a branch map when in forest mode (cascade_tree < 0)"
            )

        tree = self._get_tree()
        nodes = tree[self.cascade_depth]
        return dict(enumerate(nodes))

    def trace_cascade_workflow_inputs(self, inputs):
        # should convert inputs to an object with a length (e.g. list, tuple, TargetCollection, ...)

        # for convenience, check if inputs results from the default workflow output, i.e. a dict
        # which stores a TargetCollection in the "collection" field
        if isinstance(inputs, dict) and "collection" in inputs:
            collection = inputs["collection"]
            if isinstance(collection, TargetCollection):
                return collection

        return inputs

    def trace_cascade_inputs(self, inputs):
        # should convert inputs into an iterable sequence (list, tuple, ...), no TargetCollection!
        return inputs

    @abstractmethod
    def cascade_workflow_requires(self):
        # should return the leaf requirements of a cascading task workflow
        return

    @abstractmethod
    def cascade_requires(self):
        # should return the leaf requirements of a cascading task branch
        return

    @abstractmethod
    def cascade_output(self):
        # this should return a single target when the output should be a single tree
        # or a target collection whose targets are accessible as items via cascade tree numbers
        return

    @abstractmethod
    def merge(self, inputs, output):
        return

    def complete(self):
        if self.is_forest():
            return all(task.complete() for task in flatten(self.requires()))
        else:
            return super(CascadeMerge, self).complete()

    def workflow_requires(self):
        self._build_cascade_forest()

        reqs = super(CascadeMerge, self).workflow_requires()

        if self.is_leaf():
            # this is simply the cascade requirement
            reqs["cascade"] = self.cascade_workflow_requires()

        else:
            # not a leaf, just require the next cascade depth
            reqs["cascade"] = self.req(self,
                                       cascade_depth=self.cascade_depth + 1)

        return reqs

    def requires(self):
        reqs = OrderedDict()

        if self.is_forest():
            # require the workflows for all cascade trees
            n_trees = len(self.cascade_forest)
            reqs["forest"] = {
                t: self.req(self, branch=-1, cascade_tree=t)
                for t in range(n_trees)
            }

        elif self.is_leaf():
            # this is simply the cascade requirement
            # also determine and pass the corresponding leaf number range
            sum_n_leaves = sum(self.leaves_per_tree)
            offset = sum(self.leaves_per_tree[:self.cascade_tree])
            merge_factor = self.merge_factor
            if merge_factor <= 0:
                merge_factor = self.leaves_per_tree[self.cascade_tree]
            start_leaf = offset + self.branch * merge_factor
            end_leaf = min(start_leaf + merge_factor, sum_n_leaves)
            reqs["cascade"] = self.cascade_requires(start_leaf, end_leaf)

        else:
            # get all child nodes in the next layer at depth = depth + 1, store their branches
            # note: child node tuples contain the exact same values plus an additional one
            tree = self._get_tree()
            node = self.branch_data
            branches = [
                i for i, n in enumerate(tree[self.cascade_depth + 1])
                if n[:-1] == node
            ]

            # add to requirements
            reqs["cascade"] = {
                b: self.req(self,
                            branch=b,
                            cascade_depth=self.cascade_depth + 1)
                for b in branches
            }

        return reqs

    def cascade_cache_directory(self):
        # by default, use the targets parent directory, also for SinglingFileCollections
        # otherwise, no default decision is implemented
        output = self.cascade_output()
        if isinstance(output, FileSystemTarget):
            return output.parent
        elif isinstance(output, SiblingFileCollection):
            return output.dir
        else:
            raise NotImplementedError(
                "{}.cascade_cache_directory is not implemented".format(
                    self.__class__.__name__))

    def output(self):
        output = self.cascade_output()

        if self.is_forest():
            return output

        if isinstance(output, TargetCollection):
            output = output[self.cascade_tree]

        if self.is_root():
            return output

        else:
            name, ext = os.path.splitext(output.basename)
            basename = self.node_format.format(name=name,
                                               ext=ext,
                                               branch=self.branch,
                                               depth=self.cascade_depth)
            return self.cascade_cache_directory().child(basename, "f")

    def run(self):
        if self.is_forest():
            return

        # trace actual inputs to merge
        inputs = self.input()["cascade"]
        if self.is_leaf():
            inputs = self.trace_cascade_inputs(inputs)
        else:
            inputs = inputs.values()

        # merge
        self.publish_message("start merging {} inputs of node {}".format(
            len(inputs), self.branch_data))
        self.merge(inputs, self.output())

        # remove intermediate nodes
        if not self.is_leaf() and not self.keep_nodes:
            with self.publish_step(
                    "removing intermediate results of node {}".format(
                        self.branch_data)):
                for inp in inputs:
                    inp.remove()
Exemple #8
0
class GraphWorkflow(WorkflowBase):
    input_path = luigi.Parameter()
    input_key = luigi.Parameter()
    graph_path = luigi.Parameter()
    output_key = luigi.Parameter()
    n_scales = luigi.IntParameter(default=1)

    # for now we only support n5 / zarr input labels
    def _check_input(self):
        ending = self.input_path.split('.')[-1]
        assert ending.lower() in ('zr', 'zarr', 'n5'),\
            "Only support n5 and zarr files, not %s" % ending

    def requires(self):
        self._check_input()

        initial_task = getattr(initial_tasks,
                               self._get_task_name('InitialSubGraphs'))
        dep = initial_task(tmp_folder=self.tmp_folder,
                           max_jobs=self.max_jobs,
                           config_dir=self.config_dir,
                           input_path=self.input_path,
                           input_key=self.input_key,
                           graph_path=self.graph_path,
                           dependency=self.dependency)
        merge_task = getattr(merge_tasks,
                             self._get_task_name('MergeSubGraphs'))
        for scale in range(1, self.n_scales):
            scale_out_key = 's%i/sub_graphs' % scale
            dep = merge_task(tmp_folder=self.tmp_folder,
                             max_jobs=self.max_jobs,
                             config_dir=self.config_dir,
                             graph_path=self.graph_path,
                             output_key=scale_out_key,
                             scale=scale,
                             merge_complete_graph=False,
                             dependency=dep)

        dep = merge_task(tmp_folder=self.tmp_folder,
                         max_jobs=self.max_jobs,
                         config_dir=self.config_dir,
                         graph_path=self.graph_path,
                         output_key=self.output_key,
                         scale=self.n_scales - 1,
                         merge_complete_graph=True,
                         dependency=dep)

        map_task = getattr(map_tasks, self._get_task_name('MapEdgeIds'))
        for scale in range(self.n_scales):
            dep = map_task(tmp_folder=self.tmp_folder,
                           max_jobs=self.max_jobs,
                           config_dir=self.config_dir,
                           graph_path=self.graph_path,
                           input_key=self.output_key,
                           scale=self.n_scales - 1,
                           dependency=dep)
        return dep

    @staticmethod
    def get_config():
        configs = super(GraphWorkflow, GraphWorkflow).get_config()
        configs.update({
            'initial_sub_graphs':
            initial_tasks.InitialSubGraphsLocal.default_task_config(),
            'merge_sub_graphs':
            merge_tasks.MergeSubGraphsLocal.default_task_config(),
            'map_edge_ids':
            map_tasks.MapEdgeIdsLocal.default_task_config()
        })
        return configs
Exemple #9
0
class WebDataExcelFileFromArchiveParsingToCsv(LoadingDataIntoCsvFile):

    skiptop = luigi.IntParameter(default=0)
    skipbottom = luigi.IntParameter(default=0)
    usecolumns = luigi.Parameter(default='')
Exemple #10
0
class RangeHourlyBase(RangeBase):
    """
    Produces a contiguous completed range of an hourly recurring task.
    """
    start = luigi.DateHourParameter(
        default=None,
        description=
        "beginning datehour, inclusive. Default: None - work backward forever (requires reverse=True)"
    )
    stop = luigi.DateHourParameter(
        default=None,
        description=
        "ending datehour, exclusive. Default: None - work forward forever")
    hours_back = luigi.IntParameter(
        default=100 * 24,  # slightly more than three months
        description=("extent to which contiguousness is to be assured into "
                     "past, in hours from current time. Prevents infinite "
                     "loop when start is none. If the dataset has limited "
                     "retention (i.e. old outputs get removed), this should "
                     "be set shorter to that, too, to prevent the oldest "
                     "outputs flapping. Increase freely if you intend to "
                     "process old dates - worker's memory is the limit"))
    # TODO always entire interval for reprocessings (fixed start and stop)?
    hours_forward = luigi.IntParameter(
        default=0,
        description=
        "extent to which contiguousness is to be assured into future, in hours from current time. Prevents infinite loop when stop is none"
    )

    def datetime_to_parameter(self, dt):
        return dt

    def parameter_to_datetime(self, p):
        return p

    def datetime_to_parameters(self, dt):
        """
        Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter
        """
        return self._task_parameters(dt)

    def parameters_to_datetime(self, p):
        """
        Given a dictionary of parameters, will extract the ranged task parameter value
        """
        return p[self._param_name]

    def moving_start(self, now):
        return now - timedelta(hours=self.hours_back)

    def moving_stop(self, now):
        return now + timedelta(hours=self.hours_forward)

    def finite_datetimes(self, finite_start, finite_stop):
        """
        Simply returns the points in time that correspond to whole hours.
        """
        datehour_start = datetime(finite_start.year, finite_start.month,
                                  finite_start.day, finite_start.hour)
        datehours = []
        for i in itertools.count():
            t = datehour_start + timedelta(hours=i)
            if t >= finite_stop:
                return datehours
            if t >= finite_start:
                datehours.append(t)

    def _format_datetime(self, dt):
        return luigi.DateHourParameter().serialize(dt)
Exemple #11
0
class RangeBase(luigi.WrapperTask):
    """
    Produces a contiguous completed range of a recurring task.

    Made for the common use case where a task is parameterized by e.g.
    DateParameter, and assurance is needed that any gaps arising from downtime
    are eventually filled.

    Emits events that one can use to monitor gaps and delays.

    At least one of start and stop needs to be specified.

    (This is quite an abstract base class for subclasses with different
    datetime parameter class, e.g. DateParameter, DateHourParameter, ..., and
    different parameter naming, e.g. days_back/forward, hours_back/forward,
    ..., as well as different documentation wording, for good user experience.)

    Subclasses will need to use the ``of`` parameter when overriding methods.
    """
    # TODO lift the single parameter constraint by passing unknown parameters through WrapperTask?
    of = luigi.TaskParameter(
        description=
        "task name to be completed. The task must take a single datetime parameter"
    )
    of_params = luigi.DictParameter(
        default=dict(),
        description=
        "Arguments to be provided to the 'of' class when instantiating")
    # The common parameters 'start' and 'stop' have type (e.g. DateParameter,
    # DateHourParameter) dependent on the concrete subclass, cumbersome to
    # define here generically without dark magic. Refer to the overrides.
    start = luigi.Parameter()
    stop = luigi.Parameter()
    reverse = luigi.BoolParameter(
        default=False,
        description=
        "specifies the preferred order for catching up. False - work from the oldest missing outputs onward; True - from the newest backward"
    )
    task_limit = luigi.IntParameter(
        default=50,
        description=
        "how many of 'of' tasks to require. Guards against scheduling insane amounts of tasks in one go"
    )
    # TODO overridable exclude_datetimes or something...
    now = luigi.IntParameter(
        default=None,
        description="set to override current time. In seconds since epoch")
    param_name = luigi.Parameter(
        default=None,
        description=
        "parameter name used to pass in parameterized value. Defaults to None, meaning use first positional parameter",
        positional=False)

    @property
    def of_cls(self):
        """
        DONT USE. Will be deleted soon. Use ``self.of``!
        """
        if isinstance(self.of, six.string_types):
            warnings.warn(
                'When using Range programatically, dont pass "of" param as string!'
            )
            return Register.get_task_cls(self.of)
        return self.of

    # a bunch of datetime arithmetic building blocks that need to be provided in subclasses
    def datetime_to_parameter(self, dt):
        raise NotImplementedError

    def parameter_to_datetime(self, p):
        raise NotImplementedError

    def datetime_to_parameters(self, dt):
        """
        Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter
        """
        raise NotImplementedError

    def parameters_to_datetime(self, p):
        """
        Given a dictionary of parameters, will extract the ranged task parameter value
        """
        raise NotImplementedError

    def moving_start(self, now):
        """
        Returns a datetime from which to ensure contiguousness in the case when
        start is None or unfeasibly far back.
        """
        raise NotImplementedError

    def moving_stop(self, now):
        """
        Returns a datetime till which to ensure contiguousness in the case when
        stop is None or unfeasibly far forward.
        """
        raise NotImplementedError

    def finite_datetimes(self, finite_start, finite_stop):
        """
        Returns the individual datetimes in interval [finite_start, finite_stop)
        for which task completeness should be required, as a sorted list.
        """
        raise NotImplementedError

    def _emit_metrics(self, missing_datetimes, finite_start, finite_stop):
        """
        For consistent metrics one should consider the entire range, but
        it is open (infinite) if stop or start is None.

        Hence make do with metrics respective to the finite simplification.
        """
        datetimes = self.finite_datetimes(
            finite_start if self.start is None else min(
                finite_start, self.parameter_to_datetime(self.start)),
            finite_stop if self.stop is None else max(
                finite_stop, self.parameter_to_datetime(self.stop)))

        delay_in_jobs = len(datetimes) - datetimes.index(
            missing_datetimes[0]) if datetimes and missing_datetimes else 0
        self.trigger_event(RangeEvent.DELAY, self.of_cls.task_family,
                           delay_in_jobs)

        expected_count = len(datetimes)
        complete_count = expected_count - len(missing_datetimes)
        self.trigger_event(RangeEvent.COMPLETE_COUNT, self.of_cls.task_family,
                           complete_count)
        self.trigger_event(
            RangeEvent.COMPLETE_FRACTION, self.of_cls.task_family,
            float(complete_count) / expected_count if expected_count else 1)

    def _format_datetime(self, dt):
        return self.datetime_to_parameter(dt)

    def _format_range(self, datetimes):
        param_first = self._format_datetime(datetimes[0])
        param_last = self._format_datetime(datetimes[-1])
        return '[%s, %s]' % (param_first, param_last)

    def _instantiate_task_cls(self, param):
        return self.of(**self._task_parameters(param))

    @property
    def _param_name(self):
        if self.param_name is None:
            return next(x[0] for x in self.of.get_params() if x[1].positional)
        else:
            return self.param_name

    def _task_parameters(self, param):
        kwargs = dict(**self.of_params)
        kwargs[self._param_name] = param
        return kwargs

    def requires(self):
        # cache because we anticipate a fair amount of computation
        if hasattr(self, '_cached_requires'):
            return self._cached_requires

        if not self.start and not self.stop:
            raise ParameterException(
                "At least one of start and stop needs to be specified")
        if not self.start and not self.reverse:
            raise ParameterException(
                "Either start needs to be specified or reverse needs to be True"
            )
        if self.start and self.stop and self.start > self.stop:
            raise ParameterException("Can't have start > stop")
        # TODO check overridden complete() and exists()

        now = datetime.utcfromtimestamp(
            time.time() if self.now is None else self.now)

        moving_start = self.moving_start(now)
        finite_start = moving_start if self.start is None else max(
            self.parameter_to_datetime(self.start), moving_start)
        moving_stop = self.moving_stop(now)
        finite_stop = moving_stop if self.stop is None else min(
            self.parameter_to_datetime(self.stop), moving_stop)

        datetimes = self.finite_datetimes(
            finite_start, finite_stop) if finite_start <= finite_stop else []

        if datetimes:
            logger.debug('Actually checking if range %s of %s is complete',
                         self._format_range(datetimes),
                         self.of_cls.task_family)
            missing_datetimes = sorted(self._missing_datetimes(datetimes))
            logger.debug('Range %s lacked %d of expected %d %s instances',
                         self._format_range(datetimes), len(missing_datetimes),
                         len(datetimes), self.of_cls.task_family)
        else:
            missing_datetimes = []
            logger.debug('Empty range. No %s instances expected',
                         self.of_cls.task_family)

        self._emit_metrics(missing_datetimes, finite_start, finite_stop)

        if self.reverse:
            required_datetimes = missing_datetimes[-self.task_limit:]
        else:
            required_datetimes = missing_datetimes[:self.task_limit]
        if required_datetimes:
            logger.debug('Requiring %d missing %s instances in range %s',
                         len(required_datetimes), self.of_cls.task_family,
                         self._format_range(required_datetimes))
        if self.reverse:
            required_datetimes.reverse(
            )  # TODO priorities, so that within the batch tasks are ordered too

        self._cached_requires = [
            self._instantiate_task_cls(self.datetime_to_parameter(d))
            for d in required_datetimes
        ]
        return self._cached_requires

    def missing_datetimes(self, finite_datetimes):
        """
        Override in subclasses to do bulk checks.

        Returns a sorted list.

        This is a conservative base implementation that brutally checks completeness, instance by instance.

        Inadvisable as it may be slow.
        """
        return [
            d for d in finite_datetimes if not self._instantiate_task_cls(
                self.datetime_to_parameter(d)).complete()
        ]

    def _missing_datetimes(self, finite_datetimes):
        """
        Backward compatible wrapper. Will be deleted eventually (stated on Dec 2015)
        """
        try:
            return self.missing_datetimes(finite_datetimes)
        except TypeError as ex:
            if 'missing_datetimes()' in repr(ex):
                warnings.warn(
                    'In your Range* subclass, missing_datetimes() should only take 1 argument (see latest docs)'
                )
                return self.missing_datetimes(self.of_cls, finite_datetimes)
            else:
                raise
Exemple #12
0
class RangeDailyBase(RangeBase):
    """
    Produces a contiguous completed range of a daily recurring task.
    """
    start = luigi.DateParameter(
        default=None,
        description=
        "beginning date, inclusive. Default: None - work backward forever (requires reverse=True)"
    )
    stop = luigi.DateParameter(
        default=None,
        description=
        "ending date, exclusive. Default: None - work forward forever")
    days_back = luigi.IntParameter(
        default=100,  # slightly more than three months
        description=("extent to which contiguousness is to be assured into "
                     "past, in days from current time. Prevents infinite loop "
                     "when start is none. If the dataset has limited retention"
                     " (i.e. old outputs get removed), this should be set "
                     "shorter to that, too, to prevent the oldest outputs "
                     "flapping. Increase freely if you intend to process old "
                     "dates - worker's memory is the limit"))
    days_forward = luigi.IntParameter(
        default=0,
        description=
        "extent to which contiguousness is to be assured into future, in days from current time. Prevents infinite loop when stop is none"
    )

    def datetime_to_parameter(self, dt):
        return dt.date()

    def parameter_to_datetime(self, p):
        return datetime(p.year, p.month, p.day)

    def datetime_to_parameters(self, dt):
        """
        Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter
        """
        return self._task_parameters(dt.date())

    def parameters_to_datetime(self, p):
        """
        Given a dictionary of parameters, will extract the ranged task parameter value
        """
        dt = p[self._param_name]
        return datetime(dt.year, dt.month, dt.day)

    def moving_start(self, now):
        return now - timedelta(days=self.days_back)

    def moving_stop(self, now):
        return now + timedelta(days=self.days_forward)

    def finite_datetimes(self, finite_start, finite_stop):
        """
        Simply returns the points in time that correspond to turn of day.
        """
        date_start = datetime(finite_start.year, finite_start.month,
                              finite_start.day)
        dates = []
        for i in itertools.count():
            t = date_start + timedelta(days=i)
            if t >= finite_stop:
                return dates
            if t >= finite_start:
                dates.append(t)
Exemple #13
0
class HealthLabelTask(luigi.Task):
    """Apply health labels to the organisation data in MYSQL.

    Args:
        date (datetime): Datetime used to label the outputs
        _routine_id (str): String used to label the AWS task
        test (bool): True if in test mode
        insert_batch_size (int): Number of rows to insert into the db in a batch
        db_config_env (str): The output database envariable
        bucket (str): S3 bucket where the models are stored
        vectoriser_key (str): S3 key for the vectoriser model
        classifier_key (str): S3 key for the classifier model
    """
    date = luigi.DateParameter()
    _routine_id = luigi.Parameter()
    test = luigi.BoolParameter()
    insert_batch_size = luigi.IntParameter(default=500)
    db_config_env = luigi.Parameter()
    bucket = luigi.Parameter()
    vectoriser_key = luigi.Parameter()
    classifier_key = luigi.Parameter()

    def requires(self):
        yield OrgGeocodeTask(date=self.date,
                             _routine_id=self._routine_id,
                             test=self.test,
                             db_config_env="MYSQLDB",
                             city_col=Organization.city,
                             country_col=Organization.country,
                             location_key_col=Organization.location_id,
                             insert_batch_size=self.insert_batch_size,
                             env_files=[find_filepath_from_pathstub("nesta/nesta/"),
                                        find_filepath_from_pathstub("config/mysqldb.config")],
                             job_def="py36_amzn1_image",
                             job_name=f"CrunchBaseOrgGeocodeTask-{self._routine_id}",
                             job_queue="HighPriority",
                             region_name="eu-west-2",
                             poll_time=10,
                             memory=4096,
                             max_live_jobs=2)

    def output(self):
        """Points to the output database engine"""
        self.db_config_path = os.environ[self.db_config_env]
        db_config = get_config(self.db_config_path, "mysqldb")
        db_config["database"] = 'dev' if self.test else 'production'
        db_config["table"] = "Crunchbase health labels <dummy>"  # Note, not a real table
        update_id = "CrunchbaseHealthLabel_{}".format(self.date)
        return MySqlTarget(update_id=update_id, **db_config)

    def run(self):
        """Apply health labels using model."""
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        try_until_allowed(Base.metadata.create_all, self.engine)

        # collect and unpickle models from s3
        logging.info("Collecting models from S3")
        s3 = boto3.resource('s3')
        vectoriser_obj = s3.Object(self.bucket, self.vectoriser_key)
        vectoriser = pickle.loads(vectoriser_obj.get()['Body']._raw_stream.read())
        classifier_obj = s3.Object(self.bucket, self.classifier_key)
        classifier = pickle.loads(classifier_obj.get()['Body']._raw_stream.read())

        # retrieve organisations and categories
        nrows = 1000 if self.test else None
        logging.info("Collecting organisations from database")
        with db_session(self.engine) as session:
            orgs = (session
                    .query(Organization.id)
                    .filter(Organization.is_health.is_(None))
                    .limit(nrows)
                    .all())

        for batch_count, batch in enumerate(split_batches(orgs,
                                                          self.insert_batch_size), 1):
            batch_orgs_with_cats = []
            for (org_id, ) in batch:
                with db_session(self.engine) as session:
                    categories = (session
                                  .query(OrganizationCategory.category_name)
                                  .filter(OrganizationCategory.organization_id == org_id)
                                  .all())
                # categories should be a list of str, comma separated: ['cat,cat,cat', 'cat,cat']
                categories = ','.join(cat_name for (cat_name, ) in categories)
                batch_orgs_with_cats.append({'id': org_id, 'categories': categories})

            logging.debug(f"{len(batch_orgs_with_cats)} organisations retrieved from database")

            logging.debug("Predicting health flags")
            batch_orgs_with_flag = predict_health_flag(batch_orgs_with_cats, vectoriser, classifier)

            logging.debug(f"{len(batch_orgs_with_flag)} organisations to update")
            with db_session(self.engine) as session:
                session.bulk_update_mappings(Organization, batch_orgs_with_flag)
            logging.info(f"{batch_count} batches health labeled and written to db")

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
Exemple #14
0
class SGEJobTask(luigi.Task):
    """Base class for executing a job on SunGrid Engine

    Override ``work()`` (rather than ``run()``) with your job code.

    Parameters:

    - n_cpu: Number of CPUs (or "slots") to allocate for the Task. This
          value is passed as ``qsub -pe {pe} {n_cpu}``
    - parallel_env: SGE parallel environment name. The default is "orte",
          the parallel environment installed with MIT StarCluster. If you
          are using a different cluster environment, check with your
          sysadmin for the right pe to use. This value is passed as {pe}
          to the qsub command above.
    - shared_tmp_dir: Shared drive accessible from all nodes in the cluster.
          Task classes and dependencies are pickled to a temporary folder on
          this drive. The default is ``/home``, the NFS share location setup
          by StarCluster

    """

    n_cpu = luigi.IntParameter(default=2, significant=False)
    shared_tmp_dir = luigi.Parameter(default='/home', significant=False)
    parallel_env = luigi.Parameter(default='orte', significant=False)

    def _fetch_task_failures(self):
        if not os.path.exists(self.errfile):
            logger.info('No error file')
            return []
        with open(self.errfile, "r") as f:
            errors = f.readlines()
        if errors == []:
            return errors
        if errors[0].strip(
        ) == 'stdin: is not a tty':  # SGE complains when we submit through a pipe
            errors.pop(0)
        return errors

    def _init_local(self):

        # Set up temp folder in shared directory (trim to max filename length)
        base_tmp_dir = self.shared_tmp_dir
        random_id = '%016x' % random.getrandbits(64)
        folder_name = self.task_id + '-' + random_id
        self.tmp_dir = os.path.join(base_tmp_dir, folder_name)
        max_filename_length = os.fstatvfs(0).f_namemax
        self.tmp_dir = self.tmp_dir[:max_filename_length]
        logger.info("Tmp dir: %s", self.tmp_dir)
        os.makedirs(self.tmp_dir)

        # Dump the code to be run into a pickle file
        logging.debug("Dumping pickled class")
        self._dump(self.tmp_dir)

        # Make sure that all the class's dependencies are tarred and available
        logging.debug("Tarballing dependencies")
        # Grab luigi and the module containing the code to be run
        packages = [luigi] + [__import__(self.__module__, None, None, 'dummy')]
        luigi.hadoop.create_packages_archive(
            packages, os.path.join(self.tmp_dir, "packages.tar"))

    def run(self):
        self._init_local()
        self._run_job()
        # The procedure:
        # - Pickle the class
        # - Tarball the dependencies
        # - Construct a qsub argument that runs a generic runner function with the path to the pickled class
        # - Runner function loads the class from pickle
        # - Runner class untars the dependencies
        # - Runner function hits the button on the class's work() method

    def work(self):
        """Override this method, rather than ``run()``,  for your actual work."""
        pass

    def _dump(self, out_dir=''):
        """Dump instance to file."""
        self.job_file = os.path.join(out_dir, 'job-instance.pickle')
        if self.__module__ == '__main__':
            d = pickle.dumps(self)
            module_name = os.path.basename(sys.argv[0]).rsplit('.', 1)[0]
            d = d.replace('(c__main__', "(c" + module_name)
            open(self.job_file, "w").write(d)

        else:
            pickle.dump(self, open(self.job_file, "w"))

    def _run_job(self):

        # Build a qsub argument that will run sge_runner.py on the directory we've specified
        runner_path = sge_runner.__file__
        if runner_path.endswith("pyc"):
            runner_path = runner_path[:-3] + "py"
        job_str = 'python {0} "{1}"'.format(
            runner_path, self.tmp_dir
        )  # enclose tmp_dir in quotes to protect from special escape chars

        # Build qsub submit command
        self.outfile = os.path.join(self.tmp_dir, 'job.out')
        self.errfile = os.path.join(self.tmp_dir, 'job.err')
        submit_cmd = _build_qsub_command(job_str, self.task_family,
                                         self.outfile, self.errfile,
                                         self.parallel_env, self.n_cpu)
        logger.debug('qsub command: \n' + submit_cmd)

        # Submit the job and grab job ID
        output = subprocess.check_output(submit_cmd, shell=True)
        self.job_id = _parse_qsub_job_id(output)
        logger.debug("Submitted job to qsub with response:\n" + output)

        self._track_job()

        # Now delete the temporaries, if they're there.
        if self.tmp_dir and os.path.exists(self.tmp_dir):
            logger.info('Removing temporary directory %s' % self.tmp_dir)
            shutil.rmtree(self.tmp_dir)

    def _track_job(self):
        while True:
            # Sleep for a little bit
            time.sleep(POLL_TIME)

            # See what the job's up to
            # ASSUMPTION
            qstat_out = subprocess.check_output(['qstat'])
            sge_status = _parse_qstat_state(qstat_out, self.job_id)
            if sge_status == 'r':
                logger.info('Job is running...')
            elif sge_status == 'qw':
                logger.info('Job is pending...')
            elif 'E' in sge_status:
                logger.error('Job has FAILED:\n' +
                             '\n'.join(self._fetch_task_failures()))
                break
            elif sge_status == 't' or sge_status == 'u':
                # Then the job could either be failed or done.
                errors = self._fetch_task_failures()
                if not errors:
                    logger.info('Job is done')
                else:
                    logger.error('Job has FAILED:\n' + '\n'.join(errors))
                break
            else:
                logger.info('Job status is UNKNOWN!')
                logger.info('Status is : %s' % sge_status)
                raise Exception(
                    "job status isn't one of ['r', 'qw', 'E*', 't', 'u']: %s" %
                    sge_status)
class _SubDummyTask(gokart.TaskOnKart):
    task_namespace = __name__
    param = luigi.IntParameter()
Exemple #16
0
class TrainModel(luigi.Task):
    # TODO parameters description with 'help' argument
    test_size = luigi.FloatParameter(default = 0.25)
    random_state = luigi.Parameter(default = 'None')
    shuffle = luigi.BoolParameter()
    selected_model = luigi.ChoiceParameter(choices = ['naive_bayes', 'random_forest', 'svm'], default = 'naive_bayes')
    verbose = luigi.IntParameter(default=0)

    # Random forest
    n_estimators = luigi.IntParameter(default = 100)
    max_depth = luigi.Parameter(default = 'None')


    def __init__(self, *args, **kwargs):
        super(TrainModel, self).__init__(*args, **kwargs)

        self.random_state = ast.literal_eval(self.random_state)
        self.max_depth = ast.literal_eval(self.max_depth)

    def requires(self):
        return FeatureExtraction()

    def output(self):
        return None

    def run(self):
        print('---> Spliting data')
        features = pickle.load(open(self.input()['features'].path, 'rb'))
        labels = pickle.load(open(self.input()['labels'].path, 'rb'))

        train_features, test_features, train_labels, test_labels = \
            train_test_split(features, labels, test_size = self.test_size, 
                             random_state = self.random_state, shuffle = self.shuffle)

        # TODO
        # kfold = StratifiedKFold(n_splits=10, shuffle=True)

        if self.selected_model == 'random_forest':
            model = RandomForestModel( n_estimators = self.n_estimators, 
                                       max_depth = self.max_depth,
                                       verbose = self.verbose )
        elif self.selected_model == 'naive_bayes':
            model = NaiveBayesModel()
        elif self.selected_model == 'svm':
            raise NotImplementedError

        print('---> Training model', model.name)
        model.train(train_features, train_labels)
        results, figures = model.score(test_features, test_labels)

        print('---> Saving Model')
        output_name = model.name + '||Accuray=' + str(round(results['accuracy'], 2))
        output_name += '||' + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
        save_folder = os.path.join('models', output_name)
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)

        with open(os.path.join(save_folder, 'results.json'), 'w') as fp:
            json.dump(results, fp)

        for key in figures:
            with open(os.path.join(save_folder, str(key+'.png')), 'w') as fp:
                figures.savefig(fp)

        model.save(os.path.join(save_folder, 'model.pickle'))
Exemple #17
0
from openfda import parallel, config, index_util, elasticsearch_requests
from openfda.annotation_table.pipeline import CombineHarmonization
from openfda.faers import annotate
from openfda.faers import xml_to_json
from openfda.index_util import AlwaysRunTask


# this should be a symlink to wherever the real data directory is
RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
BASE_DIR = config.data_dir()
FAERS_HISTORIC = ('http://www.fda.gov/Drugs/GuidanceCompliance'
  'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm083765.htm')
FAERS_CURRENT = ('http://www.fda.gov/Drugs/GuidanceCompliance'
  'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm082193.htm')

MAX_RECORDS_PER_FILE = luigi.IntParameter(-1, is_global=True)

class DownloadDataset(AlwaysRunTask):
  '''
  This task downloads all datasets that have not yet been fetched.
  '''
  def _fetch(self):
    for page in [self._faers_current.find_all(href=re.compile('.*.zip')),
                 self._faers_historic.find_all(href=re.compile('.*.zip'))]:
      for a in page:
        filename = a.text.split(u'\xa0')[0]
        # FAERS XML/ASCII for 2014 Q3/Q4 have many strange issues. Sigh.
        filename = filename.replace('Q', 'q')
        filename = filename.replace(' q', 'q')
        filename = filename.replace(' ', '_')
        if '.zip' not in filename.lower():
Exemple #18
0
class _DummyTask(TaskOnKart):
    task_namespace = __name__
    param = luigi.IntParameter()
    task = TaskInstanceParameter(default=_DummySubTask())
Exemple #19
0
class SGEJobTask(luigi.Task):
    """Base class for executing a job on SunGrid Engine

    Override ``work()`` (rather than ``run()``) with your job code.

    Parameters:

    - n_cpu: Number of CPUs (or "slots") to allocate for the Task. This
          value is passed as ``qsub -pe {pe} {n_cpu}``
    - parallel_env: SGE parallel environment name. The default is "orte",
          the parallel environment installed with MIT StarCluster. If you
          are using a different cluster environment, check with your
          sysadmin for the right pe to use. This value is passed as {pe}
          to the qsub command above.
    - shared_tmp_dir: Shared drive accessible from all nodes in the cluster.
          Task classes and dependencies are pickled to a temporary folder on
          this drive. The default is ``/home``, the NFS share location setup
          by StarCluster
    - job_name_format: String that can be passed in to customize the job name
        string passed to qsub; e.g. "Task123_{task_family}_{n_cpu}...".
    - job_name: Exact job name to pass to qsub.
    - run_locally: Run locally instead of on the cluster.
    - poll_time: the length of time to wait in order to poll qstat
    - dont_remove_tmp_dir: Instead of deleting the temporary directory, keep it.
    - no_tarball: Don't create a tarball of the luigi project directory.  Can be
        useful to reduce I/O requirements when the luigi directory is accessible
        from cluster nodes already.

    """

    n_cpu = luigi.IntParameter(default=2, significant=False)
    shared_tmp_dir = luigi.Parameter(default='/home', significant=False)
    parallel_env = luigi.Parameter(default='orte', significant=False)
    job_name_format = luigi.Parameter(
        significant=False,
        default=None,
        description="A string that can be "
        "formatted with class variables to name the job with qsub.")
    job_name = luigi.Parameter(significant=False,
                               default=None,
                               description="Explicit job name given via qsub.")
    run_locally = luigi.BoolParameter(
        significant=False, description="run locally instead of on the cluster")
    poll_time = luigi.IntParameter(
        significant=False,
        default=POLL_TIME,
        description="specify the wait time to poll qstat for the job status")
    dont_remove_tmp_dir = luigi.BoolParameter(
        significant=False,
        description="don't delete the temporary directory used (for debugging)"
    )
    no_tarball = luigi.BoolParameter(
        significant=False,
        description="don't tarball (and extract) the luigi project files")

    # Custom params
    h_vmem = luigi.IntParameter(default=100)
    m_mem_free = luigi.IntParameter(default=5)

    def __init__(self, *args, **kwargs):
        super(SGEJobTask, self).__init__(*args, **kwargs)
        if self.job_name:
            # use explicitly provided job name
            pass
        elif self.job_name_format:
            # define the job name with the provided format
            self.job_name = self.job_name_format.format(
                task_family=self.task_family, **self.__dict__)
        else:
            # default to the task family
            self.job_name = self.task_family

    def _fetch_task_failures(self):
        if not os.path.exists(self.errfile):
            logger.info('No error file')
            return []
        with open(self.errfile, "r") as f:
            errors = f.readlines()
        if errors == []:
            return errors
        if errors[0].strip(
        ) == 'stdin: is not a tty':  # SGE complains when we submit through a pipe
            errors.pop(0)
        return errors

    def _init_local(self):

        # Set up temp folder in shared directory (trim to max filename length)
        base_tmp_dir = self.shared_tmp_dir
        random_id = '%016x' % random.getrandbits(64)
        folder_name = self.task_id + '-' + random_id
        self.tmp_dir = os.path.join(base_tmp_dir, folder_name)
        max_filename_length = os.fstatvfs(0).f_namemax
        self.tmp_dir = self.tmp_dir[:max_filename_length]
        logger.info("Tmp dir: %s", self.tmp_dir)
        os.makedirs(self.tmp_dir)

        # Dump the code to be run into a pickle file
        logging.debug("Dumping pickled class")
        self._dump(self.tmp_dir)

        if not self.no_tarball:
            # Make sure that all the class's dependencies are tarred and available
            # This is not necessary if luigi is importable from the cluster node
            logging.debug("Tarballing dependencies")
            # Grab luigi and the module containing the code to be run
            packages = [luigi
                        ] + [__import__(self.__module__, None, None, 'dummy')]
            create_packages_archive(packages,
                                    os.path.join(self.tmp_dir, "packages.tar"))

    def run(self):
        if self.run_locally:
            self.work()
        else:
            self._init_local()
            self._run_job()
            # The procedure:
            # - Pickle the class
            # - Tarball the dependencies
            # - Construct a qsub argument that runs a generic runner function with the path to the pickled class
            # - Runner function loads the class from pickle
            # - Runner class untars the dependencies
            # - Runner function hits the button on the class's work() method

    def work(self):
        """Override this method, rather than ``run()``,  for your actual work."""
        pass

    def _dump(self, out_dir=''):
        """Dump instance to file."""
        if True:

            print("##################joining paths")
            self.job_file = os.path.join(out_dir, 'job-instance.pickle')

            print("#################checking if main")
            if self.__module__ == '__main__':
                print("##########################dumping pickle")

                d = cloudpickle.dumps(self)
                module_name = os.path.basename(sys.argv[0]).rsplit('.', 1)[0]
                d = d.replace('(c__main__', "(c" + module_name)
                with open(self.job_file, "w") as f:
                    f.write(d)
            else:
                with open(self.job_file, "wb") as f:
                    # pickle.dump(self, f)
                    cloudpickle.dump(self, f)

    def _run_job(self):

        # Build a qsub argument that will run sge_runner.py on the directory we've specified
        runner_path = sge_runner.__file__
        if runner_path.endswith("pyc"):
            runner_path = runner_path[:-3] + "py"
        job_str = '/dls/science/groups/i04-1/conor_dev/ccp4/build/bin/cctbx.python {0} "{1}" "{2}"'.format(
            runner_path, self.tmp_dir, os.getcwd()
        )  # enclose tmp_dir in quotes to protect from special escape chars
        if self.no_tarball:
            job_str += ' "--no-tarball"'

        # Build qsub submit command
        self.outfile = os.path.join(self.tmp_dir, 'job.out')
        self.errfile = os.path.join(self.tmp_dir, 'job.err')
        submit_cmd = _build_qsub_command(
            job_str,
            self.task_family,
            self.outfile,
            self.errfile,
            self.parallel_env,
            n_cpu=self.n_cpu,
            h_vmem=self.h_vmem,
            m_mem_free=self.m_mem_free,
        )

        logger.debug('qsub command: \n' + submit_cmd)

        # Submit the job and grab job ID
        output = subprocess.check_output(submit_cmd, shell=True)
        self.job_id = _parse_qsub_job_id(output)
        logger.debug("Submitted job to qsub with response:\n" + output)

        self._track_job()

        # Now delete the temporaries, if they're there.
        if (self.tmp_dir and os.path.exists(self.tmp_dir)
                and not self.dont_remove_tmp_dir):
            logger.info('Removing temporary directory %s' % self.tmp_dir)
            subprocess.call(["rm", "-rf", self.tmp_dir])

    def _track_job(self):
        while True:
            # Sleep for a little bit
            time.sleep(self.poll_time)

            # See what the job's up to
            # ASSUMPTION
            qstat_out = subprocess.check_output(['qstat'])
            sge_status = _parse_qstat_state(qstat_out, self.job_id)
            if sge_status == 'r':
                logger.info('Job is running...')
            elif sge_status == 'qw':
                logger.info('Job is pending...')
            elif 'E' in sge_status:
                logger.error('Job has FAILED:\n' +
                             '\n'.join(self._fetch_task_failures()))
                break
            elif sge_status == 't' or sge_status == 'u':
                # Then the job could either be failed or done.
                errors = self._fetch_task_failures()
                if not errors:
                    logger.info('Job is done')
                else:
                    logger.error('Job has FAILED:\n' + '\n'.join(errors))
                break
            else:
                logger.info('Job status is UNKNOWN!')
                logger.info('Status is : %s' % sge_status)
                raise Exception(
                    "job status isn't one of ['r', 'qw', 'E*', 't', 'u']: %s" %
                    sge_status)
Exemple #20
0
class _DummyListTask(TaskOnKart):
    task_namespace = __name__
    param = luigi.IntParameter()
    task = ListTaskInstanceParameter(
        default=[_DummySubTask(), _DummySubTask()])
Exemple #21
0
class ScoreVariantsWithCnn(VclineTask):
    cf = luigi.DictParameter()
    n_cpu = luigi.IntParameter(default=1)
    memory_mb = luigi.FloatParameter(default=4096)
    sh_config = luigi.DictParameter(default=dict())
    priority = 50

    def output(self):
        output_path_prefix = re.sub(r'\.vcf\.gz$', '', self.input()[0][0].path)
        return [
            luigi.LocalTarget(f'{output_path_prefix}.cnn.vcf.gz{s}')
            for s in ['', '.tbi']
        ]

    def run(self):
        input_vcf = Path(self.input()[0][0].path)
        input_cram = Path(self.input()[0][2].path)
        fa = Path(self.input()[1][0].path)
        intervals = [Path(i.path) for i in self.input()[2]]
        skip_interval_split = (len(intervals) == 1)
        output_vcf = Path(self.output()[0].path)
        output_path_prefix = '.'.join(str(output_vcf).split('.')[:-2])
        if skip_interval_split:
            tmp_prefixes = [output_path_prefix]
        else:
            tmp_prefixes = [
                '{0}.{1}'.format(output_path_prefix, o.stem) for o in intervals
            ]
        input_targets = yield [
            CNNScoreVariants(input_vcf_path=str(input_vcf),
                             input_cram_path=str(input_cram),
                             fa_path=str(fa),
                             evaluation_interval_path=str(o),
                             output_path_prefix=s,
                             gatk=self.cf['gatk'],
                             python=self.cf['python'],
                             save_memory=self.cf['save_memory'],
                             n_cpu=self.n_cpu,
                             memory_mb=self.memory_mb,
                             sh_config=self.sh_config)
            for o, s in zip(intervals, tmp_prefixes)
        ]
        run_id = '.'.join(output_vcf.name.split('.')[:-2])
        self.print_log(f'Score variants with CNN:\t{run_id}')
        gatk = self.cf['gatk']
        self.setup_shell(run_id=run_id,
                         commands=gatk,
                         cwd=output_vcf.parent,
                         **self.sh_config,
                         env={
                             'JAVA_TOOL_OPTIONS':
                             self.generate_gatk_java_options(
                                 n_cpu=self.n_cpu, memory_mb=self.memory_mb)
                         })
        if not skip_interval_split:
            tmp_vcfs = [Path(f'{s}.vcf.gz') for s in tmp_prefixes]
            self.run_shell(
                args=(f'set -e && {gatk} MergeVcfs' +
                      ''.join(f' --INPUT {v}'
                              for v in tmp_vcfs) + f' --OUTPUT {output_vcf}'),
                input_files_or_dirs=tmp_vcfs,
                output_files_or_dirs=[output_vcf, f'{output_vcf}.tbi'])
            self.remove_files_and_dirs(*chain.from_iterable(
                [o.path for o in t] for t in input_targets))
Exemple #22
0
class Predict(luigi.Task):
    it = luigi.IntParameter()
    path = luigi.Parameter()
    samples = luigi.TupleParameter()
    data_eval = luigi.TupleParameter()
    resources={'gpu': 1, 'ram': 10}

    @property
    def priority(self):
        if int(self.it)%10000==0:
            return 1.+1./int(self.it)
        else:
            return 0.

    def requires(self):
        return MakeItFolder(self.it, self.path, self.data_eval)

    def output(self):
        ret = []
        for de in self.data_eval:
            for s in self.samples:
                ret.append(luigi.LocalTarget(os.path.join(os.path.dirname(self.input().fn), 'pred_{0:}_{'
                                                                                            '1:}.msg'.format(de, s))))
        return ret

    def run(self):

        src = '/groups/saalfeld/saalfeldlab/larissa/data/cremieval/{0:}/{1:}.n5'
        tgt = os.path.join(os.path.dirname(self.input().fn), '{0:}', '{1:}.n5')
        output_shape = (71, 650, 650)
        gpu_list = []
        for i in range(8):
            nvsmi = subprocess.Popen("nvidia-smi -d PIDS -q -i {0:}".format(i), shell=True,
                                     stdout=subprocess.PIPE).stdout.read()
            if 'None' in nvsmi:
                gpu_list.append(i)
        completed = []
        for de in self.data_eval:
            for s in self.samples:
                srcf = z5py.File(src.format(de, s), use_zarr_format=False)
                shape = srcf['volumes/raw'].shape
                tgtf = z5py.File(tgt.format(de, s), use_zarr_format=False)
                if not os.path.exists(os.path.join(tgt.format(de,s), 'clefts')):
                    tgtf.create_dataset('clefts',
                                    shape=shape,
                                    compression='gzip',
                                    dtype='uint8',
                                    chunks=output_shape)
                    completed.append(False)
                else:
                    if self.check_completeness()[0]:
                        completed.append(True)
                    else:
                        completed.append(False)
                if not os.path.exists(os.path.join(tgt.format(de,s), 'pre_dist')):
                    tgtf.create_dataset('pre_dist',
                                    shape=shape,
                                    compression='gzip',
                                    dtype='uint8',
                                    chunks=output_shape)
                    completed.append(False)
                else:
                    if self.check_completeness()[0]:
                        completed.append(True)
                    else:
                        completed.append(False)

                if not os.path.exists(os.path.join(tgt.format(de,s), 'post_dist')):

                    tgtf.create_dataset('post_dist',
                                    shape=shape,
                                    compression='gzip',
                                    dtype='uint8',
                                    chunks=output_shape)
                    completed.append(False)
                else:
                    if self.check_completeness()[0]:
                        completed.append(True)
                    else:
                        completed.append(False)
                get_offset_lists(shape, gpu_list, tgt.format(de, s), output_shape=output_shape)
        if all(completed):
            self.finish()
            return
        self.submit_inference(self.data_eval, gpu_list)

        reprocess_attempts = 0
        while reprocess_attempts < 4:
            complete, reprocess_list = self.check_completeness(gpu_list)
            if complete:
                self.finish()
                return
            else:
                self.set_status_message("Reprocessing {0:}, try {1:}".format(list(reprocess_list), reprocess_attempts))
                self.submit_inference(tuple(reprocess_list), gpu_list)
                reprocess_attempts += 1
        if reprocess_attempts >= 4:
            raise AssertionError


    def submit_inference(self, data_eval, gpu_list):
        with ProcessPoolExecutor(max_workers=len(gpu_list)) as pp:
            tasks = [pp.submit(single_inference,
                                self.path,
                                json.dumps(list(data_eval)).replace(' ', '').replace('"', '\\"'),
                                json.dumps(list(self.samples)).replace(' ', '').replace('"', '\\"'),
                                str(gpu),
                                str(self.it)) for gpu in gpu_list]
            result = [t.result() for t in tasks]
    def finish(self):
        for o in self.output():
            done = o.open('w')
            done.close()
    def check_completeness(self, gpu_list=None):
        complete=True
        reprocess=set()
        tgt = os.path.join(os.path.dirname(self.input().fn), '{0:}', '{1:}.n5')
        pattern = re.compile("list_gpu_[0-7].json")
        for de in self.data_eval:
            for s in self.samples:
                if gpu_list is None:
                    gpu_list = []
                    for fn in os.listdir(tgt.format(de, s)):
                        if pattern.match(fn) is not None:
                            gpu_list.append(int(filter(str.isdigit, fn)))
                if len(gpu_list)==0:
                    complete=False
                    reprocess.add(de)
                for gpu in gpu_list:
                    if os.path.exists(os.path.join(tgt.format(de, s), 'list_gpu_{0:}.json').format(gpu)) and \
                            os.path.exists(os.path.join(tgt.format(de, s), 'list_gpu_{0:}processed.txt'.format(gpu))):
                        block_list = os.path.join(tgt.format(de, s), 'list_gpu_{0:}.json').format(gpu)
                        block_list_processed = os.path.join(tgt.format(de, s), 'list_gpu_{0:}processed.txt'.format(gpu))
                        with open(block_list, 'r') as f:
                            block_list = json.load(f)
                            block_list = {tuple(coo) for coo in block_list}
                        with open(block_list_processed, 'r') as f:
                            list_as_str = f.read()
                        list_as_str_curated = '['+list_as_str[:list_as_str.rfind(']')+1]+']'
                        processed_list = json.loads(list_as_str_curated)
                        processed_list = {tuple(coo) for coo in processed_list}
                        if processed_list < block_list:
                            complete=False
                            reprocess.add(de)
                    else:
                        complete=False
                        reprocess.add(de)
        return complete, reprocess
class BaseTask(luigi.Task):

    pipeline = luigi.IntParameter()
    job = luigi.IntParameter()
    start = luigi.IntParameter()
    solr_query = luigi.Parameter()
    batch = luigi.IntParameter()
    task_name = "ClarityNLPLuigiTask"
    docs = list()
    pipeline_config = config.PipelineConfig('', '')
    segment = segmentation.Segmentation()

    def run(self):
        task_family_name = str(self.task_family)
        if self.task_name == "ClarityNLPLuigiTask":
            self.task_name = task_family_name
        client = MongoClient(util.mongo_host, util.mongo_port)

        try:
            with self.output().open('w') as temp_file:
                temp_file.write("start writing custom task")
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Running Batch %s" % self.batch)

                self.pipeline_config = config.get_pipeline_config(
                    self.pipeline, util.conn_string)
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS, "Running Solr query")
                self.docs = solr_data.query(
                    self.solr_query,
                    rows=util.row_count,
                    start=self.start,
                    solr_url=util.solr_url,
                    tags=self.pipeline_config.report_tags,
                    mapper_inst=util.report_mapper_inst,
                    mapper_url=util.report_mapper_url,
                    mapper_key=util.report_mapper_key,
                    types=self.pipeline_config.report_types,
                    sources=self.pipeline_config.sources,
                    filter_query=self.pipeline_config.filter_query,
                    cohort_ids=self.pipeline_config.cohort,
                    job_results_filters=self.pipeline_config.job_results)

                for d in self.docs:
                    doc_id = d[util.solr_report_id_field]
                    if util.use_memory_caching == "true":
                        k = keys.hashkey(doc_id)
                        document_cache[k] = d
                    if util.use_redis_caching == "true":
                        util.write_to_redis_cache("doc:" + doc_id,
                                                  json.dumps(d))
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Running %s main task" % self.task_name)
                self.run_custom_task(temp_file, client)
                temp_file.write("Done writing custom task!")

            self.docs = list()
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.WARNING,
                                   ''.join(traceback.format_stack()))
            print(ex)
        finally:
            client.close()

    def output(self):
        return luigi.LocalTarget(
            "%s/pipeline_job%s_%s_batch%s.txt" %
            (util.tmp_dir, str(self.job), self.task_name, str(self.start)))

    def set_name(self, name):
        self.task_name = name

    def write_result_data(self,
                          temp_file,
                          mongo_client,
                          doc,
                          data: dict,
                          prefix: str = '',
                          phenotype_final: bool = False):
        inserted = pipeline_mongo_writer(mongo_client,
                                         self.pipeline,
                                         self.task_name,
                                         self.job,
                                         self.batch,
                                         self.pipeline_config,
                                         doc,
                                         data,
                                         prefix=prefix)
        if temp_file is not None:
            temp_file.write(str(inserted))
            temp_file.write('\n')
        return inserted

    def write_multiple_result_data(self,
                                   temp_file,
                                   mongo_client,
                                   doc,
                                   data: list,
                                   prefix: str = ''):
        ids = list()
        for d in data:
            inserted = pipeline_mongo_writer(mongo_client,
                                             self.pipeline,
                                             self.task_name,
                                             self.job,
                                             self.batch,
                                             self.pipeline_config,
                                             doc,
                                             d,
                                             prefix=prefix)
            ids.append(inserted)
            if temp_file is not None:
                temp_file.write(str(inserted))
                temp_file.write('\n')

        return ids

    def write_log_data(self, job_status, status_message):
        jobs.update_job_status(str(self.job), util.conn_string, job_status,
                               status_message)

    def run_custom_task(self, temp_file, mongo_client: MongoClient):
        print("Implement your custom functionality here ")

    def get_document_text(self, doc, clean=True):
        if doc and util.solr_text_field in doc:
            txt = doc[util.solr_text_field]
            if type(txt) == str:
                txt_val = txt
            elif type(txt) == list:
                txt_val = ' '.join(txt)
            else:
                txt_val = str(txt)

            if clean:
                return txt_val.encode("ascii", errors="ignore").decode()
            else:
                return txt_val
        else:
            return ''

    def get_boolean(self, key, default=False):
        return get_config_boolean(self.pipeline_config, key, default=default)

    def get_integer(self, key, default=-1):
        return get_config_integer(self.pipeline_config, key, default=default)

    def get_string(self, key, default=''):
        return get_config_string(self.pipeline_config, key, default=default)

    def get_document_sentences(self, doc):
        return document_sentences(doc)

    def get_document_sections(self, doc):
        names, section_texts = document_sections(doc)
        return names, section_texts
class Baseline(sciluigi.WorkflowTask):

    workdir = luigi.Parameter(default='/work')
    tvddir = luigi.Parameter(default='/tvd')
    series = luigi.Parameter(default='GameOfThrones')
    season = luigi.IntParameter(default=1)
    episode = luigi.IntParameter(default=1)
    language = luigi.Parameter(default='en')

    linearBICClusteringFeatures__e = luigi.BoolParameter(default=True)
    linearBICClusteringFeatures__De = luigi.BoolParameter(default=False)
    linearBICClusteringFeatures__DDe = luigi.BoolParameter(default=False)
    linearBICClusteringFeatures__coefs = luigi.IntParameter(default=12)
    linearBICClusteringFeatures__D = luigi.BoolParameter(default=False)
    linearBICClusteringFeatures__DD = luigi.BoolParameter(default=False)

    linearBICClustering__max_gap = luigi.FloatParameter(default=3600.0)
    linearBICClustering__penalty_coef = luigi.FloatParameter(default=1.0)
    linearBICClustering__covariance_type = luigi.Parameter(default='diag')

    bicClusteringFeatures__e = luigi.BoolParameter(default=True)
    bicClusteringFeatures__De = luigi.BoolParameter(default=False)
    bicClusteringFeatures__DDe = luigi.BoolParameter(default=False)
    bicClusteringFeatures__coefs = luigi.IntParameter(default=12)
    bicClusteringFeatures__D = luigi.BoolParameter(default=False)
    bicClusteringFeatures__DD = luigi.BoolParameter(default=False)

    bicClustering__penalty_coef = luigi.FloatParameter(default=3.5)
    bicClustering__covariance_type = luigi.Parameter(default='full')

    hyperopt = luigi.Parameter(default=None)

    def workflow(self):

        # =====================================================================
        # SPEECH / NON-SPEECH
        # =====================================================================

        audio = self.new_task('audio',
                              pyannote_workflows.tasks.tvd_dataset.Audio,
                              tvddir=self.tvddir,
                              series=self.series,
                              season=self.season,
                              episode=self.episode,
                              language=self.language)

        speakerReference = self.new_task(
            'speakerReference',
            pyannote_workflows.tasks.tvd_dataset.Speaker,
            workdir=self.workdir,
            tvddir=self.tvddir,
            series=self.series,
            season=self.season,
            episode=self.episode)

        speech = self.new_task('speechReference',
                               pyannote_workflows.tasks.tvd_dataset.Speech,
                               to_annotation=True)

        speech.in_wav = audio.out_put
        speech.in_speaker = speakerReference.out_put

        # =====================================================================
        # LINEAR BIC CLUSTERING
        # =====================================================================

        linearBICClusteringFeatures = self.new_task(
            'linearBICClusteringFeatures',
            pyannote_workflows.tasks.speech.MFCC,
            e=self.linearBICClusteringFeatures__e,
            De=self.linearBICClusteringFeatures__De,
            DDe=self.linearBICClusteringFeatures__DDe,
            coefs=self.linearBICClusteringFeatures__coefs,
            D=self.linearBICClusteringFeatures__D,
            DD=self.linearBICClusteringFeatures__DD)

        linearBICClusteringFeatures.in_audio = audio.out_put

        linearBICClustering = self.new_task(
            'linearBICClustering',
            pyannote_workflows.tasks.speech.LinearBICClustering,
            max_gap=self.linearBICClustering__max_gap,
            penalty_coef=self.linearBICClustering__penalty_coef,
            covariance_type=self.linearBICClustering__covariance_type)

        linearBICClustering.in_segmentation = speech.out_put
        linearBICClustering.in_features = linearBICClusteringFeatures.out_put

        # =====================================================================
        # BIC CLUSTERING
        # =====================================================================

        bicClusteringFeatures = self.new_task(
            'bicClusteringFeatures',
            pyannote_workflows.tasks.speech.MFCC,
            e=self.bicClusteringFeatures__e,
            De=self.bicClusteringFeatures__De,
            DDe=self.bicClusteringFeatures__DDe,
            coefs=self.bicClusteringFeatures__coefs,
            D=self.bicClusteringFeatures__D,
            DD=self.bicClusteringFeatures__DD)

        bicClusteringFeatures.in_audio = audio.out_put

        bicClustering = self.new_task(
            'bicClustering',
            pyannote_workflows.tasks.speech.BICClustering,
            penalty_coef=self.bicClustering__penalty_coef,
            covariance_type=self.bicClustering__covariance_type)

        bicClustering.in_segmentation = linearBICClustering.out_put
        bicClustering.in_features = bicClusteringFeatures.out_put

        # =====================================================================
        # EVALUATION
        # =====================================================================

        evaluateDiarization = self.new_task(
            'evaluateDiarization',
            pyannote_workflows.tasks.evaluation.EvaluateDiarizationFast)

        evaluateDiarization.in_hypothesis = bicClustering.out_put
        evaluateDiarization.in_reference = speakerReference.out_put

        if hasattr(self, 'auto_output'):
            pprint(self.auto_output)

        if self.hyperopt is not None:
            hyperopt = self.new_task('hyperopt',
                                     pyannote_workflows.utils.Hyperopt,
                                     temp=self.hyperopt)
            hyperopt.in_evaluation = evaluateDiarization.out_put
            return hyperopt

        else:
            return evaluateDiarization
Exemple #25
0
class MasscanScan(luigi.Task):
    """ Run ``masscan`` against a target specified via the TargetList Task.

    Note:
        When specified, ``--top_ports`` is processed and then ultimately passed to ``--ports``.

    Install:
        .. code-block:: console

            git clone https://github.com/robertdavidgraham/masscan /tmp/masscan
            make -s -j -C /tmp/masscan
            sudo mv /tmp/masscan/bin/masscan /usr/local/bin/masscan
            rm -rf /tmp/masscan

    Basic Example:
        .. code-block:: console

            masscan -v --open-only --banners --rate 1000 -e tun0 -oJ masscan.tesla.json --ports 80,443,22,21 -iL tesla.ips

    Luigi Example:
        .. code-block:: console

            PYTHONPATH=$(pwd) luigi --local-scheduler --module recon.masscan Masscan --target-file tesla --ports 80,443,22,21

    Args:
        rate: desired rate for transmitting packets (packets per second)
        interface: use the named raw network interface, such as "eth0"
        top_ports: Scan top N most popular ports
        ports: specifies the port(s) to be scanned
        db_location: specifies the path to the database used for storing results *Required by upstream Task*
        target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
        results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
        exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task*
    """

    rate = luigi.Parameter(default=defaults.get("masscan-rate"))
    interface = luigi.Parameter(default=defaults.get("masscan-iface"))
    top_ports = luigi.IntParameter(
        default=0)  # IntParameter -> top_ports expected as int
    ports = luigi.Parameter(default="")
    requirements = ["masscan"]
    exception = True

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.db_mgr = pipeline.models.db_manager.DBManager(
            db_location=self.db_location)
        self.results_subfolder = (Path(self.results_dir) /
                                  "masscan-results").expanduser().resolve()

    def output(self):
        """ Returns the target output for this task.

        Naming convention for the output file is masscan.TARGET_FILE.json.

        Returns:
            luigi.local_target.LocalTarget
        """
        new_path = self.results_subfolder / "masscan.json"

        return luigi.LocalTarget(new_path.expanduser().resolve())

    def run(self):
        """ Defines the options/arguments sent to masscan after processing.

        Returns:
            list: list of options/arguments, beginning with the name of the executable to run
        """
        meets_requirements(self.requirements, self.exception)
        if not self.ports and not self.top_ports:
            # need at least one, can't be put into argparse scanner because things like amass don't require ports option
            logging.error("Must specify either --top-ports or --ports.")
            exit(2)

        if self.top_ports:
            # if --top-ports used, format the top_*_ports lists as strings and then into a proper masscan --ports option
            top_tcp_ports_str = ",".join(
                str(x) for x in top_tcp_ports[:self.top_ports])
            top_udp_ports_str = ",".join(
                str(x) for x in top_udp_ports[:self.top_ports])

            self.ports = f"{top_tcp_ports_str},U:{top_udp_ports_str}"
            self.top_ports = 0

        self.results_subfolder.mkdir(parents=True, exist_ok=True)

        yield TargetList(target_file=self.target_file,
                         results_dir=self.results_dir,
                         db_location=self.db_location)

        if self.db_mgr.get_all_hostnames():
            # TargetList generated some domains for us to scan with amass

            yield ParseAmassOutput(
                target_file=self.target_file,
                exempt_list=self.exempt_list,
                results_dir=self.results_dir,
                db_location=self.db_location,
            )

        command = [
            tools.get("masscan").get("path"),
            "-v",
            "--open",
            "--banners",
            "--rate",
            self.rate,
            "-e",
            self.interface,
            "-oJ",
            self.output().path,
            "--ports",
            self.ports,
            "-iL",
        ]

        # masscan only understands how to scan ipv4
        ip_addresses = self.db_mgr.get_all_ipv4_addresses()
        masscan_input_file = None

        if ip_addresses:
            # TargetList generated ip addresses for us to scan with masscan
            masscan_input_file = self.results_subfolder / "input-from-amass"

            with open(masscan_input_file, "w") as f:
                for ip_address in ip_addresses:
                    f.write(f"{ip_address}\n")

            command.append(str(masscan_input_file))

        subprocess.run(command)  # will fail if no ipv4 addresses were found

        if masscan_input_file is not None:
            masscan_input_file.unlink()
class CombineDataAll(sciluigi.Task):
    """Combine al the relevant outputs into a table."""
    in_counts = None
    in_seq = None
    in_tpm = None
    in_bed = None
    in_effect = None
    outdir = luigi.Parameter()
    window_size = luigi.IntParameter()
    temperature = luigi.FloatParameter()

    def out_table(self, ):
        filenames = [target().path for target in self.in_counts.values()]
        return sciluigi.TargetInfo(
            self,
            os.path.join(
                self.outdir,
                processing.combine_filenames_split(filenames) +
                '.temp%d.combined_data.gz' % self.temperature))

    def run(self, ):
        # make directory f it doesn't exist
        outdir = self.outdir
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # load counts
        counts = {}
        for key, target in self.in_counts.items():
            data_table = pd.read_csv(target().path,
                                     compression='gzip',
                                     index_col=0)
            counts['%s' % (key)] = processing.get_counts_from_counts_table(
                data_table, )
        counts = pd.concat(counts).unstack(level=0)

        # load seqdata
        seqdata = pd.read_table(self.in_seq().path,
                                compression='gzip',
                                index_col=0)

        # load seqdata_effects
        seqeffect = pd.read_table(self.in_effect().path,
                                  compression='gzip',
                                  index_col=0)
        #seqeffect.loc[:, 'flag'] = pd.Series({idx:seqmodel.flag_ensemble(row.drop('ddG')-row.ddG) for idx, row in seqeffect.iterrows()})

        noflip_cols = [idx for idx in seqeffect if idx.find('noflip') == 0]
        flip_cols = [
            idx for idx in seqeffect
            if idx.find('flip') == 0 or idx.find('doubleflip') == 0
        ]
        seqeffect.loc[:, 'ddG_noflip_noens'] = seqeffect.loc[:, noflip_cols[0]]
        seqeffect.loc[:, 'ddG_noflip'] = seqmodel.compute_ensemble_ddG_set(
            seqeffect.loc[:, noflip_cols], self.temperature)
        seqeffect.loc[:, 'ddG_flip'] = seqmodel.compute_ensemble_ddG_set(
            seqeffect.loc[:, flip_cols], self.temperature)
        keep_cols = [idx for idx in seqeffect if idx.find('ddG') == 0]

        # load bed data
        beddata = processing.load_bed(
            self.in_bed().path,
            additional_cols=variables.motif_fields_additional).set_index(
                'name')

        # load tpm
        expression = pd.read_table(self.in_tpm().path,
                                   index_col=0,
                                   squeeze=True)

        # combine
        out_data = pd.concat([
            beddata, counts, expression, seqdata, seqeffect.loc[:, keep_cols]
        ],
                             axis=1)
        out_data.loc[:, 'clip_signal_per_tpm'] = (out_data.rep1 +
                                                  out_data.rep2) / out_data.tpm
        out_data.loc[:, 'clip_input_per_tpm'] = (out_data.input) / out_data.tpm

        out_data.to_csv(self.out_table().path, sep='\t', compression='gzip')
Exemple #27
0
class CreateIntraSessionInteractionDataset(BasePySparkTask):
    sample_days: int = luigi.IntParameter(default=16)
    history_window: int = luigi.IntParameter(default=10)
    size_available_list: int = luigi.IntParameter(default=100)
    minimum_interactions: int = luigi.IntParameter(default=5)
    max_itens_per_session: int = luigi.IntParameter(default=15)
    min_itens_interactions: int = luigi.IntParameter(default=3)
    max_relative_pos: int = luigi.IntParameter(default=3)
    pos_max_deep: int = luigi.IntParameter(default=1)

    # def requires(self):
    #     return SessionPrepareDataset(sample_days=self.sample_days, history_window=self.history_window, size_available_list=self.size_available_list)

    def output(self):
        return luigi.LocalTarget(os.path.join(DATASET_DIR, "indexed_intra_session_train_%d_w=%d_l=%d_m=%d_s=%d_i=%d_p=%d" % (self.sample_days, self.history_window,
            self.size_available_list, self.minimum_interactions, self.max_itens_per_session, self.min_itens_interactions, self.max_relative_pos))),\
                luigi.LocalTarget(os.path.join(DATASET_DIR, "item_positive_interaction_%d_w=%d_l=%d_m=%d_s=%d_i=%d_p=%d.csv" % (self.sample_days, self.history_window,
            self.size_available_list, self.minimum_interactions, self.max_itens_per_session, self.min_itens_interactions, self.max_relative_pos))),\
                luigi.LocalTarget(os.path.join(DATASET_DIR, "item_id_index_%d_w=%d_l=%d_m=%d_s=%d_i=%d_p=%d.csv" % (self.sample_days, self.history_window,
            self.size_available_list, self.minimum_interactions, self.max_itens_per_session, self.min_itens_interactions, self.max_relative_pos))),\
                luigi.LocalTarget(os.path.join(DATASET_DIR, "session_index_%d_w=%d_l=%d_m=%d_s=%d_i=%d_p=%d.csv" % (self.sample_days, self.history_window,
            self.size_available_list, self.minimum_interactions, self.max_itens_per_session, self.min_itens_interactions, self.max_relative_pos)))

    def get_df_tuple_probs(self, df):

        df_tuple_count = df.groupby("ItemID_A", "ItemID_B").count()
        df_count        = df.groupby("ItemID_A").count()\
                            .withColumnRenamed("count", "total")\
                            .withColumnRenamed("ItemID_A", "_ItemID_A")

        df_join = df_tuple_count.join(
            df_count, df_tuple_count.ItemID_A == df_count._ItemID_A).cache()
        df_join = df_join.withColumn("prob", col("count") / col("total"))

        df_join  = df_join.select("ItemID_A", 'ItemID_B', 'count', 'total', 'prob')\
                    .withColumnRenamed("ItemID_A", "_ItemID_A")\
                    .withColumnRenamed("ItemID_B", "_ItemID_B")\
                    .withColumnRenamed("count", "total_ocr_dupla")\
                    .withColumnRenamed("total", "total_ocr").cache()

        return df_join

    def add_positive_interactions(self, df):

        # Filter more then 1 ocurrence for positive interactions
        df = df.filter(col("total_ocr_dupla") >= 1)

        df = df\
            .groupby("ItemID_A")\
            .agg(F.collect_set("ItemID_B").alias("sub_a_b"))

        # df_b = df\
        #     .groupby("ItemID_B")\
        #     .agg(F.collect_set("ItemID_A").alias("sub_b"))

        # df = df.join(df_a, "ItemID_A").join(df_b, "ItemID_B").cache()

        # concat_int_arrays = concat(IntegerType())
        # df = df.withColumn("sub_a_b", concat_int_arrays("sub_a", "sub_b"))#.show(truncate=False)
        # return df
        df = df.withColumnRenamed("ItemID_A", "ItemID")
        #df = df.withColumn("ItemID_COPY",df.ItemID)

        df = df.toPandas().set_index('ItemID')
        print(df)

        sub_pos = []
        for i, row in tqdm(df.iterrows(), total=df.shape[0]):
            l = serach_positive(row.name,
                                df,
                                max_deep=self.pos_max_deep,
                                deep=0,
                                list_pos=[])
            sub_pos.append(list(np.unique(l)))

        df['sub_a_b_all'] = sub_pos

        return df

    def main(self, sc: SparkContext, *args):
        os.makedirs(DATASET_DIR, exist_ok=True)

        #parans
        min_itens_per_session = 2
        max_itens_per_session = self.max_itens_per_session
        min_itens_interactions = self.min_itens_interactions  # Tupla interactions
        max_relative_pos = self.max_relative_pos

        spark = SparkSession(sc)
        df = spark.read.csv(BASE_DATASET_FILE, header=True, inferSchema=True)
        df = df.withColumnRenamed("session_id", "SessionID")\
            .withColumnRenamed("click_timestamp", "Timestamp_")\
            .withColumnRenamed("click_article_id", "ItemID")\
            .withColumn("Timestamp",F.from_unixtime(col("Timestamp_")/lit(1000)).cast("timestamp"))\
            .orderBy(col('Timestamp')).select("SessionID", "ItemID", "Timestamp", "Timestamp_")

        print(df.show(2))

        dt = datetime.strptime('2017-10-16 20:59:59', '%Y-%m-%d %H:%M:%S')
        df = df.filter(col('Timestamp') < dt)

        # Drop duplicate item in that same session
        df = df.dropDuplicates(['SessionID', 'ItemID'])

        # filter date
        max_timestamp = df.select(max(
            col('Timestamp'))).collect()[0]['max(Timestamp)']
        init_timestamp = max_timestamp - timedelta(days=self.sample_days)
        print("Timestamp:", max_timestamp, init_timestamp)

        df = df.filter(col('Timestamp') >= init_timestamp).cache()

        df = df.groupby("SessionID").agg(
            max("Timestamp").alias("Timestamp"),
            collect_list("ItemID").alias("ItemIDs"),
            count("ItemID").alias("total"))

        # Filter Interactions
        df = df.filter(df.total >= min_itens_per_session).cache()

        # Filter position in list
        df_pos = df.select(
            col('SessionID').alias('_SessionID'), posexplode(df.ItemIDs))

        # Explode A
        df = df.withColumn("ItemID_A", explode(df.ItemIDs))
        df = df.join(df_pos,
                    (df.SessionID == df_pos._SessionID) & (df.ItemID_A == df_pos.col))\
                .select('SessionID', 'Timestamp', 'ItemID_A', 'pos', 'ItemIDs')\
                .withColumnRenamed('pos', 'pos_A')

        # Explode B
        df = df.withColumn("ItemID_B", explode(df.ItemIDs))
        df = df.join(df_pos,
                    (df.SessionID == df_pos._SessionID) & (df.ItemID_B == df_pos.col))\
                .withColumnRenamed('pos', 'pos_B')

        df = df.withColumn("relative_pos", abs(df.pos_A - df.pos_B))

        # Filter  distincts
        df = df.select('SessionID', 'Timestamp', 'ItemID_A', 'pos_A',
                        'ItemID_B', 'pos_B', 'relative_pos')\
                .distinct()\
                .filter(df.ItemID_A != df.ItemID_B).cache()

        # # Filter duplicates
        # udf_join = F.udf(lambda s,x,y : "_".join(sorted([str(s), str(x),str(y)])) , StringType())
        # df = df.withColumn('key', udf_join('SessionID', 'ItemID_A','ItemID_B'))
        # df = df.dropDuplicates(["key"])

        # Calculate and filter probs ocorrence
        df_probs = self.get_df_tuple_probs(df)
        df = df.join(df_probs, (df.ItemID_A == df_probs._ItemID_A) &
                     (df.ItemID_B == df_probs._ItemID_B))

        print(df.show(2))
        print(df.count())

        # Add positive interactoes
        df_positive = self.add_positive_interactions(df)

        # Filter confidence
        df = df.filter(col("total_ocr_dupla") >= min_itens_interactions)\
               .filter(col("relative_pos") <= max_relative_pos)\
               .filter(col("pos_A") <= self.max_itens_per_session)

        # df = df.select("SessionID", 'Timestamp', 'ItemID_A', 'pos_A',
        #                 'ItemID_B', 'pos_B', 'relative_pos',
        #                 'total_ocr', 'total_ocr_dupla', 'prob', 'sub_a_b')\
        #         .dropDuplicates(['ItemID_A', 'ItemID_B', 'relative_pos']) # TODO is it right?
        df = df.select("SessionID", 'Timestamp', 'ItemID_A', 'ItemID_B', 'relative_pos',
                        'total_ocr', 'total_ocr_dupla')\
                .dropDuplicates(['ItemID_A', 'ItemID_B', 'relative_pos']) # TODO is it right?

        df.select("ItemID_A").dropDuplicates().toPandas().to_csv(
            self.output()[2].path, index_label="item_idx")
        df.select("SessionID").dropDuplicates().toPandas().to_csv(
            self.output()[3].path, index_label="session_idx")
        df.write.parquet(self.output()[0].path)
        df_positive.to_csv(self.output()[1].path)
class MyWorkflow(sciluigi.WorkflowTask):
    # only required parameter
    outdir = luigi.Parameter()
    cores = luigi.IntParameter(default=1)
    # genome data
    genome = luigi.Parameter(default='hg38')
    genome_fasta = luigi.Parameter(default='/shr/genomes/fasta/hg38/hg38.fa')
    genome_size = luigi.Parameter(default='/shr/gSizes/hg38.genomsize')

    # CLIP input data
    input_bam = luigi.Parameter(
        default='CLIP/hPUM2/bams/input.ENCFF786ZZB.bam')
    rep1_bam = luigi.Parameter(default='CLIP/hPUM2/bams/rep1.ENCFF231WHF.bam')
    rep2_bam = luigi.Parameter(default='CLIP/hPUM2/bams/rep2.ENCFF732EQX.bam')

    # CLIP processing inputs
    input_bed = luigi.Parameter()
    len_consensus_seq = luigi.IntParameter(default=11)
    check_for_seq = luigi.Parameter(default='TGTA')
    window_size = luigi.IntParameter(default=500)
    temperature = luigi.IntParameter(default=0)

    # RNAMap input data
    model_param_basename = luigi.Parameter(
        default='annotations/RNAmap/qMotif_20180302_')

    # transcript data
    tpm_cutoff = luigi.FloatParameter(default=0.01)
    #tpm_file = luigi.Parameter(default='RNAseq/transcript_quant/rna_seq_combined.tpm.above_0.01_both.dat')
    #rnaseq_file1 = luigi.Parameter(default='RNAseq/transcript_quant/ENCFF272HJP.rep1.tsv')
    #rnaseq_file2 = luigi.Parameter(default='RNAseq/transcript_quant/ENCFF471SEN.rep2.tsv')
    #regions = luigi.Parameter(default='RNAseq/transcript_quant/exons.st.merge_transcript.above_0.01_both.bed') # the regions in which to look for motifs
    transcript_bed = luigi.Parameter(
        default='annotations/refseq/hg38_refGene.transcripts.st.bed')
    biomart_file = luigi.Parameter(
        default='annotations/ensemble_gene_converter_biomart.txt')

    def workflow(self):
        ####### CLIP ########
        # download CLIP data
        # TODO

        # process CLIP data bams
        # get the bam file of the clip data
        processclipbams = {}
        findtotalreads = {}
        outdir_bams = os.path.join(self.outdir, 'bams')
        for key, bamfile in zip(
            ['rep1', 'rep2', 'input'],
            [self.rep1_bam, self.rep2_bam, self.input_bam]):
            processclipbams[key] = self.new_task('processclipbam_%s' % key,
                                                 ProcessRawClipBam,
                                                 bamfile=bamfile,
                                                 outdir=outdir_bams)
            findtotalreads[key] = self.new_task('findtotalreads_%s' % key,
                                                FindTotalReads)
            findtotalreads[key].in_bam = processclipbams[key].out_bam

        # make bed graph of each strand of clip data
        getbedgraphs = {}
        outdir_clips = os.path.join(self.outdir, 'clip', 'bedgraphs')
        for key, processclipbam in processclipbams.items():
            getbedgraphs[key] = self.new_task('getbedgraphs_%s' % key,
                                              GetBedGraphFromBam,
                                              outdir=outdir_clips,
                                              genome_size=self.genome_size)
            getbedgraphs[key].in_bam = processclipbam.out_bam

        # load RNA seq data
        downloadrna = self.new_task('downloadrna',
                                    DownloadRNAseq,
                                    outdir=os.path.join(
                                        self.outdir, 'expression'))

        ##### STARTING WITH A KNOWN BED FILE OF SITES #####
        filterbed = self.new_task('getbed',
                                  scltasks.FilenameToTaskOutput,
                                  filename=self.input_bed)
        i = 0

        # split bed file into strands
        splitbedfile = self.new_task('splitbedfile_%d' % i, DividBedByStrand)
        splitbedfile.in_bed_file = filterbed.out_file

        ##### FIND CLIP SIGNAL #####

        # go through bedgraph files and run all clip commands
        outdir_clips = os.path.join(self.outdir, 'clip', 'split_%d' % i,
                                    'strands')
        combinestrandsall = {}
        for key, getbedgraph in getbedgraphs.items():
            # find signal in plus strand
            clipsignalplus = self.new_task('getclipsignalplus_%s_%d' %
                                           (key, i),
                                           GetClipSignal,
                                           window_size=self.window_size,
                                           genome_size=self.genome_size,
                                           outdir=outdir_clips)
            clipsignalplus.in_bed_file = splitbedfile.out_bed_plus
            clipsignalplus.in_bg_file = getbedgraph.out_bg_plus

            # find signal in minus strand
            clipsignalminus = self.new_task('getclipsignalminus_%s_%d' %
                                            (key, i),
                                            GetClipSignal,
                                            window_size=self.window_size,
                                            genome_size=self.genome_size,
                                            outdir=outdir_clips)
            clipsignalminus.in_bed_file = splitbedfile.out_bed_minus
            clipsignalminus.in_bg_file = getbedgraph.out_bg_minus

            # combine the two
            combinestrands = self.new_task('combinestrands_%s_%d' % (key, i),
                                           CombineStrandData,
                                           outdir=os.path.join(
                                               self.outdir, 'clip',
                                               'split_%d' % i))
            combinestrands.in_datafiles = [
                clipsignalplus.out_signal, clipsignalminus.out_signal
            ]
            combinestrandsall[key] = combinestrands

        ####### FIND EXPRESSION OF TRANSCRIPTS AT MOTIF SITES ########
        # find the transcript count per motif site based on the annotated refseq gene and the rnaseq data
        outdir_tpm = os.path.join(self.outdir, 'expression', 'split_%d' % i)
        findmotiftpm = self.new_task('findmotiftpm_%d' % i,
                                     ProcessRNASeq,
                                     biomart_file=self.biomart_file,
                                     outdir=outdir_tpm)
        findmotiftpm.in_bed = filterbed.out_file
        findmotiftpm.in_rna1 = downloadrna.out_rna1
        findmotiftpm.in_rna2 = downloadrna.out_rna2

        ####### FIND SEQUENCE AT MOTIF SITES ########
        # find sequence of intervals
        outdir_seq = os.path.join(self.outdir, 'sequences', 'split_%d' % i)
        findsequence = self.new_task('findsequence_%d' % i,
                                     scltasks.FindSequence,
                                     genome_fasta=self.genome_fasta,
                                     window_size=self.window_size,
                                     outdir=outdir_seq)
        findsequence.in_bed = filterbed.out_file

        find_seqdata = self.new_task('findseqdata_%d' % i,
                                     FindMotifSequenceData,
                                     seq_length=self.len_consensus_seq,
                                     check_for_seq=self.check_for_seq,
                                     window_size=self.window_size,
                                     outdir=outdir_seq)
        find_seqdata.in_fasta = findsequence.out_fasta

        ####### PREDICT EFFECTS AT MOTIF SITES #######
        outdir_model = os.path.join(self.outdir, 'effects',
                                    'temp_%d' % (self.temperature),
                                    'split_%d' % i)
        find_effect = self.new_task(
            'findeffect_%d' % i,
            FindPredictedSeqEffect,
            outdir=outdir_model,
            model_param_basename=self.model_param_basename,
            temperature=self.temperature)
        find_effect.in_seqdata = find_seqdata.out_seqdata

        ####### COMBINE INFO AT MOTIF SITES ########

        # combine data in meaningful way
        combinedata = self.new_task('combinedata_%d' % i,
                                    CombineDataAll,
                                    window_size=self.window_size,
                                    outdir=os.path.join(
                                        self.outdir, 'output', 'split_%d' % i),
                                    temperature=self.temperature)
        combinedata.in_counts = {
            key: target.out_signal
            for key, target in combinestrandsall.items()
        }
        combinedata.in_seq = find_seqdata.out_seqdata
        combinedata.in_tpm = findmotiftpm.out_motif_tpm
        combinedata.in_bed = filterbed.out_file
        combinedata.in_effect = find_effect.out_seqdata

        return combinedata
Exemple #29
0
class SessionPrepareDataset(BasePySparkTask):
    sample_days: int = luigi.IntParameter(default=16)
    history_window: int = luigi.IntParameter(default=10)
    size_available_list: int = luigi.IntParameter(default=100)
    minimum_interactions: int = luigi.IntParameter(default=5)

    def output(self):
        return luigi.LocalTarget(os.path.join(DATASET_DIR, "dataset_prepared_sample={}_win={}_list={}_min_i={}.csv"\
                    .format(self.sample_days, self.history_window, self.size_available_list, self.minimum_interactions),))

    def add_history(self, df):

        w = Window.partitionBy('SessionID').orderBy(
            'Timestamp')  #.rangeBetween(Window.currentRow, 5)

        df = df.withColumn('ItemIDHistory',
                           F.collect_list('ItemID').over(w)).where(
                               size(col("ItemIDHistory")) >= 2)  #\

        df = df.withColumn(
            'ItemIDHistory',
            pad_history(df.ItemIDHistory, lit(self.history_window)))

        return df

    def filter(self, df):
        # filter date
        max_timestamp = df.select(max(
            col('Timestamp'))).collect()[0]['max(Timestamp)']
        init_timestamp = max_timestamp - timedelta(days=self.sample_days)
        df = df.filter(col('Timestamp') >= init_timestamp).cache()

        # Filter minin interactions
        df_item = df.groupBy("ItemID").count()
        df_item = df_item.filter(col('count') >= self.minimum_interactions)

        # Filter session size
        df_session = df.groupBy("SessionID").count()
        df_session = df_session.filter(col('count') >= 2)

        df = df \
            .join(df_item, "ItemID", how="inner") \
            .join(df_session, "SessionID", how="inner")

        return df

    def add_available_items(self, df):
        all_items = list(
            df.select("ItemID").dropDuplicates().toPandas()["ItemID"])

        df = df.withColumn(
            'AvailableItems',
            udf_sample_items(all_items,
                             self.size_available_list)(col("ItemID")))

        return df

    def main(self, sc: SparkContext, *args):
        os.makedirs(DATASET_DIR, exist_ok=True)

        spark = SparkSession(sc)
        df = spark.read.csv(BASE_DATASET_FILE, header=True, inferSchema=True)
        df = df.withColumnRenamed("session_id", "SessionID")\
            .withColumnRenamed("click_timestamp", "Timestamp_")\
            .withColumnRenamed("click_article_id", "ItemID")\
            .withColumn("Timestamp",F.from_unixtime(col("Timestamp_")/lit(1000)).cast("timestamp"))\
            .orderBy(col('Timestamp')).select("SessionID", "ItemID", "Timestamp", "Timestamp_").filter(col('Timestamp') < '2017-10-16 24:59:59')

        # Drop duplicate item in that same session
        df = df.dropDuplicates(['SessionID', 'ItemID'])

        df = self.filter(df)
        df = self.add_history(df)
        df = self.add_available_items(df)

        df = df.withColumn('visit', lit(1))

        df.toPandas().to_csv(self.output().path, index=False)
Exemple #30
0
class KillOpenRedshiftSessions(luigi.Task):
    """
    An task for killing any open Redshift sessions
    in a given database. This is necessary to prevent open user sessions
    with transactions against the table from blocking drop or truncate
    table commands.

    Usage:

    Subclass and override the required `host`, `database`,
    `user`, and `password` attributes.
    """

    # time in seconds to wait before
    # reconnecting to Redshift if our session is killed too.
    # 30 seconds is usually fine; 60 is conservative
    connection_reset_wait_seconds = luigi.IntParameter(default=60)

    @abc.abstractproperty
    def host(self):
        return None

    @abc.abstractproperty
    def database(self):
        return None

    @abc.abstractproperty
    def user(self):
        return None

    @abc.abstractproperty
    def password(self):
        return None

    @property
    def update_id(self):
        """
        This update id will be a unique identifier
        for this insert on this table.
        """
        return self.task_id

    def output(self):
        """
        Returns a RedshiftTarget representing the inserted dataset.

        Normally you don't override this.
        """
        # uses class name as a meta-table
        return RedshiftTarget(host=self.host,
                              database=self.database,
                              user=self.user,
                              password=self.password,
                              table=self.__class__.__name__,
                              update_id=self.update_id)

    def run(self):
        """
        Kill any open Redshift sessions for the given database.
        """
        connection = self.output().connect()
        # kill any sessions other than ours and
        # internal Redshift sessions (rdsdb)
        query = ("select pg_terminate_backend(process) "
                 "from STV_SESSIONS "
                 "where db_name=%s "
                 "and user_name != 'rdsdb' "
                 "and process != pg_backend_pid()")
        cursor = connection.cursor()
        logger.info('Killing all open Redshift sessions for database: %s',
                    self.database)
        try:
            cursor.execute(query, (self.database, ))
            cursor.close()
            connection.commit()
        except psycopg2.DatabaseError as e:
            if e.message and 'EOF' in e.message:
                # sometimes this operation kills the current session.
                # rebuild the connection. Need to pause for 30-60 seconds
                # before Redshift will allow us back in.
                connection.close()
                logger.info(
                    'Pausing %s seconds for Redshift to reset connection',
                    self.connection_reset_wait_seconds)
                time.sleep(self.connection_reset_wait_seconds)
                logger.info('Reconnecting to Redshift')
                connection = self.output().connect()
            else:
                raise

        try:
            self.output().touch(connection)
            connection.commit()
        finally:
            connection.close()

        logger.info('Done killing all open Redshift sessions for database: %s',
                    self.database)