class MeasurementFinderTask(luigi.Task): pipeline = luigi.IntParameter() job = luigi.IntParameter() start = luigi.IntParameter() batch = luigi.IntParameter() solr_query = luigi.Parameter() segment = segmentation.Segmentation() def run(self): client = MongoClient(util.mongo_host, util.mongo_port) try: jobs.update_job_status( str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running MeasurementFinder Batch %s" % self.batch) pipeline_config = config.get_pipeline_config( self.pipeline, util.conn_string) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Solr query") docs = solr_data.query(self.solr_query, rows=util.row_count, start=self.start, solr_url=util.solr_url, tags=pipeline_config.report_tags, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, cohort_ids=pipeline_config.cohort) filters = dict() if pipeline_config.sections and len(pipeline_config.sections) > 0: filters[SECTIONS_FILTER] = pipeline_config.sections with self.output().open('w') as outfile: jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Finding terms with MeasurementFinder") # TODO incorporate sections and filters for doc in docs: meas_results = run_measurement_finder_full( doc["report_text"], pipeline_config.terms) for meas in meas_results: inserted = mongo_writer(client, self.pipeline, self.job, self.batch, pipeline_config, meas, doc, "MeasurementFinder") outfile.write(str(inserted)) outfile.write('\n') del meas_results del docs except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status(str(self.job), util.conn_string, jobs.WARNING, ''.join(traceback.format_stack())) print(ex) finally: client.close() def output(self): return luigi.LocalTarget( "%s/pipeline_job%s_measurement_finder_batch%s.txt" % (util.tmp_dir, str(self.job), str(self.start)))
import os from typing import cast import luigi from luijo.config import FileSystem class {{cookiecutter.task_name}}(luigi.Task): """ This is a starter task to use as a template for the creation of your real task. Let's get started! (...and don't forget to update your docstrings!) :cvar hello: the string that follows 'Hello' in the output :cvar repeat: the number of times the message should be repeated """ hello: luigi.Parameter = luigi.Parameter(default='Pythonista') repeat: luigi.Parameter = luigi.IntParameter(default=10) def requires(self): """ This task has no requirements. :return: an empty iteration """ return [] def output(self) -> luigi.LocalTarget: """ This task returns a local target object containing the number of 'hello' lines that were specified by the :py:attr:`HelloLuigi.repeat` parameter. :return: the local target output
class IntraSessionInteractionsDataFrame(BasePrepareDataFrames): sample_days: int = luigi.IntParameter(default=16) max_itens_per_session: int = luigi.IntParameter(default=15) min_itens_interactions: int = luigi.IntParameter(default=3) max_relative_pos: int = luigi.IntParameter(default=3) days_test: int = luigi.IntParameter(default=1) pos_max_deep: int = luigi.IntParameter(default=1) filter_first_interaction: bool = luigi.BoolParameter(default=False) def requires(self): return CreateIntraSessionInteractionDataset( max_itens_per_session=self.max_itens_per_session, sample_days=self.sample_days, min_itens_interactions=self.min_itens_interactions, max_relative_pos=self.max_relative_pos, pos_max_deep=self.pos_max_deep) @property def timestamp_property(self) -> str: return "Timestamp" @property def dataset_dir(self) -> str: return DATASET_DIR def read_data_frame(self) -> pd.DataFrame: df = pd.read_parquet(self.read_data_frame_path) #.sample(10000) # TODO if self.filter_first_interaction: df = df.groupby(['ItemID_A', 'ItemID_B']).head(1).reset_index(drop=True) #df["ItemID"] = df.ItemID_A #df['sub_a_b'] = df['sub_a_b'].apply(list) df['available_arms'] = None df["visit"] = 1 df_session = df[['SessionID']].drop_duplicates().reset_index().rename( columns={"index": 'SessionIDX'}) df = df.merge(df_session).drop(['SessionID'], axis=1) df = df.rename(columns={"ItemID_A": 'ItemID'}) return df @property def metadata_data_frame_path(self) -> Optional[str]: return self.input()[1].path @property def read_data_frame_path(self) -> pd.DataFrame: return self.input()[0].path def transform_data_frame(self, df: pd.DataFrame, data_key: str) -> pd.DataFrame: print(data_key) print(df.describe()) return df def time_train_test_split( self, df: pd.DataFrame, test_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]: df[self.timestamp_property] = pd.to_datetime( df[self.timestamp_property]) if self.timestamp_property: df = df.sort_values(self.timestamp_property) cutoff_date = df[self.timestamp_property].iloc[-1] - pd.Timedelta( days=self.days_test) return df[df[self.timestamp_property] < cutoff_date], df[ df[self.timestamp_property] >= cutoff_date]
class CallVariantsWithHaplotypeCaller(VclineTask): cf = luigi.DictParameter() n_cpu = luigi.IntParameter(default=1) memory_mb = luigi.FloatParameter(default=4096) sh_config = luigi.DictParameter(default=dict()) priority = 50 def output(self): run_dir = Path(self.cf['germline_snv_indel_gatk_dir_path']).joinpath( Path(self.input()[0][0].path).stem) return [ luigi.LocalTarget( run_dir.joinpath(f'{run_dir.name}.haplotypecaller.{s}')) for s in ['vcf.gz', 'vcf.gz.tbi', 'cram', 'cram.crai'] ] def run(self): output_vcf = Path(self.output()[0].path) intervals = [Path(i.path) for i in self.input()[3]] skip_interval_split = (len(intervals) == 1) fa = Path(self.input()[1][0].path) input_cram = Path(self.input()[0][0].path) dbsnp_vcf = Path(self.input()[2][0].path) output_path_prefix = '.'.join(str(output_vcf).split('.')[:-2]) if skip_interval_split: tmp_prefixes = [output_path_prefix] else: tmp_prefixes = [ '{0}.{1}'.format(output_path_prefix, o.stem) for o in intervals ] input_targets = yield [ HaplotypeCaller(input_cram_path=str(input_cram), fa_path=str(fa), dbsnp_vcf_path=str(dbsnp_vcf), evaluation_interval_path=str(o), output_path_prefix=s, gatk=self.cf['gatk'], save_memory=self.cf['save_memory'], n_cpu=self.n_cpu, memory_mb=self.memory_mb, sh_config=self.sh_config) for o, s in zip(intervals, tmp_prefixes) ] run_id = '.'.join(output_vcf.name.split('.')[:-3]) self.print_log( f'Call germline variants with HaplotypeCaller:\t{run_id}') output_cram = Path(self.output()[2].path) gatk = self.cf['gatk'] samtools = self.cf['samtools'] self.setup_shell(run_id=run_id, commands=gatk, cwd=output_vcf.parent, **self.sh_config, env={ 'JAVA_TOOL_OPTIONS': self.generate_gatk_java_options( n_cpu=self.n_cpu, memory_mb=self.memory_mb) }) if skip_interval_split: tmp_bam = Path(f'{tmp_prefixes[0]}.bam') self.samtools_view(input_sam_path=tmp_bam, fa_path=fa, output_sam_path=output_cram, samtools=samtools, n_cpu=self.n_cpu, index_sam=True, remove_input=True) else: tmp_vcfs = [Path(f'{s}.vcf.gz') for s in tmp_prefixes] self.run_shell( args=(f'set -e && {gatk} MergeVcfs' + ''.join(f' --INPUT {v}' for v in tmp_vcfs) + f' --REFERENCE_SEQUENCE {fa}' + f' --OUTPUT {output_vcf}'), input_files_or_dirs=[*tmp_vcfs, fa], output_files_or_dirs=[output_vcf, f'{output_vcf}.tbi']) self.samtools_merge( input_sam_paths=[f'{s}.bam' for s in tmp_prefixes], fa_path=fa, output_sam_path=output_cram, samtools=samtools, n_cpu=self.n_cpu, memory_mb=self.memory_mb, index_sam=True, remove_input=False) self.remove_files_and_dirs(*chain.from_iterable( [o.path for o in t] for t in input_targets))
class Crop(luigi.Task): it = luigi.IntParameter() dt = luigi.Parameter() aug = luigi.Parameter() de = luigi.Parameter() samples = luigi.TupleParameter() data_eval = luigi.TupleParameter() resources = {"ram": 50} @property def priority(self): if int(self.it) % 10000 == 0: return 1.0 / int(self.it) else: return 0.0 def requires(self): return Predict(self.it, self.dt, self.aug, self.samples, self.data_eval) # and so on def output(self): return luigi.LocalTarget( os.path.join(os.path.dirname(self.input().fn), self.de, "crop.msg")) def run(self): progress = 0.0 self.set_progress_percentage(progress) if "unaligned" in self.de: aligned = False else: aligned = True for s in self.samples: filename = os.path.join(os.path.dirname(self.input().fn), self.de, s + ".n5") datasets_src = ["clefts", "pre_dist", "post_dist"] datasets_tgt = [ "clefts_cropped", "pre_dist_cropped", "post_dist_cropped" ] off = offsets[s][aligned] sh = shapes[s][aligned] f = z5py.File(filename, use_zarr_format=False) for dss, dst in zip(datasets_src, datasets_tgt): chunk_size = tuple( min(c, shi) for c, shi in zip(f[dss].chunks, sh)) f.create_dataset( dst, shape=sh, compression="gzip", dtype=f[dss].dtype, chunks=chunk_size, ) bb = tuple(slice(o, o + shi, None) for o, shi in zip(off, sh)) f[dst][:] = f[dss][bb] f[dst].attrs["offset"] = off[::-1] progress += 100.0 / (len(self.samples) * len(datasets_src)) try: self.set_progress_percentage(progress) except: pass done = self.output().open("w") done.close()
class FindPartners(luigi.Task): it = luigi.IntParameter() dt = luigi.Parameter() aug = luigi.Parameter() de = luigi.Parameter() samples = luigi.TupleParameter() data_eval = luigi.TupleParameter() resources = {"ram": 650, "fp": 1} retry_count = 1 @property def priority(self): if int(self.it) % 10000 == 0: return 1.0 / int(self.it) else: return 0.0 def requires(self): return ConnectedComponents( self.it, self.dt, self.aug, self.de, self.samples, self.data_eval ) def output(self): return luigi.LocalTarget( os.path.join(os.path.dirname(self.input().fn), "partners.msg") ) def run(self): logging.debug("Starting to run partner finding") progress = 0.0 self.set_progress_percentage(progress) thr = 127 cc_thr = 42 pre_thr = 42 post_thr = 42 dist_thr = 600 size_thr = 5 for s in self.samples: logging.debug("Starting with sample {0:}".format(s)) filename = os.path.join(os.path.dirname(self.input().fn), s + ".h5") syn_file = os.path.join(os.path.dirname(self.input().fn), s + ".n5") cleft_cc_ds = "clefts_cropped_thr{0:}_cc{1:}".format(thr, cc_thr) pre_ds = "pre_dist_cropped" post_ds = "post_dist_cropped" seg_file = os.path.join( "/groups/saalfeld/saalfeldlab/larissa/data/cremieval/", self.de, s + ".n5", ) seg_ds = "volumes/labels/neuron_ids_constis_slf1_sf750_cropped" if "unaligned" in self.de: aligned = False else: aligned = True off = tuple(np.array(offsets[s][aligned]) * np.array((40, 4, 4))) mm = Matchmaker( syn_file, cleft_cc_ds, pre_ds, post_ds, seg_file, seg_ds, filename, offset=off, safe_mem=True, dist_thr=dist_thr, size_thr=size_thr, pre_thr=pre_thr, post_thr=post_thr, ) # mm.prepare_file() mm.write_partners() mm.cremi_file.close() del mm progress += 100.0 / len(self.samples) try: self.set_progress_percentage(progress) except: pass done = self.output().open("w") done.close()
class CascadeMerge(LocalWorkflow): cascade_tree = luigi.IntParameter( default=0, description="the index of the cascade tree, only " "necessary when multiple trees (a forrest) are used, -1 denotes a wrapper that requires " "and outputs all trees, default: 0") cascade_depth = luigi.IntParameter( default=0, description="the depth of this workflow in the " "cascade tree with 0 being the root of the tree, default: 0") keep_nodes = luigi.BoolParameter( significant=False, description="keep merged results from " "intermediary nodes in the cascade cache directory") # internal parameter n_cascade_leaves = luigi.IntParameter(default=NO_INT, significant=False) # fixate some workflow parameters acceptance = 1. tolerance = 0. pilot = False node_format = "{name}.d{depth}.b{branch}{ext}" merge_factor = 2 exclude_params_db = {"n_cascade_leaves"} exclude_db = True def __init__(self, *args, **kwargs): super(CascadeMerge, self).__init__(*args, **kwargs) # the merge factor should not be 1 if self.merge_factor == 1: raise ValueError("the merge factor should not be 1") self._forest_built = False def is_branch(self, default=False): return super(CascadeMerge, self).is_branch() or (not default and self.is_forest()) def is_forest(self): return self.cascade_tree < 0 def is_root(self): if self.is_forest(): return False return self.cascade_depth == 0 def is_leaf(self): if self.is_forest(): return False tree = self._get_tree() max_depth = max(tree.keys()) return self.cascade_depth == max_depth @cached_workflow_property def cascade_forest(self): self._build_cascade_forest() return self.cascade_forest @cached_workflow_property def leaves_per_tree(self): self._build_cascade_forest() return self.leaves_per_tree def _get_tree(self): try: return self.cascade_forest[self.cascade_tree] except IndexError: raise Exception( "cascade tree {} not found, forest only contains {} tree(s)". format(self.cascade_tree, len(self.cascade_forest))) def _build_cascade_forest(self): # a node in the tree can be described by a tuple of integers, where each value denotes the # branch path to go down the tree to reach the node (e.g. (2, 0) -> 2nd branch, 0th branch), # so the length of the tuple defines the depth of the node via ``depth = len(node) - 1`` # the tree itself is a dict that maps depths to lists of nodes with that depth # when multiple trees are used (a forest), each one handles ``n_leaves / n_trees`` leaves if self._forest_built: return # helper to convert nested lists of leaf number chunks into a list of nodes in the format # described above def nodify(obj, node=None, root_id=0): if not isinstance(obj, list): return [] nodes = [] if node is None: node = tuple() else: nodes.append(node) for i, _obj in enumerate(obj): nodes += nodify(_obj, node + (i if node else root_id, )) return nodes # first, determine the number of files to merge in total when not already set via params if self.n_cascade_leaves == NO_INT: if self.is_branch(default=True): raise Exception( "number of files to merge cannot be computed for a branch") # get inputs, i.e. outputs of workflow requirements and trace actual inputs to merge # an integer number representing the number of inputs is also valid inputs = luigi.task.getpaths(self.cascade_workflow_requires()) inputs = self.trace_cascade_workflow_inputs(inputs) self.n_cascade_leaves = inputs if isinstance( inputs, six.integer_types) else len(inputs) # infer the number of trees from the cascade output output = self.cascade_output() n_trees = 1 if not isinstance(output, TargetCollection) else len(output) if self.n_cascade_leaves < n_trees: raise Exception( "too many leaves ({}) for number of requested trees ({})". format(self.n_cascade_leaves, n_trees)) # determine the number of leaves per tree n_min = self.n_cascade_leaves // n_trees n_trees_overlap = self.n_cascade_leaves % n_trees leaves_per_tree = n_trees_overlap * [n_min + 1] + ( n_trees - n_trees_overlap) * [n_min] # build the trees forest = [] for i, n_leaves in enumerate(leaves_per_tree): # build a nested list of leaf numbers using the merge factor # e.g. 9 leaves with factor 3 -> [[0, 1, 2], [3, 4, 5], [6, 7, 8]] # TODO: this point defines the actual tree structure, which is bottom-up at the moment, # but maybe it's good to have this configurable nested_leaves = list(iter_chunks(n_leaves, self.merge_factor)) while len(nested_leaves) > 1: nested_leaves = list( iter_chunks(nested_leaves, self.merge_factor)) # convert the list of nodes to the tree format described above tree = {} for node in nodify(nested_leaves, root_id=i): depth = len(node) - 1 tree.setdefault(depth, []).append(node) forest.append(tree) # store values self.leaves_per_tree = leaves_per_tree self.cascade_forest = forest self._forest_built = True def create_branch_map(self): if self.is_forest(): raise Exception( "cannot define a branch map when in forest mode (cascade_tree < 0)" ) tree = self._get_tree() nodes = tree[self.cascade_depth] return dict(enumerate(nodes)) def trace_cascade_workflow_inputs(self, inputs): # should convert inputs to an object with a length (e.g. list, tuple, TargetCollection, ...) # for convenience, check if inputs results from the default workflow output, i.e. a dict # which stores a TargetCollection in the "collection" field if isinstance(inputs, dict) and "collection" in inputs: collection = inputs["collection"] if isinstance(collection, TargetCollection): return collection return inputs def trace_cascade_inputs(self, inputs): # should convert inputs into an iterable sequence (list, tuple, ...), no TargetCollection! return inputs @abstractmethod def cascade_workflow_requires(self): # should return the leaf requirements of a cascading task workflow return @abstractmethod def cascade_requires(self): # should return the leaf requirements of a cascading task branch return @abstractmethod def cascade_output(self): # this should return a single target when the output should be a single tree # or a target collection whose targets are accessible as items via cascade tree numbers return @abstractmethod def merge(self, inputs, output): return def complete(self): if self.is_forest(): return all(task.complete() for task in flatten(self.requires())) else: return super(CascadeMerge, self).complete() def workflow_requires(self): self._build_cascade_forest() reqs = super(CascadeMerge, self).workflow_requires() if self.is_leaf(): # this is simply the cascade requirement reqs["cascade"] = self.cascade_workflow_requires() else: # not a leaf, just require the next cascade depth reqs["cascade"] = self.req(self, cascade_depth=self.cascade_depth + 1) return reqs def requires(self): reqs = OrderedDict() if self.is_forest(): # require the workflows for all cascade trees n_trees = len(self.cascade_forest) reqs["forest"] = { t: self.req(self, branch=-1, cascade_tree=t) for t in range(n_trees) } elif self.is_leaf(): # this is simply the cascade requirement # also determine and pass the corresponding leaf number range sum_n_leaves = sum(self.leaves_per_tree) offset = sum(self.leaves_per_tree[:self.cascade_tree]) merge_factor = self.merge_factor if merge_factor <= 0: merge_factor = self.leaves_per_tree[self.cascade_tree] start_leaf = offset + self.branch * merge_factor end_leaf = min(start_leaf + merge_factor, sum_n_leaves) reqs["cascade"] = self.cascade_requires(start_leaf, end_leaf) else: # get all child nodes in the next layer at depth = depth + 1, store their branches # note: child node tuples contain the exact same values plus an additional one tree = self._get_tree() node = self.branch_data branches = [ i for i, n in enumerate(tree[self.cascade_depth + 1]) if n[:-1] == node ] # add to requirements reqs["cascade"] = { b: self.req(self, branch=b, cascade_depth=self.cascade_depth + 1) for b in branches } return reqs def cascade_cache_directory(self): # by default, use the targets parent directory, also for SinglingFileCollections # otherwise, no default decision is implemented output = self.cascade_output() if isinstance(output, FileSystemTarget): return output.parent elif isinstance(output, SiblingFileCollection): return output.dir else: raise NotImplementedError( "{}.cascade_cache_directory is not implemented".format( self.__class__.__name__)) def output(self): output = self.cascade_output() if self.is_forest(): return output if isinstance(output, TargetCollection): output = output[self.cascade_tree] if self.is_root(): return output else: name, ext = os.path.splitext(output.basename) basename = self.node_format.format(name=name, ext=ext, branch=self.branch, depth=self.cascade_depth) return self.cascade_cache_directory().child(basename, "f") def run(self): if self.is_forest(): return # trace actual inputs to merge inputs = self.input()["cascade"] if self.is_leaf(): inputs = self.trace_cascade_inputs(inputs) else: inputs = inputs.values() # merge self.publish_message("start merging {} inputs of node {}".format( len(inputs), self.branch_data)) self.merge(inputs, self.output()) # remove intermediate nodes if not self.is_leaf() and not self.keep_nodes: with self.publish_step( "removing intermediate results of node {}".format( self.branch_data)): for inp in inputs: inp.remove()
class GraphWorkflow(WorkflowBase): input_path = luigi.Parameter() input_key = luigi.Parameter() graph_path = luigi.Parameter() output_key = luigi.Parameter() n_scales = luigi.IntParameter(default=1) # for now we only support n5 / zarr input labels def _check_input(self): ending = self.input_path.split('.')[-1] assert ending.lower() in ('zr', 'zarr', 'n5'),\ "Only support n5 and zarr files, not %s" % ending def requires(self): self._check_input() initial_task = getattr(initial_tasks, self._get_task_name('InitialSubGraphs')) dep = initial_task(tmp_folder=self.tmp_folder, max_jobs=self.max_jobs, config_dir=self.config_dir, input_path=self.input_path, input_key=self.input_key, graph_path=self.graph_path, dependency=self.dependency) merge_task = getattr(merge_tasks, self._get_task_name('MergeSubGraphs')) for scale in range(1, self.n_scales): scale_out_key = 's%i/sub_graphs' % scale dep = merge_task(tmp_folder=self.tmp_folder, max_jobs=self.max_jobs, config_dir=self.config_dir, graph_path=self.graph_path, output_key=scale_out_key, scale=scale, merge_complete_graph=False, dependency=dep) dep = merge_task(tmp_folder=self.tmp_folder, max_jobs=self.max_jobs, config_dir=self.config_dir, graph_path=self.graph_path, output_key=self.output_key, scale=self.n_scales - 1, merge_complete_graph=True, dependency=dep) map_task = getattr(map_tasks, self._get_task_name('MapEdgeIds')) for scale in range(self.n_scales): dep = map_task(tmp_folder=self.tmp_folder, max_jobs=self.max_jobs, config_dir=self.config_dir, graph_path=self.graph_path, input_key=self.output_key, scale=self.n_scales - 1, dependency=dep) return dep @staticmethod def get_config(): configs = super(GraphWorkflow, GraphWorkflow).get_config() configs.update({ 'initial_sub_graphs': initial_tasks.InitialSubGraphsLocal.default_task_config(), 'merge_sub_graphs': merge_tasks.MergeSubGraphsLocal.default_task_config(), 'map_edge_ids': map_tasks.MapEdgeIdsLocal.default_task_config() }) return configs
class WebDataExcelFileFromArchiveParsingToCsv(LoadingDataIntoCsvFile): skiptop = luigi.IntParameter(default=0) skipbottom = luigi.IntParameter(default=0) usecolumns = luigi.Parameter(default='')
class RangeHourlyBase(RangeBase): """ Produces a contiguous completed range of an hourly recurring task. """ start = luigi.DateHourParameter( default=None, description= "beginning datehour, inclusive. Default: None - work backward forever (requires reverse=True)" ) stop = luigi.DateHourParameter( default=None, description= "ending datehour, exclusive. Default: None - work forward forever") hours_back = luigi.IntParameter( default=100 * 24, # slightly more than three months description=("extent to which contiguousness is to be assured into " "past, in hours from current time. Prevents infinite " "loop when start is none. If the dataset has limited " "retention (i.e. old outputs get removed), this should " "be set shorter to that, too, to prevent the oldest " "outputs flapping. Increase freely if you intend to " "process old dates - worker's memory is the limit")) # TODO always entire interval for reprocessings (fixed start and stop)? hours_forward = luigi.IntParameter( default=0, description= "extent to which contiguousness is to be assured into future, in hours from current time. Prevents infinite loop when stop is none" ) def datetime_to_parameter(self, dt): return dt def parameter_to_datetime(self, p): return p def datetime_to_parameters(self, dt): """ Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter """ return self._task_parameters(dt) def parameters_to_datetime(self, p): """ Given a dictionary of parameters, will extract the ranged task parameter value """ return p[self._param_name] def moving_start(self, now): return now - timedelta(hours=self.hours_back) def moving_stop(self, now): return now + timedelta(hours=self.hours_forward) def finite_datetimes(self, finite_start, finite_stop): """ Simply returns the points in time that correspond to whole hours. """ datehour_start = datetime(finite_start.year, finite_start.month, finite_start.day, finite_start.hour) datehours = [] for i in itertools.count(): t = datehour_start + timedelta(hours=i) if t >= finite_stop: return datehours if t >= finite_start: datehours.append(t) def _format_datetime(self, dt): return luigi.DateHourParameter().serialize(dt)
class RangeBase(luigi.WrapperTask): """ Produces a contiguous completed range of a recurring task. Made for the common use case where a task is parameterized by e.g. DateParameter, and assurance is needed that any gaps arising from downtime are eventually filled. Emits events that one can use to monitor gaps and delays. At least one of start and stop needs to be specified. (This is quite an abstract base class for subclasses with different datetime parameter class, e.g. DateParameter, DateHourParameter, ..., and different parameter naming, e.g. days_back/forward, hours_back/forward, ..., as well as different documentation wording, for good user experience.) Subclasses will need to use the ``of`` parameter when overriding methods. """ # TODO lift the single parameter constraint by passing unknown parameters through WrapperTask? of = luigi.TaskParameter( description= "task name to be completed. The task must take a single datetime parameter" ) of_params = luigi.DictParameter( default=dict(), description= "Arguments to be provided to the 'of' class when instantiating") # The common parameters 'start' and 'stop' have type (e.g. DateParameter, # DateHourParameter) dependent on the concrete subclass, cumbersome to # define here generically without dark magic. Refer to the overrides. start = luigi.Parameter() stop = luigi.Parameter() reverse = luigi.BoolParameter( default=False, description= "specifies the preferred order for catching up. False - work from the oldest missing outputs onward; True - from the newest backward" ) task_limit = luigi.IntParameter( default=50, description= "how many of 'of' tasks to require. Guards against scheduling insane amounts of tasks in one go" ) # TODO overridable exclude_datetimes or something... now = luigi.IntParameter( default=None, description="set to override current time. In seconds since epoch") param_name = luigi.Parameter( default=None, description= "parameter name used to pass in parameterized value. Defaults to None, meaning use first positional parameter", positional=False) @property def of_cls(self): """ DONT USE. Will be deleted soon. Use ``self.of``! """ if isinstance(self.of, six.string_types): warnings.warn( 'When using Range programatically, dont pass "of" param as string!' ) return Register.get_task_cls(self.of) return self.of # a bunch of datetime arithmetic building blocks that need to be provided in subclasses def datetime_to_parameter(self, dt): raise NotImplementedError def parameter_to_datetime(self, p): raise NotImplementedError def datetime_to_parameters(self, dt): """ Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter """ raise NotImplementedError def parameters_to_datetime(self, p): """ Given a dictionary of parameters, will extract the ranged task parameter value """ raise NotImplementedError def moving_start(self, now): """ Returns a datetime from which to ensure contiguousness in the case when start is None or unfeasibly far back. """ raise NotImplementedError def moving_stop(self, now): """ Returns a datetime till which to ensure contiguousness in the case when stop is None or unfeasibly far forward. """ raise NotImplementedError def finite_datetimes(self, finite_start, finite_stop): """ Returns the individual datetimes in interval [finite_start, finite_stop) for which task completeness should be required, as a sorted list. """ raise NotImplementedError def _emit_metrics(self, missing_datetimes, finite_start, finite_stop): """ For consistent metrics one should consider the entire range, but it is open (infinite) if stop or start is None. Hence make do with metrics respective to the finite simplification. """ datetimes = self.finite_datetimes( finite_start if self.start is None else min( finite_start, self.parameter_to_datetime(self.start)), finite_stop if self.stop is None else max( finite_stop, self.parameter_to_datetime(self.stop))) delay_in_jobs = len(datetimes) - datetimes.index( missing_datetimes[0]) if datetimes and missing_datetimes else 0 self.trigger_event(RangeEvent.DELAY, self.of_cls.task_family, delay_in_jobs) expected_count = len(datetimes) complete_count = expected_count - len(missing_datetimes) self.trigger_event(RangeEvent.COMPLETE_COUNT, self.of_cls.task_family, complete_count) self.trigger_event( RangeEvent.COMPLETE_FRACTION, self.of_cls.task_family, float(complete_count) / expected_count if expected_count else 1) def _format_datetime(self, dt): return self.datetime_to_parameter(dt) def _format_range(self, datetimes): param_first = self._format_datetime(datetimes[0]) param_last = self._format_datetime(datetimes[-1]) return '[%s, %s]' % (param_first, param_last) def _instantiate_task_cls(self, param): return self.of(**self._task_parameters(param)) @property def _param_name(self): if self.param_name is None: return next(x[0] for x in self.of.get_params() if x[1].positional) else: return self.param_name def _task_parameters(self, param): kwargs = dict(**self.of_params) kwargs[self._param_name] = param return kwargs def requires(self): # cache because we anticipate a fair amount of computation if hasattr(self, '_cached_requires'): return self._cached_requires if not self.start and not self.stop: raise ParameterException( "At least one of start and stop needs to be specified") if not self.start and not self.reverse: raise ParameterException( "Either start needs to be specified or reverse needs to be True" ) if self.start and self.stop and self.start > self.stop: raise ParameterException("Can't have start > stop") # TODO check overridden complete() and exists() now = datetime.utcfromtimestamp( time.time() if self.now is None else self.now) moving_start = self.moving_start(now) finite_start = moving_start if self.start is None else max( self.parameter_to_datetime(self.start), moving_start) moving_stop = self.moving_stop(now) finite_stop = moving_stop if self.stop is None else min( self.parameter_to_datetime(self.stop), moving_stop) datetimes = self.finite_datetimes( finite_start, finite_stop) if finite_start <= finite_stop else [] if datetimes: logger.debug('Actually checking if range %s of %s is complete', self._format_range(datetimes), self.of_cls.task_family) missing_datetimes = sorted(self._missing_datetimes(datetimes)) logger.debug('Range %s lacked %d of expected %d %s instances', self._format_range(datetimes), len(missing_datetimes), len(datetimes), self.of_cls.task_family) else: missing_datetimes = [] logger.debug('Empty range. No %s instances expected', self.of_cls.task_family) self._emit_metrics(missing_datetimes, finite_start, finite_stop) if self.reverse: required_datetimes = missing_datetimes[-self.task_limit:] else: required_datetimes = missing_datetimes[:self.task_limit] if required_datetimes: logger.debug('Requiring %d missing %s instances in range %s', len(required_datetimes), self.of_cls.task_family, self._format_range(required_datetimes)) if self.reverse: required_datetimes.reverse( ) # TODO priorities, so that within the batch tasks are ordered too self._cached_requires = [ self._instantiate_task_cls(self.datetime_to_parameter(d)) for d in required_datetimes ] return self._cached_requires def missing_datetimes(self, finite_datetimes): """ Override in subclasses to do bulk checks. Returns a sorted list. This is a conservative base implementation that brutally checks completeness, instance by instance. Inadvisable as it may be slow. """ return [ d for d in finite_datetimes if not self._instantiate_task_cls( self.datetime_to_parameter(d)).complete() ] def _missing_datetimes(self, finite_datetimes): """ Backward compatible wrapper. Will be deleted eventually (stated on Dec 2015) """ try: return self.missing_datetimes(finite_datetimes) except TypeError as ex: if 'missing_datetimes()' in repr(ex): warnings.warn( 'In your Range* subclass, missing_datetimes() should only take 1 argument (see latest docs)' ) return self.missing_datetimes(self.of_cls, finite_datetimes) else: raise
class RangeDailyBase(RangeBase): """ Produces a contiguous completed range of a daily recurring task. """ start = luigi.DateParameter( default=None, description= "beginning date, inclusive. Default: None - work backward forever (requires reverse=True)" ) stop = luigi.DateParameter( default=None, description= "ending date, exclusive. Default: None - work forward forever") days_back = luigi.IntParameter( default=100, # slightly more than three months description=("extent to which contiguousness is to be assured into " "past, in days from current time. Prevents infinite loop " "when start is none. If the dataset has limited retention" " (i.e. old outputs get removed), this should be set " "shorter to that, too, to prevent the oldest outputs " "flapping. Increase freely if you intend to process old " "dates - worker's memory is the limit")) days_forward = luigi.IntParameter( default=0, description= "extent to which contiguousness is to be assured into future, in days from current time. Prevents infinite loop when stop is none" ) def datetime_to_parameter(self, dt): return dt.date() def parameter_to_datetime(self, p): return datetime(p.year, p.month, p.day) def datetime_to_parameters(self, dt): """ Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter """ return self._task_parameters(dt.date()) def parameters_to_datetime(self, p): """ Given a dictionary of parameters, will extract the ranged task parameter value """ dt = p[self._param_name] return datetime(dt.year, dt.month, dt.day) def moving_start(self, now): return now - timedelta(days=self.days_back) def moving_stop(self, now): return now + timedelta(days=self.days_forward) def finite_datetimes(self, finite_start, finite_stop): """ Simply returns the points in time that correspond to turn of day. """ date_start = datetime(finite_start.year, finite_start.month, finite_start.day) dates = [] for i in itertools.count(): t = date_start + timedelta(days=i) if t >= finite_stop: return dates if t >= finite_start: dates.append(t)
class HealthLabelTask(luigi.Task): """Apply health labels to the organisation data in MYSQL. Args: date (datetime): Datetime used to label the outputs _routine_id (str): String used to label the AWS task test (bool): True if in test mode insert_batch_size (int): Number of rows to insert into the db in a batch db_config_env (str): The output database envariable bucket (str): S3 bucket where the models are stored vectoriser_key (str): S3 key for the vectoriser model classifier_key (str): S3 key for the classifier model """ date = luigi.DateParameter() _routine_id = luigi.Parameter() test = luigi.BoolParameter() insert_batch_size = luigi.IntParameter(default=500) db_config_env = luigi.Parameter() bucket = luigi.Parameter() vectoriser_key = luigi.Parameter() classifier_key = luigi.Parameter() def requires(self): yield OrgGeocodeTask(date=self.date, _routine_id=self._routine_id, test=self.test, db_config_env="MYSQLDB", city_col=Organization.city, country_col=Organization.country, location_key_col=Organization.location_id, insert_batch_size=self.insert_batch_size, env_files=[find_filepath_from_pathstub("nesta/nesta/"), find_filepath_from_pathstub("config/mysqldb.config")], job_def="py36_amzn1_image", job_name=f"CrunchBaseOrgGeocodeTask-{self._routine_id}", job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=4096, max_live_jobs=2) def output(self): """Points to the output database engine""" self.db_config_path = os.environ[self.db_config_env] db_config = get_config(self.db_config_path, "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = "Crunchbase health labels <dummy>" # Note, not a real table update_id = "CrunchbaseHealthLabel_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config) def run(self): """Apply health labels using model.""" # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) try_until_allowed(Base.metadata.create_all, self.engine) # collect and unpickle models from s3 logging.info("Collecting models from S3") s3 = boto3.resource('s3') vectoriser_obj = s3.Object(self.bucket, self.vectoriser_key) vectoriser = pickle.loads(vectoriser_obj.get()['Body']._raw_stream.read()) classifier_obj = s3.Object(self.bucket, self.classifier_key) classifier = pickle.loads(classifier_obj.get()['Body']._raw_stream.read()) # retrieve organisations and categories nrows = 1000 if self.test else None logging.info("Collecting organisations from database") with db_session(self.engine) as session: orgs = (session .query(Organization.id) .filter(Organization.is_health.is_(None)) .limit(nrows) .all()) for batch_count, batch in enumerate(split_batches(orgs, self.insert_batch_size), 1): batch_orgs_with_cats = [] for (org_id, ) in batch: with db_session(self.engine) as session: categories = (session .query(OrganizationCategory.category_name) .filter(OrganizationCategory.organization_id == org_id) .all()) # categories should be a list of str, comma separated: ['cat,cat,cat', 'cat,cat'] categories = ','.join(cat_name for (cat_name, ) in categories) batch_orgs_with_cats.append({'id': org_id, 'categories': categories}) logging.debug(f"{len(batch_orgs_with_cats)} organisations retrieved from database") logging.debug("Predicting health flags") batch_orgs_with_flag = predict_health_flag(batch_orgs_with_cats, vectoriser, classifier) logging.debug(f"{len(batch_orgs_with_flag)} organisations to update") with db_session(self.engine) as session: session.bulk_update_mappings(Organization, batch_orgs_with_flag) logging.info(f"{batch_count} batches health labeled and written to db") # mark as done logging.warning("Task complete") self.output().touch()
class SGEJobTask(luigi.Task): """Base class for executing a job on SunGrid Engine Override ``work()`` (rather than ``run()``) with your job code. Parameters: - n_cpu: Number of CPUs (or "slots") to allocate for the Task. This value is passed as ``qsub -pe {pe} {n_cpu}`` - parallel_env: SGE parallel environment name. The default is "orte", the parallel environment installed with MIT StarCluster. If you are using a different cluster environment, check with your sysadmin for the right pe to use. This value is passed as {pe} to the qsub command above. - shared_tmp_dir: Shared drive accessible from all nodes in the cluster. Task classes and dependencies are pickled to a temporary folder on this drive. The default is ``/home``, the NFS share location setup by StarCluster """ n_cpu = luigi.IntParameter(default=2, significant=False) shared_tmp_dir = luigi.Parameter(default='/home', significant=False) parallel_env = luigi.Parameter(default='orte', significant=False) def _fetch_task_failures(self): if not os.path.exists(self.errfile): logger.info('No error file') return [] with open(self.errfile, "r") as f: errors = f.readlines() if errors == []: return errors if errors[0].strip( ) == 'stdin: is not a tty': # SGE complains when we submit through a pipe errors.pop(0) return errors def _init_local(self): # Set up temp folder in shared directory (trim to max filename length) base_tmp_dir = self.shared_tmp_dir random_id = '%016x' % random.getrandbits(64) folder_name = self.task_id + '-' + random_id self.tmp_dir = os.path.join(base_tmp_dir, folder_name) max_filename_length = os.fstatvfs(0).f_namemax self.tmp_dir = self.tmp_dir[:max_filename_length] logger.info("Tmp dir: %s", self.tmp_dir) os.makedirs(self.tmp_dir) # Dump the code to be run into a pickle file logging.debug("Dumping pickled class") self._dump(self.tmp_dir) # Make sure that all the class's dependencies are tarred and available logging.debug("Tarballing dependencies") # Grab luigi and the module containing the code to be run packages = [luigi] + [__import__(self.__module__, None, None, 'dummy')] luigi.hadoop.create_packages_archive( packages, os.path.join(self.tmp_dir, "packages.tar")) def run(self): self._init_local() self._run_job() # The procedure: # - Pickle the class # - Tarball the dependencies # - Construct a qsub argument that runs a generic runner function with the path to the pickled class # - Runner function loads the class from pickle # - Runner class untars the dependencies # - Runner function hits the button on the class's work() method def work(self): """Override this method, rather than ``run()``, for your actual work.""" pass def _dump(self, out_dir=''): """Dump instance to file.""" self.job_file = os.path.join(out_dir, 'job-instance.pickle') if self.__module__ == '__main__': d = pickle.dumps(self) module_name = os.path.basename(sys.argv[0]).rsplit('.', 1)[0] d = d.replace('(c__main__', "(c" + module_name) open(self.job_file, "w").write(d) else: pickle.dump(self, open(self.job_file, "w")) def _run_job(self): # Build a qsub argument that will run sge_runner.py on the directory we've specified runner_path = sge_runner.__file__ if runner_path.endswith("pyc"): runner_path = runner_path[:-3] + "py" job_str = 'python {0} "{1}"'.format( runner_path, self.tmp_dir ) # enclose tmp_dir in quotes to protect from special escape chars # Build qsub submit command self.outfile = os.path.join(self.tmp_dir, 'job.out') self.errfile = os.path.join(self.tmp_dir, 'job.err') submit_cmd = _build_qsub_command(job_str, self.task_family, self.outfile, self.errfile, self.parallel_env, self.n_cpu) logger.debug('qsub command: \n' + submit_cmd) # Submit the job and grab job ID output = subprocess.check_output(submit_cmd, shell=True) self.job_id = _parse_qsub_job_id(output) logger.debug("Submitted job to qsub with response:\n" + output) self._track_job() # Now delete the temporaries, if they're there. if self.tmp_dir and os.path.exists(self.tmp_dir): logger.info('Removing temporary directory %s' % self.tmp_dir) shutil.rmtree(self.tmp_dir) def _track_job(self): while True: # Sleep for a little bit time.sleep(POLL_TIME) # See what the job's up to # ASSUMPTION qstat_out = subprocess.check_output(['qstat']) sge_status = _parse_qstat_state(qstat_out, self.job_id) if sge_status == 'r': logger.info('Job is running...') elif sge_status == 'qw': logger.info('Job is pending...') elif 'E' in sge_status: logger.error('Job has FAILED:\n' + '\n'.join(self._fetch_task_failures())) break elif sge_status == 't' or sge_status == 'u': # Then the job could either be failed or done. errors = self._fetch_task_failures() if not errors: logger.info('Job is done') else: logger.error('Job has FAILED:\n' + '\n'.join(errors)) break else: logger.info('Job status is UNKNOWN!') logger.info('Status is : %s' % sge_status) raise Exception( "job status isn't one of ['r', 'qw', 'E*', 't', 'u']: %s" % sge_status)
class _SubDummyTask(gokart.TaskOnKart): task_namespace = __name__ param = luigi.IntParameter()
class TrainModel(luigi.Task): # TODO parameters description with 'help' argument test_size = luigi.FloatParameter(default = 0.25) random_state = luigi.Parameter(default = 'None') shuffle = luigi.BoolParameter() selected_model = luigi.ChoiceParameter(choices = ['naive_bayes', 'random_forest', 'svm'], default = 'naive_bayes') verbose = luigi.IntParameter(default=0) # Random forest n_estimators = luigi.IntParameter(default = 100) max_depth = luigi.Parameter(default = 'None') def __init__(self, *args, **kwargs): super(TrainModel, self).__init__(*args, **kwargs) self.random_state = ast.literal_eval(self.random_state) self.max_depth = ast.literal_eval(self.max_depth) def requires(self): return FeatureExtraction() def output(self): return None def run(self): print('---> Spliting data') features = pickle.load(open(self.input()['features'].path, 'rb')) labels = pickle.load(open(self.input()['labels'].path, 'rb')) train_features, test_features, train_labels, test_labels = \ train_test_split(features, labels, test_size = self.test_size, random_state = self.random_state, shuffle = self.shuffle) # TODO # kfold = StratifiedKFold(n_splits=10, shuffle=True) if self.selected_model == 'random_forest': model = RandomForestModel( n_estimators = self.n_estimators, max_depth = self.max_depth, verbose = self.verbose ) elif self.selected_model == 'naive_bayes': model = NaiveBayesModel() elif self.selected_model == 'svm': raise NotImplementedError print('---> Training model', model.name) model.train(train_features, train_labels) results, figures = model.score(test_features, test_labels) print('---> Saving Model') output_name = model.name + '||Accuray=' + str(round(results['accuracy'], 2)) output_name += '||' + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S') save_folder = os.path.join('models', output_name) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'results.json'), 'w') as fp: json.dump(results, fp) for key in figures: with open(os.path.join(save_folder, str(key+'.png')), 'w') as fp: figures.savefig(fp) model.save(os.path.join(save_folder, 'model.pickle'))
from openfda import parallel, config, index_util, elasticsearch_requests from openfda.annotation_table.pipeline import CombineHarmonization from openfda.faers import annotate from openfda.faers import xml_to_json from openfda.index_util import AlwaysRunTask # this should be a symlink to wherever the real data directory is RUN_DIR = dirname(dirname(os.path.abspath(__file__))) BASE_DIR = config.data_dir() FAERS_HISTORIC = ('http://www.fda.gov/Drugs/GuidanceCompliance' 'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm083765.htm') FAERS_CURRENT = ('http://www.fda.gov/Drugs/GuidanceCompliance' 'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm082193.htm') MAX_RECORDS_PER_FILE = luigi.IntParameter(-1, is_global=True) class DownloadDataset(AlwaysRunTask): ''' This task downloads all datasets that have not yet been fetched. ''' def _fetch(self): for page in [self._faers_current.find_all(href=re.compile('.*.zip')), self._faers_historic.find_all(href=re.compile('.*.zip'))]: for a in page: filename = a.text.split(u'\xa0')[0] # FAERS XML/ASCII for 2014 Q3/Q4 have many strange issues. Sigh. filename = filename.replace('Q', 'q') filename = filename.replace(' q', 'q') filename = filename.replace(' ', '_') if '.zip' not in filename.lower():
class _DummyTask(TaskOnKart): task_namespace = __name__ param = luigi.IntParameter() task = TaskInstanceParameter(default=_DummySubTask())
class SGEJobTask(luigi.Task): """Base class for executing a job on SunGrid Engine Override ``work()`` (rather than ``run()``) with your job code. Parameters: - n_cpu: Number of CPUs (or "slots") to allocate for the Task. This value is passed as ``qsub -pe {pe} {n_cpu}`` - parallel_env: SGE parallel environment name. The default is "orte", the parallel environment installed with MIT StarCluster. If you are using a different cluster environment, check with your sysadmin for the right pe to use. This value is passed as {pe} to the qsub command above. - shared_tmp_dir: Shared drive accessible from all nodes in the cluster. Task classes and dependencies are pickled to a temporary folder on this drive. The default is ``/home``, the NFS share location setup by StarCluster - job_name_format: String that can be passed in to customize the job name string passed to qsub; e.g. "Task123_{task_family}_{n_cpu}...". - job_name: Exact job name to pass to qsub. - run_locally: Run locally instead of on the cluster. - poll_time: the length of time to wait in order to poll qstat - dont_remove_tmp_dir: Instead of deleting the temporary directory, keep it. - no_tarball: Don't create a tarball of the luigi project directory. Can be useful to reduce I/O requirements when the luigi directory is accessible from cluster nodes already. """ n_cpu = luigi.IntParameter(default=2, significant=False) shared_tmp_dir = luigi.Parameter(default='/home', significant=False) parallel_env = luigi.Parameter(default='orte', significant=False) job_name_format = luigi.Parameter( significant=False, default=None, description="A string that can be " "formatted with class variables to name the job with qsub.") job_name = luigi.Parameter(significant=False, default=None, description="Explicit job name given via qsub.") run_locally = luigi.BoolParameter( significant=False, description="run locally instead of on the cluster") poll_time = luigi.IntParameter( significant=False, default=POLL_TIME, description="specify the wait time to poll qstat for the job status") dont_remove_tmp_dir = luigi.BoolParameter( significant=False, description="don't delete the temporary directory used (for debugging)" ) no_tarball = luigi.BoolParameter( significant=False, description="don't tarball (and extract) the luigi project files") # Custom params h_vmem = luigi.IntParameter(default=100) m_mem_free = luigi.IntParameter(default=5) def __init__(self, *args, **kwargs): super(SGEJobTask, self).__init__(*args, **kwargs) if self.job_name: # use explicitly provided job name pass elif self.job_name_format: # define the job name with the provided format self.job_name = self.job_name_format.format( task_family=self.task_family, **self.__dict__) else: # default to the task family self.job_name = self.task_family def _fetch_task_failures(self): if not os.path.exists(self.errfile): logger.info('No error file') return [] with open(self.errfile, "r") as f: errors = f.readlines() if errors == []: return errors if errors[0].strip( ) == 'stdin: is not a tty': # SGE complains when we submit through a pipe errors.pop(0) return errors def _init_local(self): # Set up temp folder in shared directory (trim to max filename length) base_tmp_dir = self.shared_tmp_dir random_id = '%016x' % random.getrandbits(64) folder_name = self.task_id + '-' + random_id self.tmp_dir = os.path.join(base_tmp_dir, folder_name) max_filename_length = os.fstatvfs(0).f_namemax self.tmp_dir = self.tmp_dir[:max_filename_length] logger.info("Tmp dir: %s", self.tmp_dir) os.makedirs(self.tmp_dir) # Dump the code to be run into a pickle file logging.debug("Dumping pickled class") self._dump(self.tmp_dir) if not self.no_tarball: # Make sure that all the class's dependencies are tarred and available # This is not necessary if luigi is importable from the cluster node logging.debug("Tarballing dependencies") # Grab luigi and the module containing the code to be run packages = [luigi ] + [__import__(self.__module__, None, None, 'dummy')] create_packages_archive(packages, os.path.join(self.tmp_dir, "packages.tar")) def run(self): if self.run_locally: self.work() else: self._init_local() self._run_job() # The procedure: # - Pickle the class # - Tarball the dependencies # - Construct a qsub argument that runs a generic runner function with the path to the pickled class # - Runner function loads the class from pickle # - Runner class untars the dependencies # - Runner function hits the button on the class's work() method def work(self): """Override this method, rather than ``run()``, for your actual work.""" pass def _dump(self, out_dir=''): """Dump instance to file.""" if True: print("##################joining paths") self.job_file = os.path.join(out_dir, 'job-instance.pickle') print("#################checking if main") if self.__module__ == '__main__': print("##########################dumping pickle") d = cloudpickle.dumps(self) module_name = os.path.basename(sys.argv[0]).rsplit('.', 1)[0] d = d.replace('(c__main__', "(c" + module_name) with open(self.job_file, "w") as f: f.write(d) else: with open(self.job_file, "wb") as f: # pickle.dump(self, f) cloudpickle.dump(self, f) def _run_job(self): # Build a qsub argument that will run sge_runner.py on the directory we've specified runner_path = sge_runner.__file__ if runner_path.endswith("pyc"): runner_path = runner_path[:-3] + "py" job_str = '/dls/science/groups/i04-1/conor_dev/ccp4/build/bin/cctbx.python {0} "{1}" "{2}"'.format( runner_path, self.tmp_dir, os.getcwd() ) # enclose tmp_dir in quotes to protect from special escape chars if self.no_tarball: job_str += ' "--no-tarball"' # Build qsub submit command self.outfile = os.path.join(self.tmp_dir, 'job.out') self.errfile = os.path.join(self.tmp_dir, 'job.err') submit_cmd = _build_qsub_command( job_str, self.task_family, self.outfile, self.errfile, self.parallel_env, n_cpu=self.n_cpu, h_vmem=self.h_vmem, m_mem_free=self.m_mem_free, ) logger.debug('qsub command: \n' + submit_cmd) # Submit the job and grab job ID output = subprocess.check_output(submit_cmd, shell=True) self.job_id = _parse_qsub_job_id(output) logger.debug("Submitted job to qsub with response:\n" + output) self._track_job() # Now delete the temporaries, if they're there. if (self.tmp_dir and os.path.exists(self.tmp_dir) and not self.dont_remove_tmp_dir): logger.info('Removing temporary directory %s' % self.tmp_dir) subprocess.call(["rm", "-rf", self.tmp_dir]) def _track_job(self): while True: # Sleep for a little bit time.sleep(self.poll_time) # See what the job's up to # ASSUMPTION qstat_out = subprocess.check_output(['qstat']) sge_status = _parse_qstat_state(qstat_out, self.job_id) if sge_status == 'r': logger.info('Job is running...') elif sge_status == 'qw': logger.info('Job is pending...') elif 'E' in sge_status: logger.error('Job has FAILED:\n' + '\n'.join(self._fetch_task_failures())) break elif sge_status == 't' or sge_status == 'u': # Then the job could either be failed or done. errors = self._fetch_task_failures() if not errors: logger.info('Job is done') else: logger.error('Job has FAILED:\n' + '\n'.join(errors)) break else: logger.info('Job status is UNKNOWN!') logger.info('Status is : %s' % sge_status) raise Exception( "job status isn't one of ['r', 'qw', 'E*', 't', 'u']: %s" % sge_status)
class _DummyListTask(TaskOnKart): task_namespace = __name__ param = luigi.IntParameter() task = ListTaskInstanceParameter( default=[_DummySubTask(), _DummySubTask()])
class ScoreVariantsWithCnn(VclineTask): cf = luigi.DictParameter() n_cpu = luigi.IntParameter(default=1) memory_mb = luigi.FloatParameter(default=4096) sh_config = luigi.DictParameter(default=dict()) priority = 50 def output(self): output_path_prefix = re.sub(r'\.vcf\.gz$', '', self.input()[0][0].path) return [ luigi.LocalTarget(f'{output_path_prefix}.cnn.vcf.gz{s}') for s in ['', '.tbi'] ] def run(self): input_vcf = Path(self.input()[0][0].path) input_cram = Path(self.input()[0][2].path) fa = Path(self.input()[1][0].path) intervals = [Path(i.path) for i in self.input()[2]] skip_interval_split = (len(intervals) == 1) output_vcf = Path(self.output()[0].path) output_path_prefix = '.'.join(str(output_vcf).split('.')[:-2]) if skip_interval_split: tmp_prefixes = [output_path_prefix] else: tmp_prefixes = [ '{0}.{1}'.format(output_path_prefix, o.stem) for o in intervals ] input_targets = yield [ CNNScoreVariants(input_vcf_path=str(input_vcf), input_cram_path=str(input_cram), fa_path=str(fa), evaluation_interval_path=str(o), output_path_prefix=s, gatk=self.cf['gatk'], python=self.cf['python'], save_memory=self.cf['save_memory'], n_cpu=self.n_cpu, memory_mb=self.memory_mb, sh_config=self.sh_config) for o, s in zip(intervals, tmp_prefixes) ] run_id = '.'.join(output_vcf.name.split('.')[:-2]) self.print_log(f'Score variants with CNN:\t{run_id}') gatk = self.cf['gatk'] self.setup_shell(run_id=run_id, commands=gatk, cwd=output_vcf.parent, **self.sh_config, env={ 'JAVA_TOOL_OPTIONS': self.generate_gatk_java_options( n_cpu=self.n_cpu, memory_mb=self.memory_mb) }) if not skip_interval_split: tmp_vcfs = [Path(f'{s}.vcf.gz') for s in tmp_prefixes] self.run_shell( args=(f'set -e && {gatk} MergeVcfs' + ''.join(f' --INPUT {v}' for v in tmp_vcfs) + f' --OUTPUT {output_vcf}'), input_files_or_dirs=tmp_vcfs, output_files_or_dirs=[output_vcf, f'{output_vcf}.tbi']) self.remove_files_and_dirs(*chain.from_iterable( [o.path for o in t] for t in input_targets))
class Predict(luigi.Task): it = luigi.IntParameter() path = luigi.Parameter() samples = luigi.TupleParameter() data_eval = luigi.TupleParameter() resources={'gpu': 1, 'ram': 10} @property def priority(self): if int(self.it)%10000==0: return 1.+1./int(self.it) else: return 0. def requires(self): return MakeItFolder(self.it, self.path, self.data_eval) def output(self): ret = [] for de in self.data_eval: for s in self.samples: ret.append(luigi.LocalTarget(os.path.join(os.path.dirname(self.input().fn), 'pred_{0:}_{' '1:}.msg'.format(de, s)))) return ret def run(self): src = '/groups/saalfeld/saalfeldlab/larissa/data/cremieval/{0:}/{1:}.n5' tgt = os.path.join(os.path.dirname(self.input().fn), '{0:}', '{1:}.n5') output_shape = (71, 650, 650) gpu_list = [] for i in range(8): nvsmi = subprocess.Popen("nvidia-smi -d PIDS -q -i {0:}".format(i), shell=True, stdout=subprocess.PIPE).stdout.read() if 'None' in nvsmi: gpu_list.append(i) completed = [] for de in self.data_eval: for s in self.samples: srcf = z5py.File(src.format(de, s), use_zarr_format=False) shape = srcf['volumes/raw'].shape tgtf = z5py.File(tgt.format(de, s), use_zarr_format=False) if not os.path.exists(os.path.join(tgt.format(de,s), 'clefts')): tgtf.create_dataset('clefts', shape=shape, compression='gzip', dtype='uint8', chunks=output_shape) completed.append(False) else: if self.check_completeness()[0]: completed.append(True) else: completed.append(False) if not os.path.exists(os.path.join(tgt.format(de,s), 'pre_dist')): tgtf.create_dataset('pre_dist', shape=shape, compression='gzip', dtype='uint8', chunks=output_shape) completed.append(False) else: if self.check_completeness()[0]: completed.append(True) else: completed.append(False) if not os.path.exists(os.path.join(tgt.format(de,s), 'post_dist')): tgtf.create_dataset('post_dist', shape=shape, compression='gzip', dtype='uint8', chunks=output_shape) completed.append(False) else: if self.check_completeness()[0]: completed.append(True) else: completed.append(False) get_offset_lists(shape, gpu_list, tgt.format(de, s), output_shape=output_shape) if all(completed): self.finish() return self.submit_inference(self.data_eval, gpu_list) reprocess_attempts = 0 while reprocess_attempts < 4: complete, reprocess_list = self.check_completeness(gpu_list) if complete: self.finish() return else: self.set_status_message("Reprocessing {0:}, try {1:}".format(list(reprocess_list), reprocess_attempts)) self.submit_inference(tuple(reprocess_list), gpu_list) reprocess_attempts += 1 if reprocess_attempts >= 4: raise AssertionError def submit_inference(self, data_eval, gpu_list): with ProcessPoolExecutor(max_workers=len(gpu_list)) as pp: tasks = [pp.submit(single_inference, self.path, json.dumps(list(data_eval)).replace(' ', '').replace('"', '\\"'), json.dumps(list(self.samples)).replace(' ', '').replace('"', '\\"'), str(gpu), str(self.it)) for gpu in gpu_list] result = [t.result() for t in tasks] def finish(self): for o in self.output(): done = o.open('w') done.close() def check_completeness(self, gpu_list=None): complete=True reprocess=set() tgt = os.path.join(os.path.dirname(self.input().fn), '{0:}', '{1:}.n5') pattern = re.compile("list_gpu_[0-7].json") for de in self.data_eval: for s in self.samples: if gpu_list is None: gpu_list = [] for fn in os.listdir(tgt.format(de, s)): if pattern.match(fn) is not None: gpu_list.append(int(filter(str.isdigit, fn))) if len(gpu_list)==0: complete=False reprocess.add(de) for gpu in gpu_list: if os.path.exists(os.path.join(tgt.format(de, s), 'list_gpu_{0:}.json').format(gpu)) and \ os.path.exists(os.path.join(tgt.format(de, s), 'list_gpu_{0:}processed.txt'.format(gpu))): block_list = os.path.join(tgt.format(de, s), 'list_gpu_{0:}.json').format(gpu) block_list_processed = os.path.join(tgt.format(de, s), 'list_gpu_{0:}processed.txt'.format(gpu)) with open(block_list, 'r') as f: block_list = json.load(f) block_list = {tuple(coo) for coo in block_list} with open(block_list_processed, 'r') as f: list_as_str = f.read() list_as_str_curated = '['+list_as_str[:list_as_str.rfind(']')+1]+']' processed_list = json.loads(list_as_str_curated) processed_list = {tuple(coo) for coo in processed_list} if processed_list < block_list: complete=False reprocess.add(de) else: complete=False reprocess.add(de) return complete, reprocess
class BaseTask(luigi.Task): pipeline = luigi.IntParameter() job = luigi.IntParameter() start = luigi.IntParameter() solr_query = luigi.Parameter() batch = luigi.IntParameter() task_name = "ClarityNLPLuigiTask" docs = list() pipeline_config = config.PipelineConfig('', '') segment = segmentation.Segmentation() def run(self): task_family_name = str(self.task_family) if self.task_name == "ClarityNLPLuigiTask": self.task_name = task_family_name client = MongoClient(util.mongo_host, util.mongo_port) try: with self.output().open('w') as temp_file: temp_file.write("start writing custom task") jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Batch %s" % self.batch) self.pipeline_config = config.get_pipeline_config( self.pipeline, util.conn_string) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Solr query") self.docs = solr_data.query( self.solr_query, rows=util.row_count, start=self.start, solr_url=util.solr_url, tags=self.pipeline_config.report_tags, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, types=self.pipeline_config.report_types, sources=self.pipeline_config.sources, filter_query=self.pipeline_config.filter_query, cohort_ids=self.pipeline_config.cohort, job_results_filters=self.pipeline_config.job_results) for d in self.docs: doc_id = d[util.solr_report_id_field] if util.use_memory_caching == "true": k = keys.hashkey(doc_id) document_cache[k] = d if util.use_redis_caching == "true": util.write_to_redis_cache("doc:" + doc_id, json.dumps(d)) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running %s main task" % self.task_name) self.run_custom_task(temp_file, client) temp_file.write("Done writing custom task!") self.docs = list() except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status(str(self.job), util.conn_string, jobs.WARNING, ''.join(traceback.format_stack())) print(ex) finally: client.close() def output(self): return luigi.LocalTarget( "%s/pipeline_job%s_%s_batch%s.txt" % (util.tmp_dir, str(self.job), self.task_name, str(self.start))) def set_name(self, name): self.task_name = name def write_result_data(self, temp_file, mongo_client, doc, data: dict, prefix: str = '', phenotype_final: bool = False): inserted = pipeline_mongo_writer(mongo_client, self.pipeline, self.task_name, self.job, self.batch, self.pipeline_config, doc, data, prefix=prefix) if temp_file is not None: temp_file.write(str(inserted)) temp_file.write('\n') return inserted def write_multiple_result_data(self, temp_file, mongo_client, doc, data: list, prefix: str = ''): ids = list() for d in data: inserted = pipeline_mongo_writer(mongo_client, self.pipeline, self.task_name, self.job, self.batch, self.pipeline_config, doc, d, prefix=prefix) ids.append(inserted) if temp_file is not None: temp_file.write(str(inserted)) temp_file.write('\n') return ids def write_log_data(self, job_status, status_message): jobs.update_job_status(str(self.job), util.conn_string, job_status, status_message) def run_custom_task(self, temp_file, mongo_client: MongoClient): print("Implement your custom functionality here ") def get_document_text(self, doc, clean=True): if doc and util.solr_text_field in doc: txt = doc[util.solr_text_field] if type(txt) == str: txt_val = txt elif type(txt) == list: txt_val = ' '.join(txt) else: txt_val = str(txt) if clean: return txt_val.encode("ascii", errors="ignore").decode() else: return txt_val else: return '' def get_boolean(self, key, default=False): return get_config_boolean(self.pipeline_config, key, default=default) def get_integer(self, key, default=-1): return get_config_integer(self.pipeline_config, key, default=default) def get_string(self, key, default=''): return get_config_string(self.pipeline_config, key, default=default) def get_document_sentences(self, doc): return document_sentences(doc) def get_document_sections(self, doc): names, section_texts = document_sections(doc) return names, section_texts
class Baseline(sciluigi.WorkflowTask): workdir = luigi.Parameter(default='/work') tvddir = luigi.Parameter(default='/tvd') series = luigi.Parameter(default='GameOfThrones') season = luigi.IntParameter(default=1) episode = luigi.IntParameter(default=1) language = luigi.Parameter(default='en') linearBICClusteringFeatures__e = luigi.BoolParameter(default=True) linearBICClusteringFeatures__De = luigi.BoolParameter(default=False) linearBICClusteringFeatures__DDe = luigi.BoolParameter(default=False) linearBICClusteringFeatures__coefs = luigi.IntParameter(default=12) linearBICClusteringFeatures__D = luigi.BoolParameter(default=False) linearBICClusteringFeatures__DD = luigi.BoolParameter(default=False) linearBICClustering__max_gap = luigi.FloatParameter(default=3600.0) linearBICClustering__penalty_coef = luigi.FloatParameter(default=1.0) linearBICClustering__covariance_type = luigi.Parameter(default='diag') bicClusteringFeatures__e = luigi.BoolParameter(default=True) bicClusteringFeatures__De = luigi.BoolParameter(default=False) bicClusteringFeatures__DDe = luigi.BoolParameter(default=False) bicClusteringFeatures__coefs = luigi.IntParameter(default=12) bicClusteringFeatures__D = luigi.BoolParameter(default=False) bicClusteringFeatures__DD = luigi.BoolParameter(default=False) bicClustering__penalty_coef = luigi.FloatParameter(default=3.5) bicClustering__covariance_type = luigi.Parameter(default='full') hyperopt = luigi.Parameter(default=None) def workflow(self): # ===================================================================== # SPEECH / NON-SPEECH # ===================================================================== audio = self.new_task('audio', pyannote_workflows.tasks.tvd_dataset.Audio, tvddir=self.tvddir, series=self.series, season=self.season, episode=self.episode, language=self.language) speakerReference = self.new_task( 'speakerReference', pyannote_workflows.tasks.tvd_dataset.Speaker, workdir=self.workdir, tvddir=self.tvddir, series=self.series, season=self.season, episode=self.episode) speech = self.new_task('speechReference', pyannote_workflows.tasks.tvd_dataset.Speech, to_annotation=True) speech.in_wav = audio.out_put speech.in_speaker = speakerReference.out_put # ===================================================================== # LINEAR BIC CLUSTERING # ===================================================================== linearBICClusteringFeatures = self.new_task( 'linearBICClusteringFeatures', pyannote_workflows.tasks.speech.MFCC, e=self.linearBICClusteringFeatures__e, De=self.linearBICClusteringFeatures__De, DDe=self.linearBICClusteringFeatures__DDe, coefs=self.linearBICClusteringFeatures__coefs, D=self.linearBICClusteringFeatures__D, DD=self.linearBICClusteringFeatures__DD) linearBICClusteringFeatures.in_audio = audio.out_put linearBICClustering = self.new_task( 'linearBICClustering', pyannote_workflows.tasks.speech.LinearBICClustering, max_gap=self.linearBICClustering__max_gap, penalty_coef=self.linearBICClustering__penalty_coef, covariance_type=self.linearBICClustering__covariance_type) linearBICClustering.in_segmentation = speech.out_put linearBICClustering.in_features = linearBICClusteringFeatures.out_put # ===================================================================== # BIC CLUSTERING # ===================================================================== bicClusteringFeatures = self.new_task( 'bicClusteringFeatures', pyannote_workflows.tasks.speech.MFCC, e=self.bicClusteringFeatures__e, De=self.bicClusteringFeatures__De, DDe=self.bicClusteringFeatures__DDe, coefs=self.bicClusteringFeatures__coefs, D=self.bicClusteringFeatures__D, DD=self.bicClusteringFeatures__DD) bicClusteringFeatures.in_audio = audio.out_put bicClustering = self.new_task( 'bicClustering', pyannote_workflows.tasks.speech.BICClustering, penalty_coef=self.bicClustering__penalty_coef, covariance_type=self.bicClustering__covariance_type) bicClustering.in_segmentation = linearBICClustering.out_put bicClustering.in_features = bicClusteringFeatures.out_put # ===================================================================== # EVALUATION # ===================================================================== evaluateDiarization = self.new_task( 'evaluateDiarization', pyannote_workflows.tasks.evaluation.EvaluateDiarizationFast) evaluateDiarization.in_hypothesis = bicClustering.out_put evaluateDiarization.in_reference = speakerReference.out_put if hasattr(self, 'auto_output'): pprint(self.auto_output) if self.hyperopt is not None: hyperopt = self.new_task('hyperopt', pyannote_workflows.utils.Hyperopt, temp=self.hyperopt) hyperopt.in_evaluation = evaluateDiarization.out_put return hyperopt else: return evaluateDiarization
class MasscanScan(luigi.Task): """ Run ``masscan`` against a target specified via the TargetList Task. Note: When specified, ``--top_ports`` is processed and then ultimately passed to ``--ports``. Install: .. code-block:: console git clone https://github.com/robertdavidgraham/masscan /tmp/masscan make -s -j -C /tmp/masscan sudo mv /tmp/masscan/bin/masscan /usr/local/bin/masscan rm -rf /tmp/masscan Basic Example: .. code-block:: console masscan -v --open-only --banners --rate 1000 -e tun0 -oJ masscan.tesla.json --ports 80,443,22,21 -iL tesla.ips Luigi Example: .. code-block:: console PYTHONPATH=$(pwd) luigi --local-scheduler --module recon.masscan Masscan --target-file tesla --ports 80,443,22,21 Args: rate: desired rate for transmitting packets (packets per second) interface: use the named raw network interface, such as "eth0" top_ports: Scan top N most popular ports ports: specifies the port(s) to be scanned db_location: specifies the path to the database used for storing results *Required by upstream Task* target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task* results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task* exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task* """ rate = luigi.Parameter(default=defaults.get("masscan-rate")) interface = luigi.Parameter(default=defaults.get("masscan-iface")) top_ports = luigi.IntParameter( default=0) # IntParameter -> top_ports expected as int ports = luigi.Parameter(default="") requirements = ["masscan"] exception = True def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.db_mgr = pipeline.models.db_manager.DBManager( db_location=self.db_location) self.results_subfolder = (Path(self.results_dir) / "masscan-results").expanduser().resolve() def output(self): """ Returns the target output for this task. Naming convention for the output file is masscan.TARGET_FILE.json. Returns: luigi.local_target.LocalTarget """ new_path = self.results_subfolder / "masscan.json" return luigi.LocalTarget(new_path.expanduser().resolve()) def run(self): """ Defines the options/arguments sent to masscan after processing. Returns: list: list of options/arguments, beginning with the name of the executable to run """ meets_requirements(self.requirements, self.exception) if not self.ports and not self.top_ports: # need at least one, can't be put into argparse scanner because things like amass don't require ports option logging.error("Must specify either --top-ports or --ports.") exit(2) if self.top_ports: # if --top-ports used, format the top_*_ports lists as strings and then into a proper masscan --ports option top_tcp_ports_str = ",".join( str(x) for x in top_tcp_ports[:self.top_ports]) top_udp_ports_str = ",".join( str(x) for x in top_udp_ports[:self.top_ports]) self.ports = f"{top_tcp_ports_str},U:{top_udp_ports_str}" self.top_ports = 0 self.results_subfolder.mkdir(parents=True, exist_ok=True) yield TargetList(target_file=self.target_file, results_dir=self.results_dir, db_location=self.db_location) if self.db_mgr.get_all_hostnames(): # TargetList generated some domains for us to scan with amass yield ParseAmassOutput( target_file=self.target_file, exempt_list=self.exempt_list, results_dir=self.results_dir, db_location=self.db_location, ) command = [ tools.get("masscan").get("path"), "-v", "--open", "--banners", "--rate", self.rate, "-e", self.interface, "-oJ", self.output().path, "--ports", self.ports, "-iL", ] # masscan only understands how to scan ipv4 ip_addresses = self.db_mgr.get_all_ipv4_addresses() masscan_input_file = None if ip_addresses: # TargetList generated ip addresses for us to scan with masscan masscan_input_file = self.results_subfolder / "input-from-amass" with open(masscan_input_file, "w") as f: for ip_address in ip_addresses: f.write(f"{ip_address}\n") command.append(str(masscan_input_file)) subprocess.run(command) # will fail if no ipv4 addresses were found if masscan_input_file is not None: masscan_input_file.unlink()
class CombineDataAll(sciluigi.Task): """Combine al the relevant outputs into a table.""" in_counts = None in_seq = None in_tpm = None in_bed = None in_effect = None outdir = luigi.Parameter() window_size = luigi.IntParameter() temperature = luigi.FloatParameter() def out_table(self, ): filenames = [target().path for target in self.in_counts.values()] return sciluigi.TargetInfo( self, os.path.join( self.outdir, processing.combine_filenames_split(filenames) + '.temp%d.combined_data.gz' % self.temperature)) def run(self, ): # make directory f it doesn't exist outdir = self.outdir if not os.path.exists(outdir): os.makedirs(outdir) # load counts counts = {} for key, target in self.in_counts.items(): data_table = pd.read_csv(target().path, compression='gzip', index_col=0) counts['%s' % (key)] = processing.get_counts_from_counts_table( data_table, ) counts = pd.concat(counts).unstack(level=0) # load seqdata seqdata = pd.read_table(self.in_seq().path, compression='gzip', index_col=0) # load seqdata_effects seqeffect = pd.read_table(self.in_effect().path, compression='gzip', index_col=0) #seqeffect.loc[:, 'flag'] = pd.Series({idx:seqmodel.flag_ensemble(row.drop('ddG')-row.ddG) for idx, row in seqeffect.iterrows()}) noflip_cols = [idx for idx in seqeffect if idx.find('noflip') == 0] flip_cols = [ idx for idx in seqeffect if idx.find('flip') == 0 or idx.find('doubleflip') == 0 ] seqeffect.loc[:, 'ddG_noflip_noens'] = seqeffect.loc[:, noflip_cols[0]] seqeffect.loc[:, 'ddG_noflip'] = seqmodel.compute_ensemble_ddG_set( seqeffect.loc[:, noflip_cols], self.temperature) seqeffect.loc[:, 'ddG_flip'] = seqmodel.compute_ensemble_ddG_set( seqeffect.loc[:, flip_cols], self.temperature) keep_cols = [idx for idx in seqeffect if idx.find('ddG') == 0] # load bed data beddata = processing.load_bed( self.in_bed().path, additional_cols=variables.motif_fields_additional).set_index( 'name') # load tpm expression = pd.read_table(self.in_tpm().path, index_col=0, squeeze=True) # combine out_data = pd.concat([ beddata, counts, expression, seqdata, seqeffect.loc[:, keep_cols] ], axis=1) out_data.loc[:, 'clip_signal_per_tpm'] = (out_data.rep1 + out_data.rep2) / out_data.tpm out_data.loc[:, 'clip_input_per_tpm'] = (out_data.input) / out_data.tpm out_data.to_csv(self.out_table().path, sep='\t', compression='gzip')
class CreateIntraSessionInteractionDataset(BasePySparkTask): sample_days: int = luigi.IntParameter(default=16) history_window: int = luigi.IntParameter(default=10) size_available_list: int = luigi.IntParameter(default=100) minimum_interactions: int = luigi.IntParameter(default=5) max_itens_per_session: int = luigi.IntParameter(default=15) min_itens_interactions: int = luigi.IntParameter(default=3) max_relative_pos: int = luigi.IntParameter(default=3) pos_max_deep: int = luigi.IntParameter(default=1) # def requires(self): # return SessionPrepareDataset(sample_days=self.sample_days, history_window=self.history_window, size_available_list=self.size_available_list) def output(self): return luigi.LocalTarget(os.path.join(DATASET_DIR, "indexed_intra_session_train_%d_w=%d_l=%d_m=%d_s=%d_i=%d_p=%d" % (self.sample_days, self.history_window, self.size_available_list, self.minimum_interactions, self.max_itens_per_session, self.min_itens_interactions, self.max_relative_pos))),\ luigi.LocalTarget(os.path.join(DATASET_DIR, "item_positive_interaction_%d_w=%d_l=%d_m=%d_s=%d_i=%d_p=%d.csv" % (self.sample_days, self.history_window, self.size_available_list, self.minimum_interactions, self.max_itens_per_session, self.min_itens_interactions, self.max_relative_pos))),\ luigi.LocalTarget(os.path.join(DATASET_DIR, "item_id_index_%d_w=%d_l=%d_m=%d_s=%d_i=%d_p=%d.csv" % (self.sample_days, self.history_window, self.size_available_list, self.minimum_interactions, self.max_itens_per_session, self.min_itens_interactions, self.max_relative_pos))),\ luigi.LocalTarget(os.path.join(DATASET_DIR, "session_index_%d_w=%d_l=%d_m=%d_s=%d_i=%d_p=%d.csv" % (self.sample_days, self.history_window, self.size_available_list, self.minimum_interactions, self.max_itens_per_session, self.min_itens_interactions, self.max_relative_pos))) def get_df_tuple_probs(self, df): df_tuple_count = df.groupby("ItemID_A", "ItemID_B").count() df_count = df.groupby("ItemID_A").count()\ .withColumnRenamed("count", "total")\ .withColumnRenamed("ItemID_A", "_ItemID_A") df_join = df_tuple_count.join( df_count, df_tuple_count.ItemID_A == df_count._ItemID_A).cache() df_join = df_join.withColumn("prob", col("count") / col("total")) df_join = df_join.select("ItemID_A", 'ItemID_B', 'count', 'total', 'prob')\ .withColumnRenamed("ItemID_A", "_ItemID_A")\ .withColumnRenamed("ItemID_B", "_ItemID_B")\ .withColumnRenamed("count", "total_ocr_dupla")\ .withColumnRenamed("total", "total_ocr").cache() return df_join def add_positive_interactions(self, df): # Filter more then 1 ocurrence for positive interactions df = df.filter(col("total_ocr_dupla") >= 1) df = df\ .groupby("ItemID_A")\ .agg(F.collect_set("ItemID_B").alias("sub_a_b")) # df_b = df\ # .groupby("ItemID_B")\ # .agg(F.collect_set("ItemID_A").alias("sub_b")) # df = df.join(df_a, "ItemID_A").join(df_b, "ItemID_B").cache() # concat_int_arrays = concat(IntegerType()) # df = df.withColumn("sub_a_b", concat_int_arrays("sub_a", "sub_b"))#.show(truncate=False) # return df df = df.withColumnRenamed("ItemID_A", "ItemID") #df = df.withColumn("ItemID_COPY",df.ItemID) df = df.toPandas().set_index('ItemID') print(df) sub_pos = [] for i, row in tqdm(df.iterrows(), total=df.shape[0]): l = serach_positive(row.name, df, max_deep=self.pos_max_deep, deep=0, list_pos=[]) sub_pos.append(list(np.unique(l))) df['sub_a_b_all'] = sub_pos return df def main(self, sc: SparkContext, *args): os.makedirs(DATASET_DIR, exist_ok=True) #parans min_itens_per_session = 2 max_itens_per_session = self.max_itens_per_session min_itens_interactions = self.min_itens_interactions # Tupla interactions max_relative_pos = self.max_relative_pos spark = SparkSession(sc) df = spark.read.csv(BASE_DATASET_FILE, header=True, inferSchema=True) df = df.withColumnRenamed("session_id", "SessionID")\ .withColumnRenamed("click_timestamp", "Timestamp_")\ .withColumnRenamed("click_article_id", "ItemID")\ .withColumn("Timestamp",F.from_unixtime(col("Timestamp_")/lit(1000)).cast("timestamp"))\ .orderBy(col('Timestamp')).select("SessionID", "ItemID", "Timestamp", "Timestamp_") print(df.show(2)) dt = datetime.strptime('2017-10-16 20:59:59', '%Y-%m-%d %H:%M:%S') df = df.filter(col('Timestamp') < dt) # Drop duplicate item in that same session df = df.dropDuplicates(['SessionID', 'ItemID']) # filter date max_timestamp = df.select(max( col('Timestamp'))).collect()[0]['max(Timestamp)'] init_timestamp = max_timestamp - timedelta(days=self.sample_days) print("Timestamp:", max_timestamp, init_timestamp) df = df.filter(col('Timestamp') >= init_timestamp).cache() df = df.groupby("SessionID").agg( max("Timestamp").alias("Timestamp"), collect_list("ItemID").alias("ItemIDs"), count("ItemID").alias("total")) # Filter Interactions df = df.filter(df.total >= min_itens_per_session).cache() # Filter position in list df_pos = df.select( col('SessionID').alias('_SessionID'), posexplode(df.ItemIDs)) # Explode A df = df.withColumn("ItemID_A", explode(df.ItemIDs)) df = df.join(df_pos, (df.SessionID == df_pos._SessionID) & (df.ItemID_A == df_pos.col))\ .select('SessionID', 'Timestamp', 'ItemID_A', 'pos', 'ItemIDs')\ .withColumnRenamed('pos', 'pos_A') # Explode B df = df.withColumn("ItemID_B", explode(df.ItemIDs)) df = df.join(df_pos, (df.SessionID == df_pos._SessionID) & (df.ItemID_B == df_pos.col))\ .withColumnRenamed('pos', 'pos_B') df = df.withColumn("relative_pos", abs(df.pos_A - df.pos_B)) # Filter distincts df = df.select('SessionID', 'Timestamp', 'ItemID_A', 'pos_A', 'ItemID_B', 'pos_B', 'relative_pos')\ .distinct()\ .filter(df.ItemID_A != df.ItemID_B).cache() # # Filter duplicates # udf_join = F.udf(lambda s,x,y : "_".join(sorted([str(s), str(x),str(y)])) , StringType()) # df = df.withColumn('key', udf_join('SessionID', 'ItemID_A','ItemID_B')) # df = df.dropDuplicates(["key"]) # Calculate and filter probs ocorrence df_probs = self.get_df_tuple_probs(df) df = df.join(df_probs, (df.ItemID_A == df_probs._ItemID_A) & (df.ItemID_B == df_probs._ItemID_B)) print(df.show(2)) print(df.count()) # Add positive interactoes df_positive = self.add_positive_interactions(df) # Filter confidence df = df.filter(col("total_ocr_dupla") >= min_itens_interactions)\ .filter(col("relative_pos") <= max_relative_pos)\ .filter(col("pos_A") <= self.max_itens_per_session) # df = df.select("SessionID", 'Timestamp', 'ItemID_A', 'pos_A', # 'ItemID_B', 'pos_B', 'relative_pos', # 'total_ocr', 'total_ocr_dupla', 'prob', 'sub_a_b')\ # .dropDuplicates(['ItemID_A', 'ItemID_B', 'relative_pos']) # TODO is it right? df = df.select("SessionID", 'Timestamp', 'ItemID_A', 'ItemID_B', 'relative_pos', 'total_ocr', 'total_ocr_dupla')\ .dropDuplicates(['ItemID_A', 'ItemID_B', 'relative_pos']) # TODO is it right? df.select("ItemID_A").dropDuplicates().toPandas().to_csv( self.output()[2].path, index_label="item_idx") df.select("SessionID").dropDuplicates().toPandas().to_csv( self.output()[3].path, index_label="session_idx") df.write.parquet(self.output()[0].path) df_positive.to_csv(self.output()[1].path)
class MyWorkflow(sciluigi.WorkflowTask): # only required parameter outdir = luigi.Parameter() cores = luigi.IntParameter(default=1) # genome data genome = luigi.Parameter(default='hg38') genome_fasta = luigi.Parameter(default='/shr/genomes/fasta/hg38/hg38.fa') genome_size = luigi.Parameter(default='/shr/gSizes/hg38.genomsize') # CLIP input data input_bam = luigi.Parameter( default='CLIP/hPUM2/bams/input.ENCFF786ZZB.bam') rep1_bam = luigi.Parameter(default='CLIP/hPUM2/bams/rep1.ENCFF231WHF.bam') rep2_bam = luigi.Parameter(default='CLIP/hPUM2/bams/rep2.ENCFF732EQX.bam') # CLIP processing inputs input_bed = luigi.Parameter() len_consensus_seq = luigi.IntParameter(default=11) check_for_seq = luigi.Parameter(default='TGTA') window_size = luigi.IntParameter(default=500) temperature = luigi.IntParameter(default=0) # RNAMap input data model_param_basename = luigi.Parameter( default='annotations/RNAmap/qMotif_20180302_') # transcript data tpm_cutoff = luigi.FloatParameter(default=0.01) #tpm_file = luigi.Parameter(default='RNAseq/transcript_quant/rna_seq_combined.tpm.above_0.01_both.dat') #rnaseq_file1 = luigi.Parameter(default='RNAseq/transcript_quant/ENCFF272HJP.rep1.tsv') #rnaseq_file2 = luigi.Parameter(default='RNAseq/transcript_quant/ENCFF471SEN.rep2.tsv') #regions = luigi.Parameter(default='RNAseq/transcript_quant/exons.st.merge_transcript.above_0.01_both.bed') # the regions in which to look for motifs transcript_bed = luigi.Parameter( default='annotations/refseq/hg38_refGene.transcripts.st.bed') biomart_file = luigi.Parameter( default='annotations/ensemble_gene_converter_biomart.txt') def workflow(self): ####### CLIP ######## # download CLIP data # TODO # process CLIP data bams # get the bam file of the clip data processclipbams = {} findtotalreads = {} outdir_bams = os.path.join(self.outdir, 'bams') for key, bamfile in zip( ['rep1', 'rep2', 'input'], [self.rep1_bam, self.rep2_bam, self.input_bam]): processclipbams[key] = self.new_task('processclipbam_%s' % key, ProcessRawClipBam, bamfile=bamfile, outdir=outdir_bams) findtotalreads[key] = self.new_task('findtotalreads_%s' % key, FindTotalReads) findtotalreads[key].in_bam = processclipbams[key].out_bam # make bed graph of each strand of clip data getbedgraphs = {} outdir_clips = os.path.join(self.outdir, 'clip', 'bedgraphs') for key, processclipbam in processclipbams.items(): getbedgraphs[key] = self.new_task('getbedgraphs_%s' % key, GetBedGraphFromBam, outdir=outdir_clips, genome_size=self.genome_size) getbedgraphs[key].in_bam = processclipbam.out_bam # load RNA seq data downloadrna = self.new_task('downloadrna', DownloadRNAseq, outdir=os.path.join( self.outdir, 'expression')) ##### STARTING WITH A KNOWN BED FILE OF SITES ##### filterbed = self.new_task('getbed', scltasks.FilenameToTaskOutput, filename=self.input_bed) i = 0 # split bed file into strands splitbedfile = self.new_task('splitbedfile_%d' % i, DividBedByStrand) splitbedfile.in_bed_file = filterbed.out_file ##### FIND CLIP SIGNAL ##### # go through bedgraph files and run all clip commands outdir_clips = os.path.join(self.outdir, 'clip', 'split_%d' % i, 'strands') combinestrandsall = {} for key, getbedgraph in getbedgraphs.items(): # find signal in plus strand clipsignalplus = self.new_task('getclipsignalplus_%s_%d' % (key, i), GetClipSignal, window_size=self.window_size, genome_size=self.genome_size, outdir=outdir_clips) clipsignalplus.in_bed_file = splitbedfile.out_bed_plus clipsignalplus.in_bg_file = getbedgraph.out_bg_plus # find signal in minus strand clipsignalminus = self.new_task('getclipsignalminus_%s_%d' % (key, i), GetClipSignal, window_size=self.window_size, genome_size=self.genome_size, outdir=outdir_clips) clipsignalminus.in_bed_file = splitbedfile.out_bed_minus clipsignalminus.in_bg_file = getbedgraph.out_bg_minus # combine the two combinestrands = self.new_task('combinestrands_%s_%d' % (key, i), CombineStrandData, outdir=os.path.join( self.outdir, 'clip', 'split_%d' % i)) combinestrands.in_datafiles = [ clipsignalplus.out_signal, clipsignalminus.out_signal ] combinestrandsall[key] = combinestrands ####### FIND EXPRESSION OF TRANSCRIPTS AT MOTIF SITES ######## # find the transcript count per motif site based on the annotated refseq gene and the rnaseq data outdir_tpm = os.path.join(self.outdir, 'expression', 'split_%d' % i) findmotiftpm = self.new_task('findmotiftpm_%d' % i, ProcessRNASeq, biomart_file=self.biomart_file, outdir=outdir_tpm) findmotiftpm.in_bed = filterbed.out_file findmotiftpm.in_rna1 = downloadrna.out_rna1 findmotiftpm.in_rna2 = downloadrna.out_rna2 ####### FIND SEQUENCE AT MOTIF SITES ######## # find sequence of intervals outdir_seq = os.path.join(self.outdir, 'sequences', 'split_%d' % i) findsequence = self.new_task('findsequence_%d' % i, scltasks.FindSequence, genome_fasta=self.genome_fasta, window_size=self.window_size, outdir=outdir_seq) findsequence.in_bed = filterbed.out_file find_seqdata = self.new_task('findseqdata_%d' % i, FindMotifSequenceData, seq_length=self.len_consensus_seq, check_for_seq=self.check_for_seq, window_size=self.window_size, outdir=outdir_seq) find_seqdata.in_fasta = findsequence.out_fasta ####### PREDICT EFFECTS AT MOTIF SITES ####### outdir_model = os.path.join(self.outdir, 'effects', 'temp_%d' % (self.temperature), 'split_%d' % i) find_effect = self.new_task( 'findeffect_%d' % i, FindPredictedSeqEffect, outdir=outdir_model, model_param_basename=self.model_param_basename, temperature=self.temperature) find_effect.in_seqdata = find_seqdata.out_seqdata ####### COMBINE INFO AT MOTIF SITES ######## # combine data in meaningful way combinedata = self.new_task('combinedata_%d' % i, CombineDataAll, window_size=self.window_size, outdir=os.path.join( self.outdir, 'output', 'split_%d' % i), temperature=self.temperature) combinedata.in_counts = { key: target.out_signal for key, target in combinestrandsall.items() } combinedata.in_seq = find_seqdata.out_seqdata combinedata.in_tpm = findmotiftpm.out_motif_tpm combinedata.in_bed = filterbed.out_file combinedata.in_effect = find_effect.out_seqdata return combinedata
class SessionPrepareDataset(BasePySparkTask): sample_days: int = luigi.IntParameter(default=16) history_window: int = luigi.IntParameter(default=10) size_available_list: int = luigi.IntParameter(default=100) minimum_interactions: int = luigi.IntParameter(default=5) def output(self): return luigi.LocalTarget(os.path.join(DATASET_DIR, "dataset_prepared_sample={}_win={}_list={}_min_i={}.csv"\ .format(self.sample_days, self.history_window, self.size_available_list, self.minimum_interactions),)) def add_history(self, df): w = Window.partitionBy('SessionID').orderBy( 'Timestamp') #.rangeBetween(Window.currentRow, 5) df = df.withColumn('ItemIDHistory', F.collect_list('ItemID').over(w)).where( size(col("ItemIDHistory")) >= 2) #\ df = df.withColumn( 'ItemIDHistory', pad_history(df.ItemIDHistory, lit(self.history_window))) return df def filter(self, df): # filter date max_timestamp = df.select(max( col('Timestamp'))).collect()[0]['max(Timestamp)'] init_timestamp = max_timestamp - timedelta(days=self.sample_days) df = df.filter(col('Timestamp') >= init_timestamp).cache() # Filter minin interactions df_item = df.groupBy("ItemID").count() df_item = df_item.filter(col('count') >= self.minimum_interactions) # Filter session size df_session = df.groupBy("SessionID").count() df_session = df_session.filter(col('count') >= 2) df = df \ .join(df_item, "ItemID", how="inner") \ .join(df_session, "SessionID", how="inner") return df def add_available_items(self, df): all_items = list( df.select("ItemID").dropDuplicates().toPandas()["ItemID"]) df = df.withColumn( 'AvailableItems', udf_sample_items(all_items, self.size_available_list)(col("ItemID"))) return df def main(self, sc: SparkContext, *args): os.makedirs(DATASET_DIR, exist_ok=True) spark = SparkSession(sc) df = spark.read.csv(BASE_DATASET_FILE, header=True, inferSchema=True) df = df.withColumnRenamed("session_id", "SessionID")\ .withColumnRenamed("click_timestamp", "Timestamp_")\ .withColumnRenamed("click_article_id", "ItemID")\ .withColumn("Timestamp",F.from_unixtime(col("Timestamp_")/lit(1000)).cast("timestamp"))\ .orderBy(col('Timestamp')).select("SessionID", "ItemID", "Timestamp", "Timestamp_").filter(col('Timestamp') < '2017-10-16 24:59:59') # Drop duplicate item in that same session df = df.dropDuplicates(['SessionID', 'ItemID']) df = self.filter(df) df = self.add_history(df) df = self.add_available_items(df) df = df.withColumn('visit', lit(1)) df.toPandas().to_csv(self.output().path, index=False)
class KillOpenRedshiftSessions(luigi.Task): """ An task for killing any open Redshift sessions in a given database. This is necessary to prevent open user sessions with transactions against the table from blocking drop or truncate table commands. Usage: Subclass and override the required `host`, `database`, `user`, and `password` attributes. """ # time in seconds to wait before # reconnecting to Redshift if our session is killed too. # 30 seconds is usually fine; 60 is conservative connection_reset_wait_seconds = luigi.IntParameter(default=60) @abc.abstractproperty def host(self): return None @abc.abstractproperty def database(self): return None @abc.abstractproperty def user(self): return None @abc.abstractproperty def password(self): return None @property def update_id(self): """ This update id will be a unique identifier for this insert on this table. """ return self.task_id def output(self): """ Returns a RedshiftTarget representing the inserted dataset. Normally you don't override this. """ # uses class name as a meta-table return RedshiftTarget(host=self.host, database=self.database, user=self.user, password=self.password, table=self.__class__.__name__, update_id=self.update_id) def run(self): """ Kill any open Redshift sessions for the given database. """ connection = self.output().connect() # kill any sessions other than ours and # internal Redshift sessions (rdsdb) query = ("select pg_terminate_backend(process) " "from STV_SESSIONS " "where db_name=%s " "and user_name != 'rdsdb' " "and process != pg_backend_pid()") cursor = connection.cursor() logger.info('Killing all open Redshift sessions for database: %s', self.database) try: cursor.execute(query, (self.database, )) cursor.close() connection.commit() except psycopg2.DatabaseError as e: if e.message and 'EOF' in e.message: # sometimes this operation kills the current session. # rebuild the connection. Need to pause for 30-60 seconds # before Redshift will allow us back in. connection.close() logger.info( 'Pausing %s seconds for Redshift to reset connection', self.connection_reset_wait_seconds) time.sleep(self.connection_reset_wait_seconds) logger.info('Reconnecting to Redshift') connection = self.output().connect() else: raise try: self.output().touch(connection) connection.commit() finally: connection.close() logger.info('Done killing all open Redshift sessions for database: %s', self.database)