def run(self, input_files, metadata, output_files): """ Main run function for processing MNase-Seq FastQ data. Pipeline aligns the FASTQ files to the genome using BWA. iNPS is then used for peak calling to identify nucleosome position sites within the genome. Parameters ---------- files_ids : list List of file locations metadata : list Required meta data Returns ------- outputfiles : list List of locations for the output bam, bed and tsv files """ output_metadata = {} bwa = bwaAlignerTool() bwa_files, bwa_meta = bwa.run( remap(input_files, "genome", "loc", "index"), remap(metadata, "genome", "loc", "index"), {"output": output_files["bam"]} ) output_files_generated = {} try: output_files_generated["bam"] = bwa_files["bam"] output_metadata["bam"] = bwa_meta["bam"] tool_name = output_metadata['bam'].meta_data['tool'] output_metadata['bam'].meta_data['tool_description'] = tool_name output_metadata['bam'].meta_data['tool'] = "process_mnaseseq" except KeyError: logger.fatal("BWA Alignment failed") inps_tool = inps() inps_files, inps_meta = inps_tool.run( remap(bwa_files, "bam"), remap(bwa_meta, "bam"), {"bed": output_files["bed"]} ) try: output_files_generated["bed"] = inps_files["bed"] output_metadata["bed"] = inps_meta["bed"] tool_name = output_metadata['bed'].meta_data['tool'] output_metadata['bed'].meta_data['tool_description'] = tool_name output_metadata['bed'].meta_data['tool'] = "process_mnaseseq" except KeyError: logger.fatal("BWA Alignment failed") print("MNASESEQ RESULTS:", output_metadata) return output_files, output_metadata
def run(self, input_files, metadata, output_files): logger.info("\t0. perform checks") assert len(input_files.keys()) == 2 assert len(metadata.keys()) == 2 logger.info("\t1.a Instantiate Tool 1 and run") simple_tool1 = SimpleTool1(self.configuration) try: output1, outmd1 = simple_tool1.run( # Use remap to convert role "number1" to "input" for simpleTool1 remap(input_files, input="number1"), remap(metadata, input="number1"), # Use a temporary file name for intermediate outputs {"output": 'file1.out'}) except Exception as err: # pylint: disable=broad-except logger.fatal("Tool 1, run 1 failed: {}", err) return {}, {} logger.progress(50) # out of 100 logger.info("\t1.b (Instantiate Tool) and run") try: output2, outmd2 = simple_tool1.run( # Use remap to convert role "number2" to "input" for simpleTool1 remap(input_files, input="number2"), remap(metadata, input="number2"), # Use a temporary file name for intermediate outputs {"output": 'file2.out'}) except Exception as err: # pylint: disable=broad-except logger.fatal("Tool 1, run 2 failed: {}", err) return {}, {} logger.progress(75) # out of 100 logger.info("\t2. Instantiate Tool and run") simple_tool2 = SimpleTool2(self.configuration) try: output3, outmd3 = simple_tool2.run( # Instead of using remap, here we re-build dicts to convert input roles { "input1": output1["output"], "input2": output2["output"] }, { "input1": outmd1["output"], "input2": outmd2["output"] }, # Workflow output files are from this Tool output_files) except Exception as err: # pylint: disable=broad-except logger.fatal("Tool 2 failed: {}", err) return {}, {} logger.progress(100) # out of 100 logger.info("\t4. Optionally edit the output metadata") logger.info("\t5. Return") return output3, outmd3
def run(self, input_files, metadata, output_files): """ Main run function to index the WIG files ready for use in the RESTful API. WIG files are indexed in 2 different ways to allow for optimal data retreival. The first is as a Tabix file, this allows the data to get easily extracted as GFF3 documents and served to the user. The second is as an HDF5 file that is used to identify which bed files have information at a given location. This is to help the REST clients make only the required calls to the relevant GFF3 files rather than needing to pole all potential GFF3 files. Parameters ---------- files_ids : list List of file locations metadata : list Returns ------- outputfiles : list List of locations for the output wig and HDF5 files """ # Ensure that the file exists f_check = h5py.File(input_files["hdf5_file"], "a") f_check.close() # GFF3 Sort gst = gff3SortTool() gst_files, gst_meta = gst.run( remap(input_files, "gff3"), remap(metadata, "gff3"), remap(output_files, "sorted_gff3") ) # GFF3 Indexer tbi = gff3IndexerTool() tbi_files, tbi_meta = tbi.run( { "gff3": gst_files["sorted_gff3"], "chrom_size": input_files["chrom_size"], "hdf5_file": input_files["hdf5_file"] }, { "gff3": gst_meta["sorted_gff3"], "chrom_size": metadata["chrom_size"], "hdf5_file": metadata["hdf5_file"] }, { "gz_file": output_files["gz_file"], "tbi_file": output_files["tbi_file"], "hdf5_file": output_files["hdf5_file"] } ) return (gst_files + tbi_files, gst_meta + tbi_meta)
def run(self, input_files, metadata, output_files): """ Main run function to index the BED files ready for use in the RESTful API. BED files are index in 2 different ways to allow for optimal data retreival. The first is as a bigbed file, this allows the data to get easily extracted as BED documents and served to the user. The second is as an HDF5 file that is used to identify which bed files have information at a given location. This is to help the REST clients make only the required calls to the relevant BED files rather than needing to pole all potential BED files. Parameters ---------- inpout_files : list List of file locations metadata : list Returns ------- outputfiles : list List of locations for the output BED and HDF5 files """ # Ensure that the file exists f_check = h5py.File(input_files["hdf5_file"], "a") f_check.close() # Bed Sort bst = bedSortTool() bst_files, bst_meta = bst.run(remap(input_files, "bed"), remap(metadata, "bed"), remap(output_files, "sorted_bed")) # Bed Indexer bit = bedIndexerTool() bit_files, bit_meta = bit.run( { "bed": bst_files["sorted_bed"], "chrom_size": input_files["chrom_size"], "hdf5_file": input_files["hdf5_file"] }, { "bed": bst_meta["sorted_bed"], "chrom_size": metadata["chrom_size"], "hdf5_file": metadata["hdf5_file"] }, { "bb_file": output_files["bb_file"], "hdf5_file": output_files["hdf5_file"] }) return (bst_files + bit_files, bst_meta + bit_meta)
def history_policy_model(state, history, t, discount=1.0, discount_factor=0.95, max_depth=10): """history is a string""" with scope(prefix=history): if t > max_depth: return pyro.sample("a%d" % t, dist.Categorical(torch.ones(len(actions)))) action_weights = torch.zeros(len(actions)) for i, action in enumerate(actions): with scope(prefix="%s%d" % (action, t)): value = history_value_model(state, action, history, t, discount=discount, discount_factor=discount_factor, max_depth=max_depth) action_weights[i] = torch.exp(value) # Make the weights positive, then subtract from max min_weight = torch.min(action_weights) max_weight = torch.max(action_weights) action_weights = tensor([ remap(action_weights[i], min_weight, max_weight, 0., 1.) for i in range(len(action_weights)) ]) return actions[pyro.sample("a%d" % t, dist.Categorical(action_weights))]
def policy_model(state, t, discount=1.0, discount_factor=0.95, max_depth=10, alpha=0.1): """Returns Pr(a|s)""" # Weight the actions based on the value, and return the most # likely action if t > max_depth: return pyro.sample("a%d" % t, dist.Categorical(tensor([1., 1., 1.]))) action_weights = torch.zeros(len(actions)) for i, action in enumerate(actions): with scope(prefix="%s%d" % (action, t)): value = value_model(state, action, t, discount=discount, discount_factor=discount_factor, max_depth=max_depth) action_weights[i] = torch.exp( alpha * value) # action weight is softmax of value # Make the weights positive, then subtract from max min_weight = torch.min(action_weights) max_weight = torch.max(action_weights) action_weights = tensor([ remap(action_weights[i], min_weight, max_weight, 0., 1.) for i in range(len(action_weights)) ]) return actions[pyro.sample("a%d" % t, dist.Categorical(action_weights))]
def run(self, input_files, metadata, output_files): """ Main run function for generatigng the index files required by BS Seeker2. Parameters ---------- input_files : dict List of strings for the locations of files. These should include: genome_fa : str Genome assembly in FASTA metadata : dict Input file meta data associated with their roles genome : str output_files : dict Output file locations bam : str Output bam file location Returns ------- output_files : dict Output file locations associated with their roles, for the output index : str """ output_results_files = {} output_metadata = {} logger.info("WGBS - BS-Seeker2 Index") # Build the matching WGBS genome index builder = bssIndexerTool(self.configuration) genome_idx, gidx_meta = builder.run(remap(input_files, "genome"), remap(metadata, "genome"), remap(output_files, "index")) output_results_files["index"] = genome_idx["index"] output_metadata["index"] = gidx_meta["index"] return output_results_files, output_metadata
def drawBar(char, total, current): numsegs = 5 totalRange = numsegs * 8 # as determined by box characters # this assumes a start of zero barval = utils.clamp(int(utils.remap(current, 0, total, 0, totalRange)), 0, totalRange) numFull = barval // 8 partial = barval % 8 barlist = [barFill[-1] for x in range(numFull)] if partial != 0: barlist += [barFill[partial]] barlist += [' ' for x in range(numsegs - len(barlist))] return barStr.format(char, *barlist[::-1])
def sentiment_to_rating(sent): """ converts a sentiment to a movie rating >>> sentiment_to_rating(1) 100.0 >>> sentiment_to_rating(-1) 0.0 >>> sentiment_to_rating(0) 50.0 """ rating_min = 0 rating_max = 100 rating = remap(sent, sentiment.min(), sentiment.max(), rating_min, rating_max) return rating
def arrive(self, target): """ A method that calculates a steering force towards a target STEER = DESIRED - VELOCITY """ desired = target - self.position desired_len = desired.length() m = remap(desired_len, 0, 50, 0, self.maxspeed) try: desired.scale_to_length(m) except (ValueError, ZeroDivisionError): desired = self.position steer = desired - self.velocity limit_vector(steer, self.maxforce) self.apply_force(steer)
def belief_policy_model(belief, t, discount=1.0, discount_factor=0.95, max_depth=10, bu_nsteps=10, bu_lr=0.1): if t > max_depth: return pyro.sample("a%d" % t, dist.Categorical(tensor([1., 1., 1.]))) action_weights = torch.zeros(len(actions)) for i, action in enumerate(actions): with scope(prefix="%s%d" % (action,t)): value = belief_value_model(belief, action, t, discount=discount, discount_factor=discount_factor, max_depth=max_depth, bu_nsteps=bu_nsteps, bu_lr=bu_lr) action_weights[i] = torch.exp(value) # action weight is softmax of value # Make the weights positive, then subtract from max min_weight = torch.min(action_weights) max_weight = torch.max(action_weights) action_weights = tensor([remap(action_weights[i], min_weight, max_weight, 0., 1.) for i in range(len(action_weights))]) return actions[pyro.sample("a%d" % t, dist.Categorical(action_weights))]
def run(self, input_files, metadata, output_files): """ Main run function for processing ChIP-seq FastQ data. Pipeline aligns the FASTQ files to the genome using BWA. MACS 2 is then used for peak calling to identify transcription factor binding sites within the genome. Currently this can only handle a single data file and a single background file. Parameters ---------- input_files : dict Location of the initial input files required by the workflow bam : str Location of the aligned reads file bam_bg : str Location of the background aligned FASTQ reads file [OPTIONAL] metadata : dict Input file meta data associated with their roles bam : str bam_bg : str [OPTIONAL] output_files : dict Output file locations narrow_peak : str summits : str broad_peak : str gapped_peak : str Returns ------- output_files : dict Output file locations associated with their roles, for the output narrow_peak : str Results files in bed4+1 format summits : str Results files in bed6+4 format broad_peak : str Results files in bed6+3 format gapped_peak : str Results files in bed12+3 format output_metadata : dict Output metadata for the associated files in output_files narrow_peak : Metadata summits : Metadata broad_peak : Metadata gapped_peak : Metadata """ output_files_generated = {} output_metadata = {} # MACS2 to call peaks macs_caller = macs2(self.configuration) macs_inputs = {"bam": input_files["bam"]} macs_metadt = {"bam": metadata['bam']} if "bg_loc" in input_files: macs_inputs["bam"] = input_files["bam_bg"] macs_metadt["bam"] = output_metadata['bam_bg'] m_results_files, m_results_meta = macs_caller.run( macs_inputs, macs_metadt, # Outputs of the final step may match workflow outputs; # Extra entries in output_files will be disregarded. remap(output_files, 'narrow_peak', 'summits', 'broad_peak', 'gapped_peak')) if 'narrow_peak' in m_results_meta: output_files_generated['narrow_peak'] = m_results_files[ 'narrow_peak'] output_metadata['narrow_peak'] = m_results_meta['narrow_peak'] tool_name = output_metadata['narrow_peak'].meta_data['tool'] output_metadata['narrow_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['narrow_peak'].meta_data[ 'tool'] = "process_chipseq" if 'summits' in m_results_meta: output_files_generated['summits'] = m_results_files['summits'] output_metadata['summits'] = m_results_meta['summits'] tool_name = output_metadata['summits'].meta_data['tool'] output_metadata['summits'].meta_data[ 'tool_description'] = tool_name output_metadata['summits'].meta_data['tool'] = "process_chipseq" if 'broad_peak' in m_results_meta: output_files_generated['broad_peak'] = m_results_files[ 'broad_peak'] output_metadata['broad_peak'] = m_results_meta['broad_peak'] tool_name = output_metadata['broad_peak'].meta_data['tool'] output_metadata['broad_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['broad_peak'].meta_data['tool'] = "process_chipseq" if 'gapped_peak' in m_results_meta: output_files_generated['gapped_peak'] = m_results_files[ 'gapped_peak'] output_metadata['gapped_peak'] = m_results_meta['gapped_peak'] tool_name = output_metadata['gapped_peak'].meta_data['tool'] output_metadata['gapped_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['gapped_peak'].meta_data[ 'tool'] = "process_chipseq" return output_files_generated, output_metadata
def set_raw_state(self, raw): self.set_state(remap(raw, self.min, self.max, -1, 1))
def set_state(self, val): self.state = val remapped = math.floor(remap(val, -1, 1, self.min, self.max)) if self.channel is not None: self.channel.duty_cycle = remapped
def thumbnail( self, seg=None, overlay=None, colors=None, opacity=0.5, step=2, unroll=False ): """ Create a thumbnail. """ img = self.copy('int') if overlay is not None and seg is not None: print "You cannot specify both seg and overlay" return if seg is not None: seg = seg.astype('int') if overlay is not None and colors is None: colors = 'jet' # default colormap if colors is not None and seg is None and overlay is None: overlay = img.copy() # we want to see img in colors img = zeros(img.get_header(), dtype='uint8') opacity = 1.0 if isinstance( colors, str ): colors = utils.get_colormap( colors ) if seg is not None and colors is None: colors = utils.random_colormap(seg.max()) # now, seg == overlay if overlay is not None: seg = overlay if len(img.shape) == 2: if seg is not None: data = img.get_data('uint8') data = data.reshape(data.shape[0], data.shape[1], 1) data = np.concatenate( [ data, data, data ], axis=2 ) rgb_overlay = utils.remap( seg, colors ) op = (1-opacity) * (seg != 0).astype('float') + (seg == 0).astype('float') op = op.reshape(op.shape[0], op.shape[1], 1) op = np.concatenate( [ op, op, op ], axis=2 ) img = op * data + opacity*rgb_overlay return img.astype('uint8') elif len(img.shape) == 3: if not unroll: shape = np.array(img.shape).max() if seg is None: output = np.ones((shape,shape*3+step*(3-1)))*255 else: output = np.ones((shape,shape*3+step*(3-1),3))*255 offset1 = (shape - img.shape[1])/2 offset2 = (shape - img.shape[2])/2 if seg is None: tmp_img = img[img.shape[0]/2,:,:] else: tmp_img = Image(img[img.shape[0]/2,:,:]).thumbnail( seg[img.shape[0]/2,:,:], colors=colors, opacity=opacity ) output[offset1:offset1+img.shape[1], offset2:offset2+img.shape[2]] = tmp_img offset1 = (shape - img.shape[0])/2 offset2 = shape + step + (shape - img.shape[2])/2 if seg is None: tmp_img = img[:,img.shape[1]/2,:] else: tmp_img = Image(img[:,img.shape[1]/2,:]).thumbnail( seg[:,img.shape[1]/2,:], colors=colors, opacity=opacity) output[offset1:offset1+img.shape[0], offset2:offset2+img.shape[2]] = tmp_img offset1 = (shape - img.shape[0])/2 offset2 = 2*shape + 2*step + (shape - img.shape[1])/2 if seg is None: tmp_img = img[:,:,img.shape[2]/2] else: tmp_img = Image(img[:,:,img.shape[2]/2]).thumbnail( seg[:,:,img.shape[2]/2], colors=colors, opacity=opacity ) output[offset1:offset1+img.shape[0], offset2:offset2+img.shape[1]] = tmp_img return output.astype('uint8') else: # unroll is True if seg is None: output = np.ones( ( self.shape[1], self.shape[2]*self.shape[0]+2*(self.shape[0]-1) ) ) * 255 else: output = np.ones( ( self.shape[1], self.shape[2]*self.shape[0]+2*(self.shape[0]-1), 3 ) ) * 255 for k in xrange(self.shape[0]): if seg is None: tmp_img = img[k,:,:] else: tmp_img = Image(img[k,:,:]).thumbnail( seg[k,:,:], colors=colors, opacity=opacity ) output[:, k*self.shape[2]+2*k:(k+1)*self.shape[2]+2*k] = tmp_img return output.astype('uint8') else: raise "Wrong number of dimensions for thumbnail: " + str(len(self.shape))
def run(self, input_files, metadata, output_files): """ Main run function for aligning FastQ reads with Bowtie2. Currently this can only handle a single data file and a single background file. Parameters ---------- input_files : dict Location of the initial input files required by the workflow genome : str Genome FASTA file index : str Location of the BWA archived index files loc : str Location of the FASTQ reads files fastq2 : str [OPTIONAL] Location of the FASTQ reads file for paired end data metadata : dict Input file meta data associated with their roles genome : str index : str loc : str fastq2 : str output_files : dict Output file locations bam : str Output bam file location Returns ------- output_files : dict Output file locations associated with their roles, for the output bam : str Aligned FASTQ short read file locations output_metadata : dict Output metadata for the associated files in output_files bam : Metadata """ output_files_generated = {} output_metadata = {} logger.info("PROCESS ALIGNMENT - DEFINED OUTPUT:", output_files["bam"]) bowtie2_handle = bowtie2AlignerTool(self.configuration) bowtie2_files, bowtie2_meta = bowtie2_handle.run( # ideally parameter "roles" don't change remap(input_files, "genome", "loc", "index"), remap(metadata, "genome", "loc", "index"), {"output": output_files["bam"]}) try: output_files_generated["bam"] = bowtie2_files["bam"] output_metadata["bam"] = bowtie2_meta["bam"] tool_name = output_metadata['bam'].meta_data['tool'] output_metadata['bam'].meta_data['tool_description'] = tool_name output_metadata['bam'].meta_data['tool'] = "process_bwa" except KeyError: logger.fatal("BWA aligner failed") return output_files_generated, output_metadata
def thumbnail(self, seg=None, overlay=None, colors=None, opacity=0.5, step=2, unroll=False, index=None): """ Create a thumbnail. """ img = self.copy('int') if overlay is not None and seg is not None: print "You cannot specify both seg and overlay" return if seg is not None: seg = seg.astype('int') if overlay is not None and colors is None: colors = 'jet' # default colormap if colors is not None and seg is None and overlay is None: overlay = img.copy() # we want to see img in colors img = zeros(img.get_header(), dtype='uint8') opacity = 1.0 if isinstance(colors, str): colors = utils.get_colormap(colors) if seg is not None and colors is None: colors = utils.random_colormap(seg.max()) if index is None: index = np.array(img.shape).astype('int32') / 2 # now, seg == overlay if overlay is not None: seg = overlay if len(img.shape) == 2: if seg is not None: data = img.get_data('uint8') data = data.reshape(data.shape[0], data.shape[1], 1) data = np.concatenate([data, data, data], axis=2) rgb_overlay = utils.remap(seg, colors) # op = (1-opacity) * (seg != 0).astype('float') + (seg == 0).astype('float') # op = op.reshape(op.shape[0], # op.shape[1], # 1) # op = np.concatenate( [ op, # op, # op ], axis=2 ) # img = op * data + opacity*rgb_overlay img = (1 - opacity) * data + opacity * rgb_overlay return img.astype('uint8') elif len(img.shape) == 3: if not unroll: shape = np.array(img.shape).max() if seg is None: output = np.ones((shape, shape * 3 + step * (3 - 1))) * 255 else: output = np.ones( (shape, shape * 3 + step * (3 - 1), 3)) * 255 offset1 = int((shape - img.shape[1]) / 2) offset2 = int((shape - img.shape[2]) / 2) if seg is None: tmp_img = img[index[0], :, :] else: tmp_img = Image(img[index[0], :, :]).thumbnail( seg[index[0], :, :], colors=colors, opacity=opacity) output[offset1:offset1 + img.shape[1], offset2:offset2 + img.shape[2]] = tmp_img offset1 = int((shape - img.shape[0]) / 2) offset2 = int(shape + step + (shape - img.shape[2]) / 2) if seg is None: tmp_img = img[:, index[1], :] else: tmp_img = Image(img[:, index[1], :]).thumbnail( seg[:, index[1], :], colors=colors, opacity=opacity) output[offset1:offset1 + img.shape[0], offset2:offset2 + img.shape[2]] = tmp_img offset1 = int((shape - img.shape[0]) / 2) offset2 = int(2 * shape + 2 * step + (shape - img.shape[1]) / 2) if seg is None: tmp_img = img[:, :, index[2]] else: tmp_img = Image(img[:, :, index[2]]).thumbnail(seg[:, :, index[2]], colors=colors, opacity=opacity) output[offset1:offset1 + img.shape[0], offset2:offset2 + img.shape[1]] = tmp_img return output.astype('uint8') else: # unroll is True if seg is None: output = np.ones( (self.shape[1], self.shape[2] * self.shape[0] + 2 * (self.shape[0] - 1))) * 255 else: output = np.ones( (self.shape[1], self.shape[2] * self.shape[0] + 2 * (self.shape[0] - 1), 3)) * 255 for k in xrange(self.shape[0]): if seg is None: tmp_img = img[k, :, :] else: tmp_img = Image(img[k, :, :]).thumbnail( seg[k, :, :], colors=colors, opacity=opacity) output[:, k * self.shape[2] + 2 * k:(k + 1) * self.shape[2] + 2 * k] = tmp_img return output.astype('uint8') else: raise "Wrong number of dimensions for thumbnail: " + str( len(self.shape))
def run(self, input_files, metadata, output_files): # pylint: disable=too-many-branches """ Main run function for processing ChIP-seq FastQ data. Pipeline aligns the FASTQ files to the genome using BWA. MACS 2 is then used for peak calling to identify transcription factor binding sites within the genome. Currently this can only handle a single data file and a single background file. Parameters ---------- input_files : dict Location of the initial input files required by the workflow genome : str Genome FASTA file index : str Location of the BWA archived index files loc : str Location of the FASTQ reads files fastq2 : str Location of the paired end FASTQ file [OPTIONAL] bg_loc : str Location of the background FASTQ reads files [OPTIONAL] fastq2_bg : str Location of the paired end background FASTQ reads files [OPTIONAL] metadata : dict Input file meta data associated with their roles genome : str index : str bg_loc : str [OPTIONAL] output_files : dict Output file locations bam [, "bam_bg"] : str filtered [, "filtered_bg"] : str narrow_peak : str summits : str broad_peak : str gapped_peak : str Returns ------- output_files : dict Output file locations associated with their roles, for the output bam [, "bam_bg"] : str Aligned FASTQ short read file [ and aligned background file] locations filtered [, "filtered_bg"] : str Filtered versions of the respective bam files narrow_peak : str Results files in bed4+1 format summits : str Results files in bed6+4 format broad_peak : str Results files in bed6+3 format gapped_peak : str Results files in bed12+3 format output_metadata : dict Output metadata for the associated files in output_files bam [, "bam_bg"] : Metadata filtered [, "filtered_bg"] : Metadata narrow_peak : Metadata summits : Metadata broad_peak : Metadata gapped_peak : Metadata """ output_files_generated = {} output_metadata = {} logger.info("PROCESS CHIPSEQ - DEFINED OUTPUT:", output_files["bam"]) align_input_files = remap(input_files, "genome", "loc", "index") align_input_file_meta = remap(metadata, "genome", "loc", "index") if "fastq2" in input_files: align_input_files["fastq2"] = input_files["fastq2"] align_input_file_meta["fastq2"] = metadata["fastq2"] bwa = bwaAlignerTool(self.configuration) bwa_files, bwa_meta = bwa.run(align_input_files, align_input_file_meta, {"output": output_files["bam"]}) try: output_files_generated["bam"] = bwa_files["bam"] output_metadata["bam"] = bwa_meta["bam"] tool_name = output_metadata['bam'].meta_data['tool'] output_metadata['bam'].meta_data['tool_description'] = tool_name output_metadata['bam'].meta_data['tool'] = "process_chipseq" except KeyError: logger.fatal("BWA aligner failed") if "bg_loc" in input_files: # Align background files align_input_files_bg = remap(input_files, "genome", "index", loc="bg_loc") align_input_file_meta_bg = remap(metadata, "genome", "index", loc="bg_loc") if "fastq2" in input_files: align_input_files_bg["fastq2"] = input_files["fastq2_bg"] align_input_file_meta_bg["fastq2"] = metadata["fastq2_bg"] bwa_bg_files, bwa_bg_meta = bwa.run( align_input_files_bg, align_input_file_meta_bg, {"output": output_files["bam_bg"]}) try: output_files_generated["bam_bg"] = bwa_bg_files["bam_bg"] output_metadata["bam_bg"] = bwa_bg_meta["bam_bg"] tool_name = output_metadata['bam_bg'].meta_data['tool'] output_metadata['bam_bg'].meta_data[ 'tool_description'] = tool_name output_metadata['bam_bg'].meta_data['tool'] = "process_chipseq" except KeyError: logger.fatal("Background BWA aligner failed") # Filter the bams b3f = biobambam(self.configuration) b3f_files, b3f_meta = b3f.run({"input": bwa_files['bam']}, {"input": bwa_meta['bam']}, {"output": output_files["filtered"]}) try: output_files_generated["filtered"] = b3f_files["bam"] output_metadata["filtered"] = b3f_meta["bam"] tool_name = output_metadata['filtered'].meta_data['tool'] output_metadata['filtered'].meta_data[ 'tool_description'] = tool_name output_metadata['filtered'].meta_data['tool'] = "process_chipseq" except KeyError: logger.fatal("BioBamBam filtering failed") if "bg_loc" in input_files: # Filter background aligned files b3f_bg_files, b3f_bg_meta = b3f.run( {"input": bwa_bg_files['bam']}, {"input": bwa_bg_meta['bam']}, {"output": output_files["filtered_bg"]}) try: output_files_generated["filtered_bg"] = b3f_bg_files["bam"] output_metadata["filtered_bg"] = b3f_bg_meta["bam"] tool_name = output_metadata['filtered_bg'].meta_data['tool'] output_metadata['filtered_bg'].meta_data[ 'tool_description'] = tool_name output_metadata['filtered_bg'].meta_data[ 'tool'] = "process_chipseq" except KeyError: logger.fatal("Background BioBamBam filtering failed") # MACS2 to call peaks macs_caller = macs2(self.configuration) macs_inputs = {"bam": output_files_generated["filtered"]} macs_metadt = {"bam": output_metadata['filtered']} if "bg_loc" in input_files: macs_inputs["bam_bg"] = output_files_generated["filtered_bg"] macs_metadt["bam_bg"] = output_metadata['filtered_bg'] m_results_files, m_results_meta = macs_caller.run( macs_inputs, macs_metadt, # Outputs of the final step may match workflow outputs; # Extra entries in output_files will be disregarded. remap(output_files, 'narrow_peak', 'summits', 'broad_peak', 'gapped_peak')) if not m_results_meta: logger.fatal("MACS2 peak calling failed") if 'narrow_peak' in m_results_meta: output_files_generated['narrow_peak'] = m_results_files[ 'narrow_peak'] output_metadata['narrow_peak'] = m_results_meta['narrow_peak'] tool_name = output_metadata['narrow_peak'].meta_data['tool'] output_metadata['narrow_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['narrow_peak'].meta_data[ 'tool'] = "process_chipseq" if 'summits' in m_results_meta: output_files_generated['summits'] = m_results_files['summits'] output_metadata['summits'] = m_results_meta['summits'] tool_name = output_metadata['summits'].meta_data['tool'] output_metadata['summits'].meta_data[ 'tool_description'] = tool_name output_metadata['summits'].meta_data['tool'] = "process_chipseq" if 'broad_peak' in m_results_meta: output_files_generated['broad_peak'] = m_results_files[ 'broad_peak'] output_metadata['broad_peak'] = m_results_meta['broad_peak'] tool_name = output_metadata['broad_peak'].meta_data['tool'] output_metadata['broad_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['broad_peak'].meta_data['tool'] = "process_chipseq" if 'gapped_peak' in m_results_meta: output_files_generated['gapped_peak'] = m_results_files[ 'gapped_peak'] output_metadata['gapped_peak'] = m_results_meta['gapped_peak'] tool_name = output_metadata['gapped_peak'].meta_data['tool'] output_metadata['gapped_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['gapped_peak'].meta_data[ 'tool'] = "process_chipseq" return output_files_generated, output_metadata
def run(self, input_files, metadata, output_files): """ Main run function for processing RNA-Seq FastQ data. Pipeline aligns the FASTQ files to the genome using Kallisto. Kallisto is then also used for peak calling to identify levels of expression. Parameters ---------- files_ids : dict List of file locations (genome FASTA, FASTQ_01, FASTQ_02 (for paired ends)) metadata : dict Required meta data output_files : dict List of output file locations Returns ------- outputfiles : list List of locations for the output bam, bed and tsv files Parameters ---------- input_files : list List of file locations metadata : list Required meta data output_files : list List of output file locations Returns ------- outputfiles : dict List of locations for the output index files output_metadata : dict Metadata about each of the files """ # Index the cDNA # This could get moved to the general tools section k_index = kallistoIndexerTool() k_out, k_meta = k_index.run( remap(input_files, "cdna"), remap(metadata, "cdna"), remap(output_files, "index"), ) if "index" not in k_out: logger.fatal("Kallisto: Index has not been generated") return {}, {} # Quantification k_quant = kallistoQuantificationTool() if "fastq2" not in input_files: kq_input_files = { "cdna": input_files["cdna"], "fastq1": input_files["fastq1"], "index": k_out["index"] } kq_input_meta = { "cdna": metadata["cdna"], "fastq1": metadata["fastq1"], "index": k_meta["index"] } kq_files, kq_meta = k_quant.run( kq_input_files, kq_input_meta, remap(output_files, "abundance_h5_file", "abundance_tsv_file", "run_info_file")) elif "fastq2" in input_files: kq_input_files = { "cdna": input_files["cdna"], "fastq1": input_files["fastq1"], "fastq2": input_files["fastq2"], "index": k_out["index"] } kq_input_meta = { "cdna": metadata["cdna"], "fastq1": metadata["fastq1"], "fastq2": metadata["fastq2"], "index": k_meta["index"] } kq_files, kq_meta = k_quant.run( kq_input_files, kq_input_meta, remap(output_files, "abundance_h5_file", "abundance_tsv_file", "run_info_file")) try: kq_files["index"] = k_out["index"] kq_meta["index"] = k_meta["index"] tool_name = kq_meta['index'].meta_data['tool'] kq_meta['index'].meta_data['tool_description'] = tool_name kq_meta['index'].meta_data['tool'] = "process_rnaseq" tool_name = kq_meta['abundance_h5_file'].meta_data['tool'] kq_meta['abundance_h5_file'].meta_data[ 'tool_description'] = tool_name kq_meta['abundance_h5_file'].meta_data['tool'] = "process_rnaseq" tool_name = kq_meta['abundance_tsv_file'].meta_data['tool'] kq_meta['abundance_tsv_file'].meta_data[ 'tool_description'] = tool_name kq_meta['abundance_tsv_file'].meta_data['tool'] = "process_rnaseq" tool_name = kq_meta['run_info_file'].meta_data['tool'] kq_meta['run_info_file'].meta_data['tool_description'] = tool_name kq_meta['run_info_file'].meta_data['tool'] = "process_rnaseq" except KeyError: logger.fatal("Kallisto failed") return (kq_files, kq_meta)
def run(self, input_files, metadata, output_files): """ Main run function for processing DamID-seq FastQ data. Pipeline aligns the FASTQ files to the genome using BWA. iDEAR is then used for peak calling to identify transcription factor binding sites within the genome. Currently this can only handle a single data file and a single background file. Parameters ---------- input_files : dict Location of the initial input files required by the workflow genome : str Genome FASTA file index : str Location of the BWA archived index files fastq_1 : str Location of the FASTQ reads files fastq_2 : str Location of the FASTQ repeat reads files bg_fastq_1 : str Location of the background FASTQ reads files bg_fastq_2 : str Location of the background FASTQ repeat reads files metadata : dict Input file meta data associated with their roles genome : str index : str fastq_1 : str fastq_2 : str bg_fastq_1 : str bg_fastq_2 : str output_files : dict Output file locations bam [, "bam_bg"] : str filtered [, "filtered_bg"] : str Returns ------- output_files : dict Output file locations associated with their roles, for the output bam [, "bam_bg"] : str Aligned FASTQ short read file [ and aligned background file] locations filtered [, "filtered_bg"] : str Filtered versions of the respective bam files bigwig : str Location of the bigwig peaks output_metadata : dict Output metadata for the associated files in output_files bam [, "bam_bg"] : Metadata filtered [, "filtered_bg"] : Metadata bigwig : Metadata """ output_files_generated = {} output_metadata = {} # Add in BSgenome section logger.info("PROCESS DAMIDSEQ - DEFINED OUTPUT:", output_files) alignment_set = [ ["fastq_1", "bam_1", "bam_1_filtered"], ["fastq_2", "bam_2", "bam_2_filtered"], ["bg_fastq_1", "bg_bam_1", "bg_bam_1_filtered"], ["bg_fastq_2", "bg_bam_2", "bg_bam_2_filtered"], ] # BSgenome logger.info("Generating BSgenome") bsg = bsgenomeTool(self.configuration) bsgi, bsgm = bsg.run({"genome": input_files["genome"]}, {"genome": metadata["genome"]}, { "bsgenome": output_files["bsgenome"], "chrom_size": output_files["chrom_size"], "genome_2bit": output_files["genome_2bit"], "seed_file": output_files["seed_file"] }) try: file_keys = ["bsgenome", "chrom_size", "genome_2bit", "seed_file"] for file_key in file_keys: output_files_generated[file_key] = bsgi[file_key] output_metadata[file_key] = bsgm[file_key] tool_name = output_metadata[file_key].meta_data['tool'] output_metadata[file_key].meta_data[ 'tool_description'] = tool_name output_metadata[file_key].meta_data[ 'tool'] = "process_damidseq" except KeyError: logger.fatal("BSgenome indexer failed") # Align and filter reads for aln in alignment_set: bwa = bwaAlignerTool(self.configuration) bwa_files, bwa_meta = bwa.run( remap(input_files, "genome", "index", loc=aln[0]), remap(metadata, "genome", "index", loc=aln[0]), {"output": output_files[aln[1]]}) try: output_files_generated[aln[1]] = bwa_files["bam"] output_metadata[aln[1]] = bwa_meta["bam"] tool_name = output_metadata[aln[1]].meta_data["tool"] output_metadata[ aln[1]].meta_data["tool_description"] = tool_name output_metadata[aln[1]].meta_data["tool"] = "process_damidseq" except KeyError as msg: logger.fatal( "KeyError error - BWA aligner failed: {0}\n{1}\n{2}\n{3}". format( msg, aln[1], "Available file keys: " + ", ".join(bwa_files.keys()), "Available mets keys: " + ", ".join(bwa_meta.keys()))) # Filter the bams b3f = biobambam(self.configuration) b3f_files, b3f_meta = b3f.run({"input": bwa_files["bam"]}, {"input": bwa_meta["bam"]}, {"output": output_files[aln[2]]}) try: output_files_generated[aln[2]] = b3f_files["bam"] output_metadata[aln[2]] = b3f_meta["bam"] tool_name = output_metadata[aln[2]].meta_data["tool"] output_metadata[ aln[2]].meta_data["tool_description"] = tool_name output_metadata[aln[2]].meta_data["tool"] = "process_damidseq" except KeyError as msg: logger.fatal( "KeyError error - BioBamBam filtering failed: {0}\n{1}". format(msg, aln[2])) return {}, {} # iDEAR to call peaks idear_caller = idearTool(self.configuration) idear_files, idear_meta = idear_caller.run( { "bam_1": output_files_generated["bam_1_filtered"], "bam_2": output_files_generated["bam_2_filtered"], "bg_bam_1": output_files_generated["bg_bam_1_filtered"], "bg_bam_2": output_files_generated["bg_bam_2_filtered"], "bsgenome": input_files["bsgenome"] }, { "bam_1": output_metadata["bam_1_filtered"], "bam_2": output_metadata["bam_2_filtered"], "bg_bam_1": output_metadata["bg_bam_1_filtered"], "bg_bam_2": output_metadata["bg_bam_2_filtered"], "bsgenome": metadata["bsgenome"] }, { "bigwig": output_files["bigwig"], }) try: output_files_generated["bigwig"] = idear_files["bigwig"] output_metadata["bigwig"] = idear_meta["bigwig"] tool_name = output_metadata["bigwig"].meta_data["tool"] output_metadata["bigwig"].meta_data["tool_description"] = tool_name output_metadata["bigwig"].meta_data["tool"] = "process_damidseq" except KeyError as msg: logger.fatal( "KeyError error - iDEAR filtering failed: {0}\n{1}".format( msg, "bigwig")) return {}, {} print("DAMID-SEQ RESULTS:", output_metadata) return output_files_generated, output_metadata
def run(self, input_files, metadata, output_files): """ This pipeline processes paired-end FASTQ files to identify methylated regions within the genome. Parameters ---------- input_files : dict List of strings for the locations of files. These should include: genome_fa : str Genome assembly in FASTA fastq1 : str Location for the first FASTQ file for single or paired end reads fastq2 : str [OPTIONAL]Location for the second FASTQ file if paired end reads metadata : dict Input file meta data associated with their roles genome_fa : str fastq1 : str fastq2 : str [OPTIONAL] output_files : dict index : str fastq1_filtered : str fastq2_filtered : str [OPTIONAL] bam : str bai : str wig_file : str cgmap_file : str atcgmap_file : str Returns ------- fastq1_filtered|fastq1_filtered : str Locations of the filtered FASTQ files from which alignments were made bam|bai : str Location of the alignment bam file and the associated index wig_file : str Location of the wig file containing the methylation peak calls cgmap_file : str Location of the CGmap file generated by BS-Seeker2 atcgmap_file : str Location of the ATCGmap file generated by BS-Seeker2 """ output_results_files = {} output_metadata = {} logger.info("WGBS - BS-Seeker2 Index") # Build the matching WGBS genome index builder = bssIndexerTool(self.configuration) genome_idx, gidx_meta = builder.run(remap(input_files, "genome"), remap(metadata, "genome"), remap(output_files, "index")) output_results_files["index"] = genome_idx["index"] output_metadata["index"] = gidx_meta["index"] # Filter the FASTQ reads to remove duplicates logger.info("WGBS - Filter") frt = filterReadsTool(self.configuration) fastq1f, filter1_meta = frt.run( {"fastq": input_files["fastq1"]}, {"fastq": metadata["fastq1"]}, {"fastq_filtered": output_files["fastq1_filtered"]}) try: output_results_files["fastq1_filtered"] = fastq1f["fastq_filtered"] output_metadata["fastq1_filtered"] = filter1_meta["fastq_filtered"] tool_name = output_metadata["fastq1_filtered"].meta_data["tool"] output_metadata["fastq1_filtered"].meta_data[ "tool_description"] = tool_name output_metadata["fastq1_filtered"].meta_data[ "tool"] = "process_wgbs" except KeyError: logger.fatal("WGBS - FILTER: Error while filtering") return {}, {} if "fastq2" in input_files: logger.info("WGBS - Filter background") fastq2f, filter2_meta = frt.run( {"fastq": input_files["fastq2"]}, {"fastq": metadata["fastq2"]}, {"fastq_filtered": output_files["fastq2_filtered"]}) try: output_results_files["fastq2_filtered"] = fastq2f[ "fastq_filtered"] output_metadata["fastq2_filtered"] = filter2_meta[ "fastq_filtered"] tool_name = output_metadata["fastq2_filtered"].meta_data[ "tool"] output_metadata["fastq2_filtered"].meta_data[ "tool_description"] = tool_name output_metadata["fastq2_filtered"].meta_data[ "tool"] = "process_wgbs" except KeyError: logger.fatal( "WGBS - FILTER (background): Error while filtering") return {}, {} logger.info("WGBS - BS-Seeker2 Aligner") # Handles the alignment of all of the split packets then merges them # back together. bss_aligner = bssAlignerTool(self.configuration) aligner_input_files = { "genome": input_files["genome"], "fastq1": fastq1f["fastq_filtered"] } aligner_input_files["index"] = genome_idx["index"] aligner_meta = { "genome": metadata["genome"], "fastq1": filter1_meta["fastq_filtered"], "index": output_metadata["index"] } if "fastq2" in input_files: aligner_input_files["fastq2"] = fastq2f["fastq_filtered"] aligner_meta["fastq2"] = filter2_meta["fastq_filtered"] bam, bam_meta = bss_aligner.run(aligner_input_files, aligner_meta, remap(output_files, "bam", "bai")) try: output_results_files["bam"] = bam["bam"] output_results_files["bai"] = bam["bai"] output_metadata["bam"] = bam_meta["bam"] output_metadata["bai"] = bam_meta["bai"] tool_name = output_metadata["bam"].meta_data["tool"] output_metadata["bam"].meta_data["tool_description"] = tool_name output_metadata["bam"].meta_data["tool"] = "process_wgbs" tool_name = output_metadata["bai"].meta_data["tool"] output_metadata["bai"].meta_data["tool_description"] = tool_name output_metadata["bai"].meta_data["tool"] = "process_wgbs" except KeyError: logger.fatal("WGBS - Aligner failed") return {}, {} # Methylation peak caller peak_caller_handle = bssMethylationCallerTool(self.configuration) mct_input_files = { "genome": input_files["genome"], "index": genome_idx["index"], "fastq1": fastq1f["fastq_filtered"], "bam": bam["bam"], "bai": bam["bai"] } mct_meta = { "genome": metadata["genome"], "index": gidx_meta["index"], "fastq1": filter1_meta["fastq_filtered"], "bam": output_metadata["bam"], "bai": bam_meta["bai"] } if "fastq2" in input_files: mct_input_files["fastq2"] = fastq2f["fastq_filtered"] mct_meta["fastq2"] = filter2_meta["fastq_filtered"] peak_files, peak_meta = peak_caller_handle.run( mct_input_files, mct_meta, remap(output_files, "wig_file", "cgmap_file", "atcgmap_file")) # output_metadata["peak_calling"] = peak_meta try: output_results_files["wig_file"] = peak_files["wig_file"] output_results_files["cgmap_file"] = peak_files["cgmap_file"] output_results_files["atcgmap_file"] = peak_files["atcgmap_file"] output_metadata["wig_file"] = peak_meta["wig_file"] output_metadata["cgmap_file"] = peak_meta["cgmap_file"] output_metadata["atcgmap_file"] = peak_meta["atcgmap_file"] output_metadata["wig_file"].meta_data["tool_description"] = output_metadata["wig_file"].meta_data["tool"] # pylint: disable=line-too-long output_metadata["wig_file"].meta_data["tool"] = "process_wgbs" output_metadata["cgmap_file"].meta_data["tool_description"] = output_metadata["cgmap_file"].meta_data["tool"] # pylint: disable=line-too-long output_metadata["cgmap_file"].meta_data["tool"] = "process_wgbs" output_metadata["atcgmap_file"].meta_data["tool_description"] = output_metadata["atcgmap_file"].meta_data["tool"] # pylint: disable=line-too-long output_metadata["atcgmap_file"].meta_data["tool"] = "process_wgbs" except KeyError: logger.fatal("WGBS - Peak caller failed") return {}, {} return (output_results_files, output_metadata)