def get_change_point_gtf(self, cp): graph_id = ('G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) features = [] f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changept' f.start = cp.pos f.end = cp.pos + 1 f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange)} features.append(f) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changeinterval' f.start = cp.start f.end = cp.end f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange)} features.append(f) return features
def to_gtf(self): strand_str = Strand.to_gtf(self.strand) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'transcript' f.start = self.start f.end = self.end f.score = 0.0 f.strand = strand_str f.phase = '.' f.attrs = { GTF.Attr.TRANSCRIPT_ID: self._id, GTF.Attr.SAMPLE_ID: self.sample_id, GTF.Attr.EXPR: str(self.expr), GTF.Attr.REF: str(int(self.is_ref)) } yield f for e in self.exons: f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'exon' f.start = e.start f.end = e.end f.score = 0.0 f.strand = strand_str f.phase = '.' f.attrs = {GTF.Attr.TRANSCRIPT_ID: self._id} yield f
def get_node_gtf(self): graph_id = ( 'G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) # iterate through locus and return change point data for n_id in self.G: n = self.get_node_interval(n_id) expr_data = self.get_expr_data(*n) ref_starts = _array_subset(self.ref_start_sites, *n) ref_stops = _array_subset(self.ref_stop_sites, *n) # return gtf feature for each node f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'node' f.start = n[0] f.end = n[1] f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = { 'graph_id': graph_id, 'expr_min': str(expr_data.min()), 'expr_max': str(expr_data.max()), 'expr_mean': str(expr_data.mean()), 'ref_starts': ','.join(map(str, ref_starts)), 'ref_stops': ','.join(map(str, ref_stops)) } yield f
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, expr, rel_frac, abs_frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = Strand.to_gtf(strand) attr_dict = { 'locus_id': locus_id, 'gene_id': gene_id, 'tss_id': tss_id, 'transcript_id': transcript_id } f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'transcript' f.start = tx_start f.end = tx_end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = { 'expr': '%.3f' % expr, 'rel_frac': '%.5f' % rel_frac, 'abs_frac': '%.5f' % abs_frac } f.attrs.update(attr_dict) yield f for e in exons: f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'exon' f.start = e.start f.end = e.end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = {} f.attrs.update(attr_dict) yield f
def _make_transcript_feature(exon_features): f = GTF.Feature() f.seqid = exon_features[0].seqid f.source = exon_features[0].source f.feature = 'transcript' f.start = exon_features[0].start f.end = exon_features[-1].end f.score = exon_features[0].score f.strand = exon_features[0].strand f.phase = '.' f.attrs = exon_features[0].attrs.copy() if 'exon_number' in f.attrs: del f.attrs['exon_number'] return f
def _read_transfrags(sample, gtf_expr_attr, is_ref=False): ''' Process individual sample GTF file - Reads entire GTF file into memory. - Renames "gene_id" and "transcript_id" attributes for consistency and to conserve space. ''' t_dict = collections.OrderedDict() t_id_map = {} t_expr_map = {} cur_t_id = 1 for f in GTF.parse(open(sample.gtf_file)): t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID] if f.feature == 'transcript': # save expression expr = f.attrs[gtf_expr_attr] t_expr_map[t_id] = expr # rename transcript id if t_id not in t_id_map: new_t_id = "%s.T%d" % (sample._id, cur_t_id) t_id_map[t_id] = new_t_id cur_t_id += 1 t_dict[new_t_id] = [] # init t_dict elif f.feature == 'exon': # lookup expression if is_ref: expr = 0.0 else: expr = float(t_expr_map[t_id]) new_t_id = t_id_map[t_id] # store exon feature attrs = ((GTF.Attr.TRANSCRIPT_ID, new_t_id), (GTF.Attr.SAMPLE_ID, sample._id), (GTF.Attr.REF, str(int(is_ref))), (gtf_expr_attr, expr)) f.attrs = collections.OrderedDict(attrs) t_dict[new_t_id].append(f) return t_dict
def assemble(**kwargs): ''' kwargs: dict containing arguments and input/output file locations Configuration attributes: - guided_strand - guided_ends - guided_assembly - change_point - change_point_pvalue - change_point_fold_change - change_point_trim - path_graph_kmax - path_graph_loss_threshold - path_frac - max_paths - isoform_frac - max_isoforms Input file attributes: - transfrags_gtf_file - chrom_sizes_file Output file attributes: - unresolved_bg_files - resolved_bg_files - splice_bed_file - expr_h5_file - splice_graph_gtf_file - path_graph_stats_file - assembly_loss_gtf_file - assembly_gtf_file - assembly_bed_file ''' config = Config(**kwargs) # setup bedgraph output files for s, filename in config.unresolved_bg_files: config.unresolved_bg_fhs.append(open(filename, 'w')) for s, filename in config.resolved_bg_files: config.resolved_bg_fhs.append(open(filename, 'w')) # setup junction bed file config.splice_bed_fh = Locus.open_splice_bed(config.splice_bed_file) # setup expression hdf5 config.expr_h5fh = Locus.open_expression_hdf5(config.expr_h5_file, config.chrom_sizes_file) # splice graph gtf file config.splice_graph_gtf_fh = open(config.splice_graph_gtf_file, 'w') # path graph stats file config.path_graph_stats_fh = open(config.path_graph_stats_file, 'w') fields = [ 'locus', 'k', 'kmax', 'transfrags', 'nodes', 'kmers', 'short_transfrags', 'lost_kmers', 'tot_expr', 'lost_expr', 'lost_expr_frac', 'valid' ] print >> config.path_graph_stats_fh, '\t'.join(fields) # assembly gtf and bed files config.assembly_loss_gtf_fh = open(config.assembly_loss_gtf_file, 'w') config.assembly_gtf_fh = open(config.assembly_gtf_file, 'w') config.assembly_bed_fh = open(config.assembly_bed_file, 'w') # parse gtf file for interval, gtf_lines in GTF.parse_loci(open( config.transfrags_gtf_file)): chrom, start, end = interval logging.debug('Locus %s:%d-%d: ' % (chrom, start, end)) assemble_locus(gtf_lines, config) # cleanup and close files config.assembly_gtf_fh.close() config.assembly_bed_fh.close() config.assembly_loss_gtf_fh.close() config.path_graph_stats_fh.close() config.splice_graph_gtf_fh.close() config.expr_h5fh.close() config.splice_bed_fh.close() Locus.close_bedgraphs(config.unresolved_bg_fhs) Locus.close_bedgraphs(config.resolved_bg_fhs)
def assemble_isoforms(sgraph, config): # create a path graph from the splice graph K, k = create_optimal_path_graph( sgraph, kmax=config.path_graph_kmax, loss_threshold=config.path_graph_loss_threshold, stats_fh=config.path_graph_stats_fh) if K is None: return [] if len(K) == 0: return [] # report lost nodes if config.assembly_loss_gtf_fh is not None: graph_id = ('L_%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand))) for n_id in get_lost_nodes(sgraph, K): n = sgraph.get_node_interval(n_id) expr_data = sgraph.get_node_expr_data(n_id) # return gtf feature for each node f = GTF.Feature() f.seqid = sgraph.chrom f.source = 'taco' f.feature = 'lost_node' f.start = n[0] f.end = n[1] f.score = 0.0 f.strand = Strand.to_gtf(sgraph.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'expr': str(expr_data.mean())} print >> config.assembly_loss_gtf_fh, str(f) # smooth kmer graph smooth_graph(K) source_node = K.graph['source'] source_expr = K.node[source_node][KMER_EXPR] logging.debug('%s:%d-%d[%s] finding paths in k=%d graph ' '(%d nodes) source_expr=%f' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand), k, len(K), source_expr)) id_kmer_map = K.graph['id_kmer_map'] paths = [] for kmer_path, expr in find_paths(K, KMER_EXPR, config.path_frac, config.max_paths): path = reconstruct_path(kmer_path, id_kmer_map, sgraph) logging.debug("\texpr=%f length=%d" % (expr, len(path))) paths.append((path, expr)) # build gene clusters clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac) logging.debug('\tclusters: %d filtered: %d' % (len(clusters), len(filtered))) gene_isoforms = [] for cluster in clusters: isoforms = [] for path, expr, rel_frac, abs_frac in cluster.iterpaths(): isoforms.append( Isoform(path=path, expr=expr, rel_frac=rel_frac, abs_frac=abs_frac)) # apply max isoforms limit (per cluster) if config.max_isoforms > 0: isoforms = isoforms[:config.max_isoforms] gene_isoforms.append(isoforms) return gene_isoforms
def main(): parser = argparse.ArgumentParser( description="Simple toolkit to test changes in RBP/TF binding affinity" "caused by genetic variants.") parser.add_argument(dest='vcf', help='Path to the VCF file') parser.add_argument(dest='bed', help='Path to the bed file') parser.add_argument(dest='fasta', help='Path to the fasta file.') parser.add_argument( '--list', action='store_true', help='If bed argument represents a list of bed file to process' ' (one per line)') parser.add_argument( '--chr', action='store_true', help='Input files should contain chr string. If not found,' ' a fix is tried.') parser.add_argument( '--gtf', help= 'gtf file to further take into account intron/exon boundaries. Canonical transcripts' 'will be retrieved.') parser.add_argument( '--gtf_is_processed', action='store_true', help='If set \'--gtf\' argument represents the processed' 'daraframe from an original gtf file.') parser.add_argument('-o', '--output', default=os.getcwd(), help='Output directory. Default: current directory') parser.add_argument( '-p', '--fromPickle', help="If given, analysis should start from previously serialized object." "Input file will be ignored") args = parser.parse_args() osutils = OSutils() is_pickle = False if args.fromPickle: osutils.is_pickled(args.fromPickle) with open(args.fromPickle, 'rb') as file_object: raw_data = file_object.read() deserialized = pickle.loads(raw_data) is_pickle = True else: Logger.print_advances('Validating input data') if args.gtf: gtf = GTF(args.gtf, args.gtf_is_processed, args.output) else: gtf = None mutation = Mutation(args.vcf, args.bed, args.fasta, gtf, args.list, args.chr, args.output) deserialized = {} Logger.print_advances('Starting analysis') for name, bedobj in mutation.beds.items(): Logger.print_advances("Processing {} peak file.".format(name)) Logger.log("Intercepting variants") fn = mutation.vcf_intersect(mutation.vcf_bed, bedobj, name) Logger.log('Extracting peaks fasta sequences') bed_seq = mutation.get_peak_sequence(bedobj, mutation.fasta) bed_peak_fasta = mutation.save_fasta_sequence( bed_seq, osutils.set_out_fn(mutation.outdir, name + ".fasta")) Logger.log("Mutating fasta sequences") isec = Isec(fn) seqs_mut = mutation.mutate_fasta(bed_peak_fasta, isec) deserialized[name] = seqs_mut Logger.log("Done") Logger.log("Dumping data structure to {}".format("data.pickle")) serialized = pickle.dumps(deserialized) with open(osutils.set_out_fn(mutation.outdir, "data.pickle"), 'wb') as file_object: file_object.write(serialized) motdisrupt = PeaksMutated(deserialized, is_pickle, args.output) motdisrupt.list_beds() motdisrupt.write_object() pwm = PWMs() pwm.parse_cisBP_pwm()
def assemble(**kwargs): ''' kwargs: dict containing arguments and input/output file locations Configuration attributes: - guided_strand - guided_ends - guided_assembly - change_point - change_point_pvalue - change_point_fold_change - change_point_trim - path_graph_kmax - path_graph_loss_threshold - path_frac - max_paths - isoform_frac - max_isoforms Input file attributes: - transfrags_gtf_file - chrom_sizes_file Output file attributes: - unresolved_bg_files - resolved_bg_files - splice_bed_file - expr_h5_file - splice_graph_gtf_file - path_graph_stats_file - assembly_loss_gtf_file - assembly_gtf_file - assembly_bed_file ''' config = Config(**kwargs) # setup bedgraph output files for s, filename in config.unresolved_bg_files: config.unresolved_bg_fhs.append(open(filename, 'w')) for s, filename in config.resolved_bg_files: config.resolved_bg_fhs.append(open(filename, 'w')) # setup junction bed file config.splice_bed_fh = Locus.open_splice_bed(config.splice_bed_file) # setup expression hdf5 config.expr_h5fh = Locus.open_expression_hdf5(config.expr_h5_file, config.chrom_sizes_file) # splice graph gtf file config.splice_graph_gtf_fh = open(config.splice_graph_gtf_file, 'w') # path graph stats file config.path_graph_stats_fh = open(config.path_graph_stats_file, 'w') fields = ['locus', 'k', 'kmax', 'transfrags', 'nodes', 'kmers', 'short_transfrags', 'lost_kmers', 'tot_expr', 'lost_expr', 'lost_expr_frac', 'valid'] print >>config.path_graph_stats_fh, '\t'.join(fields) # assembly gtf and bed files config.assembly_loss_gtf_fh = open(config.assembly_loss_gtf_file, 'w') config.assembly_gtf_fh = open(config.assembly_gtf_file, 'w') config.assembly_bed_fh = open(config.assembly_bed_file, 'w') # parse gtf file for interval, gtf_lines in GTF.parse_loci(open(config.transfrags_gtf_file)): chrom, start, end = interval logging.debug('Locus %s:%d-%d: ' % (chrom, start, end)) assemble_locus(gtf_lines, config) # cleanup and close files config.assembly_gtf_fh.close() config.assembly_bed_fh.close() config.assembly_loss_gtf_fh.close() config.path_graph_stats_fh.close() config.splice_graph_gtf_fh.close() config.expr_h5fh.close() config.splice_bed_fh.close() Locus.close_bedgraphs(config.unresolved_bg_fhs) Locus.close_bedgraphs(config.resolved_bg_fhs)