Ejemplo n.º 1
0
 def get_change_point_gtf(self, cp):
     graph_id = ('G_%s_%d_%d_%s' %
                 (self.chrom, self.start, self.end,
                  Strand.to_gtf(self.strand)))
     features = []
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'changept'
     f.start = cp.pos
     f.end = cp.pos + 1
     f.score = 0
     f.strand = Strand.to_gtf(self.strand)
     f.phase = '.'
     f.attrs = {'graph_id': graph_id,
                'sign': str(cp.sign),
                'pvalue': str(cp.pvalue),
                'foldchange': str(cp.foldchange)}
     features.append(f)
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'changeinterval'
     f.start = cp.start
     f.end = cp.end
     f.score = 0
     f.strand = Strand.to_gtf(self.strand)
     f.phase = '.'
     f.attrs = {'graph_id': graph_id,
                'sign': str(cp.sign),
                'pvalue': str(cp.pvalue),
                'foldchange': str(cp.foldchange)}
     features.append(f)
     return features
Ejemplo n.º 2
0
 def to_gtf(self):
     strand_str = Strand.to_gtf(self.strand)
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'transcript'
     f.start = self.start
     f.end = self.end
     f.score = 0.0
     f.strand = strand_str
     f.phase = '.'
     f.attrs = {
         GTF.Attr.TRANSCRIPT_ID: self._id,
         GTF.Attr.SAMPLE_ID: self.sample_id,
         GTF.Attr.EXPR: str(self.expr),
         GTF.Attr.REF: str(int(self.is_ref))
     }
     yield f
     for e in self.exons:
         f = GTF.Feature()
         f.seqid = self.chrom
         f.source = 'taco'
         f.feature = 'exon'
         f.start = e.start
         f.end = e.end
         f.score = 0.0
         f.strand = strand_str
         f.phase = '.'
         f.attrs = {GTF.Attr.TRANSCRIPT_ID: self._id}
         yield f
Ejemplo n.º 3
0
 def get_node_gtf(self):
     graph_id = (
         'G_%s_%d_%d_%s' %
         (self.chrom, self.start, self.end, Strand.to_gtf(self.strand)))
     # iterate through locus and return change point data
     for n_id in self.G:
         n = self.get_node_interval(n_id)
         expr_data = self.get_expr_data(*n)
         ref_starts = _array_subset(self.ref_start_sites, *n)
         ref_stops = _array_subset(self.ref_stop_sites, *n)
         # return gtf feature for each node
         f = GTF.Feature()
         f.seqid = self.chrom
         f.source = 'taco'
         f.feature = 'node'
         f.start = n[0]
         f.end = n[1]
         f.score = 0
         f.strand = Strand.to_gtf(self.strand)
         f.phase = '.'
         f.attrs = {
             'graph_id': graph_id,
             'expr_min': str(expr_data.min()),
             'expr_max': str(expr_data.max()),
             'expr_mean': str(expr_data.mean()),
             'ref_starts': ','.join(map(str, ref_starts)),
             'ref_stops': ','.join(map(str, ref_stops))
         }
         yield f
Ejemplo n.º 4
0
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id,
                     transcript_id, expr, rel_frac, abs_frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = Strand.to_gtf(strand)
    attr_dict = {
        'locus_id': locus_id,
        'gene_id': gene_id,
        'tss_id': tss_id,
        'transcript_id': transcript_id
    }
    f = GTF.Feature()
    f.seqid = chrom
    f.source = 'taco'
    f.feature = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = int(round(1000.0 * rel_frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {
        'expr': '%.3f' % expr,
        'rel_frac': '%.5f' % rel_frac,
        'abs_frac': '%.5f' % abs_frac
    }
    f.attrs.update(attr_dict)
    yield f
    for e in exons:
        f = GTF.Feature()
        f.seqid = chrom
        f.source = 'taco'
        f.feature = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(1000.0 * rel_frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {}
        f.attrs.update(attr_dict)
        yield f
Ejemplo n.º 5
0
def _make_transcript_feature(exon_features):
    f = GTF.Feature()
    f.seqid = exon_features[0].seqid
    f.source = exon_features[0].source
    f.feature = 'transcript'
    f.start = exon_features[0].start
    f.end = exon_features[-1].end
    f.score = exon_features[0].score
    f.strand = exon_features[0].strand
    f.phase = '.'
    f.attrs = exon_features[0].attrs.copy()
    if 'exon_number' in f.attrs:
        del f.attrs['exon_number']
    return f
Ejemplo n.º 6
0
def _read_transfrags(sample, gtf_expr_attr, is_ref=False):
    '''
    Process individual sample GTF file
      - Reads entire GTF file into memory.
      - Renames "gene_id" and "transcript_id" attributes for
        consistency and to conserve space.
    '''
    t_dict = collections.OrderedDict()
    t_id_map = {}
    t_expr_map = {}
    cur_t_id = 1
    for f in GTF.parse(open(sample.gtf_file)):
        t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID]
        if f.feature == 'transcript':
            # save expression
            expr = f.attrs[gtf_expr_attr]
            t_expr_map[t_id] = expr
            # rename transcript id
            if t_id not in t_id_map:
                new_t_id = "%s.T%d" % (sample._id, cur_t_id)
                t_id_map[t_id] = new_t_id
                cur_t_id += 1
                t_dict[new_t_id] = []  # init t_dict
        elif f.feature == 'exon':
            # lookup expression
            if is_ref:
                expr = 0.0
            else:
                expr = float(t_expr_map[t_id])
            new_t_id = t_id_map[t_id]
            # store exon feature
            attrs = ((GTF.Attr.TRANSCRIPT_ID, new_t_id), (GTF.Attr.SAMPLE_ID,
                                                          sample._id),
                     (GTF.Attr.REF, str(int(is_ref))), (gtf_expr_attr, expr))
            f.attrs = collections.OrderedDict(attrs)
            t_dict[new_t_id].append(f)
    return t_dict
Ejemplo n.º 7
0
def assemble(**kwargs):
    '''
    kwargs: dict containing arguments and input/output file locations

    Configuration attributes:
    - guided_strand
    - guided_ends
    - guided_assembly
    - change_point
    - change_point_pvalue
    - change_point_fold_change
    - change_point_trim
    - path_graph_kmax
    - path_graph_loss_threshold
    - path_frac
    - max_paths
    - isoform_frac
    - max_isoforms

    Input file attributes:
    - transfrags_gtf_file
    - chrom_sizes_file

    Output file attributes:
    - unresolved_bg_files
    - resolved_bg_files
    - splice_bed_file
    - expr_h5_file
    - splice_graph_gtf_file
    - path_graph_stats_file
    - assembly_loss_gtf_file
    - assembly_gtf_file
    - assembly_bed_file
    '''
    config = Config(**kwargs)
    # setup bedgraph output files
    for s, filename in config.unresolved_bg_files:
        config.unresolved_bg_fhs.append(open(filename, 'w'))
    for s, filename in config.resolved_bg_files:
        config.resolved_bg_fhs.append(open(filename, 'w'))
    # setup junction bed file
    config.splice_bed_fh = Locus.open_splice_bed(config.splice_bed_file)
    # setup expression hdf5
    config.expr_h5fh = Locus.open_expression_hdf5(config.expr_h5_file,
                                                  config.chrom_sizes_file)
    # splice graph gtf file
    config.splice_graph_gtf_fh = open(config.splice_graph_gtf_file, 'w')
    # path graph stats file
    config.path_graph_stats_fh = open(config.path_graph_stats_file, 'w')
    fields = [
        'locus', 'k', 'kmax', 'transfrags', 'nodes', 'kmers',
        'short_transfrags', 'lost_kmers', 'tot_expr', 'lost_expr',
        'lost_expr_frac', 'valid'
    ]
    print >> config.path_graph_stats_fh, '\t'.join(fields)

    # assembly gtf and bed files
    config.assembly_loss_gtf_fh = open(config.assembly_loss_gtf_file, 'w')
    config.assembly_gtf_fh = open(config.assembly_gtf_file, 'w')
    config.assembly_bed_fh = open(config.assembly_bed_file, 'w')

    # parse gtf file
    for interval, gtf_lines in GTF.parse_loci(open(
            config.transfrags_gtf_file)):
        chrom, start, end = interval
        logging.debug('Locus %s:%d-%d: ' % (chrom, start, end))
        assemble_locus(gtf_lines, config)

    # cleanup and close files
    config.assembly_gtf_fh.close()
    config.assembly_bed_fh.close()
    config.assembly_loss_gtf_fh.close()
    config.path_graph_stats_fh.close()
    config.splice_graph_gtf_fh.close()
    config.expr_h5fh.close()
    config.splice_bed_fh.close()
    Locus.close_bedgraphs(config.unresolved_bg_fhs)
    Locus.close_bedgraphs(config.resolved_bg_fhs)
Ejemplo n.º 8
0
def assemble_isoforms(sgraph, config):
    # create a path graph from the splice graph
    K, k = create_optimal_path_graph(
        sgraph,
        kmax=config.path_graph_kmax,
        loss_threshold=config.path_graph_loss_threshold,
        stats_fh=config.path_graph_stats_fh)
    if K is None:
        return []
    if len(K) == 0:
        return []

    # report lost nodes
    if config.assembly_loss_gtf_fh is not None:
        graph_id = ('L_%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end,
                                        Strand.to_gtf(sgraph.strand)))
        for n_id in get_lost_nodes(sgraph, K):
            n = sgraph.get_node_interval(n_id)
            expr_data = sgraph.get_node_expr_data(n_id)
            # return gtf feature for each node
            f = GTF.Feature()
            f.seqid = sgraph.chrom
            f.source = 'taco'
            f.feature = 'lost_node'
            f.start = n[0]
            f.end = n[1]
            f.score = 0.0
            f.strand = Strand.to_gtf(sgraph.strand)
            f.phase = '.'
            f.attrs = {'graph_id': graph_id, 'expr': str(expr_data.mean())}
            print >> config.assembly_loss_gtf_fh, str(f)

    # smooth kmer graph
    smooth_graph(K)

    source_node = K.graph['source']
    source_expr = K.node[source_node][KMER_EXPR]
    logging.debug('%s:%d-%d[%s] finding paths in k=%d graph '
                  '(%d nodes) source_expr=%f' %
                  (sgraph.chrom, sgraph.start, sgraph.end,
                   Strand.to_gtf(sgraph.strand), k, len(K), source_expr))
    id_kmer_map = K.graph['id_kmer_map']

    paths = []
    for kmer_path, expr in find_paths(K, KMER_EXPR, config.path_frac,
                                      config.max_paths):
        path = reconstruct_path(kmer_path, id_kmer_map, sgraph)
        logging.debug("\texpr=%f length=%d" % (expr, len(path)))
        paths.append((path, expr))
    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('\tclusters: %d filtered: %d' %
                  (len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            isoforms.append(
                Isoform(path=path,
                        expr=expr,
                        rel_frac=rel_frac,
                        abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(
        description="Simple toolkit to test changes in RBP/TF binding affinity"
        "caused by genetic variants.")
    parser.add_argument(dest='vcf', help='Path to the VCF file')
    parser.add_argument(dest='bed', help='Path to the bed file')
    parser.add_argument(dest='fasta', help='Path to the fasta file.')
    parser.add_argument(
        '--list',
        action='store_true',
        help='If bed argument represents a list of bed file to process'
        ' (one per line)')
    parser.add_argument(
        '--chr',
        action='store_true',
        help='Input files should contain chr string. If not found,'
        ' a fix is tried.')
    parser.add_argument(
        '--gtf',
        help=
        'gtf file to further take into account intron/exon boundaries. Canonical transcripts'
        'will be retrieved.')
    parser.add_argument(
        '--gtf_is_processed',
        action='store_true',
        help='If set \'--gtf\' argument represents the processed'
        'daraframe from an original gtf file.')
    parser.add_argument('-o',
                        '--output',
                        default=os.getcwd(),
                        help='Output directory. Default: current directory')
    parser.add_argument(
        '-p',
        '--fromPickle',
        help="If given, analysis should start from previously serialized object."
        "Input file will be ignored")
    args = parser.parse_args()

    osutils = OSutils()
    is_pickle = False
    if args.fromPickle:
        osutils.is_pickled(args.fromPickle)
        with open(args.fromPickle, 'rb') as file_object:
            raw_data = file_object.read()
        deserialized = pickle.loads(raw_data)
        is_pickle = True

    else:
        Logger.print_advances('Validating input data')
        if args.gtf:
            gtf = GTF(args.gtf, args.gtf_is_processed, args.output)
        else:
            gtf = None

        mutation = Mutation(args.vcf, args.bed, args.fasta, gtf, args.list,
                            args.chr, args.output)
        deserialized = {}
        Logger.print_advances('Starting analysis')
        for name, bedobj in mutation.beds.items():
            Logger.print_advances("Processing {} peak file.".format(name))
            Logger.log("Intercepting variants")
            fn = mutation.vcf_intersect(mutation.vcf_bed, bedobj, name)

            Logger.log('Extracting peaks fasta sequences')
            bed_seq = mutation.get_peak_sequence(bedobj, mutation.fasta)
            bed_peak_fasta = mutation.save_fasta_sequence(
                bed_seq, osutils.set_out_fn(mutation.outdir, name + ".fasta"))

            Logger.log("Mutating fasta sequences")
            isec = Isec(fn)
            seqs_mut = mutation.mutate_fasta(bed_peak_fasta, isec)
            deserialized[name] = seqs_mut
            Logger.log("Done")

        Logger.log("Dumping data structure to {}".format("data.pickle"))
        serialized = pickle.dumps(deserialized)
        with open(osutils.set_out_fn(mutation.outdir, "data.pickle"),
                  'wb') as file_object:
            file_object.write(serialized)

    motdisrupt = PeaksMutated(deserialized, is_pickle, args.output)
    motdisrupt.list_beds()
    motdisrupt.write_object()
    pwm = PWMs()
    pwm.parse_cisBP_pwm()
Ejemplo n.º 10
0
def assemble(**kwargs):
    '''
    kwargs: dict containing arguments and input/output file locations

    Configuration attributes:
    - guided_strand
    - guided_ends
    - guided_assembly
    - change_point
    - change_point_pvalue
    - change_point_fold_change
    - change_point_trim
    - path_graph_kmax
    - path_graph_loss_threshold
    - path_frac
    - max_paths
    - isoform_frac
    - max_isoforms

    Input file attributes:
    - transfrags_gtf_file
    - chrom_sizes_file

    Output file attributes:
    - unresolved_bg_files
    - resolved_bg_files
    - splice_bed_file
    - expr_h5_file
    - splice_graph_gtf_file
    - path_graph_stats_file
    - assembly_loss_gtf_file
    - assembly_gtf_file
    - assembly_bed_file
    '''
    config = Config(**kwargs)
    # setup bedgraph output files
    for s, filename in config.unresolved_bg_files:
        config.unresolved_bg_fhs.append(open(filename, 'w'))
    for s, filename in config.resolved_bg_files:
        config.resolved_bg_fhs.append(open(filename, 'w'))
    # setup junction bed file
    config.splice_bed_fh = Locus.open_splice_bed(config.splice_bed_file)
    # setup expression hdf5
    config.expr_h5fh = Locus.open_expression_hdf5(config.expr_h5_file,
                                                  config.chrom_sizes_file)
    # splice graph gtf file
    config.splice_graph_gtf_fh = open(config.splice_graph_gtf_file, 'w')
    # path graph stats file
    config.path_graph_stats_fh = open(config.path_graph_stats_file, 'w')
    fields = ['locus', 'k', 'kmax', 'transfrags', 'nodes', 'kmers',
              'short_transfrags', 'lost_kmers', 'tot_expr', 'lost_expr',
              'lost_expr_frac', 'valid']
    print >>config.path_graph_stats_fh, '\t'.join(fields)

    # assembly gtf and bed files
    config.assembly_loss_gtf_fh = open(config.assembly_loss_gtf_file, 'w')
    config.assembly_gtf_fh = open(config.assembly_gtf_file, 'w')
    config.assembly_bed_fh = open(config.assembly_bed_file, 'w')

    # parse gtf file
    for interval, gtf_lines in GTF.parse_loci(open(config.transfrags_gtf_file)):
        chrom, start, end = interval
        logging.debug('Locus %s:%d-%d: ' % (chrom, start, end))
        assemble_locus(gtf_lines, config)

    # cleanup and close files
    config.assembly_gtf_fh.close()
    config.assembly_bed_fh.close()
    config.assembly_loss_gtf_fh.close()
    config.path_graph_stats_fh.close()
    config.splice_graph_gtf_fh.close()
    config.expr_h5fh.close()
    config.splice_bed_fh.close()
    Locus.close_bedgraphs(config.unresolved_bg_fhs)
    Locus.close_bedgraphs(config.resolved_bg_fhs)