コード例 #1
0
def make_chimera(cluster_pair, cluster_shelve, transcript_dict,
                 genome_tx_trees, annotation_source):
    # lookup 5' and 3' clusters
    cluster5p = cluster_shelve[str(cluster_pair.id5p)]
    cluster3p = cluster_shelve[str(cluster_pair.id3p)]
    # get 5' and 3' transcripts
    transcripts5p = lookup_transcripts(cluster5p, transcript_dict,
                                       genome_tx_trees)
    transcripts3p = lookup_transcripts(cluster3p, transcript_dict,
                                       genome_tx_trees)
    # lookup chimera type and distance
    chimera_type, distance = get_chimera_type(cluster5p, cluster3p,
                                              transcripts5p, transcripts3p,
                                              transcript_dict, genome_tx_trees)
    # format transcript information
    tx_names_5p, gene_names_5p, biotypes_5p = get_transcript_info(
        transcripts5p, annotation_source)
    tx_names_3p, gene_names_3p, biotypes_3p = get_transcript_info(
        transcripts3p, annotation_source)
    # make chimera object
    c = Chimera()
    c.rname5p = cluster5p.rname
    c.start5p = cluster5p.start
    c.end5p = cluster5p.end
    c.rname3p = cluster3p.rname
    c.start3p = cluster3p.start
    c.end3p = cluster3p.end
    c.chimera_id = "CHIMERA%d" % (cluster_pair.pair_id)
    frags = set(cluster_pair.qnames)
    frags.update(cluster_pair.spanning_qnames)
    c.num_frags = len(frags)
    c.strand5p = cluster5p.strand
    c.strand3p = cluster3p.strand
    c.chimera_type = chimera_type
    c.distance = distance
    c.num_discordant_frags = len(cluster_pair.qnames)
    c.num_spanning_frags = len(cluster_pair.spanning_qnames)
    c.num_discordant_frags_5p = len(cluster5p.qnames)
    c.num_discordant_frags_3p = len(cluster3p.qnames)
    c.num_concordant_frags_5p = cluster5p.concordant_frags
    c.num_concordant_frags_3p = cluster3p.concordant_frags
    c.biotypes_5p = sorted(biotypes_5p)
    c.biotypes_3p = sorted(biotypes_3p)
    c.genes_5p = sorted(gene_names_5p)
    c.genes_3p = sorted(gene_names_3p)
    c.transcripts_5p = sorted(tx_names_5p)
    c.transcripts_3p = sorted(tx_names_3p)
    return c
コード例 #2
0
ファイル: write_output.py プロジェクト: BioXiao/chimerascan
def make_chimera(cluster_pair, 
                 cluster_shelve,
                 transcript_dict,
                 genome_tx_trees,
                 annotation_source):
    # lookup 5' and 3' clusters
    cluster5p = cluster_shelve[str(cluster_pair.id5p)]
    cluster3p = cluster_shelve[str(cluster_pair.id3p)]
    # get 5' and 3' transcripts
    transcripts5p = lookup_transcripts(cluster5p, transcript_dict, genome_tx_trees)
    transcripts3p = lookup_transcripts(cluster3p, transcript_dict, genome_tx_trees)
    # lookup chimera type and distance
    chimera_type, distance = get_chimera_type(cluster5p, cluster3p, 
                                              transcripts5p, transcripts3p, 
                                              transcript_dict, genome_tx_trees)
    # format transcript information
    tx_names_5p, gene_names_5p, biotypes_5p = get_transcript_info(transcripts5p, annotation_source)
    tx_names_3p, gene_names_3p, biotypes_3p = get_transcript_info(transcripts3p, annotation_source)
    # make chimera object
    c = Chimera()
    c.rname5p = cluster5p.rname
    c.start5p = cluster5p.start
    c.end5p = cluster5p.end
    c.rname3p = cluster3p.rname
    c.start3p = cluster3p.start
    c.end3p = cluster3p.end
    c.chimera_id = "CHIMERA%d" % (cluster_pair.pair_id)
    frags = set(cluster_pair.qnames)
    frags.update(cluster_pair.spanning_qnames)
    c.num_frags = len(frags)
    c.strand5p = cluster5p.strand
    c.strand3p = cluster3p.strand
    c.chimera_type = chimera_type
    c.distance = distance
    c.num_discordant_frags = len(cluster_pair.qnames)
    c.num_spanning_frags = len(cluster_pair.spanning_qnames)
    c.num_discordant_frags_5p = len(cluster5p.qnames)
    c.num_discordant_frags_3p = len(cluster3p.qnames)
    c.num_concordant_frags_5p = cluster5p.concordant_frags
    c.num_concordant_frags_3p = cluster3p.concordant_frags
    c.biotypes_5p = sorted(biotypes_5p)
    c.biotypes_3p = sorted(biotypes_3p)
    c.genes_5p = sorted(gene_names_5p)
    c.genes_3p = sorted(gene_names_3p)
    c.transcripts_5p = sorted(tx_names_5p)
    c.transcripts_3p = sorted(tx_names_3p)
    return c
コード例 #3
0
def write_output(input_file, bam_file, output_file, index_dir):
    # read transcripts
    logging.debug("Reading transcripts")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    transcript_genome_map = build_transcript_genome_map(transcripts)
    tx_id_map = build_transcript_map(transcripts)
    genome_tx_trees = build_genome_transcript_trees(transcripts)
    # open BAM file for checking wild-type isoform
    bamfh = pysam.Samfile(bam_file, "rb")   
    # group chimera isoforms together
    lines = []
    chimera_clusters = 0
    for key,chimeras in get_chimera_groups(input_file, tx_id_map):
        txs5p = set()
        txs3p = set()
        genes5p = set()
        genes3p = set()
        names = set()
        for c in chimeras:
            txs5p.add("%s:%d-%d" % (c.tx_name_5p, c.tx_start_5p, c.tx_end_5p-1))
            txs3p.add("%s:%d-%d" % (c.tx_name_3p, c.tx_start_3p, c.tx_end_3p-1))
            genes5p.add(c.gene_name_5p)
            genes3p.add(c.gene_name_3p)
            names.add(c.name)
        c = get_best_coverage_chimera(chimeras)
        # get chimera type and distance between genes
        chimera_type, distance = get_chimera_type(tx_id_map[c.tx_name_5p],
                                                  tx_id_map[c.tx_name_3p],
                                                  genome_tx_trees)
        # get genomic positions of chimera
        chrom5p,strand5p,start5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_start_5p, transcript_genome_map)
        chrom5p,strand5p,end5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map)
        if strand5p == 1:
            start5p,end5p = end5p,start5p
        chrom3p,strand3p,start3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map)
        chrom3p,strand3p,end3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_end_3p-1, transcript_genome_map)
        if strand3p == 1:
            start3p,end3p = end3p,start3p
        # get breakpoint spanning sequences
        spanning_seqs = set()
        spanning_fasta_lines = []
        for dr in c.get_spanning_reads():
            if dr.seq in spanning_seqs:
                continue
            spanning_seqs.add(dr.seq)
            spanning_fasta_lines.extend([">%s/%d;pos=%d;strand=%s" % 
                                         (dr.qname, dr.readnum+1, dr.pos, 
                                          "-" if dr.is_reverse else "+"), 
                                         dr.seq])
        # get isoform fraction
        num_wt_frags_5p, num_wt_frags_3p = get_wildtype_frags(c, bamfh)
        num_chimeric_frags = c.get_num_frags()
        frac5p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_5p)
        frac3p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_3p)
        # setup fields of BEDPE file
        fields = [chrom5p, start5p, end5p,
                  chrom3p, start3p, end3p,
                  "CLUSTER%d" % (chimera_clusters),
                  c.get_num_frags(),
                  "+" if (strand5p == 0) else "-",
                  "+" if (strand3p == 0) else "-",
                  ','.join(txs5p),
                  ','.join(txs3p),
                  ','.join(genes5p),
                  ','.join(genes3p),
                  chimera_type, distance,
                  c.get_num_frags(),
                  c.get_num_spanning_frags(),
                  c.get_num_unique_positions(),
                  frac5p, frac3p,
                  ','.join(spanning_fasta_lines),
                  ','.join(names)]
        lines.append(fields)
        chimera_clusters += 1
    bamfh.close()
    logging.debug("Clustered chimeras: %d" % (chimera_clusters))
    # sort
    lines = sorted(lines, key=operator.itemgetter(18, 17, 16), reverse=True)    
    f = open(output_file, "w")
    print >>f, '\t'.join(['#chrom5p', 'start5p', 'end5p', 
                          'chrom3p', 'start3p', 'end3p',
                          'chimera_cluster_id', 'score', 
                          'strand5p', 'strand3p',
                          'transcript_ids_5p', 'transcript_ids_3p',
                          'genes5p', 'genes3p',
                          'type', 'distance',
                          'total_frags', 
                          'spanning_frags',
                          'unique_alignment_positions',
                          'isoform_fraction_5p',
                          'isoform_fraction_3p',
                          'breakpoint_spanning_reads',
                          'chimera_ids'])
    for fields in lines:
        print >>f, '\t'.join(map(str, fields))
    f.close()
    return config.JOB_SUCCESS