Ejemplo n.º 1
0
def main():
    gff3 = '/home/jordan/GENOMES/CNA3_all_transcripts.gff3'
    fasta = '/home/jordan/GENOMES/H99_fa.json'
    chrom_lengths = '/home/jordan/GENOMES/H99_chrom_lengths.json'
    prefix = sys.argv[1].split('/')[-1].split('.')[0]
    print prefix
    tx_dict = SP.build_transcript_dict(gff3)
    tx_by_chrom = sort_tx_by_chrom(tx_dict)
    int_dict = make_promoter_dict(tx_dict, chrom_lengths)
    peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1], cutoff=2)
    #peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1])
    peak_df = find_best_peaks(peak_df, int_dict, max_genes=300)
    if len(sys.argv) == 3:
        gene_list_file = sys.argv[2]
        in_list, other = split_by_gene(peak_df, gene_list_file)
        in_list.to_csv(prefix+'_by_gene_in_list.csv')
        other.to_csv(prefix+'_by_gene_other.csv')
        generate_sequence_file(in_list, int_dict, fasta, prefix+'_in_list')
        generate_sequence_file(other, int_dict, fasta, prefix+'_other')
        split = True
        minsites = [int(0.75*len(in_list)),int(0.75*len(other))]
        if minsites[0] > 600: minsites[0] = 600
        if minsites[1] > 600: minsites[1] = 600
    else:
        peak_df.to_csv(prefix+'_by_gene.csv')
        generate_sequence_file(peak_df, int_dict, fasta, prefix)
        split = False
        minsites = int(0.75*len(peak_df))
        if minsites > 600: minsites = 600
    call_meme(prefix, minsites, split=split)
Ejemplo n.º 2
0
def generate_all_ss_seqs(gff3, fasta_dict, organism):
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss)
    
    all_seq5 = []
    all_seq3 = []
    for transcript, introns in ss_dict.iteritems():
        if organism == 'pombe':
            isoform = transcript+'.1'
        else:
            isoform = transcript+'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            if strand == '+':
                seq5 = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)]
            elif strand == '-':
                seq5 = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)]
                seq5 = SP.reverse_complement(seq5)

            all_seq5.append(seq5)

            if strand == '+':
                seq3 = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)]
            elif strand == '-':
                seq3 = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)]
                seq3 = SP.reverse_complement(seq3)
            
            all_seq3.append(seq3)
    return all_seq5, all_seq3
Ejemplo n.º 3
0
def generate_all_ss_seqs(gff3, fasta_dict, organism):
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss)

    all_seq5 = []
    all_seq3 = []
    for transcript, introns in ss_dict.iteritems():
        if organism == 'pombe':
            isoform = transcript + '.1'
        else:
            isoform = transcript + 'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            if strand == '+':
                seq5 = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)]
            elif strand == '-':
                seq5 = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)]
                seq5 = SP.reverse_complement(seq5)

            all_seq5.append(seq5)

            if strand == '+':
                seq3 = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)]
            elif strand == '-':
                seq3 = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)]
                seq3 = SP.reverse_complement(seq3)

            all_seq3.append(seq3)
    return all_seq5, all_seq3
Ejemplo n.º 4
0
def build_transcript_dict(gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", expand=False, convert_chroms=False):
    transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')
    
    lat_rom = {'chr1':'I','chr2':'II','chr3':'III','MT':'MT'}
    
    if convert_chroms is True:
        transcript_dict = {k:[start, end, strand, lat_rom[chrom], cds_start, cds_end] for 
                           k, [start, end, strand, chrom, cds_start, cds_end] in transcript_dict.items()}
    
    
    chrom_lengths = {'I':5818680, 'II':4744158, 'III':2598968,'chr1':5818680, 'chr2':4744158, 'chr3':2598968}
    
    if expand is True:
        expanded_dict = {}
        for tx, info in transcript_dict.iteritems():
            new_start = info[0]-300
            if new_start < 0:
                new_start = 0
            new_end = info[1]+300
            if info[3] in chrom_lengths:
                if new_end > chrom_lengths[info[3]]:
                    new_end = chrom_lengths[info[3]]
            #else: print info[3]
            if len(info[4]) == 0:
                info[4] = [info[0]]
            if len(info[5]) == 0:
                info[5] = [info[1]]
            expanded_dict[tx] = [new_start, new_end, info[2], info[3], info[4], info[5]]
        transcript_dict = expanded_dict
    
    return transcript_dict
Ejemplo n.º 5
0
def main():
    gff3 = '/home/jordan/GENOMES/CNA3_all_transcripts.gff3'
    fasta = '/home/jordan/GENOMES/H99_fa.json'
    chrom_lengths = '/home/jordan/GENOMES/H99_chrom_lengths.json'
    prefix = sys.argv[1].split('/')[-1].split('.')[0]
    print prefix
    tx_dict = SP.build_transcript_dict(gff3)
    tx_by_chrom = sort_tx_by_chrom(tx_dict)
    int_dict = make_promoter_dict(tx_dict, chrom_lengths)
    peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1], cutoff=2)
    #peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1])
    peak_df = find_best_peaks(peak_df, int_dict, max_genes=300)
    if len(sys.argv) == 3:
        gene_list_file = sys.argv[2]
        in_list, other = split_by_gene(peak_df, gene_list_file)
        in_list.to_csv(prefix + '_by_gene_in_list.csv')
        other.to_csv(prefix + '_by_gene_other.csv')
        generate_sequence_file(in_list, int_dict, fasta, prefix + '_in_list')
        generate_sequence_file(other, int_dict, fasta, prefix + '_other')
        split = True
        minsites = [int(0.75 * len(in_list)), int(0.75 * len(other))]
        if minsites[0] > 600: minsites[0] = 600
        if minsites[1] > 600: minsites[1] = 600
    else:
        peak_df.to_csv(prefix + '_by_gene.csv')
        generate_sequence_file(peak_df, int_dict, fasta, prefix)
        split = False
        minsites = int(0.75 * len(peak_df))
        if minsites > 600: minsites = 600
    call_meme(prefix, minsites, split=split)
Ejemplo n.º 6
0
def get_sequence(coord_dict, gff3_file, fasta_file):
    if 'pombe' in gff3_file:
        organism = 'pombe'

    else:
        organism = None

    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) is str:
        fasta_dict = make_fasta_dict(fasta_file)
    else:
        fasta_dict = fasta_file

    seq_dict = {}
    counter5 = 0
    counter3 = 0
    other = 0
    for transcript, coord_sets in coord_dict.iteritems():
        seq_dict[transcript] = []
        chrom = transcript_dict[transcript][3]
        #if chrom in rom_lat: chrom = rom_lat[chrom]
        strand = transcript_dict[transcript][2]
        for coord in coord_sets[0]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord - 9):(coord + 11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord - 10):(coord + 10)]
                sequence = SP.reverse_complement(sequence)

            if sequence[10:12] == 'GT' or sequence[10:12] == 'GC':
                seq_type = "5'"
                counter5 += 1
            seq_dict[transcript].append((sequence, seq_type))

        for coord in coord_sets[1]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord - 9):(coord + 11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord - 10):(coord + 10)]
                sequence = SP.reverse_complement(sequence)

            if sequence[8:10] == 'AG':
                seq_type = "3'"
                counter3 += 1
            seq_dict[transcript].append((sequence, seq_type))

    #print str(counter5)+" 5' splice sites"
    #print str(counter3)+" 3' splice sites"

    return seq_dict
Ejemplo n.º 7
0
def get_sequence(coord_dict, gff3_file, fasta_file):
    if 'pombe' in gff3_file:
        organism = 'pombe'

    else: organism = None
    
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) is str:
        fasta_dict = make_fasta_dict(fasta_file)
    else:
        fasta_dict = fasta_file
    
    seq_dict = {}
    counter5 = 0
    counter3 = 0
    other = 0
    for transcript, coord_sets in coord_dict.iteritems():
        seq_dict[transcript] = []
        chrom = transcript_dict[transcript][3]
        #if chrom in rom_lat: chrom = rom_lat[chrom]
        strand = transcript_dict[transcript][2]
        for coord in coord_sets[0]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord-9):(coord+11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord-10):(coord+10)]
                sequence = SP.reverse_complement(sequence)

            if sequence[10:12] == 'GT' or sequence[10:12] == 'GC': 
                seq_type = "5'"
                counter5 += 1
            seq_dict[transcript].append((sequence, seq_type))
     
        for coord in coord_sets[1]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord-9):(coord+11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord-10):(coord+10)]
                sequence = SP.reverse_complement(sequence)
                
            if sequence[8:10] == 'AG': 
                seq_type = "3'"
                counter3 += 1
            seq_dict[transcript].append((sequence, seq_type))
    
    #print str(counter5)+" 5' splice sites"
    #print str(counter3)+" 3' splice sites"
    
    return seq_dict
Ejemplo n.º 8
0
def build_tss_dict(gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", window=220):
    transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')
    
    tss_dict = {}
    for tx, info in transcript_dict.iteritems():
        if info[2] == '+':
            start = info[0]-window
            end = info[0]+window
            tss_dict[tx] = [start, end, info[2], info[3]]
        elif info[2] == '-':
            start = info[1]-window
            end = info[1]+window
            tss_dict[tx] = [start, end, info[2], info[3]]
    return tss_dict
Ejemplo n.º 9
0
def build_tss_dict(
        gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3",
        window=220):
    transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')

    tss_dict = {}
    for tx, info in transcript_dict.iteritems():
        if info[2] == '+':
            start = info[0] - window
            end = info[0] + window
            tss_dict[tx] = [start, end, info[2], info[3]]
        elif info[2] == '-':
            start = info[1] - window
            end = info[1] + window
            tss_dict[tx] = [start, end, info[2], info[3]]
    return tss_dict
Ejemplo n.º 10
0
def peak_to_seq_pipeline(untagged_peak_file, tagged1_peak_file, tagged2_peak_file, gff3, fasta, junction_df=None, branch_df=None, cutoff=5, name='CP_peaks'):
    
    if 'pombe' in gff3: organism = 'pombe'
    else: organism = None
        
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    print "Finding peaks in transcripts..."
    
    print untagged_peak_file
    untagged = CP_peaks_by_gene(untagged_peak_file, transcript_dict, cutoff=cutoff)
    
    print tagged1_peak_file
    tagged1 = CP_peaks_by_gene(tagged1_peak_file, transcript_dict, cutoff=cutoff)
    
    print tagged2_peak_file
    tagged2 = CP_peaks_by_gene(tagged2_peak_file, transcript_dict, cutoff=cutoff)
    
    print "Comparing peaks between replicates..."
    peaks = CP_compare_reps(untagged, tagged1, tagged2)
    
    print "Checking peaks against annotation..."
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    peak_df = CP_compare_to_annotation(peaks, ss_dict, transcript_dict)
    peak_df = collapse_unpredicted_peaks(peak_df)
    peak_df['genome coord'] = peak_df['chromosome'].str.cat(peak_df['position'].apply(int).apply(str), sep=':')
    
    if type(fasta) == str:
        fasta = SP.make_fasta_dict(fasta)
    print "Adding sequences..."
    peak_seq_df = add_sequence_to_df(peak_df, fasta, flag=flag)
    
    print "Writing bedgraph..."
    with open(name+'.bedgraph', 'w') as fout:
        for ix, r in peak_seq_df.iterrows():
            if r['strand'] == '+':
                position2 = r['position']+1
                height = r['height']
            elif r['strand'] == '-':
                position2 = r['position']-1
                height = r['height']*-1
            line_list = [r['chromosome'], r['position'], position2, height, '\n']
            line_list = [str(x) for x in line_list]
            line = '\t'.join(line_list)
            fout.write(line)
    
    print "Completed"
    return peak_seq_df
Ejemplo n.º 11
0
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) == dict:
        fasta_dict = fasta_file
    elif fasta_file.endswith('json'):
        with open(fasta_file, 'r') as f:
            fasta_dict = json.load(f)
    else:
        fasta_dict = make_fasta_dict(fasta_file)
    if ss_dict is not None:
        ss_dict=ss_dict
    elif junction_bed is not None:
        ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism)
    elif peak_df is not None:
        ss_dict = {}
        peak_df = peak_df[~peak_df['type'].str.contains('prime')]
        for ix, r in peak_df.iterrows():
            if r['transcript'] not in ss_dict:
                ss_dict[r['transcript']] = []
            if r['strand'] == '+':
                ss_dict[r['transcript']].append((r['position'],r['position']+50))
            elif r['strand'] == '-':
                ss_dict[r['transcript']].append((r['position'],r['position']-50))
                
    else:
        ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism)
        ss_dict = SP.collapse_ss_dict(ss_dict)
    
    seq_dict = {}
    for transcript, introns in ss_dict.iteritems():
        if junction_bed is None:
            if organism == 'pombe':
                transcript = transcript+'.1'
            else:
                transcript = transcript+'T0'
        introns = list(introns)
        strand = transcript_dict[transcript][2]
        chrom = transcript_dict[transcript][3]
        n = 0
        for n in range(len(introns)):
            if strand == '+':
                seq_dict[transcript+'-'+chrom+':'+str(introns[n][0]+1)] = fasta_dict[chrom][introns[n][0]+2:introns[n][0]+17]
            elif strand == '-':
                seq = fasta_dict[chrom][introns[n][0]-16:introns[n][0]-1]
                seq_dict[transcript+'-'+chrom+':'+str(introns[n][0])] = SP.reverse_complement(seq)
    return seq_dict
Ejemplo n.º 12
0
def create_branch_df(branch_dict, gff3, fa_dict, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    chroms = []
    fives = []
    transcripts = []
    branches = []
    depths = []
    strands = []
    distances = []
    for tx, five_sites in branch_dict.iteritems():
        for five_site in five_sites:
            chrom = five_site[0].split(':')[0]
            pos = int(five_site[0].split(':')[1])
            n = 0
            for n in range(len(five_site[1])):
                if abs(five_site[1][n] -
                       pos) > 5 and abs(five_site[1][n] -
                                        pos) <= 1000 and five_site[2][n] >= 5:
                    chroms.append(chrom)
                    fives.append(pos)
                    transcripts.append(tx)
                    branches.append(five_site[1][n])
                    depths.append(five_site[2][n])
                    strands.append(tx_dict[tx][2])
                    if tx_dict[tx][2] == '+':
                        distances.append(five_site[1][n] - pos)
                    elif tx_dict[tx][2] == '-':
                        distances.append(pos - five_site[1][n])
    branch_df = pd.DataFrame(index=range(len(fives)))
    branch_df['transcript'] = transcripts
    branch_df['chromosome'] = chroms
    branch_df['5p splice site'] = fives
    branch_df['branch site'] = branches
    branch_df['depth'] = depths
    branch_df['distance'] = distances
    branch_df['strand'] = strands

    branch_df = branch_df[branch_df['distance'] > 0]
    branch_df['genome coord'] = branch_df['chromosome'].str.cat(
        branch_df['5p splice site'].apply(int).apply(str), sep=':')
    branch_df['branch coord'] = branch_df['chromosome'].str.cat(
        branch_df['branch site'].apply(int).apply(str), sep=':')

    branch_df = add_seq(branch_df, fa_dict)
    branch_df = find_3p_site(branch_df, gff3, organism=organism)
    return branch_df
Ejemplo n.º 13
0
def build_transcript_dict(
        gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3",
        expand=False,
        convert_chroms=False):
    transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')

    lat_rom = {'chr1': 'I', 'chr2': 'II', 'chr3': 'III', 'MT': 'MT'}

    if convert_chroms is True:
        transcript_dict = {
            k: [start, end, strand, lat_rom[chrom], cds_start, cds_end]
            for k, [start, end, strand, chrom, cds_start, cds_end] in
            transcript_dict.items()
        }

    chrom_lengths = {
        'I': 5818680,
        'II': 4744158,
        'III': 2598968,
        'chr1': 5818680,
        'chr2': 4744158,
        'chr3': 2598968
    }

    if expand is True:
        expanded_dict = {}
        for tx, info in transcript_dict.iteritems():
            new_start = info[0] - 300
            if new_start < 0:
                new_start = 0
            new_end = info[1] + 300
            if info[3] in chrom_lengths:
                if new_end > chrom_lengths[info[3]]:
                    new_end = chrom_lengths[info[3]]
            #else: print info[3]
            if len(info[4]) == 0:
                info[4] = [info[0]]
            if len(info[5]) == 0:
                info[5] = [info[1]]
            expanded_dict[tx] = [
                new_start, new_end, info[2], info[3], info[4], info[5]
            ]
        transcript_dict = expanded_dict

    return transcript_dict
Ejemplo n.º 14
0
def create_branch_df(branch_dict, gff3, fa_dict, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    chroms = []
    fives = []
    transcripts = []
    branches = []
    depths = []
    strands = []
    distances = []
    for tx, five_sites in branch_dict.iteritems():
        for five_site in five_sites:
            chrom = five_site[0].split(':')[0]
            pos = int(five_site[0].split(':')[1])
            n=0
            for n in range(len(five_site[1])):
                if abs(five_site[1][n]-pos) > 5 and abs(five_site[1][n]-pos) <= 1000 and five_site[2][n] >= 5:
                    chroms.append(chrom)
                    fives.append(pos)
                    transcripts.append(tx)
                    branches.append(five_site[1][n])
                    depths.append(five_site[2][n])
                    strands.append(tx_dict[tx][2])
                    if tx_dict[tx][2] == '+':
                        distances.append(five_site[1][n]-pos)
                    elif tx_dict[tx][2] == '-':
                        distances.append(pos-five_site[1][n])
    branch_df = pd.DataFrame(index = range(len(fives)))
    branch_df['transcript'] = transcripts
    branch_df['chromosome'] = chroms
    branch_df['5p splice site'] = fives
    branch_df['branch site'] = branches
    branch_df['depth'] = depths
    branch_df['distance'] = distances
    branch_df['strand'] = strands
    
    branch_df = branch_df[branch_df['distance'] > 0]
    branch_df['genome coord'] = branch_df['chromosome'].str.cat(branch_df['5p splice site'].apply(int).apply(str), sep=':')
    branch_df['branch coord'] = branch_df['chromosome'].str.cat(branch_df['branch site'].apply(int).apply(str), sep=':')
    
    branch_df = add_seq(branch_df, fa_dict)
    branch_df = find_3p_site(branch_df, gff3, organism=organism)
    return branch_df
Ejemplo n.º 15
0
def make_transcript_df(gff3):
    '''Creates a dataframe with all annotated transcripts from the gff3 file
    
    Parameters
    ----------
    gff3 : str
            Your favorite annotation file
            
    Returns
    ------
    df : pandas.DataFrame
            Pandas dataframe instance with location of transcripts from gff3 file'''
    
    if 'pombe' in gff3.lower():
        organism='pombe'
    else:
        organism=None
    
    # Get transcript dictionary
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    
    # Organize by transcript
    tx_dict = OrderedDict(sorted(tx_dict.items(), key=lambda t: t[0]))
    
    # Convert to dataframe
    tx_df = pd.DataFrame(index=tx_dict.keys(), columns=['start','end','strand','chromosome'])
    for n, col in enumerate(tx_df.columns):
        tx_df.loc[:,col] = zip(*tx_dict.values())[n]
    
    # Add CDS starts and ends
    CDS_starts = [min(l) if len(l) > 0 else np.NaN for l in zip(*tx_dict.values())[4]]
    CDS_ends = [max(l) if len(l) > 0 else np.NaN for l in zip(*tx_dict.values())[5]]
    tx_df.loc[:,'CDS start'] = CDS_starts
    tx_df.loc[:,'CDS end'] = CDS_ends
    
    return tx_df
Ejemplo n.º 16
0
def collect_intron_seq(gff3_file,
                       fasta_file,
                       ss_dict=None,
                       junction_bed=None,
                       gene_list=None,
                       peak_df=None,
                       organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) == dict:
        fasta_dict = fasta_file
    elif fasta_file.endswith('json'):
        with open(fasta_file, 'r') as f:
            fasta_dict = json.load(f)
    else:
        fasta_dict = make_fasta_dict(fasta_file)
    if ss_dict is not None:
        ss_dict = ss_dict
    elif junction_bed is not None:
        ss_dict = SP.build_junction_dict(junction_bed,
                                         gff3_file,
                                         transcript_dict,
                                         organism=organism)
    elif peak_df is not None:
        ss_dict = {}
        peak_df = peak_df[~peak_df['type'].str.contains('prime')]
        for ix, r in peak_df.iterrows():
            if r['transcript'] not in ss_dict:
                ss_dict[r['transcript']] = []
            if r['strand'] == '+':
                ss_dict[r['transcript']].append(
                    (r['position'], r['position'] + 50))
            elif r['strand'] == '-':
                ss_dict[r['transcript']].append(
                    (r['position'], r['position'] - 50))

    else:
        ss_dict, intron_flag = SP.list_splice_sites(gff3_file,
                                                    gene_list=gene_list,
                                                    organism=organism)
        ss_dict = SP.collapse_ss_dict(ss_dict)

    seq_dict = {}
    for transcript, introns in ss_dict.iteritems():
        if junction_bed is None:
            if organism == 'pombe':
                transcript = transcript + '.1'
            else:
                transcript = transcript + 'T0'
        introns = list(introns)
        strand = transcript_dict[transcript][2]
        chrom = transcript_dict[transcript][3]
        n = 0
        for n in range(len(introns)):
            if strand == '+':
                seq_dict[transcript + '-' + chrom + ':' +
                         str(introns[n][0] +
                             1)] = fasta_dict[chrom][introns[n][0] +
                                                     2:introns[n][0] + 17]
            elif strand == '-':
                seq = fasta_dict[chrom][introns[n][0] - 16:introns[n][0] - 1]
                seq_dict[transcript + '-' + chrom + ':' +
                         str(introns[n][0])] = SP.reverse_complement(seq)
    return seq_dict
Ejemplo n.º 17
0
def get_junction_sequence(df, gff3_file, fasta_file):
    df = df.sort_values('chr', axis=0)
    
    #transcript_dict[transcript] = [start, end, strand, chromosome, CDS start, CDS end]
    transcript_dict = SP.build_transcript_dict(gff3_file)

    #splice_dict[transcipt] = [[5'sites][3'sites]]
    splice_dict, flag = SP.list_splice_sites(gff3_file)
    
    #fasta_dict[chr] = sequence
    if type(fasta_file) is str:
        fasta_dict = make_fasta_dict(fasta_file)
    else:
        fasta_dict = fasta_file

    transcript_by_chr = {}
    for transcript, coords in transcript_dict.iteritems():
        chromosome = coords[3]
        if chromosome in transcript_by_chr:
            transcript_by_chr[chromosome].append(transcript)
        else:
            transcript_by_chr[chromosome] = []
            transcript_by_chr[chromosome].append(transcript)

    df['Gene'] = "Unknown"
    df['intron'] = "Middle"
    df['sequence1'] = ''
    df['sequence2'] = ''
    df['intron sequence'] = 'No sequence here'

    n = 0
    for n in range(len(df)):
        coord1 = int(df['coord_1'][n].strip())
        coord2 = int(df['coord_2'][n].strip())
        chrom = df['chr'][n].strip()
        strand = df['strand'][n].strip()
        transcripts = transcript_by_chr[chrom]

        for transcript in transcripts:
            tx_strand = transcript_dict[transcript][2]
            start = transcript_dict[transcript][0]
            stop = transcript_dict[transcript][1]
            
            if strand == tx_strand and coord1 >= start and coord2 <= stop:
                df.loc[n,'Gene'] = transcript
               
        if strand == '+':
            sequence1 = fasta_dict[chrom][(coord1-3):(coord1+5)]
            sequence2 = fasta_dict[chrom][(coord2-6):(coord2+2)]
            all_seq = fasta_dict[chrom][(coord1-1):coord2]
        elif strand == '-':
            sequence1 = fasta_dict[chrom][(coord2-6):(coord2+2)]
            sequence1 = SP.reverse_complement(sequence1)
            sequence2 = fasta_dict[chrom][(coord1-3):(coord1+5)]
            sequence2 = SP.reverse_complement(sequence2)
            all_seq = fasta_dict[chrom][(coord1-1):coord2]
            all_seq = SP.reverse_complement(all_seq)
        
        df.loc[n,'sequence1'] = sequence1
        df.loc[n,'sequence2'] = sequence2
        df.loc[n,'intron sequence'] = all_seq

    for transcript in transcripts:
        if transcript in df['Gene'].tolist():
            tx_df = df[df['Gene'] == transcript]
            s = tx_df['coord_1']
            min_idx = s.idxmin()
            first = int(s.min())
            #print transcript_dict[transcript][2]
            #print first
            max_idx = s.idxmax()
            last = int(s.max())
            #print last
        
            if first == last:
                df.loc[min_idx,'intron'] = 'Only'
            else:
                if transcript_dict[transcript][2] == '+':
                    df.loc[min_idx,'intron'] = 'First'
                    df.loc[max_idx,'intron'] = 'Last'
                elif transcript_dict[transcript][2] == '-':
                    df.loc[min_idx,'intron'] = 'Last'
                    df.loc[max_idx,'intron'] = 'First'
            
            for index, coord_1 in s.iteritems():
                if df['intron'][index] == 'Middle':
                    if coord_1 in range(first-10, first+10):
                        df_idx = s[s == coord_1].index[0]
                        if transcript_dict[transcript][2] == '+':
                            df.loc[df_idx, 'intron'] = 'First'
                        elif transcript_dict[transcript][2] == '-':
                            df.loc[df_idx, 'intron'] = 'Last'
                    elif coord_1 in range(last-10, last+10):
                        df_idx = s[s == coord_1].index[0]
                        if transcript_dict[transcript][2] == '+':
                            df.loc[df_idx, 'intron'] = 'Last'
                        elif transcript_dict[transcript][2] == '-':
                            df.loc[df_idx, 'intron'] = 'First'
                
    df = df[df['contained in'] != '']
    df = df.reset_index()
    return df
Ejemplo n.º 18
0
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    column_dict = {
        'position': [],
        'transcript': [],
        'alt splicing': [],
        'type': [],
        'strand': [],
        'introns in transcript': [],
        'intron size': [],
        'chromosome': [],
        '5p score': [],
        '3p score': [],
        'intron position': [],
        'exon size (us)': [],
        'exon size (ds)': [],
        'transcript size': [],
        'peak': [],
        'seq5': [],
        'seq3': []
    }
    new_index = []

    for tx in set(df['transcript']):
        strand = df[df['transcript'] == tx].iloc[0]['strand']
        splice_sites = ss_dict[tx]
        if strand == '+':
            splice_sites = sorted(list(splice_sites), key=lambda x: x[0])
        elif strand == '-':
            splice_sites = sorted(list(splice_sites),
                                  key=lambda x: x[0],
                                  reverse=True)

        df_pos = None
        for n, (five_site, three_site) in enumerate(splice_sites):
            # Check if already in dataframe
            in_df = False
            for peak in df[df['transcript'] == tx]['position']:
                if five_site in range(int(peak) - 5, int(peak) + 5):
                    in_df = True
                    df_pos = peak
                    break

            column_dict['transcript'].append(tx)
            if organism == 'pombe':
                iso = tx + '.1'
            else:
                iso = tx + 'T0'

            column_dict['intron size'].append(abs(three_site - five_site))
            column_dict['introns in transcript'].append(len(splice_sites))
            column_dict['strand'].append(strand)
            chrom = df[df['transcript'] == tx].iloc[0]['chromosome']
            column_dict['chromosome'].append(chrom)
            column_dict['transcript size'].append(
                (tx_dict[iso][1] - tx_dict[iso][0]) / 1000.)

            # Check if first or last intron and add exon size
            if n == 0:
                column_dict['intron position'].append('First')
                if strand == '+':
                    column_dict['exon size (us)'].append(
                        (five_site - tx_dict[iso][0]) / 1000.)
                    if len(splice_sites) > 1:
                        ds_length = (splice_sites[n + 1][0] -
                                     three_site) / 1000.
                        try:
                            if ds_length < 0:
                                ds_length = (splice_sites[n + 2][0] -
                                             three_site) / 1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (tx_dict[iso][1] - three_site) / 1000.

                elif strand == '-':
                    column_dict['exon size (us)'].append(
                        (tx_dict[iso][1] - five_site) / 1000.)
                    if len(splice_sites) > 1:
                        ds_length = (three_site -
                                     splice_sites[n + 1][0]) / 1000.
                        try:
                            if ds_length < 0:
                                ds_length = (three_site -
                                             splice_sites[n + 2][0]) / 1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (three_site - tx_dict[iso][0]) / 1000.
                column_dict['exon size (ds)'].append(ds_length)

            elif n == len(splice_sites) - 1:
                column_dict['intron position'].append('Last')
                column_dict['exon size (us)'].append(
                    (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.)

                if strand == '+':
                    column_dict['exon size (ds)'].append(
                        (tx_dict[iso][1] - three_site) / 1000.)
                elif strand == '-':
                    column_dict['exon size (ds)'].append(
                        (three_site - tx_dict[iso][0]) / 1000.)
            else:
                column_dict['intron position'].append('Middle')
                column_dict['exon size (us)'].append(
                    (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.)
                column_dict['exon size (ds)'].append(
                    abs(three_site - splice_sites[n + 1][0]) / 1000.)

            if in_df is True:
                peak_index = chrom + ':' + str(int(df_pos))
                new_index.append(peak_index)
                column_dict['position'].append(df_pos)
                column_dict['3p score'].append(df.loc[peak_index, '3p score'])
                column_dict['5p score'].append(df.loc[peak_index, '5p score'])
                column_dict['alt splicing'].append(df.loc[peak_index,
                                                          'alt splicing'])
                column_dict['type'].append(df.loc[peak_index, 'type'])
                column_dict['peak'].append(True)
                column_dict['seq5'].append(df.loc[peak_index, 'seq5'])
                column_dict['seq3'].append(df.loc[peak_index, 'seq3'])

            if in_df is False:
                column_dict['alt splicing'].append(False)
                column_dict['type'].append('5prime')
                column_dict['peak'].append(False)

                # Get position, index and sequence for scoring and position code
                if strand == '+':
                    column_dict['position'].append(five_site + 1)
                    new_index.append(chrom + ':' + str(five_site + 1))
                    sequence1 = fa_dict[chrom][(five_site - 1):(five_site + 7)]
                    sequence2 = fa_dict[chrom][(three_site - 5):(three_site +
                                                                 3)]

                elif strand == '-':
                    column_dict['position'].append(five_site - 1)
                    new_index.append(chrom + ':' + str(five_site - 1))
                    sequence1 = fa_dict[chrom][(five_site - 6):(five_site + 2)]
                    sequence1 = SP.reverse_complement(sequence1)
                    sequence2 = fa_dict[chrom][(three_site - 2):(three_site +
                                                                 6)]
                    sequence2 = SP.reverse_complement(sequence2)

                column_dict['seq5'].append(sequence1)
                column_dict['seq3'].append(sequence2)

                # Score sequences
                score_5, score_3 = SP.simple_score_junction(
                    sequence1, sequence2, PSSM)
                column_dict['3p score'].append(score_3)
                column_dict['5p score'].append(score_5)

    # Create new dataframe from column dictionary
    new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index)
    for column, data in column_dict.iteritems():
        new_df[column] = data

    return new_df
Ejemplo n.º 19
0
def count_reads_in_transcript(bam_files, df, gff3, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)

    bams = {}
    for bam_file in bam_files:
        bams[bam_file] = pysam.Samfile(bam_file)

    all_reads = {}

    for bam, reader in bams.iteritems():
        all_reads[bam] = pd.DataFrame(index=df.index,
                                      columns=['total', 'intron'])

        for tx in set(df['transcript']):
            tx_df = df[df['transcript'] == tx]
            if organism == 'pombe':
                tx = tx + '.1'
            else:
                tx = tx + 'T0'

            start, end, strand, CDS_start, CDS_end, exons, chrom = SP.tx_info(
                tx, tx_dict)
            if organism == 'pombe':
                lat_rom = {'chr1': 'I', 'chr2': 'II', 'chr3': 'III'}
                chrom = lat_rom[chrom]

            tx_iter = reader.fetch(chrom, start, end)

            intron_ranges = {}
            for ix, r in tx_df.iterrows():
                if strand == '+':
                    intron_start = int(r['position'])
                    intron_end = int(r['position'] + r['intron size']) + 1
                elif strand == '-':
                    intron_start = int(r['position'] - r['intron size'])
                    intron_end = int(r['position']) + 1
                intron_ranges[ix] = [range(intron_start, intron_end), 0]

            reads = 0
            for read in tx_iter:
                if read.is_reverse and strand == '+':
                    reads += 1

                    for ix in intron_ranges:
                        if read.reference_end in intron_ranges[ix][0]:
                            intron_ranges[ix][1] += 1

                elif not read.is_reverse and strand == '-':
                    reads += 1
                    for ix in intron_ranges:
                        if read.reference_start in intron_ranges[ix][0]:
                            intron_ranges[ix][1] += 1

            for ix in intron_ranges:
                try:
                    all_reads[bam].loc[ix,
                                       'total'] = reads / float(end -
                                                                start) * 1000
                    all_reads[bam].loc[ix, 'intron'] = (
                        (intron_ranges[ix][1] /
                         float(tx_df.loc[ix, 'intron size'])) /
                        (reads / float(end - start)))
                except ZeroDivisionError:
                    all_reads[bam].loc[ix, 'total'] = np.NaN
                    all_reads[bam].loc[ix, 'intron'] = np.NaN
                    print ix

    return all_reads
Ejemplo n.º 20
0
def igv_plots_general(bam_list,
                      gene_list,
                      organism,
                      colors=None,
                      names=None,
                      save_dir=None,
                      unstranded=False,
                      end_only=False,
                      same_yaxis=False,
                      specific_range=None,
                      transcript_direction=True,
                      log_scale=False,
                      rpm=True,
                      PE=False,
                      plot_junctions=False):
    '''Usage:
    Parameters
    ----------
    bam_list : list, bam files in order of plotting (top to bottom)
    gene_list : list of transcripts to plot (should be genes not transcript isoforms)
            if dataframe passed instead of list, will plot introns (must have intron information in datafame)
    organism : str, pombe or crypto
    colors : list, default `None`
            list of colors to use, same length as bam_list, check matplotlib documentation for valid color names
    names : list, default `None`
            list of sample names to use instead of bam file names. Same length as bam_files
    save_dir : str, default `None`
            directory to save eps files. If None, does not save files
    unstranded : bool, default `False`
            Use True for ChIP or DNA sequencing data (or unstranded RNAseq)
    end_only : bool or list, default `False`
            Whether to plot only the ends of reads. If different for each bam, make a list of bools same length as bam_list
    same_yaxis : bool, default `False`
            Whether all samples should be plotted on the same axis after normalizing to total number of aligned reads
    specific_range : str, default `None`
            Options: ('end', window)
                     ('start', window)
                     ([coordinate], window)
    transcript_direction : bool, default `True`
            If True, will plot in the direction of transcription, not in the direction of the DNA
    '''

    # Get all organism information (annotation etc.)
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    fix_info = {
        'I': 'chr1',
        'II': 'chr2',
        'III': 'chr3',
        'chr1': 'I',
        'chr2': 'II',
        'chr4': 'IV',
        'chr5': 'V',
        'chr6': 'VI',
        'chr7': 'VII',
        'chr8': 'VIII',
        'chr9': 'IX',
        'chr10': 'X',
        'chr11': 'XI',
        'chr12': 'XII',
        'chr13': 'XIII',
        'chr14': 'XIV',
        'chr15': 'XV',
        'chr16': 'XVI',
        '-': '+',
        '+': '-',
        'chr1': 'I',
        'chr2': 'II',
        'chr3': 'III'
    }
    if organism == 'pombe':
        tx_suffix = '.1'
    else:
        tx_suffix = 'T0'

    # Set up range parameters if specific range is indicated
    if specific_range is not None:
        window = int(specific_range[1])
        new_tx_dict = {}
        for gene in gene_list:
            info = tx_dict[gene + tx_suffix]
            if specific_range[0] == 'end':
                if info[2] == '+':
                    start = info[1] - window
                    end = info[1] + window
                else:
                    start = info[0] - window
                    end = info[0] + window
            elif specific_range[0] == 'start':
                if info[2] == '-':
                    start = info[1] - window
                    end = info[1] + window
                else:
                    start = info[0] - window
                    end = info[0] + window
            else:
                start = int(specific_range[0]) - window
                end = int(specific_range[0]) + window
            new_tx_dict[gene + tx_suffix] = [start, end, info[2], info[3]]
    else:
        new_tx_dict = tx_dict

    # Open bam files and count reads if rpm is True
    open_bams = {}
    total_list = []
    for bam in bam_list:
        open_bams[bam] = pysam.Samfile(bam)
        if rpm is True:
            total = check_output(['samtools', 'view', '-F 0x04', '-c',
                                  bam]).strip()
            total = float(total) / 1000000.
            total_list.append(total)
        else:
            total_list.append(1.)

    # Expand optional arguments to lists if necessary
    colors = list_from_arg(colors, len(bam_list))
    end_only = list_from_arg(end_only, len(bam_list))
    log_scale = list_from_arg(log_scale, len(bam_list))
    unstranded = list_from_arg(unstranded, len(bam_list))

    # Get gene_list from dataframe if gene_list is not a list
    df = None
    if type(gene_list) == dict:
        new_tx_dict = gene_list
        gene_list = gene_list.keys()

    elif type(gene_list) != list:
        df = gene_list
        gene_list = df.index

    for tx in gene_list:
        num_ax = len(bam_list) + 1
        if plot_junctions is True:
            num_ax += len(bam_list)

        fig, ax = plt.subplots(num_ax, figsize=(10, num_ax), sharex=True)
        fig.subplots_adjust(hspace=0)

        # Get transcript info from transcript_dictionary
        if df is None:
            try:
                info = new_tx_dict[tx + tx_suffix]
            except KeyError:
                info = new_tx_dict[tx]
            chrom = info[3]
            start = info[0]
            end = info[1]
            strand = info[2]

        # If dataframe was passed, get plotting information from dataframe instead
        else:
            if isinstance(df.columns, pd.core.index.MultiIndex):
                new_columns = [x[1] for x in df.columns if x[0] == 'Peaks']
                df = df[[x for x in df.columns if x[0] == 'Peaks']]
                df.columns = new_columns
            strand = df.loc[tx, 'strand']
            chrom = df.loc[tx, 'chromosome']
            if strand == '+':
                start = df.loc[tx, 'position'] - 100
                end = df.loc[tx, 'position'] + df.loc[tx, 'intron size'] + 100
            elif strand == '-':
                start = df.loc[tx, 'position'] - df.loc[tx,
                                                        'intron size'] - 100
                end = df.loc[tx, 'position'] + 100
            start = int(start)
            end = int(end)

            tx = df.loc[tx, 'transcript']

        # Generate read series for each transcript
        max_y = 0
        junc_ymax = 0
        for n, bam in enumerate(bam_list):
            try:
                bam_iter = open_bams[bam].fetch(chrom, start, end)
            except ValueError:
                chrom = fix_info[chrom]
                bam_iter = open_bams[bam].fetch(chrom, start, end)
            if end_only[n] is not False:
                s = SP.generate_read_series_A(bam_iter, chrom, start, end,
                                              strand)
                linewidth = 2
            else:
                if PE is False:
                    s = SP.generate_read_series_B(bam_iter, chrom, start, end,
                                                  strand)
                else:
                    s = SP.generate_read_series_PE(bam_iter, chrom, start, end,
                                                   strand)
                linewidth = 1

            # Get reads from otherstrand if the library type is unstranded
            if unstranded[n] is True:
                bam_iter = open_bams[bam].fetch(chrom, start, end)
                if end_only[n] is not False:
                    s2 = SP.generate_read_series_A(bam_iter, chrom, start, end,
                                                   fix_info[strand])
                    linewidth = 2
                else:
                    if PE is False:
                        s2 = SP.generate_read_series_B(bam_iter, chrom, start,
                                                       end, fix_info[strand])
                    else:
                        s2 = SP.generate_read_series_PE(
                            bam_iter, chrom, start, end, fix_info[strand])
                    linewidth = 1
                s = s.add(s2)

            # Normalize to rpm (will just divide by 1 if rpm is False)
            s = s.divide(total_list[n])
            if log_scale[n] is True:
                s = s.apply(np.log2)

            # Plot!
            ax[n].bar(s.index,
                      s,
                      linewidth=linewidth,
                      color=colors[n],
                      edgecolor=colors[n],
                      zorder=2)
            ax[n].tick_params(axis='both', which='major', labelsize=14)

            max_y = max([max_y, max(s)])

            if plot_junctions is True:
                m = n + len(bam_list)
                intron_dict = get_junctions(open_bams[bam], chrom, start, end,
                                            strand)
                ax[m].plot((start, end), (0, 0), '-', c='k')
                for coords, heights in intron_dict.iteritems():
                    ax[m].plot(coords,
                               heights,
                               '-',
                               linewidth=2,
                               color=colors[n])
                    ax[m].fill_between(coords,
                                       0,
                                       heights,
                                       facecolor=colors[n],
                                       interpolate=True,
                                       alpha=0.5)
                if same_yaxis is True:
                    junc_ymax = max(
                        [junc_ymax,
                         max(zip(*intron_dict.values())[1])])

        # Add diagram of gene below traces
        if tx in tx_dict:
            strand = gene_patches(tx, tx_dict, ax[-1])
            ax[-1].set_xlim(start, end)
        else:
            try:
                new_tx = tx.split(' ')[0]
                if new_tx[-2] == 'T' or new_tx[-2] == '.':
                    new_tx = new_tx[:-2]
                strand = gene_patches(new_tx, tx_dict, ax[-1])
                ax[-1].set_xlim(start, end)
            except KeyError:
                print "Transcript unknown"

        # Flip minus strand transcripts if indicated
        if transcript_direction is True:
            if strand == '-':
                ax[-1].invert_xaxis()

        # Set x and y limits
        for n in range(len(bam_list)):
            ax[n].set_xlim(start, end)
            if same_yaxis is True:
                ax[n].set_ylim(0, max_y + 0.1 * max_y)

                if plot_junctions is True:
                    ax[n + len(bam_list)].set_ylim(0,
                                                   junc_ymax + 0.1 * junc_ymax)

            if strand == '-':
                ax[n].invert_xaxis()

        ax[0].set_ylabel('RPM', fontsize=16)
        ax[0].set_title(tx, fontsize=16)
        #ax[0].get_xaxis().set_ticks([])
        plt.show()

        # Save if indicated
        if save_dir is not None:
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            fig.savefig(save_dir + tx + '.eps', format='eps')

        plt.clf()
Ejemplo n.º 21
0
def list_branch_points(sorted_bam_file, gff3_file, fasta_dict, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)

    if type(fasta_dict) == str:
        with open(fasta_dict, 'r') as f:
            fasta_dict = json.load(f)

    branch_dict = {}
    read_counter = 0
    br_counter = 0
    bam_reader = HTSeq.BAM_Reader(sorted_bam_file)
    for a in bam_reader:
        read_counter += 1
        transcript = a.read.name.split('-chr')[0].split(':')[-1]
        splice_site = a.read.name.split('-')[-1]
        if len(splice_site) < 3:
            splice_site = a.read.name.split('-')[-2]
        if splice_site.startswith('chr'):
            if transcript not in branch_dict:
                branch_dict[transcript] = {}
            if splice_site not in branch_dict[transcript]:
                branch_dict[transcript][splice_site] = []
            if a.iv is not None:
                strand = a.iv.strand
                read_end = a.iv.end
                if strand == '-':
                    read_end = a.iv.start
                if strand == transcript_dict[transcript][2]:
                    branch_dict[transcript][splice_site].append(read_end)
                    br_counter += 1

    print "Reads analyzed: " + str(read_counter)
    print "Reads assigned as branches: " + str(br_counter)

    new_branch_dict = {}
    for transcript, introns in branch_dict.iteritems():
        new_branch_dict[transcript] = []
        for intron, branches in introns.iteritems():
            new_branch_list = []
            new_branch_counts = []
            for branch in branches:
                flag = False
                if len(new_branch_list) > 0:
                    for pos in range(branch - 2, branch + 3):
                        if pos in new_branch_list:
                            flag = True
                            br_id = new_branch_list.index(pos)
                            new_branch_counts[br_id] += 1
                if flag == False:
                    new_branch_list.append(branch)
                    new_branch_counts.append(1)
            if len(new_branch_list) > 0:
                new_branch_dict[transcript].append(
                    [intron, new_branch_list, new_branch_counts])

    with open('{0}.bed'.format(sorted_bam_file.split('_sorted.bam')[0]),
              'w') as fout:
        fout.write('track name=junctions description="TopHat junctions"\n')
        for transcript, introns in new_branch_dict.iteritems():
            strand = transcript_dict[transcript][2]
            for intron in introns:
                chrom = intron[0].split(':')[0]
                start = int(intron[0].split(':')[1])
                n = 0
                for n in range(len(intron[1])):
                    end = intron[1][n]
                    value = intron[2][n]
                    size = abs(end - start) + 30
                    if abs(end - start) > 2000:
                        pass
                    elif abs(end - start) > 5 and value >= 5:
                        #[seqname] [start] [end] [id] [score] [strand] [thickStart] [thickEnd] [r,g,b][block_count] [block_sizes] [block_locations]
                        read_id = intron[0] + '-' + str(n)
                        block_size = '0,' + str(size)
                        line_list = [
                            chrom,
                            str(start - 1),
                            str(end + 1), read_id,
                            str(value), strand,
                            str(start - 1),
                            str(end + 1), '75,196,213', '2', '1,1', block_size,
                            '\n'
                        ]
                        line = '\t'.join(line_list)
                        fout.write(line)

    return new_branch_dict
Ejemplo n.º 22
0
def igv_plots_general(bam_list, gene_list, organism, colors=None, names=None, save_dir=None, 
                      unstranded=False, end_only=False, same_yaxis=False, specific_range=None, transcript_direction=True,
                     log_scale=False, rpm=True, PE=False, plot_junctions=False):
    '''Usage:
    Parameters
    ----------
    bam_list : list, bam files in order of plotting (top to bottom)
    gene_list : list of transcripts to plot (should be genes not transcript isoforms)
            if dataframe passed instead of list, will plot introns (must have intron information in datafame)
    organism : str, pombe or crypto
    colors : list, default `None`
            list of colors to use, same length as bam_list, check matplotlib documentation for valid color names
    names : list, default `None`
            list of sample names to use instead of bam file names. Same length as bam_files
    save_dir : str, default `None`
            directory to save eps files. If None, does not save files
    unstranded : bool, default `False`
            Use True for ChIP or DNA sequencing data (or unstranded RNAseq)
    end_only : bool or list, default `False`
            Whether to plot only the ends of reads. If different for each bam, make a list of bools same length as bam_list
    same_yaxis : bool, default `False`
            Whether all samples should be plotted on the same axis after normalizing to total number of aligned reads
    specific_range : str, default `None`
            Options: ('end', window)
                     ('start', window)
                     ([coordinate], window)
    transcript_direction : bool, default `True`
            If True, will plot in the direction of transcription, not in the direction of the DNA
    '''
    
    # Get all organism information (annotation etc.)
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    fix_info = {'I':'chr1','II':'chr2','III':'chr3','chr1':'I','chr2':'II','chr4':'IV','chr5':'V','chr6':'VI',
                'chr7':'VII','chr8':'VIII','chr9':'IX','chr10':'X','chr11':'XI','chr12':'XII','chr13':'XIII',
                'chr14':'XIV','chr15':'XV','chr16':'XVI','-':'+','+':'-','chr1':'I','chr2':'II','chr3':'III'}
    if organism == 'pombe':
        tx_suffix = '.1'
    else:
        tx_suffix = 'T0'
    
    # Set up range parameters if specific range is indicated
    if specific_range is not None:
        window = int(specific_range[1])
        new_tx_dict = {}
        for gene in gene_list:
            info = tx_dict[gene+tx_suffix]
            if specific_range[0] == 'end':
                if info[2] == '+':
                    start = info[1]-window
                    end = info[1]+window
                else:
                    start = info[0]-window
                    end = info[0]+window
            elif specific_range[0] == 'start':
                if info[2] == '-':
                    start = info[1]-window
                    end = info[1]+window
                else:
                    start = info[0]-window
                    end = info[0]+window    
            else:
                start = int(specific_range[0])-window
                end = int(specific_range[0])+window           
            new_tx_dict[gene+tx_suffix] = [start, end, info[2], info[3]]
    else:
        new_tx_dict = tx_dict
                
    # Open bam files and count reads if rpm is True
    open_bams = {}
    total_list = []
    for bam in bam_list:
        open_bams[bam] = pysam.Samfile(bam)
        if rpm is True:
            total = check_output(['samtools','view','-F 0x04','-c',bam]).strip()
            total = float(total)/1000000.
            total_list.append(total)
        else:
            total_list.append(1.)
    
    # Expand optional arguments to lists if necessary
    colors = list_from_arg(colors, len(bam_list))
    end_only = list_from_arg(end_only, len(bam_list))
    log_scale = list_from_arg(log_scale, len(bam_list))
    unstranded = list_from_arg(unstranded, len(bam_list))
    
    # Get gene_list from dataframe if gene_list is not a list
    df = None
    if type(gene_list) == dict:
        new_tx_dict = gene_list
        gene_list = gene_list.keys()
        
    elif type(gene_list) != list:
        df = gene_list
        gene_list = df.index
    
    for tx in gene_list:
        num_ax = len(bam_list)+1
        if plot_junctions is True:
            num_ax += len(bam_list)
        
        fig, ax = plt.subplots(num_ax, figsize=(10,num_ax), sharex=True)
        fig.subplots_adjust(hspace=0)
        
        # Get transcript info from transcript_dictionary
        if df is None:
            try:
                info = new_tx_dict[tx+tx_suffix]
            except KeyError:
                info = new_tx_dict[tx]
            chrom = info[3]
            start = info[0]
            end = info[1]
            strand = info[2]
        
        # If dataframe was passed, get plotting information from dataframe instead
        else:
            if isinstance(df.columns, pd.core.index.MultiIndex):
                new_columns = [x[1] for x in df.columns if x[0] == 'Peaks']
                df = df[[x for x in df.columns if x[0] == 'Peaks']]
                df.columns = new_columns
            strand = df.loc[tx,'strand']
            chrom = df.loc[tx,'chromosome']
            if strand == '+':
                start = df.loc[tx,'position']-100
                end = df.loc[tx,'position'] + df.loc[tx,'intron size']+100
            elif strand == '-':
                start = df.loc[tx,'position']-df.loc[tx,'intron size']-100
                end = df.loc[tx,'position']+100
            start = int(start)
            end = int(end)
            
            tx = df.loc[tx,'transcript']
        
        # Generate read series for each transcript
        max_y = 0
        junc_ymax = 0
        for n, bam in enumerate(bam_list):
            try:
                bam_iter = open_bams[bam].fetch(chrom, start, end)
            except ValueError:
                chrom = fix_info[chrom]
                bam_iter = open_bams[bam].fetch(chrom, start, end)
            if end_only[n] is not False:
                s = SP.generate_read_series_A(bam_iter, chrom, start, end, strand)
                linewidth = 2
            else:
                if PE is False:
                    s = SP.generate_read_series_B(bam_iter, chrom, start, end, strand)
                else:
                    s = SP.generate_read_series_PE(bam_iter, chrom, start, end, strand)
                linewidth = 1
            
            # Get reads from otherstrand if the library type is unstranded
            if unstranded[n] is True:
                bam_iter = open_bams[bam].fetch(chrom, start, end)
                if end_only[n] is not False:
                    s2 = SP.generate_read_series_A(bam_iter, chrom, start, end, fix_info[strand])
                    linewidth = 2
                else:
                    if PE is False:
                        s2 = SP.generate_read_series_B(bam_iter, chrom, start, end, fix_info[strand])
                    else:
                        s2 = SP.generate_read_series_PE(bam_iter, chrom, start, end, fix_info[strand])
                    linewidth = 1
                s = s.add(s2)
            
            # Normalize to rpm (will just divide by 1 if rpm is False)
            s = s.divide(total_list[n])
            if log_scale[n] is True:
                s = s.apply(np.log2)
            
            # Plot!
            ax[n].bar(s.index, s, linewidth=linewidth, color=colors[n], edgecolor=colors[n], zorder=2)
            ax[n].tick_params(axis='both', which='major', labelsize=14)
            
            max_y = max([max_y,max(s)])
            
            if plot_junctions is True:
                m = n+len(bam_list)
                intron_dict = get_junctions(open_bams[bam], chrom, start, end, strand)
                ax[m].plot((start, end),(0,0),'-',c='k')
                for coords, heights in intron_dict.iteritems():
                    ax[m].plot(coords, heights, '-', linewidth=2, color=colors[n])
                    ax[m].fill_between(coords, 0, heights, facecolor=colors[n], interpolate=True, alpha=0.5)
                if same_yaxis is True:
                    junc_ymax = max([junc_ymax, max(zip(*intron_dict.values())[1])])
            
        # Add diagram of gene below traces
        if tx in tx_dict:
            strand = gene_patches(tx, tx_dict, ax[-1])
            ax[-1].set_xlim(start, end)
        else:
            try:
                new_tx = tx.split(' ')[0]
                if new_tx[-2] == 'T' or new_tx[-2] == '.':
                    new_tx = new_tx[:-2]
                strand = gene_patches(new_tx, tx_dict, ax[-1])
                ax[-1].set_xlim(start, end)
            except KeyError:
                print "Transcript unknown"
                
        
        # Flip minus strand transcripts if indicated
        if transcript_direction is True:
            if strand == '-':
                ax[-1].invert_xaxis()

        # Set x and y limits
        for n in range(len(bam_list)):
            ax[n].set_xlim(start, end)
            if same_yaxis is True:
                ax[n].set_ylim(0,max_y+0.1*max_y)
                
                if plot_junctions is True:
                    ax[n+len(bam_list)].set_ylim(0,junc_ymax+0.1*junc_ymax)
            
            if strand == '-':
                ax[n].invert_xaxis()

        ax[0].set_ylabel('RPM', fontsize=16)
        ax[0].set_title(tx, fontsize=16)
        #ax[0].get_xaxis().set_ticks([])
        plt.show()
        
        # Save if indicated
        if save_dir is not None:
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            fig.savefig(save_dir+tx+'.eps', format='eps')
            
        plt.clf()
Ejemplo n.º 23
0
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    column_dict = {'position':[],'transcript':[],'alt splicing':[],'type':[],'strand':[], 'introns in transcript':[],
                   'intron size':[],'chromosome':[], '5p score':[], '3p score':[], 'intron position':[], 'exon size (us)':[],
                   'exon size (ds)':[],'transcript size':[], 'peak':[], 'seq5':[],'seq3':[]} 
    new_index = []
    
    for tx in set(df['transcript']):
        strand = df[df['transcript'] == tx].iloc[0]['strand']
        splice_sites = ss_dict[tx]
        if strand == '+':
            splice_sites = sorted(list(splice_sites), key=lambda x:x[0])
        elif strand == '-':
            splice_sites = sorted(list(splice_sites), key=lambda x:x[0], reverse=True)
        
        df_pos = None
        for n, (five_site, three_site) in enumerate(splice_sites):
            # Check if already in dataframe
            in_df = False
            for peak in df[df['transcript'] == tx]['position']:
                if five_site in range(int(peak)-5,int(peak)+5):
                    in_df = True
                    df_pos = peak
                    break
            
            column_dict['transcript'].append(tx)
            if organism == 'pombe':
                iso = tx+'.1'
            else: iso = tx+'T0'
            
            column_dict['intron size'].append(abs(three_site-five_site))
            column_dict['introns in transcript'].append(len(splice_sites))
            column_dict['strand'].append(strand)   
            chrom = df[df['transcript'] == tx].iloc[0]['chromosome']
            column_dict['chromosome'].append(chrom)
            column_dict['transcript size'].append((tx_dict[iso][1]-tx_dict[iso][0])/1000.)

            # Check if first or last intron and add exon size
            if n == 0:
                column_dict['intron position'].append('First')
                if strand == '+':
                    column_dict['exon size (us)'].append((five_site-tx_dict[iso][0])/1000.)
                    if len(splice_sites) > 1:
                        ds_length = (splice_sites[n+1][0] - three_site)/1000.
                        try:
                            if ds_length < 0:
                                ds_length = (splice_sites[n+2][0] - three_site)/1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (tx_dict[iso][1] - three_site)/1000.
                    
                elif strand == '-':
                    column_dict['exon size (us)'].append((tx_dict[iso][1]-five_site)/1000.)
                    if len(splice_sites) > 1:
                        ds_length = (three_site - splice_sites[n+1][0])/1000.
                        try:
                            if ds_length < 0:
                                ds_length = (three_site - splice_sites[n+2][0])/1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (three_site - tx_dict[iso][0])/1000.
                column_dict['exon size (ds)'].append(ds_length)
            
            elif n == len(splice_sites)-1:
                column_dict['intron position'].append('Last')
                column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.)
                
                if strand == '+':
                    column_dict['exon size (ds)'].append((tx_dict[iso][1]-three_site)/1000.)
                elif strand == '-':
                    column_dict['exon size (ds)'].append((three_site - tx_dict[iso][0])/1000.)
            else:
                column_dict['intron position'].append('Middle')
                column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.)
                column_dict['exon size (ds)'].append(abs(three_site - splice_sites[n+1][0])/1000.)

            if in_df is True:
                peak_index = chrom+':'+str(int(df_pos))
                new_index.append(peak_index)
                column_dict['position'].append(df_pos)
                column_dict['3p score'].append(df.loc[peak_index,'3p score'])
                column_dict['5p score'].append(df.loc[peak_index,'5p score'])
                column_dict['alt splicing'].append(df.loc[peak_index,'alt splicing'])
                column_dict['type'].append(df.loc[peak_index,'type'])
                column_dict['peak'].append(True)
                column_dict['seq5'].append(df.loc[peak_index,'seq5'])
                column_dict['seq3'].append(df.loc[peak_index,'seq3'])

            if in_df is False:
                column_dict['alt splicing'].append(False)
                column_dict['type'].append('5prime')
                column_dict['peak'].append(False)
                
                # Get position, index and sequence for scoring and position code
                if strand == '+':
                    column_dict['position'].append(five_site+1)
                    new_index.append(chrom+':'+str(five_site+1))
                    sequence1 = fa_dict[chrom][(five_site-1):(five_site+7)]
                    sequence2 = fa_dict[chrom][(three_site-5):(three_site+3)]
                
                elif strand == '-':
                    column_dict['position'].append(five_site-1)
                    new_index.append(chrom+':'+str(five_site-1))
                    sequence1 = fa_dict[chrom][(five_site-6):(five_site+2)]
                    sequence1 = SP.reverse_complement(sequence1)
                    sequence2 = fa_dict[chrom][(three_site-2):(three_site+6)]
                    sequence2 = SP.reverse_complement(sequence2)

                column_dict['seq5'].append(sequence1)
                column_dict['seq3'].append(sequence2)
                
                # Score sequences
                score_5, score_3 = SP.simple_score_junction(sequence1, sequence2, PSSM)
                column_dict['3p score'].append(score_3)
                column_dict['5p score'].append(score_5)
    
    # Create new dataframe from column dictionary
    new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index)
    for column, data in column_dict.iteritems():
        new_df[column] = data
    
    return new_df
Ejemplo n.º 24
0
def list_branch_points(sorted_bam_file, gff3_file, fasta_dict, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)

    if type(fasta_dict) == str:
        with open(fasta_dict, 'r') as f:
            fasta_dict = json.load(f)
    
    branch_dict = {}
    read_counter = 0
    br_counter = 0
    bam_reader = HTSeq.BAM_Reader(sorted_bam_file)
    for a in bam_reader:
        read_counter += 1
        transcript = a.read.name.split('-chr')[0].split(':')[-1]
        splice_site = a.read.name.split('-')[-1]
        if len(splice_site) < 3:
            splice_site = a.read.name.split('-')[-2]
        if splice_site.startswith('chr'):
            if transcript not in branch_dict:
                branch_dict[transcript] = {}
            if splice_site not in branch_dict[transcript]:
                branch_dict[transcript][splice_site] = []
            if a.iv is not None:
                strand = a.iv.strand
                read_end = a.iv.end
                if strand == '-':
                    read_end = a.iv.start
                if strand == transcript_dict[transcript][2]:
                    branch_dict[transcript][splice_site].append(read_end)
                    br_counter += 1
                
    print "Reads analyzed: "+str(read_counter)
    print "Reads assigned as branches: "+str(br_counter)
    
    new_branch_dict = {}
    for transcript, introns in branch_dict.iteritems():
        new_branch_dict[transcript] = []
        for intron, branches in introns.iteritems():
            new_branch_list = []
            new_branch_counts = []
            for branch in branches:
                flag = False
                if len(new_branch_list) > 0:
                    for pos in range(branch-2,branch+3):
                        if pos in new_branch_list: 
                            flag = True
                            br_id = new_branch_list.index(pos)
                            new_branch_counts[br_id] += 1
                if flag == False: 
                    new_branch_list.append(branch)
                    new_branch_counts.append(1)
            if len(new_branch_list) > 0:
                new_branch_dict[transcript].append([intron, new_branch_list, new_branch_counts])
    
    with open('{0}.bed'.format(sorted_bam_file.split('_sorted.bam')[0]), 'w') as fout:
        fout.write('track name=junctions description="TopHat junctions"\n')
        for transcript, introns in new_branch_dict.iteritems():
            strand = transcript_dict[transcript][2]
            for intron in introns:
                chrom = intron[0].split(':')[0]
                start = int(intron[0].split(':')[1])
                n=0
                for n in range(len(intron[1])):
                    end = intron[1][n]
                    value = intron[2][n]
                    size = abs(end-start)+30
                    if abs(end-start) > 2000:
                        pass
                    elif abs(end-start) > 5 and value >= 5:
                        #[seqname] [start] [end] [id] [score] [strand] [thickStart] [thickEnd] [r,g,b][block_count] [block_sizes] [block_locations]
                        read_id = intron[0]+'-'+str(n)
                        block_size = '0,'+str(size)
                        line_list = [chrom, str(start-1), str(end+1), read_id, str(value), strand, str(start-1), str(end+1), '75,196,213', '2', '1,1', block_size, '\n']
                        line = '\t'.join(line_list)
                        fout.write(line)
    
    return new_branch_dict
Ejemplo n.º 25
0
def build_junction_df(junction_bed, gff3_file, fasta, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta) == str:
        fasta = SP.make_fasta_dict(fasta)
    junction_dict = build_junction_dict(junction_bed,
                                        gff3_file,
                                        transcript_dict,
                                        organism=organism)
    junction_count = 0
    for tx, junctions in junction_dict.iteritems():
        junction_count += len(junctions)

    junction_df = pd.DataFrame(index=range(junction_count),
                               columns=[
                                   'intron tuple', 'chromosome', 'start',
                                   'end', 'strand', 'depth', 'type', 'size',
                                   'annotated intron size',
                                   'annotated intron start',
                                   'annotated intron end'
                               ])
    n = 0
    for tx, junctions in junction_dict.iteritems():
        for junction in junctions:
            junction_df.ix[n] = [tx] + junction
            n += 1

    sequence1 = []
    sequence2 = []
    ann_seq1 = []
    ann_seq2 = []
    seq_type1 = []
    seq_type2 = []
    df_tx = []
    for index, row in junction_df.iterrows():
        df_tx.append(row['intron tuple'][0])
        chrom = convert_chrom(row['chromosome'])
        if row['strand'] == '+':
            curr1 = fasta[chrom][(row['start'] - 1):(row['start'] + 7)]
            sequence1.append(curr1)
            curr2 = fasta[chrom][(row['end'] - 5):(row['end'] + 3)]
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(
                    fasta[chrom][(row['annotated intron start'] -
                                  1):(row['annotated intron start'] + 7)])
                ann_seq2.append(fasta[chrom][(row['annotated intron end'] -
                                              5):(row['annotated intron end'] +
                                                  3)])
        elif row['strand'] == '-':
            curr1 = SP.reverse_complement(fasta[chrom][(row['start'] -
                                                        6):(row['start'] + 2)])
            sequence1.append(curr1)
            curr2 = SP.reverse_complement(fasta[chrom][(row['end'] -
                                                        2):(row['end'] + 6)])
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(
                    SP.reverse_complement(
                        fasta[chrom][row['annotated intron start'] -
                                     6:row['annotated intron start'] + 2]))
                ann_seq2.append(
                    SP.reverse_complement(
                        fasta[chrom][row['annotated intron end'] -
                                     2:row['annotated intron end'] + 6]))
        else:
            sequence1.append('NNNNNNNN')
            sequence2.append('NNNNNNNN')
            ann_seq1.append('NNNNNNNN')
            ann_seq2.append('NNNNNNNN')

        if row['type'] == 'Annotated':
            seq_type1.append('5p annotated')
            seq_type2.append('3p annotated')
        elif row['type'] == '5p tethered':
            seq_type1.append('5p annotated')
            seq_type2.append(curr2[4:6])
        else:
            seq_type1.append(curr1[2:4])
            seq_type2.append(curr2[4:6])

    junc_seq_df = junction_df
    junc_seq_df['sequence1'] = sequence1
    junc_seq_df['sequence2'] = sequence2
    junc_seq_df['seq type1'] = seq_type1
    junc_seq_df['seq type2'] = seq_type2
    junc_seq_df['annotated sequence1'] = ann_seq1
    junc_seq_df['annotated sequence2'] = ann_seq2
    junc_seq_df['transcript'] = df_tx

    return junc_seq_df
Ejemplo n.º 26
0
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False):
    #Populate gene dictionary and build genome
    if 'pombe' in gff3.lower():
        transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')
        ss, flag = SP.list_splice_sites(gff3, organism='pombe')
        organism = 'pombe'
    else:
        transcript_dict = SP.build_transcript_dict(gff3)
        ss, flag = SP.list_splice_sites(gff3)
        organism = None
    ss_dict = SP.collapse_ss_dict(ss)
    genome = fasta_dict
    #print genome.keys()
    nuc_prob = gc_content(fasta_dict)
    #print nuc_prob

    base_dict = {"A": 0, "C": 1, "T": 2, "G": 3}

    #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G.
    pos_matrix_5prime = np.zeros([4, 8])
    pos_matrix_3prime = np.zeros([4, 8])

    counter1 = 0
    counter2 = 0

    for transcript, introns in ss_dict.iteritems():
        counter2 += 1
        if organism == 'pombe':
            isoform = transcript + '.1'
        else:
            isoform = transcript + 'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            counter1 += 1
            if strand == '+':
                seq = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)]
                seq = SP.reverse_complement(seq)

            for a, base in enumerate(seq):
                pos_matrix_5prime[base_dict[base], a] += 1

            if strand == '+':
                seq = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)]
                seq = SP.reverse_complement(seq)

            for b, base in enumerate(seq):
                pos_matrix_3prime[base_dict[base], b] += 1

    #print counter1
    #print counter2

    float_formatter = lambda x: "%.1f" % x
    np.set_printoptions(formatter={'float_kind': float_formatter})

    a = 0
    while a < 4:
        b = 0
        while b < 8:
            if PSSM is False:
                pos_matrix_5prime[a,
                                  b] = (pos_matrix_5prime[a,
                                                          b]) / float(counter1)
                pos_matrix_3prime[a,
                                  b] = (pos_matrix_3prime[a,
                                                          b]) / float(counter1)
            if PSSM is True:
                if pos_matrix_5prime[a, b] == 0: pos_matrix_5prime[a, b] += 1
                if pos_matrix_3prime[a, b] == 0: pos_matrix_3prime[a, b] += 1
                pos_matrix_5prime[a, b] = np.log2(
                    (pos_matrix_5prime[a, b] / float(counter1)) / nuc_prob[a])
                pos_matrix_3prime[a, b] = np.log2(
                    (pos_matrix_3prime[a, b] / float(counter1)) / nuc_prob[a])
            b += 1
        a += 1

    return (pos_matrix_5prime, pos_matrix_3prime)
Ejemplo n.º 27
0
def get_junction_sequence(df, gff3_file, fasta_file):
    df = df.sort_values('chr', axis=0)

    #transcript_dict[transcript] = [start, end, strand, chromosome, CDS start, CDS end]
    transcript_dict = SP.build_transcript_dict(gff3_file)

    #splice_dict[transcipt] = [[5'sites][3'sites]]
    splice_dict, flag = SP.list_splice_sites(gff3_file)

    #fasta_dict[chr] = sequence
    if type(fasta_file) is str:
        fasta_dict = make_fasta_dict(fasta_file)
    else:
        fasta_dict = fasta_file

    transcript_by_chr = {}
    for transcript, coords in transcript_dict.iteritems():
        chromosome = coords[3]
        if chromosome in transcript_by_chr:
            transcript_by_chr[chromosome].append(transcript)
        else:
            transcript_by_chr[chromosome] = []
            transcript_by_chr[chromosome].append(transcript)

    df['Gene'] = "Unknown"
    df['intron'] = "Middle"
    df['sequence1'] = ''
    df['sequence2'] = ''
    df['intron sequence'] = 'No sequence here'

    n = 0
    for n in range(len(df)):
        coord1 = int(df['coord_1'][n].strip())
        coord2 = int(df['coord_2'][n].strip())
        chrom = df['chr'][n].strip()
        strand = df['strand'][n].strip()
        transcripts = transcript_by_chr[chrom]

        for transcript in transcripts:
            tx_strand = transcript_dict[transcript][2]
            start = transcript_dict[transcript][0]
            stop = transcript_dict[transcript][1]

            if strand == tx_strand and coord1 >= start and coord2 <= stop:
                df.loc[n, 'Gene'] = transcript

        if strand == '+':
            sequence1 = fasta_dict[chrom][(coord1 - 3):(coord1 + 5)]
            sequence2 = fasta_dict[chrom][(coord2 - 6):(coord2 + 2)]
            all_seq = fasta_dict[chrom][(coord1 - 1):coord2]
        elif strand == '-':
            sequence1 = fasta_dict[chrom][(coord2 - 6):(coord2 + 2)]
            sequence1 = SP.reverse_complement(sequence1)
            sequence2 = fasta_dict[chrom][(coord1 - 3):(coord1 + 5)]
            sequence2 = SP.reverse_complement(sequence2)
            all_seq = fasta_dict[chrom][(coord1 - 1):coord2]
            all_seq = SP.reverse_complement(all_seq)

        df.loc[n, 'sequence1'] = sequence1
        df.loc[n, 'sequence2'] = sequence2
        df.loc[n, 'intron sequence'] = all_seq

    for transcript in transcripts:
        if transcript in df['Gene'].tolist():
            tx_df = df[df['Gene'] == transcript]
            s = tx_df['coord_1']
            min_idx = s.idxmin()
            first = int(s.min())
            #print transcript_dict[transcript][2]
            #print first
            max_idx = s.idxmax()
            last = int(s.max())
            #print last

            if first == last:
                df.loc[min_idx, 'intron'] = 'Only'
            else:
                if transcript_dict[transcript][2] == '+':
                    df.loc[min_idx, 'intron'] = 'First'
                    df.loc[max_idx, 'intron'] = 'Last'
                elif transcript_dict[transcript][2] == '-':
                    df.loc[min_idx, 'intron'] = 'Last'
                    df.loc[max_idx, 'intron'] = 'First'

            for index, coord_1 in s.iteritems():
                if df['intron'][index] == 'Middle':
                    if coord_1 in range(first - 10, first + 10):
                        df_idx = s[s == coord_1].index[0]
                        if transcript_dict[transcript][2] == '+':
                            df.loc[df_idx, 'intron'] = 'First'
                        elif transcript_dict[transcript][2] == '-':
                            df.loc[df_idx, 'intron'] = 'Last'
                    elif coord_1 in range(last - 10, last + 10):
                        df_idx = s[s == coord_1].index[0]
                        if transcript_dict[transcript][2] == '+':
                            df.loc[df_idx, 'intron'] = 'Last'
                        elif transcript_dict[transcript][2] == '-':
                            df.loc[df_idx, 'intron'] = 'First'

    df = df[df['contained in'] != '']
    df = df.reset_index()
    return df
Ejemplo n.º 28
0
def peak_to_seq_pipeline(untagged_peak_file,
                         tagged1_peak_file,
                         tagged2_peak_file,
                         gff3,
                         fasta,
                         junction_df=None,
                         branch_df=None,
                         cutoff=5,
                         name='CP_peaks'):

    if 'pombe' in gff3: organism = 'pombe'
    else: organism = None

    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    print "Finding peaks in transcripts..."

    print untagged_peak_file
    untagged = CP_peaks_by_gene(untagged_peak_file,
                                transcript_dict,
                                cutoff=cutoff)

    print tagged1_peak_file
    tagged1 = CP_peaks_by_gene(tagged1_peak_file,
                               transcript_dict,
                               cutoff=cutoff)

    print tagged2_peak_file
    tagged2 = CP_peaks_by_gene(tagged2_peak_file,
                               transcript_dict,
                               cutoff=cutoff)

    print "Comparing peaks between replicates..."
    peaks = CP_compare_reps(untagged, tagged1, tagged2)

    print "Checking peaks against annotation..."
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    peak_df = CP_compare_to_annotation(peaks, ss_dict, transcript_dict)
    peak_df = collapse_unpredicted_peaks(peak_df)
    peak_df['genome coord'] = peak_df['chromosome'].str.cat(
        peak_df['position'].apply(int).apply(str), sep=':')

    if type(fasta) == str:
        fasta = SP.make_fasta_dict(fasta)
    print "Adding sequences..."
    peak_seq_df = add_sequence_to_df(peak_df, fasta, flag=flag)

    print "Writing bedgraph..."
    with open(name + '.bedgraph', 'w') as fout:
        for ix, r in peak_seq_df.iterrows():
            if r['strand'] == '+':
                position2 = r['position'] + 1
                height = r['height']
            elif r['strand'] == '-':
                position2 = r['position'] - 1
                height = r['height'] * -1
            line_list = [
                r['chromosome'], r['position'], position2, height, '\n'
            ]
            line_list = [str(x) for x in line_list]
            line = '\t'.join(line_list)
            fout.write(line)

    print "Completed"
    return peak_seq_df
Ejemplo n.º 29
0
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False):
    #Populate gene dictionary and build genome
    if 'pombe' in gff3.lower():
        transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')
        ss, flag = SP.list_splice_sites(gff3, organism='pombe')
        organism = 'pombe'
    else:
        transcript_dict = SP.build_transcript_dict(gff3)
        ss, flag = SP.list_splice_sites(gff3)
        organism = None
    ss_dict = SP.collapse_ss_dict(ss)
    genome = fasta_dict
    #print genome.keys()
    nuc_prob = gc_content(fasta_dict)
    #print nuc_prob

    base_dict = {"A":0, "C":1, "T":2, "G":3}
    
    #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G.
    pos_matrix_5prime = np.zeros([4,8])
    pos_matrix_3prime = np.zeros([4,8])

    counter1 = 0
    counter2 = 0

    for transcript, introns in ss_dict.iteritems():
        counter2 += 1
        if organism == 'pombe':
            isoform = transcript+'.1'
        else:
            isoform = transcript+'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            counter1+=1
            if strand == '+':
                seq = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)]
                seq = SP.reverse_complement(seq)

            for a, base in enumerate(seq):
                pos_matrix_5prime[base_dict[base],a] += 1

            if strand == '+':
                seq = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)]
                seq = SP.reverse_complement(seq)
            
            for b, base in enumerate(seq):
                pos_matrix_3prime[base_dict[base],b] += 1
                
    #print counter1
    #print counter2

    float_formatter = lambda x: "%.1f" % x
    np.set_printoptions(formatter={'float_kind':float_formatter})
    
    a = 0
    while a < 4:
        b = 0
        while b < 8:
            if PSSM is False:
                pos_matrix_5prime[a,b] = (pos_matrix_5prime[a,b])/float(counter1)
                pos_matrix_3prime[a,b] = (pos_matrix_3prime[a,b])/float(counter1)
            if PSSM is True:
                if pos_matrix_5prime[a,b] == 0: pos_matrix_5prime[a,b] += 1
                if pos_matrix_3prime[a,b] == 0: pos_matrix_3prime[a,b] += 1
                pos_matrix_5prime[a,b] = np.log2((pos_matrix_5prime[a,b]/float(counter1))/nuc_prob[a])
                pos_matrix_3prime[a,b] = np.log2((pos_matrix_3prime[a,b]/float(counter1))/nuc_prob[a])
            b += 1
        a += 1
    
    return (pos_matrix_5prime, pos_matrix_3prime)
Ejemplo n.º 30
0
def get_peak_sequence3(input_file, fasta_file, gff3_file, gene_list,window=1000):
    '''Makes a fasta file of peak sequences based on an input file.
    Input file columns - 1: transcript, 2: chromosome, 3: peak center
    Remember to save the input file as an MS-DOS CSV file if exporting from Excel
    Note: retrieves sequence
    
    Parameters
    ----------
    input_file : str
            CSV file - see above
    fasta_file : str
            .json dictionary of chromosome sequences or fasta file (.json will load faster)
    gff3_file : str
            gff3 file for your organism
    gene_list: str
             CSV file 
    window : int, default 1000
            Size of sequence to retrieve (peak boundaries are window/2 on either side of peak summit)
            
    Outputs
    ------
    peak_fasta : fasta file with all peak sequences
    '''

    tx_dict = SP.build_transcript_dict(gff3_file)
    if type(fasta_file) == dict:
        fa_dict = fasta_file
    else:
        if fasta_file.endswith('json'):
            with open(fasta_file) as f:
                fa_dict = json.load(f)
        else:
            fa_dict = SP.make_fasta_dict(fasta_file)
    seq_list = []
    no_tx_n = 1
    with open(input_file,'r') as csv_file:
        f = csv.reader(csv_file, dialect=csv.excel)
        for row in f:
            tx_list = row[0].split(',')
            for tx in tx_list:
                tx = tx+'T0'
                if tx.startswith('3P'): tx = tx.split('3P')[1]

                chrom = row[1]
                if not chrom.startswith('chr'):
                    chrom = 'chr'+str(chrom)
                try:
                    center = int(row[2])
                    start = center-window/2
                    end = center+window/2
                    if tx in tx_dict:
                        strand = tx_dict[tx][2]
                    else:
                        print tx+" not in GFF3 file"
                        strand = '+'
                        tx = chrom+':'+str(center)
                    seq = seq_simple(chrom, start, end, strand, fa_dict)
                    seq_list.append((tx,seq))

                except ValueError:
                    pass
    genes_of_interest=gene_list.split("\n")

    with open('{0}_peak_sequences.fa'.format(input_file.split('/')[-1].split('.')[0]),'w') as fout:
        for tx, seq in seq_list:
            if tx in genes_of_interest:
                fout.write('>'+tx+'\n')
                fout.write(seq+'\n')
    return seq_list
Ejemplo n.º 31
0
def gene_venn(csv_files, organism):
    '''Finds overlap between 2 or 3 lists of genes.
    
    Parameters
    ----------
    csv_files : list
               2 or 3 csv files where the first column is the gene name (make sure the gene name format matches).
    organism : str
               Options are 'crypto', 'cerevisiae' or 'pombe'
    
    Output
    ------
    PDF files of venn diagrams (pairwise) and merged csv files containing the overlapping genes.'''

    if 'pombe' in organism.lower():
        gff3 = '/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3'
        organism = 'pombe'
    elif 'crypto' in organism.lower() or 'h99' in organism.lower():
        organism = None
        gff3 = '/home/jordan/GENOMES/CNA3_all_transcripts.gff3'
    elif 'cerev' in organism.lower():
        organism = None
        gff3 = '/home/jordan/GENOMES/S288C/saccharomyces_cerevisiae_R64-2-1_20150113.gff3'

    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    transcripts = tx_dict.keys()
    genes = set([x[:-2] for x in transcripts])

    df_dict = {}
    names = []
    for csv in csv_files:
        name = csv.split('/')[-1].split('.')[0]
        names.append(name)
        line = next(open(csv))
        if len(line.split(',')) > 1:
            df_dict[name] = pd.read_csv(csv, index_col=0)
            df_dict[name] = add_col_level(df_dict[name], name)
        elif len(line.split('\t')) > 1:
            df_dict[name] = pd.read_csv(csv, index_col=0, sep='\t')
            df_dict[name] = add_col_level(df_dict[name], name)
        else:
            df_dict[name] = pd.read_csv(csv, index_col=0)

    # N = genome size
    # n = number of genes in analysis (so len(a)+len(b))
    # K = number of genes in group 1 (len(a))
    # k = overlap(len(a&b))

    N = len(genes)
    K = len(df_dict[names[0]])
    overlap = set(df_dict[names[0]].index).intersection(
        df_dict[names[1]].index)
    k = len(overlap)
    n = len(df_dict[names[0]]) + len(df_dict[names[1]]) - k
    J = len(df_dict[names[1]])

    p_ab = hypergeometric(N, n, K, J, k)
    if p_ab is not None:
        venn_2sample(n, K, k, J, names[0], names[1],
                     ['crimson', 'deepskyblue', 'darkorchid'], p_ab)
        df_ab = df_dict[names[0]].merge(df_dict[names[1]],
                                        right_index=True,
                                        left_index=True)
        df_ab.to_csv('{0}_{1}_overlap.csv'.format(names[0], names[1]))

    if len(names) == 3:
        ## Compare sample 1 to sample 3
        overlap_ac = set(df_dict[names[0]].index).intersection(
            df_dict[names[2]].index)
        k_ac = len(overlap_ac)
        n_ac = len(df_dict[names[0]]) + len(df_dict[names[2]]) - k_ac
        J_ac = len(df_dict[names[2]])

        p_ac = hypergeometric(N, n_ac, K, J_ac, k_ac)
        if p_ac is not None:
            venn_2sample(n_ac, K, k_ac, J_ac, names[0], names[2],
                         ['crimson', 'gold', 'darkorange'], p_ac)
            df_ac = df_dict[names[0]].merge(df_dict[names[2]],
                                            right_index=True,
                                            left_index=True)
            df_ac.to_csv('{0}_{1}_overlap.csv'.format(names[0], names[2]))

        ## Compare sample 2 to sample 3
        overlap_bc = set(df_dict[names[1]].index).intersection(
            df_dict[names[2]].index)
        k_bc = len(overlap_bc)
        n_bc = len(df_dict[names[1]]) + len(df_dict[names[2]]) - k_bc
        J_bc = len(df_dict[names[2]])
        K_bc = len(df_dict[names[1]])

        p_bc = hypergeometric(N, n_bc, K_bc, J_bc, k_bc)
        if p_bc is not None:
            venn_2sample(n_bc, K_bc, k_bc, J_bc, names[1], names[2],
                         ['deepskyblue', 'gold', 'forestgreen'], p_bc)
            df_bc = df_dict[names[1]].merge(df_dict[names[2]],
                                            right_index=True,
                                            left_index=True)
            df_bc.to_csv('{0}_{1}_overlap.csv'.format(names[1], names[2]))
Ejemplo n.º 32
0
def get_peak_sequence3(input_file,
                       fasta_file,
                       gff3_file,
                       gene_list,
                       window=1000):
    '''Makes a fasta file of peak sequences based on an input file.
    Input file columns - 1: transcript, 2: chromosome, 3: peak center
    Remember to save the input file as an MS-DOS CSV file if exporting from Excel
    Note: retrieves sequence
    
    Parameters
    ----------
    input_file : str
            CSV file - see above
    fasta_file : str
            .json dictionary of chromosome sequences or fasta file (.json will load faster)
    gff3_file : str
            gff3 file for your organism
    gene_list: str
             CSV file 
    window : int, default 1000
            Size of sequence to retrieve (peak boundaries are window/2 on either side of peak summit)
            
    Outputs
    ------
    peak_fasta : fasta file with all peak sequences
    '''

    tx_dict = SP.build_transcript_dict(gff3_file)
    if type(fasta_file) == dict:
        fa_dict = fasta_file
    else:
        if fasta_file.endswith('json'):
            with open(fasta_file) as f:
                fa_dict = json.load(f)
        else:
            fa_dict = SP.make_fasta_dict(fasta_file)
    seq_list = []
    no_tx_n = 1
    with open(input_file, 'r') as csv_file:
        f = csv.reader(csv_file, dialect=csv.excel)
        for row in f:
            tx_list = row[0].split(',')
            for tx in tx_list:
                tx = tx + 'T0'
                if tx.startswith('3P'): tx = tx.split('3P')[1]

                chrom = row[1]
                if not chrom.startswith('chr'):
                    chrom = 'chr' + str(chrom)
                try:
                    center = int(row[2])
                    start = center - window / 2
                    end = center + window / 2
                    if tx in tx_dict:
                        strand = tx_dict[tx][2]
                    else:
                        print tx + " not in GFF3 file"
                        strand = '+'
                        tx = chrom + ':' + str(center)
                    seq = seq_simple(chrom, start, end, strand, fa_dict)
                    seq_list.append((tx, seq))

                except ValueError:
                    pass
    genes_of_interest = gene_list.split("\n")

    with open(
            '{0}_peak_sequences.fa'.format(
                input_file.split('/')[-1].split('.')[0]), 'w') as fout:
        for tx, seq in seq_list:
            if tx in genes_of_interest:
                fout.write('>' + tx + '\n')
                fout.write(seq + '\n')
    return seq_list
Ejemplo n.º 33
0
def count_reads_in_transcript(bam_files, df, gff3, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    
    bams = {}
    for bam_file in bam_files:
        bams[bam_file] = pysam.Samfile(bam_file)
    
    all_reads = {}

    for bam, reader in bams.iteritems():
        all_reads[bam] = pd.DataFrame(index=df.index, columns=['total','intron'])
        
        for tx in set(df['transcript']):
            tx_df = df[df['transcript'] == tx]
            if organism == 'pombe':
                tx = tx+'.1'
            else:
                tx = tx+'T0'
                
            start, end, strand, CDS_start, CDS_end, exons, chrom = SP.tx_info(tx, tx_dict)
            if organism == 'pombe':
                lat_rom = {'chr1':'I','chr2':'II','chr3':'III'}
                chrom = lat_rom[chrom]
            
            tx_iter = reader.fetch(chrom,  start,  end)
            
            intron_ranges = {}
            for ix, r in tx_df.iterrows():
                if strand == '+':
                    intron_start = int(r['position'])
                    intron_end = int(r['position']+r['intron size'])+1
                elif strand == '-':
                    intron_start = int(r['position']-r['intron size'])
                    intron_end = int(r['position'])+1
                intron_ranges[ix] = [range(intron_start,intron_end),0]

            reads = 0
            for read in tx_iter:
                if read.is_reverse and strand == '+':
                    reads += 1
                    
                    for ix in intron_ranges:
                        if read.reference_end in intron_ranges[ix][0]:
                            intron_ranges[ix][1] += 1
                    
                elif not read.is_reverse and strand == '-':
                    reads += 1
                    for ix in intron_ranges:
                        if read.reference_start in intron_ranges[ix][0]:
                            intron_ranges[ix][1] += 1
                            
            for ix in intron_ranges:
                try:
                    all_reads[bam].loc[ix,'total'] = reads/float(end-start)*1000
                    all_reads[bam].loc[ix,'intron'] = ((intron_ranges[ix][1]/float(tx_df.loc[ix,'intron size'])) /
                                                   (reads/float(end-start)))
                except ZeroDivisionError:
                    all_reads[bam].loc[ix,'total'] = np.NaN
                    all_reads[bam].loc[ix,'intron'] = np.NaN
                    print ix
                    
    return all_reads 
Ejemplo n.º 34
0
def build_junction_df(junction_bed, gff3_file, fasta, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta) == str:
        fasta=SP.make_fasta_dict(fasta)
    junction_dict = build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism)
    junction_count = 0
    for tx, junctions in junction_dict.iteritems():
        junction_count += len(junctions)
    
    junction_df = pd.DataFrame(index=range(junction_count), columns=['intron tuple','chromosome','start','end','strand','depth','type','size','annotated intron size','annotated intron start','annotated intron end'])
    n=0
    for tx, junctions in junction_dict.iteritems():
        for junction in junctions:
            junction_df.ix[n] = [tx]+junction
            n+=1
    
    sequence1 = []
    sequence2 = []
    ann_seq1 = []
    ann_seq2 = []
    seq_type1 = []
    seq_type2 = []
    df_tx = []
    for index, row in junction_df.iterrows():
        df_tx.append(row['intron tuple'][0])
        chrom = convert_chrom(row['chromosome'])
        if row['strand'] == '+':
            curr1 = fasta[chrom][(row['start']-1):(row['start']+7)]
            sequence1.append(curr1)
            curr2 = fasta[chrom][(row['end']-5):(row['end']+3)]
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(fasta[chrom][(row['annotated intron start']-1):(row['annotated intron start']+7)])
                ann_seq2.append(fasta[chrom][(row['annotated intron end']-5):(row['annotated intron end']+3)])
        elif row['strand'] == '-':
            curr1 = SP.reverse_complement(fasta[chrom][(row['start']-6):(row['start']+2)])
            sequence1.append(curr1)
            curr2 = SP.reverse_complement(fasta[chrom][(row['end']-2):(row['end']+6)])
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(SP.reverse_complement(fasta[chrom][row['annotated intron start']-6:row['annotated intron start']+2]))
                ann_seq2.append(SP.reverse_complement(fasta[chrom][row['annotated intron end']-2:row['annotated intron end']+6]))
        else:
            sequence1.append('NNNNNNNN')
            sequence2.append('NNNNNNNN')
            ann_seq1.append('NNNNNNNN')
            ann_seq2.append('NNNNNNNN')
        
        
        if row['type'] == 'Annotated': 
            seq_type1.append('5p annotated')
            seq_type2.append('3p annotated')
        elif row['type'] == '5p tethered':
            seq_type1.append('5p annotated')
            seq_type2.append(curr2[4:6])
        else:
            seq_type1.append(curr1[2:4])
            seq_type2.append(curr2[4:6])
            
    junc_seq_df = junction_df
    junc_seq_df['sequence1'] = sequence1
    junc_seq_df['sequence2'] = sequence2
    junc_seq_df['seq type1'] = seq_type1
    junc_seq_df['seq type2'] = seq_type2
    junc_seq_df['annotated sequence1'] = ann_seq1
    junc_seq_df['annotated sequence2'] = ann_seq2
    junc_seq_df['transcript'] = df_tx
    
    return junc_seq_df