def main(): gff3 = '/home/jordan/GENOMES/CNA3_all_transcripts.gff3' fasta = '/home/jordan/GENOMES/H99_fa.json' chrom_lengths = '/home/jordan/GENOMES/H99_chrom_lengths.json' prefix = sys.argv[1].split('/')[-1].split('.')[0] print prefix tx_dict = SP.build_transcript_dict(gff3) tx_by_chrom = sort_tx_by_chrom(tx_dict) int_dict = make_promoter_dict(tx_dict, chrom_lengths) peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1], cutoff=2) #peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1]) peak_df = find_best_peaks(peak_df, int_dict, max_genes=300) if len(sys.argv) == 3: gene_list_file = sys.argv[2] in_list, other = split_by_gene(peak_df, gene_list_file) in_list.to_csv(prefix+'_by_gene_in_list.csv') other.to_csv(prefix+'_by_gene_other.csv') generate_sequence_file(in_list, int_dict, fasta, prefix+'_in_list') generate_sequence_file(other, int_dict, fasta, prefix+'_other') split = True minsites = [int(0.75*len(in_list)),int(0.75*len(other))] if minsites[0] > 600: minsites[0] = 600 if minsites[1] > 600: minsites[1] = 600 else: peak_df.to_csv(prefix+'_by_gene.csv') generate_sequence_file(peak_df, int_dict, fasta, prefix) split = False minsites = int(0.75*len(peak_df)) if minsites > 600: minsites = 600 call_meme(prefix, minsites, split=split)
def generate_all_ss_seqs(gff3, fasta_dict, organism): transcript_dict = SP.build_transcript_dict(gff3, organism=organism) ss, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss) all_seq5 = [] all_seq3 = [] for transcript, introns in ss_dict.iteritems(): if organism == 'pombe': isoform = transcript+'.1' else: isoform = transcript+'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: if strand == '+': seq5 = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)] elif strand == '-': seq5 = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)] seq5 = SP.reverse_complement(seq5) all_seq5.append(seq5) if strand == '+': seq3 = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)] elif strand == '-': seq3 = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)] seq3 = SP.reverse_complement(seq3) all_seq3.append(seq3) return all_seq5, all_seq3
def generate_all_ss_seqs(gff3, fasta_dict, organism): transcript_dict = SP.build_transcript_dict(gff3, organism=organism) ss, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss) all_seq5 = [] all_seq3 = [] for transcript, introns in ss_dict.iteritems(): if organism == 'pombe': isoform = transcript + '.1' else: isoform = transcript + 'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: if strand == '+': seq5 = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)] elif strand == '-': seq5 = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)] seq5 = SP.reverse_complement(seq5) all_seq5.append(seq5) if strand == '+': seq3 = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)] elif strand == '-': seq3 = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)] seq3 = SP.reverse_complement(seq3) all_seq3.append(seq3) return all_seq5, all_seq3
def build_transcript_dict(gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", expand=False, convert_chroms=False): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') lat_rom = {'chr1':'I','chr2':'II','chr3':'III','MT':'MT'} if convert_chroms is True: transcript_dict = {k:[start, end, strand, lat_rom[chrom], cds_start, cds_end] for k, [start, end, strand, chrom, cds_start, cds_end] in transcript_dict.items()} chrom_lengths = {'I':5818680, 'II':4744158, 'III':2598968,'chr1':5818680, 'chr2':4744158, 'chr3':2598968} if expand is True: expanded_dict = {} for tx, info in transcript_dict.iteritems(): new_start = info[0]-300 if new_start < 0: new_start = 0 new_end = info[1]+300 if info[3] in chrom_lengths: if new_end > chrom_lengths[info[3]]: new_end = chrom_lengths[info[3]] #else: print info[3] if len(info[4]) == 0: info[4] = [info[0]] if len(info[5]) == 0: info[5] = [info[1]] expanded_dict[tx] = [new_start, new_end, info[2], info[3], info[4], info[5]] transcript_dict = expanded_dict return transcript_dict
def main(): gff3 = '/home/jordan/GENOMES/CNA3_all_transcripts.gff3' fasta = '/home/jordan/GENOMES/H99_fa.json' chrom_lengths = '/home/jordan/GENOMES/H99_chrom_lengths.json' prefix = sys.argv[1].split('/')[-1].split('.')[0] print prefix tx_dict = SP.build_transcript_dict(gff3) tx_by_chrom = sort_tx_by_chrom(tx_dict) int_dict = make_promoter_dict(tx_dict, chrom_lengths) peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1], cutoff=2) #peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1]) peak_df = find_best_peaks(peak_df, int_dict, max_genes=300) if len(sys.argv) == 3: gene_list_file = sys.argv[2] in_list, other = split_by_gene(peak_df, gene_list_file) in_list.to_csv(prefix + '_by_gene_in_list.csv') other.to_csv(prefix + '_by_gene_other.csv') generate_sequence_file(in_list, int_dict, fasta, prefix + '_in_list') generate_sequence_file(other, int_dict, fasta, prefix + '_other') split = True minsites = [int(0.75 * len(in_list)), int(0.75 * len(other))] if minsites[0] > 600: minsites[0] = 600 if minsites[1] > 600: minsites[1] = 600 else: peak_df.to_csv(prefix + '_by_gene.csv') generate_sequence_file(peak_df, int_dict, fasta, prefix) split = False minsites = int(0.75 * len(peak_df)) if minsites > 600: minsites = 600 call_meme(prefix, minsites, split=split)
def get_sequence(coord_dict, gff3_file, fasta_file): if 'pombe' in gff3_file: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) is str: fasta_dict = make_fasta_dict(fasta_file) else: fasta_dict = fasta_file seq_dict = {} counter5 = 0 counter3 = 0 other = 0 for transcript, coord_sets in coord_dict.iteritems(): seq_dict[transcript] = [] chrom = transcript_dict[transcript][3] #if chrom in rom_lat: chrom = rom_lat[chrom] strand = transcript_dict[transcript][2] for coord in coord_sets[0]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord - 9):(coord + 11)] elif strand == "-": sequence = fasta_dict[chrom][(coord - 10):(coord + 10)] sequence = SP.reverse_complement(sequence) if sequence[10:12] == 'GT' or sequence[10:12] == 'GC': seq_type = "5'" counter5 += 1 seq_dict[transcript].append((sequence, seq_type)) for coord in coord_sets[1]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord - 9):(coord + 11)] elif strand == "-": sequence = fasta_dict[chrom][(coord - 10):(coord + 10)] sequence = SP.reverse_complement(sequence) if sequence[8:10] == 'AG': seq_type = "3'" counter3 += 1 seq_dict[transcript].append((sequence, seq_type)) #print str(counter5)+" 5' splice sites" #print str(counter3)+" 3' splice sites" return seq_dict
def get_sequence(coord_dict, gff3_file, fasta_file): if 'pombe' in gff3_file: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) is str: fasta_dict = make_fasta_dict(fasta_file) else: fasta_dict = fasta_file seq_dict = {} counter5 = 0 counter3 = 0 other = 0 for transcript, coord_sets in coord_dict.iteritems(): seq_dict[transcript] = [] chrom = transcript_dict[transcript][3] #if chrom in rom_lat: chrom = rom_lat[chrom] strand = transcript_dict[transcript][2] for coord in coord_sets[0]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord-9):(coord+11)] elif strand == "-": sequence = fasta_dict[chrom][(coord-10):(coord+10)] sequence = SP.reverse_complement(sequence) if sequence[10:12] == 'GT' or sequence[10:12] == 'GC': seq_type = "5'" counter5 += 1 seq_dict[transcript].append((sequence, seq_type)) for coord in coord_sets[1]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord-9):(coord+11)] elif strand == "-": sequence = fasta_dict[chrom][(coord-10):(coord+10)] sequence = SP.reverse_complement(sequence) if sequence[8:10] == 'AG': seq_type = "3'" counter3 += 1 seq_dict[transcript].append((sequence, seq_type)) #print str(counter5)+" 5' splice sites" #print str(counter3)+" 3' splice sites" return seq_dict
def build_tss_dict(gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", window=220): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') tss_dict = {} for tx, info in transcript_dict.iteritems(): if info[2] == '+': start = info[0]-window end = info[0]+window tss_dict[tx] = [start, end, info[2], info[3]] elif info[2] == '-': start = info[1]-window end = info[1]+window tss_dict[tx] = [start, end, info[2], info[3]] return tss_dict
def build_tss_dict( gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", window=220): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') tss_dict = {} for tx, info in transcript_dict.iteritems(): if info[2] == '+': start = info[0] - window end = info[0] + window tss_dict[tx] = [start, end, info[2], info[3]] elif info[2] == '-': start = info[1] - window end = info[1] + window tss_dict[tx] = [start, end, info[2], info[3]] return tss_dict
def peak_to_seq_pipeline(untagged_peak_file, tagged1_peak_file, tagged2_peak_file, gff3, fasta, junction_df=None, branch_df=None, cutoff=5, name='CP_peaks'): if 'pombe' in gff3: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3, organism=organism) print "Finding peaks in transcripts..." print untagged_peak_file untagged = CP_peaks_by_gene(untagged_peak_file, transcript_dict, cutoff=cutoff) print tagged1_peak_file tagged1 = CP_peaks_by_gene(tagged1_peak_file, transcript_dict, cutoff=cutoff) print tagged2_peak_file tagged2 = CP_peaks_by_gene(tagged2_peak_file, transcript_dict, cutoff=cutoff) print "Comparing peaks between replicates..." peaks = CP_compare_reps(untagged, tagged1, tagged2) print "Checking peaks against annotation..." ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) peak_df = CP_compare_to_annotation(peaks, ss_dict, transcript_dict) peak_df = collapse_unpredicted_peaks(peak_df) peak_df['genome coord'] = peak_df['chromosome'].str.cat(peak_df['position'].apply(int).apply(str), sep=':') if type(fasta) == str: fasta = SP.make_fasta_dict(fasta) print "Adding sequences..." peak_seq_df = add_sequence_to_df(peak_df, fasta, flag=flag) print "Writing bedgraph..." with open(name+'.bedgraph', 'w') as fout: for ix, r in peak_seq_df.iterrows(): if r['strand'] == '+': position2 = r['position']+1 height = r['height'] elif r['strand'] == '-': position2 = r['position']-1 height = r['height']*-1 line_list = [r['chromosome'], r['position'], position2, height, '\n'] line_list = [str(x) for x in line_list] line = '\t'.join(line_list) fout.write(line) print "Completed" return peak_seq_df
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) == dict: fasta_dict = fasta_file elif fasta_file.endswith('json'): with open(fasta_file, 'r') as f: fasta_dict = json.load(f) else: fasta_dict = make_fasta_dict(fasta_file) if ss_dict is not None: ss_dict=ss_dict elif junction_bed is not None: ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) elif peak_df is not None: ss_dict = {} peak_df = peak_df[~peak_df['type'].str.contains('prime')] for ix, r in peak_df.iterrows(): if r['transcript'] not in ss_dict: ss_dict[r['transcript']] = [] if r['strand'] == '+': ss_dict[r['transcript']].append((r['position'],r['position']+50)) elif r['strand'] == '-': ss_dict[r['transcript']].append((r['position'],r['position']-50)) else: ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) seq_dict = {} for transcript, introns in ss_dict.iteritems(): if junction_bed is None: if organism == 'pombe': transcript = transcript+'.1' else: transcript = transcript+'T0' introns = list(introns) strand = transcript_dict[transcript][2] chrom = transcript_dict[transcript][3] n = 0 for n in range(len(introns)): if strand == '+': seq_dict[transcript+'-'+chrom+':'+str(introns[n][0]+1)] = fasta_dict[chrom][introns[n][0]+2:introns[n][0]+17] elif strand == '-': seq = fasta_dict[chrom][introns[n][0]-16:introns[n][0]-1] seq_dict[transcript+'-'+chrom+':'+str(introns[n][0])] = SP.reverse_complement(seq) return seq_dict
def create_branch_df(branch_dict, gff3, fa_dict, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) chroms = [] fives = [] transcripts = [] branches = [] depths = [] strands = [] distances = [] for tx, five_sites in branch_dict.iteritems(): for five_site in five_sites: chrom = five_site[0].split(':')[0] pos = int(five_site[0].split(':')[1]) n = 0 for n in range(len(five_site[1])): if abs(five_site[1][n] - pos) > 5 and abs(five_site[1][n] - pos) <= 1000 and five_site[2][n] >= 5: chroms.append(chrom) fives.append(pos) transcripts.append(tx) branches.append(five_site[1][n]) depths.append(five_site[2][n]) strands.append(tx_dict[tx][2]) if tx_dict[tx][2] == '+': distances.append(five_site[1][n] - pos) elif tx_dict[tx][2] == '-': distances.append(pos - five_site[1][n]) branch_df = pd.DataFrame(index=range(len(fives))) branch_df['transcript'] = transcripts branch_df['chromosome'] = chroms branch_df['5p splice site'] = fives branch_df['branch site'] = branches branch_df['depth'] = depths branch_df['distance'] = distances branch_df['strand'] = strands branch_df = branch_df[branch_df['distance'] > 0] branch_df['genome coord'] = branch_df['chromosome'].str.cat( branch_df['5p splice site'].apply(int).apply(str), sep=':') branch_df['branch coord'] = branch_df['chromosome'].str.cat( branch_df['branch site'].apply(int).apply(str), sep=':') branch_df = add_seq(branch_df, fa_dict) branch_df = find_3p_site(branch_df, gff3, organism=organism) return branch_df
def build_transcript_dict( gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", expand=False, convert_chroms=False): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') lat_rom = {'chr1': 'I', 'chr2': 'II', 'chr3': 'III', 'MT': 'MT'} if convert_chroms is True: transcript_dict = { k: [start, end, strand, lat_rom[chrom], cds_start, cds_end] for k, [start, end, strand, chrom, cds_start, cds_end] in transcript_dict.items() } chrom_lengths = { 'I': 5818680, 'II': 4744158, 'III': 2598968, 'chr1': 5818680, 'chr2': 4744158, 'chr3': 2598968 } if expand is True: expanded_dict = {} for tx, info in transcript_dict.iteritems(): new_start = info[0] - 300 if new_start < 0: new_start = 0 new_end = info[1] + 300 if info[3] in chrom_lengths: if new_end > chrom_lengths[info[3]]: new_end = chrom_lengths[info[3]] #else: print info[3] if len(info[4]) == 0: info[4] = [info[0]] if len(info[5]) == 0: info[5] = [info[1]] expanded_dict[tx] = [ new_start, new_end, info[2], info[3], info[4], info[5] ] transcript_dict = expanded_dict return transcript_dict
def create_branch_df(branch_dict, gff3, fa_dict, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) chroms = [] fives = [] transcripts = [] branches = [] depths = [] strands = [] distances = [] for tx, five_sites in branch_dict.iteritems(): for five_site in five_sites: chrom = five_site[0].split(':')[0] pos = int(five_site[0].split(':')[1]) n=0 for n in range(len(five_site[1])): if abs(five_site[1][n]-pos) > 5 and abs(five_site[1][n]-pos) <= 1000 and five_site[2][n] >= 5: chroms.append(chrom) fives.append(pos) transcripts.append(tx) branches.append(five_site[1][n]) depths.append(five_site[2][n]) strands.append(tx_dict[tx][2]) if tx_dict[tx][2] == '+': distances.append(five_site[1][n]-pos) elif tx_dict[tx][2] == '-': distances.append(pos-five_site[1][n]) branch_df = pd.DataFrame(index = range(len(fives))) branch_df['transcript'] = transcripts branch_df['chromosome'] = chroms branch_df['5p splice site'] = fives branch_df['branch site'] = branches branch_df['depth'] = depths branch_df['distance'] = distances branch_df['strand'] = strands branch_df = branch_df[branch_df['distance'] > 0] branch_df['genome coord'] = branch_df['chromosome'].str.cat(branch_df['5p splice site'].apply(int).apply(str), sep=':') branch_df['branch coord'] = branch_df['chromosome'].str.cat(branch_df['branch site'].apply(int).apply(str), sep=':') branch_df = add_seq(branch_df, fa_dict) branch_df = find_3p_site(branch_df, gff3, organism=organism) return branch_df
def make_transcript_df(gff3): '''Creates a dataframe with all annotated transcripts from the gff3 file Parameters ---------- gff3 : str Your favorite annotation file Returns ------ df : pandas.DataFrame Pandas dataframe instance with location of transcripts from gff3 file''' if 'pombe' in gff3.lower(): organism='pombe' else: organism=None # Get transcript dictionary tx_dict = SP.build_transcript_dict(gff3, organism=organism) # Organize by transcript tx_dict = OrderedDict(sorted(tx_dict.items(), key=lambda t: t[0])) # Convert to dataframe tx_df = pd.DataFrame(index=tx_dict.keys(), columns=['start','end','strand','chromosome']) for n, col in enumerate(tx_df.columns): tx_df.loc[:,col] = zip(*tx_dict.values())[n] # Add CDS starts and ends CDS_starts = [min(l) if len(l) > 0 else np.NaN for l in zip(*tx_dict.values())[4]] CDS_ends = [max(l) if len(l) > 0 else np.NaN for l in zip(*tx_dict.values())[5]] tx_df.loc[:,'CDS start'] = CDS_starts tx_df.loc[:,'CDS end'] = CDS_ends return tx_df
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) == dict: fasta_dict = fasta_file elif fasta_file.endswith('json'): with open(fasta_file, 'r') as f: fasta_dict = json.load(f) else: fasta_dict = make_fasta_dict(fasta_file) if ss_dict is not None: ss_dict = ss_dict elif junction_bed is not None: ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) elif peak_df is not None: ss_dict = {} peak_df = peak_df[~peak_df['type'].str.contains('prime')] for ix, r in peak_df.iterrows(): if r['transcript'] not in ss_dict: ss_dict[r['transcript']] = [] if r['strand'] == '+': ss_dict[r['transcript']].append( (r['position'], r['position'] + 50)) elif r['strand'] == '-': ss_dict[r['transcript']].append( (r['position'], r['position'] - 50)) else: ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) seq_dict = {} for transcript, introns in ss_dict.iteritems(): if junction_bed is None: if organism == 'pombe': transcript = transcript + '.1' else: transcript = transcript + 'T0' introns = list(introns) strand = transcript_dict[transcript][2] chrom = transcript_dict[transcript][3] n = 0 for n in range(len(introns)): if strand == '+': seq_dict[transcript + '-' + chrom + ':' + str(introns[n][0] + 1)] = fasta_dict[chrom][introns[n][0] + 2:introns[n][0] + 17] elif strand == '-': seq = fasta_dict[chrom][introns[n][0] - 16:introns[n][0] - 1] seq_dict[transcript + '-' + chrom + ':' + str(introns[n][0])] = SP.reverse_complement(seq) return seq_dict
def get_junction_sequence(df, gff3_file, fasta_file): df = df.sort_values('chr', axis=0) #transcript_dict[transcript] = [start, end, strand, chromosome, CDS start, CDS end] transcript_dict = SP.build_transcript_dict(gff3_file) #splice_dict[transcipt] = [[5'sites][3'sites]] splice_dict, flag = SP.list_splice_sites(gff3_file) #fasta_dict[chr] = sequence if type(fasta_file) is str: fasta_dict = make_fasta_dict(fasta_file) else: fasta_dict = fasta_file transcript_by_chr = {} for transcript, coords in transcript_dict.iteritems(): chromosome = coords[3] if chromosome in transcript_by_chr: transcript_by_chr[chromosome].append(transcript) else: transcript_by_chr[chromosome] = [] transcript_by_chr[chromosome].append(transcript) df['Gene'] = "Unknown" df['intron'] = "Middle" df['sequence1'] = '' df['sequence2'] = '' df['intron sequence'] = 'No sequence here' n = 0 for n in range(len(df)): coord1 = int(df['coord_1'][n].strip()) coord2 = int(df['coord_2'][n].strip()) chrom = df['chr'][n].strip() strand = df['strand'][n].strip() transcripts = transcript_by_chr[chrom] for transcript in transcripts: tx_strand = transcript_dict[transcript][2] start = transcript_dict[transcript][0] stop = transcript_dict[transcript][1] if strand == tx_strand and coord1 >= start and coord2 <= stop: df.loc[n,'Gene'] = transcript if strand == '+': sequence1 = fasta_dict[chrom][(coord1-3):(coord1+5)] sequence2 = fasta_dict[chrom][(coord2-6):(coord2+2)] all_seq = fasta_dict[chrom][(coord1-1):coord2] elif strand == '-': sequence1 = fasta_dict[chrom][(coord2-6):(coord2+2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fasta_dict[chrom][(coord1-3):(coord1+5)] sequence2 = SP.reverse_complement(sequence2) all_seq = fasta_dict[chrom][(coord1-1):coord2] all_seq = SP.reverse_complement(all_seq) df.loc[n,'sequence1'] = sequence1 df.loc[n,'sequence2'] = sequence2 df.loc[n,'intron sequence'] = all_seq for transcript in transcripts: if transcript in df['Gene'].tolist(): tx_df = df[df['Gene'] == transcript] s = tx_df['coord_1'] min_idx = s.idxmin() first = int(s.min()) #print transcript_dict[transcript][2] #print first max_idx = s.idxmax() last = int(s.max()) #print last if first == last: df.loc[min_idx,'intron'] = 'Only' else: if transcript_dict[transcript][2] == '+': df.loc[min_idx,'intron'] = 'First' df.loc[max_idx,'intron'] = 'Last' elif transcript_dict[transcript][2] == '-': df.loc[min_idx,'intron'] = 'Last' df.loc[max_idx,'intron'] = 'First' for index, coord_1 in s.iteritems(): if df['intron'][index] == 'Middle': if coord_1 in range(first-10, first+10): df_idx = s[s == coord_1].index[0] if transcript_dict[transcript][2] == '+': df.loc[df_idx, 'intron'] = 'First' elif transcript_dict[transcript][2] == '-': df.loc[df_idx, 'intron'] = 'Last' elif coord_1 in range(last-10, last+10): df_idx = s[s == coord_1].index[0] if transcript_dict[transcript][2] == '+': df.loc[df_idx, 'intron'] = 'Last' elif transcript_dict[transcript][2] == '-': df.loc[df_idx, 'intron'] = 'First' df = df[df['contained in'] != ''] df = df.reset_index() return df
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) column_dict = { 'position': [], 'transcript': [], 'alt splicing': [], 'type': [], 'strand': [], 'introns in transcript': [], 'intron size': [], 'chromosome': [], '5p score': [], '3p score': [], 'intron position': [], 'exon size (us)': [], 'exon size (ds)': [], 'transcript size': [], 'peak': [], 'seq5': [], 'seq3': [] } new_index = [] for tx in set(df['transcript']): strand = df[df['transcript'] == tx].iloc[0]['strand'] splice_sites = ss_dict[tx] if strand == '+': splice_sites = sorted(list(splice_sites), key=lambda x: x[0]) elif strand == '-': splice_sites = sorted(list(splice_sites), key=lambda x: x[0], reverse=True) df_pos = None for n, (five_site, three_site) in enumerate(splice_sites): # Check if already in dataframe in_df = False for peak in df[df['transcript'] == tx]['position']: if five_site in range(int(peak) - 5, int(peak) + 5): in_df = True df_pos = peak break column_dict['transcript'].append(tx) if organism == 'pombe': iso = tx + '.1' else: iso = tx + 'T0' column_dict['intron size'].append(abs(three_site - five_site)) column_dict['introns in transcript'].append(len(splice_sites)) column_dict['strand'].append(strand) chrom = df[df['transcript'] == tx].iloc[0]['chromosome'] column_dict['chromosome'].append(chrom) column_dict['transcript size'].append( (tx_dict[iso][1] - tx_dict[iso][0]) / 1000.) # Check if first or last intron and add exon size if n == 0: column_dict['intron position'].append('First') if strand == '+': column_dict['exon size (us)'].append( (five_site - tx_dict[iso][0]) / 1000.) if len(splice_sites) > 1: ds_length = (splice_sites[n + 1][0] - three_site) / 1000. try: if ds_length < 0: ds_length = (splice_sites[n + 2][0] - three_site) / 1000. except IndexError: ds_length = np.NaN else: ds_length = (tx_dict[iso][1] - three_site) / 1000. elif strand == '-': column_dict['exon size (us)'].append( (tx_dict[iso][1] - five_site) / 1000.) if len(splice_sites) > 1: ds_length = (three_site - splice_sites[n + 1][0]) / 1000. try: if ds_length < 0: ds_length = (three_site - splice_sites[n + 2][0]) / 1000. except IndexError: ds_length = np.NaN else: ds_length = (three_site - tx_dict[iso][0]) / 1000. column_dict['exon size (ds)'].append(ds_length) elif n == len(splice_sites) - 1: column_dict['intron position'].append('Last') column_dict['exon size (us)'].append( (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.) if strand == '+': column_dict['exon size (ds)'].append( (tx_dict[iso][1] - three_site) / 1000.) elif strand == '-': column_dict['exon size (ds)'].append( (three_site - tx_dict[iso][0]) / 1000.) else: column_dict['intron position'].append('Middle') column_dict['exon size (us)'].append( (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.) column_dict['exon size (ds)'].append( abs(three_site - splice_sites[n + 1][0]) / 1000.) if in_df is True: peak_index = chrom + ':' + str(int(df_pos)) new_index.append(peak_index) column_dict['position'].append(df_pos) column_dict['3p score'].append(df.loc[peak_index, '3p score']) column_dict['5p score'].append(df.loc[peak_index, '5p score']) column_dict['alt splicing'].append(df.loc[peak_index, 'alt splicing']) column_dict['type'].append(df.loc[peak_index, 'type']) column_dict['peak'].append(True) column_dict['seq5'].append(df.loc[peak_index, 'seq5']) column_dict['seq3'].append(df.loc[peak_index, 'seq3']) if in_df is False: column_dict['alt splicing'].append(False) column_dict['type'].append('5prime') column_dict['peak'].append(False) # Get position, index and sequence for scoring and position code if strand == '+': column_dict['position'].append(five_site + 1) new_index.append(chrom + ':' + str(five_site + 1)) sequence1 = fa_dict[chrom][(five_site - 1):(five_site + 7)] sequence2 = fa_dict[chrom][(three_site - 5):(three_site + 3)] elif strand == '-': column_dict['position'].append(five_site - 1) new_index.append(chrom + ':' + str(five_site - 1)) sequence1 = fa_dict[chrom][(five_site - 6):(five_site + 2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fa_dict[chrom][(three_site - 2):(three_site + 6)] sequence2 = SP.reverse_complement(sequence2) column_dict['seq5'].append(sequence1) column_dict['seq3'].append(sequence2) # Score sequences score_5, score_3 = SP.simple_score_junction( sequence1, sequence2, PSSM) column_dict['3p score'].append(score_3) column_dict['5p score'].append(score_5) # Create new dataframe from column dictionary new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index) for column, data in column_dict.iteritems(): new_df[column] = data return new_df
def count_reads_in_transcript(bam_files, df, gff3, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) bams = {} for bam_file in bam_files: bams[bam_file] = pysam.Samfile(bam_file) all_reads = {} for bam, reader in bams.iteritems(): all_reads[bam] = pd.DataFrame(index=df.index, columns=['total', 'intron']) for tx in set(df['transcript']): tx_df = df[df['transcript'] == tx] if organism == 'pombe': tx = tx + '.1' else: tx = tx + 'T0' start, end, strand, CDS_start, CDS_end, exons, chrom = SP.tx_info( tx, tx_dict) if organism == 'pombe': lat_rom = {'chr1': 'I', 'chr2': 'II', 'chr3': 'III'} chrom = lat_rom[chrom] tx_iter = reader.fetch(chrom, start, end) intron_ranges = {} for ix, r in tx_df.iterrows(): if strand == '+': intron_start = int(r['position']) intron_end = int(r['position'] + r['intron size']) + 1 elif strand == '-': intron_start = int(r['position'] - r['intron size']) intron_end = int(r['position']) + 1 intron_ranges[ix] = [range(intron_start, intron_end), 0] reads = 0 for read in tx_iter: if read.is_reverse and strand == '+': reads += 1 for ix in intron_ranges: if read.reference_end in intron_ranges[ix][0]: intron_ranges[ix][1] += 1 elif not read.is_reverse and strand == '-': reads += 1 for ix in intron_ranges: if read.reference_start in intron_ranges[ix][0]: intron_ranges[ix][1] += 1 for ix in intron_ranges: try: all_reads[bam].loc[ix, 'total'] = reads / float(end - start) * 1000 all_reads[bam].loc[ix, 'intron'] = ( (intron_ranges[ix][1] / float(tx_df.loc[ix, 'intron size'])) / (reads / float(end - start))) except ZeroDivisionError: all_reads[bam].loc[ix, 'total'] = np.NaN all_reads[bam].loc[ix, 'intron'] = np.NaN print ix return all_reads
def igv_plots_general(bam_list, gene_list, organism, colors=None, names=None, save_dir=None, unstranded=False, end_only=False, same_yaxis=False, specific_range=None, transcript_direction=True, log_scale=False, rpm=True, PE=False, plot_junctions=False): '''Usage: Parameters ---------- bam_list : list, bam files in order of plotting (top to bottom) gene_list : list of transcripts to plot (should be genes not transcript isoforms) if dataframe passed instead of list, will plot introns (must have intron information in datafame) organism : str, pombe or crypto colors : list, default `None` list of colors to use, same length as bam_list, check matplotlib documentation for valid color names names : list, default `None` list of sample names to use instead of bam file names. Same length as bam_files save_dir : str, default `None` directory to save eps files. If None, does not save files unstranded : bool, default `False` Use True for ChIP or DNA sequencing data (or unstranded RNAseq) end_only : bool or list, default `False` Whether to plot only the ends of reads. If different for each bam, make a list of bools same length as bam_list same_yaxis : bool, default `False` Whether all samples should be plotted on the same axis after normalizing to total number of aligned reads specific_range : str, default `None` Options: ('end', window) ('start', window) ([coordinate], window) transcript_direction : bool, default `True` If True, will plot in the direction of transcription, not in the direction of the DNA ''' # Get all organism information (annotation etc.) organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) tx_dict = SP.build_transcript_dict(gff3, organism=organism) fix_info = { 'I': 'chr1', 'II': 'chr2', 'III': 'chr3', 'chr1': 'I', 'chr2': 'II', 'chr4': 'IV', 'chr5': 'V', 'chr6': 'VI', 'chr7': 'VII', 'chr8': 'VIII', 'chr9': 'IX', 'chr10': 'X', 'chr11': 'XI', 'chr12': 'XII', 'chr13': 'XIII', 'chr14': 'XIV', 'chr15': 'XV', 'chr16': 'XVI', '-': '+', '+': '-', 'chr1': 'I', 'chr2': 'II', 'chr3': 'III' } if organism == 'pombe': tx_suffix = '.1' else: tx_suffix = 'T0' # Set up range parameters if specific range is indicated if specific_range is not None: window = int(specific_range[1]) new_tx_dict = {} for gene in gene_list: info = tx_dict[gene + tx_suffix] if specific_range[0] == 'end': if info[2] == '+': start = info[1] - window end = info[1] + window else: start = info[0] - window end = info[0] + window elif specific_range[0] == 'start': if info[2] == '-': start = info[1] - window end = info[1] + window else: start = info[0] - window end = info[0] + window else: start = int(specific_range[0]) - window end = int(specific_range[0]) + window new_tx_dict[gene + tx_suffix] = [start, end, info[2], info[3]] else: new_tx_dict = tx_dict # Open bam files and count reads if rpm is True open_bams = {} total_list = [] for bam in bam_list: open_bams[bam] = pysam.Samfile(bam) if rpm is True: total = check_output(['samtools', 'view', '-F 0x04', '-c', bam]).strip() total = float(total) / 1000000. total_list.append(total) else: total_list.append(1.) # Expand optional arguments to lists if necessary colors = list_from_arg(colors, len(bam_list)) end_only = list_from_arg(end_only, len(bam_list)) log_scale = list_from_arg(log_scale, len(bam_list)) unstranded = list_from_arg(unstranded, len(bam_list)) # Get gene_list from dataframe if gene_list is not a list df = None if type(gene_list) == dict: new_tx_dict = gene_list gene_list = gene_list.keys() elif type(gene_list) != list: df = gene_list gene_list = df.index for tx in gene_list: num_ax = len(bam_list) + 1 if plot_junctions is True: num_ax += len(bam_list) fig, ax = plt.subplots(num_ax, figsize=(10, num_ax), sharex=True) fig.subplots_adjust(hspace=0) # Get transcript info from transcript_dictionary if df is None: try: info = new_tx_dict[tx + tx_suffix] except KeyError: info = new_tx_dict[tx] chrom = info[3] start = info[0] end = info[1] strand = info[2] # If dataframe was passed, get plotting information from dataframe instead else: if isinstance(df.columns, pd.core.index.MultiIndex): new_columns = [x[1] for x in df.columns if x[0] == 'Peaks'] df = df[[x for x in df.columns if x[0] == 'Peaks']] df.columns = new_columns strand = df.loc[tx, 'strand'] chrom = df.loc[tx, 'chromosome'] if strand == '+': start = df.loc[tx, 'position'] - 100 end = df.loc[tx, 'position'] + df.loc[tx, 'intron size'] + 100 elif strand == '-': start = df.loc[tx, 'position'] - df.loc[tx, 'intron size'] - 100 end = df.loc[tx, 'position'] + 100 start = int(start) end = int(end) tx = df.loc[tx, 'transcript'] # Generate read series for each transcript max_y = 0 junc_ymax = 0 for n, bam in enumerate(bam_list): try: bam_iter = open_bams[bam].fetch(chrom, start, end) except ValueError: chrom = fix_info[chrom] bam_iter = open_bams[bam].fetch(chrom, start, end) if end_only[n] is not False: s = SP.generate_read_series_A(bam_iter, chrom, start, end, strand) linewidth = 2 else: if PE is False: s = SP.generate_read_series_B(bam_iter, chrom, start, end, strand) else: s = SP.generate_read_series_PE(bam_iter, chrom, start, end, strand) linewidth = 1 # Get reads from otherstrand if the library type is unstranded if unstranded[n] is True: bam_iter = open_bams[bam].fetch(chrom, start, end) if end_only[n] is not False: s2 = SP.generate_read_series_A(bam_iter, chrom, start, end, fix_info[strand]) linewidth = 2 else: if PE is False: s2 = SP.generate_read_series_B(bam_iter, chrom, start, end, fix_info[strand]) else: s2 = SP.generate_read_series_PE( bam_iter, chrom, start, end, fix_info[strand]) linewidth = 1 s = s.add(s2) # Normalize to rpm (will just divide by 1 if rpm is False) s = s.divide(total_list[n]) if log_scale[n] is True: s = s.apply(np.log2) # Plot! ax[n].bar(s.index, s, linewidth=linewidth, color=colors[n], edgecolor=colors[n], zorder=2) ax[n].tick_params(axis='both', which='major', labelsize=14) max_y = max([max_y, max(s)]) if plot_junctions is True: m = n + len(bam_list) intron_dict = get_junctions(open_bams[bam], chrom, start, end, strand) ax[m].plot((start, end), (0, 0), '-', c='k') for coords, heights in intron_dict.iteritems(): ax[m].plot(coords, heights, '-', linewidth=2, color=colors[n]) ax[m].fill_between(coords, 0, heights, facecolor=colors[n], interpolate=True, alpha=0.5) if same_yaxis is True: junc_ymax = max( [junc_ymax, max(zip(*intron_dict.values())[1])]) # Add diagram of gene below traces if tx in tx_dict: strand = gene_patches(tx, tx_dict, ax[-1]) ax[-1].set_xlim(start, end) else: try: new_tx = tx.split(' ')[0] if new_tx[-2] == 'T' or new_tx[-2] == '.': new_tx = new_tx[:-2] strand = gene_patches(new_tx, tx_dict, ax[-1]) ax[-1].set_xlim(start, end) except KeyError: print "Transcript unknown" # Flip minus strand transcripts if indicated if transcript_direction is True: if strand == '-': ax[-1].invert_xaxis() # Set x and y limits for n in range(len(bam_list)): ax[n].set_xlim(start, end) if same_yaxis is True: ax[n].set_ylim(0, max_y + 0.1 * max_y) if plot_junctions is True: ax[n + len(bam_list)].set_ylim(0, junc_ymax + 0.1 * junc_ymax) if strand == '-': ax[n].invert_xaxis() ax[0].set_ylabel('RPM', fontsize=16) ax[0].set_title(tx, fontsize=16) #ax[0].get_xaxis().set_ticks([]) plt.show() # Save if indicated if save_dir is not None: if not os.path.exists(save_dir): os.makedirs(save_dir) fig.savefig(save_dir + tx + '.eps', format='eps') plt.clf()
def list_branch_points(sorted_bam_file, gff3_file, fasta_dict, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_dict) == str: with open(fasta_dict, 'r') as f: fasta_dict = json.load(f) branch_dict = {} read_counter = 0 br_counter = 0 bam_reader = HTSeq.BAM_Reader(sorted_bam_file) for a in bam_reader: read_counter += 1 transcript = a.read.name.split('-chr')[0].split(':')[-1] splice_site = a.read.name.split('-')[-1] if len(splice_site) < 3: splice_site = a.read.name.split('-')[-2] if splice_site.startswith('chr'): if transcript not in branch_dict: branch_dict[transcript] = {} if splice_site not in branch_dict[transcript]: branch_dict[transcript][splice_site] = [] if a.iv is not None: strand = a.iv.strand read_end = a.iv.end if strand == '-': read_end = a.iv.start if strand == transcript_dict[transcript][2]: branch_dict[transcript][splice_site].append(read_end) br_counter += 1 print "Reads analyzed: " + str(read_counter) print "Reads assigned as branches: " + str(br_counter) new_branch_dict = {} for transcript, introns in branch_dict.iteritems(): new_branch_dict[transcript] = [] for intron, branches in introns.iteritems(): new_branch_list = [] new_branch_counts = [] for branch in branches: flag = False if len(new_branch_list) > 0: for pos in range(branch - 2, branch + 3): if pos in new_branch_list: flag = True br_id = new_branch_list.index(pos) new_branch_counts[br_id] += 1 if flag == False: new_branch_list.append(branch) new_branch_counts.append(1) if len(new_branch_list) > 0: new_branch_dict[transcript].append( [intron, new_branch_list, new_branch_counts]) with open('{0}.bed'.format(sorted_bam_file.split('_sorted.bam')[0]), 'w') as fout: fout.write('track name=junctions description="TopHat junctions"\n') for transcript, introns in new_branch_dict.iteritems(): strand = transcript_dict[transcript][2] for intron in introns: chrom = intron[0].split(':')[0] start = int(intron[0].split(':')[1]) n = 0 for n in range(len(intron[1])): end = intron[1][n] value = intron[2][n] size = abs(end - start) + 30 if abs(end - start) > 2000: pass elif abs(end - start) > 5 and value >= 5: #[seqname] [start] [end] [id] [score] [strand] [thickStart] [thickEnd] [r,g,b][block_count] [block_sizes] [block_locations] read_id = intron[0] + '-' + str(n) block_size = '0,' + str(size) line_list = [ chrom, str(start - 1), str(end + 1), read_id, str(value), strand, str(start - 1), str(end + 1), '75,196,213', '2', '1,1', block_size, '\n' ] line = '\t'.join(line_list) fout.write(line) return new_branch_dict
def igv_plots_general(bam_list, gene_list, organism, colors=None, names=None, save_dir=None, unstranded=False, end_only=False, same_yaxis=False, specific_range=None, transcript_direction=True, log_scale=False, rpm=True, PE=False, plot_junctions=False): '''Usage: Parameters ---------- bam_list : list, bam files in order of plotting (top to bottom) gene_list : list of transcripts to plot (should be genes not transcript isoforms) if dataframe passed instead of list, will plot introns (must have intron information in datafame) organism : str, pombe or crypto colors : list, default `None` list of colors to use, same length as bam_list, check matplotlib documentation for valid color names names : list, default `None` list of sample names to use instead of bam file names. Same length as bam_files save_dir : str, default `None` directory to save eps files. If None, does not save files unstranded : bool, default `False` Use True for ChIP or DNA sequencing data (or unstranded RNAseq) end_only : bool or list, default `False` Whether to plot only the ends of reads. If different for each bam, make a list of bools same length as bam_list same_yaxis : bool, default `False` Whether all samples should be plotted on the same axis after normalizing to total number of aligned reads specific_range : str, default `None` Options: ('end', window) ('start', window) ([coordinate], window) transcript_direction : bool, default `True` If True, will plot in the direction of transcription, not in the direction of the DNA ''' # Get all organism information (annotation etc.) organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) tx_dict = SP.build_transcript_dict(gff3, organism=organism) fix_info = {'I':'chr1','II':'chr2','III':'chr3','chr1':'I','chr2':'II','chr4':'IV','chr5':'V','chr6':'VI', 'chr7':'VII','chr8':'VIII','chr9':'IX','chr10':'X','chr11':'XI','chr12':'XII','chr13':'XIII', 'chr14':'XIV','chr15':'XV','chr16':'XVI','-':'+','+':'-','chr1':'I','chr2':'II','chr3':'III'} if organism == 'pombe': tx_suffix = '.1' else: tx_suffix = 'T0' # Set up range parameters if specific range is indicated if specific_range is not None: window = int(specific_range[1]) new_tx_dict = {} for gene in gene_list: info = tx_dict[gene+tx_suffix] if specific_range[0] == 'end': if info[2] == '+': start = info[1]-window end = info[1]+window else: start = info[0]-window end = info[0]+window elif specific_range[0] == 'start': if info[2] == '-': start = info[1]-window end = info[1]+window else: start = info[0]-window end = info[0]+window else: start = int(specific_range[0])-window end = int(specific_range[0])+window new_tx_dict[gene+tx_suffix] = [start, end, info[2], info[3]] else: new_tx_dict = tx_dict # Open bam files and count reads if rpm is True open_bams = {} total_list = [] for bam in bam_list: open_bams[bam] = pysam.Samfile(bam) if rpm is True: total = check_output(['samtools','view','-F 0x04','-c',bam]).strip() total = float(total)/1000000. total_list.append(total) else: total_list.append(1.) # Expand optional arguments to lists if necessary colors = list_from_arg(colors, len(bam_list)) end_only = list_from_arg(end_only, len(bam_list)) log_scale = list_from_arg(log_scale, len(bam_list)) unstranded = list_from_arg(unstranded, len(bam_list)) # Get gene_list from dataframe if gene_list is not a list df = None if type(gene_list) == dict: new_tx_dict = gene_list gene_list = gene_list.keys() elif type(gene_list) != list: df = gene_list gene_list = df.index for tx in gene_list: num_ax = len(bam_list)+1 if plot_junctions is True: num_ax += len(bam_list) fig, ax = plt.subplots(num_ax, figsize=(10,num_ax), sharex=True) fig.subplots_adjust(hspace=0) # Get transcript info from transcript_dictionary if df is None: try: info = new_tx_dict[tx+tx_suffix] except KeyError: info = new_tx_dict[tx] chrom = info[3] start = info[0] end = info[1] strand = info[2] # If dataframe was passed, get plotting information from dataframe instead else: if isinstance(df.columns, pd.core.index.MultiIndex): new_columns = [x[1] for x in df.columns if x[0] == 'Peaks'] df = df[[x for x in df.columns if x[0] == 'Peaks']] df.columns = new_columns strand = df.loc[tx,'strand'] chrom = df.loc[tx,'chromosome'] if strand == '+': start = df.loc[tx,'position']-100 end = df.loc[tx,'position'] + df.loc[tx,'intron size']+100 elif strand == '-': start = df.loc[tx,'position']-df.loc[tx,'intron size']-100 end = df.loc[tx,'position']+100 start = int(start) end = int(end) tx = df.loc[tx,'transcript'] # Generate read series for each transcript max_y = 0 junc_ymax = 0 for n, bam in enumerate(bam_list): try: bam_iter = open_bams[bam].fetch(chrom, start, end) except ValueError: chrom = fix_info[chrom] bam_iter = open_bams[bam].fetch(chrom, start, end) if end_only[n] is not False: s = SP.generate_read_series_A(bam_iter, chrom, start, end, strand) linewidth = 2 else: if PE is False: s = SP.generate_read_series_B(bam_iter, chrom, start, end, strand) else: s = SP.generate_read_series_PE(bam_iter, chrom, start, end, strand) linewidth = 1 # Get reads from otherstrand if the library type is unstranded if unstranded[n] is True: bam_iter = open_bams[bam].fetch(chrom, start, end) if end_only[n] is not False: s2 = SP.generate_read_series_A(bam_iter, chrom, start, end, fix_info[strand]) linewidth = 2 else: if PE is False: s2 = SP.generate_read_series_B(bam_iter, chrom, start, end, fix_info[strand]) else: s2 = SP.generate_read_series_PE(bam_iter, chrom, start, end, fix_info[strand]) linewidth = 1 s = s.add(s2) # Normalize to rpm (will just divide by 1 if rpm is False) s = s.divide(total_list[n]) if log_scale[n] is True: s = s.apply(np.log2) # Plot! ax[n].bar(s.index, s, linewidth=linewidth, color=colors[n], edgecolor=colors[n], zorder=2) ax[n].tick_params(axis='both', which='major', labelsize=14) max_y = max([max_y,max(s)]) if plot_junctions is True: m = n+len(bam_list) intron_dict = get_junctions(open_bams[bam], chrom, start, end, strand) ax[m].plot((start, end),(0,0),'-',c='k') for coords, heights in intron_dict.iteritems(): ax[m].plot(coords, heights, '-', linewidth=2, color=colors[n]) ax[m].fill_between(coords, 0, heights, facecolor=colors[n], interpolate=True, alpha=0.5) if same_yaxis is True: junc_ymax = max([junc_ymax, max(zip(*intron_dict.values())[1])]) # Add diagram of gene below traces if tx in tx_dict: strand = gene_patches(tx, tx_dict, ax[-1]) ax[-1].set_xlim(start, end) else: try: new_tx = tx.split(' ')[0] if new_tx[-2] == 'T' or new_tx[-2] == '.': new_tx = new_tx[:-2] strand = gene_patches(new_tx, tx_dict, ax[-1]) ax[-1].set_xlim(start, end) except KeyError: print "Transcript unknown" # Flip minus strand transcripts if indicated if transcript_direction is True: if strand == '-': ax[-1].invert_xaxis() # Set x and y limits for n in range(len(bam_list)): ax[n].set_xlim(start, end) if same_yaxis is True: ax[n].set_ylim(0,max_y+0.1*max_y) if plot_junctions is True: ax[n+len(bam_list)].set_ylim(0,junc_ymax+0.1*junc_ymax) if strand == '-': ax[n].invert_xaxis() ax[0].set_ylabel('RPM', fontsize=16) ax[0].set_title(tx, fontsize=16) #ax[0].get_xaxis().set_ticks([]) plt.show() # Save if indicated if save_dir is not None: if not os.path.exists(save_dir): os.makedirs(save_dir) fig.savefig(save_dir+tx+'.eps', format='eps') plt.clf()
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) column_dict = {'position':[],'transcript':[],'alt splicing':[],'type':[],'strand':[], 'introns in transcript':[], 'intron size':[],'chromosome':[], '5p score':[], '3p score':[], 'intron position':[], 'exon size (us)':[], 'exon size (ds)':[],'transcript size':[], 'peak':[], 'seq5':[],'seq3':[]} new_index = [] for tx in set(df['transcript']): strand = df[df['transcript'] == tx].iloc[0]['strand'] splice_sites = ss_dict[tx] if strand == '+': splice_sites = sorted(list(splice_sites), key=lambda x:x[0]) elif strand == '-': splice_sites = sorted(list(splice_sites), key=lambda x:x[0], reverse=True) df_pos = None for n, (five_site, three_site) in enumerate(splice_sites): # Check if already in dataframe in_df = False for peak in df[df['transcript'] == tx]['position']: if five_site in range(int(peak)-5,int(peak)+5): in_df = True df_pos = peak break column_dict['transcript'].append(tx) if organism == 'pombe': iso = tx+'.1' else: iso = tx+'T0' column_dict['intron size'].append(abs(three_site-five_site)) column_dict['introns in transcript'].append(len(splice_sites)) column_dict['strand'].append(strand) chrom = df[df['transcript'] == tx].iloc[0]['chromosome'] column_dict['chromosome'].append(chrom) column_dict['transcript size'].append((tx_dict[iso][1]-tx_dict[iso][0])/1000.) # Check if first or last intron and add exon size if n == 0: column_dict['intron position'].append('First') if strand == '+': column_dict['exon size (us)'].append((five_site-tx_dict[iso][0])/1000.) if len(splice_sites) > 1: ds_length = (splice_sites[n+1][0] - three_site)/1000. try: if ds_length < 0: ds_length = (splice_sites[n+2][0] - three_site)/1000. except IndexError: ds_length = np.NaN else: ds_length = (tx_dict[iso][1] - three_site)/1000. elif strand == '-': column_dict['exon size (us)'].append((tx_dict[iso][1]-five_site)/1000.) if len(splice_sites) > 1: ds_length = (three_site - splice_sites[n+1][0])/1000. try: if ds_length < 0: ds_length = (three_site - splice_sites[n+2][0])/1000. except IndexError: ds_length = np.NaN else: ds_length = (three_site - tx_dict[iso][0])/1000. column_dict['exon size (ds)'].append(ds_length) elif n == len(splice_sites)-1: column_dict['intron position'].append('Last') column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.) if strand == '+': column_dict['exon size (ds)'].append((tx_dict[iso][1]-three_site)/1000.) elif strand == '-': column_dict['exon size (ds)'].append((three_site - tx_dict[iso][0])/1000.) else: column_dict['intron position'].append('Middle') column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.) column_dict['exon size (ds)'].append(abs(three_site - splice_sites[n+1][0])/1000.) if in_df is True: peak_index = chrom+':'+str(int(df_pos)) new_index.append(peak_index) column_dict['position'].append(df_pos) column_dict['3p score'].append(df.loc[peak_index,'3p score']) column_dict['5p score'].append(df.loc[peak_index,'5p score']) column_dict['alt splicing'].append(df.loc[peak_index,'alt splicing']) column_dict['type'].append(df.loc[peak_index,'type']) column_dict['peak'].append(True) column_dict['seq5'].append(df.loc[peak_index,'seq5']) column_dict['seq3'].append(df.loc[peak_index,'seq3']) if in_df is False: column_dict['alt splicing'].append(False) column_dict['type'].append('5prime') column_dict['peak'].append(False) # Get position, index and sequence for scoring and position code if strand == '+': column_dict['position'].append(five_site+1) new_index.append(chrom+':'+str(five_site+1)) sequence1 = fa_dict[chrom][(five_site-1):(five_site+7)] sequence2 = fa_dict[chrom][(three_site-5):(three_site+3)] elif strand == '-': column_dict['position'].append(five_site-1) new_index.append(chrom+':'+str(five_site-1)) sequence1 = fa_dict[chrom][(five_site-6):(five_site+2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fa_dict[chrom][(three_site-2):(three_site+6)] sequence2 = SP.reverse_complement(sequence2) column_dict['seq5'].append(sequence1) column_dict['seq3'].append(sequence2) # Score sequences score_5, score_3 = SP.simple_score_junction(sequence1, sequence2, PSSM) column_dict['3p score'].append(score_3) column_dict['5p score'].append(score_5) # Create new dataframe from column dictionary new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index) for column, data in column_dict.iteritems(): new_df[column] = data return new_df
def list_branch_points(sorted_bam_file, gff3_file, fasta_dict, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_dict) == str: with open(fasta_dict, 'r') as f: fasta_dict = json.load(f) branch_dict = {} read_counter = 0 br_counter = 0 bam_reader = HTSeq.BAM_Reader(sorted_bam_file) for a in bam_reader: read_counter += 1 transcript = a.read.name.split('-chr')[0].split(':')[-1] splice_site = a.read.name.split('-')[-1] if len(splice_site) < 3: splice_site = a.read.name.split('-')[-2] if splice_site.startswith('chr'): if transcript not in branch_dict: branch_dict[transcript] = {} if splice_site not in branch_dict[transcript]: branch_dict[transcript][splice_site] = [] if a.iv is not None: strand = a.iv.strand read_end = a.iv.end if strand == '-': read_end = a.iv.start if strand == transcript_dict[transcript][2]: branch_dict[transcript][splice_site].append(read_end) br_counter += 1 print "Reads analyzed: "+str(read_counter) print "Reads assigned as branches: "+str(br_counter) new_branch_dict = {} for transcript, introns in branch_dict.iteritems(): new_branch_dict[transcript] = [] for intron, branches in introns.iteritems(): new_branch_list = [] new_branch_counts = [] for branch in branches: flag = False if len(new_branch_list) > 0: for pos in range(branch-2,branch+3): if pos in new_branch_list: flag = True br_id = new_branch_list.index(pos) new_branch_counts[br_id] += 1 if flag == False: new_branch_list.append(branch) new_branch_counts.append(1) if len(new_branch_list) > 0: new_branch_dict[transcript].append([intron, new_branch_list, new_branch_counts]) with open('{0}.bed'.format(sorted_bam_file.split('_sorted.bam')[0]), 'w') as fout: fout.write('track name=junctions description="TopHat junctions"\n') for transcript, introns in new_branch_dict.iteritems(): strand = transcript_dict[transcript][2] for intron in introns: chrom = intron[0].split(':')[0] start = int(intron[0].split(':')[1]) n=0 for n in range(len(intron[1])): end = intron[1][n] value = intron[2][n] size = abs(end-start)+30 if abs(end-start) > 2000: pass elif abs(end-start) > 5 and value >= 5: #[seqname] [start] [end] [id] [score] [strand] [thickStart] [thickEnd] [r,g,b][block_count] [block_sizes] [block_locations] read_id = intron[0]+'-'+str(n) block_size = '0,'+str(size) line_list = [chrom, str(start-1), str(end+1), read_id, str(value), strand, str(start-1), str(end+1), '75,196,213', '2', '1,1', block_size, '\n'] line = '\t'.join(line_list) fout.write(line) return new_branch_dict
def build_junction_df(junction_bed, gff3_file, fasta, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta) == str: fasta = SP.make_fasta_dict(fasta) junction_dict = build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) junction_count = 0 for tx, junctions in junction_dict.iteritems(): junction_count += len(junctions) junction_df = pd.DataFrame(index=range(junction_count), columns=[ 'intron tuple', 'chromosome', 'start', 'end', 'strand', 'depth', 'type', 'size', 'annotated intron size', 'annotated intron start', 'annotated intron end' ]) n = 0 for tx, junctions in junction_dict.iteritems(): for junction in junctions: junction_df.ix[n] = [tx] + junction n += 1 sequence1 = [] sequence2 = [] ann_seq1 = [] ann_seq2 = [] seq_type1 = [] seq_type2 = [] df_tx = [] for index, row in junction_df.iterrows(): df_tx.append(row['intron tuple'][0]) chrom = convert_chrom(row['chromosome']) if row['strand'] == '+': curr1 = fasta[chrom][(row['start'] - 1):(row['start'] + 7)] sequence1.append(curr1) curr2 = fasta[chrom][(row['end'] - 5):(row['end'] + 3)] sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append( fasta[chrom][(row['annotated intron start'] - 1):(row['annotated intron start'] + 7)]) ann_seq2.append(fasta[chrom][(row['annotated intron end'] - 5):(row['annotated intron end'] + 3)]) elif row['strand'] == '-': curr1 = SP.reverse_complement(fasta[chrom][(row['start'] - 6):(row['start'] + 2)]) sequence1.append(curr1) curr2 = SP.reverse_complement(fasta[chrom][(row['end'] - 2):(row['end'] + 6)]) sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append( SP.reverse_complement( fasta[chrom][row['annotated intron start'] - 6:row['annotated intron start'] + 2])) ann_seq2.append( SP.reverse_complement( fasta[chrom][row['annotated intron end'] - 2:row['annotated intron end'] + 6])) else: sequence1.append('NNNNNNNN') sequence2.append('NNNNNNNN') ann_seq1.append('NNNNNNNN') ann_seq2.append('NNNNNNNN') if row['type'] == 'Annotated': seq_type1.append('5p annotated') seq_type2.append('3p annotated') elif row['type'] == '5p tethered': seq_type1.append('5p annotated') seq_type2.append(curr2[4:6]) else: seq_type1.append(curr1[2:4]) seq_type2.append(curr2[4:6]) junc_seq_df = junction_df junc_seq_df['sequence1'] = sequence1 junc_seq_df['sequence2'] = sequence2 junc_seq_df['seq type1'] = seq_type1 junc_seq_df['seq type2'] = seq_type2 junc_seq_df['annotated sequence1'] = ann_seq1 junc_seq_df['annotated sequence2'] = ann_seq2 junc_seq_df['transcript'] = df_tx return junc_seq_df
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False): #Populate gene dictionary and build genome if 'pombe' in gff3.lower(): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') ss, flag = SP.list_splice_sites(gff3, organism='pombe') organism = 'pombe' else: transcript_dict = SP.build_transcript_dict(gff3) ss, flag = SP.list_splice_sites(gff3) organism = None ss_dict = SP.collapse_ss_dict(ss) genome = fasta_dict #print genome.keys() nuc_prob = gc_content(fasta_dict) #print nuc_prob base_dict = {"A": 0, "C": 1, "T": 2, "G": 3} #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G. pos_matrix_5prime = np.zeros([4, 8]) pos_matrix_3prime = np.zeros([4, 8]) counter1 = 0 counter2 = 0 for transcript, introns in ss_dict.iteritems(): counter2 += 1 if organism == 'pombe': isoform = transcript + '.1' else: isoform = transcript + 'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: counter1 += 1 if strand == '+': seq = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)] elif strand == '-': seq = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)] seq = SP.reverse_complement(seq) for a, base in enumerate(seq): pos_matrix_5prime[base_dict[base], a] += 1 if strand == '+': seq = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)] elif strand == '-': seq = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)] seq = SP.reverse_complement(seq) for b, base in enumerate(seq): pos_matrix_3prime[base_dict[base], b] += 1 #print counter1 #print counter2 float_formatter = lambda x: "%.1f" % x np.set_printoptions(formatter={'float_kind': float_formatter}) a = 0 while a < 4: b = 0 while b < 8: if PSSM is False: pos_matrix_5prime[a, b] = (pos_matrix_5prime[a, b]) / float(counter1) pos_matrix_3prime[a, b] = (pos_matrix_3prime[a, b]) / float(counter1) if PSSM is True: if pos_matrix_5prime[a, b] == 0: pos_matrix_5prime[a, b] += 1 if pos_matrix_3prime[a, b] == 0: pos_matrix_3prime[a, b] += 1 pos_matrix_5prime[a, b] = np.log2( (pos_matrix_5prime[a, b] / float(counter1)) / nuc_prob[a]) pos_matrix_3prime[a, b] = np.log2( (pos_matrix_3prime[a, b] / float(counter1)) / nuc_prob[a]) b += 1 a += 1 return (pos_matrix_5prime, pos_matrix_3prime)
def get_junction_sequence(df, gff3_file, fasta_file): df = df.sort_values('chr', axis=0) #transcript_dict[transcript] = [start, end, strand, chromosome, CDS start, CDS end] transcript_dict = SP.build_transcript_dict(gff3_file) #splice_dict[transcipt] = [[5'sites][3'sites]] splice_dict, flag = SP.list_splice_sites(gff3_file) #fasta_dict[chr] = sequence if type(fasta_file) is str: fasta_dict = make_fasta_dict(fasta_file) else: fasta_dict = fasta_file transcript_by_chr = {} for transcript, coords in transcript_dict.iteritems(): chromosome = coords[3] if chromosome in transcript_by_chr: transcript_by_chr[chromosome].append(transcript) else: transcript_by_chr[chromosome] = [] transcript_by_chr[chromosome].append(transcript) df['Gene'] = "Unknown" df['intron'] = "Middle" df['sequence1'] = '' df['sequence2'] = '' df['intron sequence'] = 'No sequence here' n = 0 for n in range(len(df)): coord1 = int(df['coord_1'][n].strip()) coord2 = int(df['coord_2'][n].strip()) chrom = df['chr'][n].strip() strand = df['strand'][n].strip() transcripts = transcript_by_chr[chrom] for transcript in transcripts: tx_strand = transcript_dict[transcript][2] start = transcript_dict[transcript][0] stop = transcript_dict[transcript][1] if strand == tx_strand and coord1 >= start and coord2 <= stop: df.loc[n, 'Gene'] = transcript if strand == '+': sequence1 = fasta_dict[chrom][(coord1 - 3):(coord1 + 5)] sequence2 = fasta_dict[chrom][(coord2 - 6):(coord2 + 2)] all_seq = fasta_dict[chrom][(coord1 - 1):coord2] elif strand == '-': sequence1 = fasta_dict[chrom][(coord2 - 6):(coord2 + 2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fasta_dict[chrom][(coord1 - 3):(coord1 + 5)] sequence2 = SP.reverse_complement(sequence2) all_seq = fasta_dict[chrom][(coord1 - 1):coord2] all_seq = SP.reverse_complement(all_seq) df.loc[n, 'sequence1'] = sequence1 df.loc[n, 'sequence2'] = sequence2 df.loc[n, 'intron sequence'] = all_seq for transcript in transcripts: if transcript in df['Gene'].tolist(): tx_df = df[df['Gene'] == transcript] s = tx_df['coord_1'] min_idx = s.idxmin() first = int(s.min()) #print transcript_dict[transcript][2] #print first max_idx = s.idxmax() last = int(s.max()) #print last if first == last: df.loc[min_idx, 'intron'] = 'Only' else: if transcript_dict[transcript][2] == '+': df.loc[min_idx, 'intron'] = 'First' df.loc[max_idx, 'intron'] = 'Last' elif transcript_dict[transcript][2] == '-': df.loc[min_idx, 'intron'] = 'Last' df.loc[max_idx, 'intron'] = 'First' for index, coord_1 in s.iteritems(): if df['intron'][index] == 'Middle': if coord_1 in range(first - 10, first + 10): df_idx = s[s == coord_1].index[0] if transcript_dict[transcript][2] == '+': df.loc[df_idx, 'intron'] = 'First' elif transcript_dict[transcript][2] == '-': df.loc[df_idx, 'intron'] = 'Last' elif coord_1 in range(last - 10, last + 10): df_idx = s[s == coord_1].index[0] if transcript_dict[transcript][2] == '+': df.loc[df_idx, 'intron'] = 'Last' elif transcript_dict[transcript][2] == '-': df.loc[df_idx, 'intron'] = 'First' df = df[df['contained in'] != ''] df = df.reset_index() return df
def peak_to_seq_pipeline(untagged_peak_file, tagged1_peak_file, tagged2_peak_file, gff3, fasta, junction_df=None, branch_df=None, cutoff=5, name='CP_peaks'): if 'pombe' in gff3: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3, organism=organism) print "Finding peaks in transcripts..." print untagged_peak_file untagged = CP_peaks_by_gene(untagged_peak_file, transcript_dict, cutoff=cutoff) print tagged1_peak_file tagged1 = CP_peaks_by_gene(tagged1_peak_file, transcript_dict, cutoff=cutoff) print tagged2_peak_file tagged2 = CP_peaks_by_gene(tagged2_peak_file, transcript_dict, cutoff=cutoff) print "Comparing peaks between replicates..." peaks = CP_compare_reps(untagged, tagged1, tagged2) print "Checking peaks against annotation..." ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) peak_df = CP_compare_to_annotation(peaks, ss_dict, transcript_dict) peak_df = collapse_unpredicted_peaks(peak_df) peak_df['genome coord'] = peak_df['chromosome'].str.cat( peak_df['position'].apply(int).apply(str), sep=':') if type(fasta) == str: fasta = SP.make_fasta_dict(fasta) print "Adding sequences..." peak_seq_df = add_sequence_to_df(peak_df, fasta, flag=flag) print "Writing bedgraph..." with open(name + '.bedgraph', 'w') as fout: for ix, r in peak_seq_df.iterrows(): if r['strand'] == '+': position2 = r['position'] + 1 height = r['height'] elif r['strand'] == '-': position2 = r['position'] - 1 height = r['height'] * -1 line_list = [ r['chromosome'], r['position'], position2, height, '\n' ] line_list = [str(x) for x in line_list] line = '\t'.join(line_list) fout.write(line) print "Completed" return peak_seq_df
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False): #Populate gene dictionary and build genome if 'pombe' in gff3.lower(): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') ss, flag = SP.list_splice_sites(gff3, organism='pombe') organism = 'pombe' else: transcript_dict = SP.build_transcript_dict(gff3) ss, flag = SP.list_splice_sites(gff3) organism = None ss_dict = SP.collapse_ss_dict(ss) genome = fasta_dict #print genome.keys() nuc_prob = gc_content(fasta_dict) #print nuc_prob base_dict = {"A":0, "C":1, "T":2, "G":3} #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G. pos_matrix_5prime = np.zeros([4,8]) pos_matrix_3prime = np.zeros([4,8]) counter1 = 0 counter2 = 0 for transcript, introns in ss_dict.iteritems(): counter2 += 1 if organism == 'pombe': isoform = transcript+'.1' else: isoform = transcript+'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: counter1+=1 if strand == '+': seq = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)] elif strand == '-': seq = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)] seq = SP.reverse_complement(seq) for a, base in enumerate(seq): pos_matrix_5prime[base_dict[base],a] += 1 if strand == '+': seq = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)] elif strand == '-': seq = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)] seq = SP.reverse_complement(seq) for b, base in enumerate(seq): pos_matrix_3prime[base_dict[base],b] += 1 #print counter1 #print counter2 float_formatter = lambda x: "%.1f" % x np.set_printoptions(formatter={'float_kind':float_formatter}) a = 0 while a < 4: b = 0 while b < 8: if PSSM is False: pos_matrix_5prime[a,b] = (pos_matrix_5prime[a,b])/float(counter1) pos_matrix_3prime[a,b] = (pos_matrix_3prime[a,b])/float(counter1) if PSSM is True: if pos_matrix_5prime[a,b] == 0: pos_matrix_5prime[a,b] += 1 if pos_matrix_3prime[a,b] == 0: pos_matrix_3prime[a,b] += 1 pos_matrix_5prime[a,b] = np.log2((pos_matrix_5prime[a,b]/float(counter1))/nuc_prob[a]) pos_matrix_3prime[a,b] = np.log2((pos_matrix_3prime[a,b]/float(counter1))/nuc_prob[a]) b += 1 a += 1 return (pos_matrix_5prime, pos_matrix_3prime)
def get_peak_sequence3(input_file, fasta_file, gff3_file, gene_list,window=1000): '''Makes a fasta file of peak sequences based on an input file. Input file columns - 1: transcript, 2: chromosome, 3: peak center Remember to save the input file as an MS-DOS CSV file if exporting from Excel Note: retrieves sequence Parameters ---------- input_file : str CSV file - see above fasta_file : str .json dictionary of chromosome sequences or fasta file (.json will load faster) gff3_file : str gff3 file for your organism gene_list: str CSV file window : int, default 1000 Size of sequence to retrieve (peak boundaries are window/2 on either side of peak summit) Outputs ------ peak_fasta : fasta file with all peak sequences ''' tx_dict = SP.build_transcript_dict(gff3_file) if type(fasta_file) == dict: fa_dict = fasta_file else: if fasta_file.endswith('json'): with open(fasta_file) as f: fa_dict = json.load(f) else: fa_dict = SP.make_fasta_dict(fasta_file) seq_list = [] no_tx_n = 1 with open(input_file,'r') as csv_file: f = csv.reader(csv_file, dialect=csv.excel) for row in f: tx_list = row[0].split(',') for tx in tx_list: tx = tx+'T0' if tx.startswith('3P'): tx = tx.split('3P')[1] chrom = row[1] if not chrom.startswith('chr'): chrom = 'chr'+str(chrom) try: center = int(row[2]) start = center-window/2 end = center+window/2 if tx in tx_dict: strand = tx_dict[tx][2] else: print tx+" not in GFF3 file" strand = '+' tx = chrom+':'+str(center) seq = seq_simple(chrom, start, end, strand, fa_dict) seq_list.append((tx,seq)) except ValueError: pass genes_of_interest=gene_list.split("\n") with open('{0}_peak_sequences.fa'.format(input_file.split('/')[-1].split('.')[0]),'w') as fout: for tx, seq in seq_list: if tx in genes_of_interest: fout.write('>'+tx+'\n') fout.write(seq+'\n') return seq_list
def gene_venn(csv_files, organism): '''Finds overlap between 2 or 3 lists of genes. Parameters ---------- csv_files : list 2 or 3 csv files where the first column is the gene name (make sure the gene name format matches). organism : str Options are 'crypto', 'cerevisiae' or 'pombe' Output ------ PDF files of venn diagrams (pairwise) and merged csv files containing the overlapping genes.''' if 'pombe' in organism.lower(): gff3 = '/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3' organism = 'pombe' elif 'crypto' in organism.lower() or 'h99' in organism.lower(): organism = None gff3 = '/home/jordan/GENOMES/CNA3_all_transcripts.gff3' elif 'cerev' in organism.lower(): organism = None gff3 = '/home/jordan/GENOMES/S288C/saccharomyces_cerevisiae_R64-2-1_20150113.gff3' tx_dict = SP.build_transcript_dict(gff3, organism=organism) transcripts = tx_dict.keys() genes = set([x[:-2] for x in transcripts]) df_dict = {} names = [] for csv in csv_files: name = csv.split('/')[-1].split('.')[0] names.append(name) line = next(open(csv)) if len(line.split(',')) > 1: df_dict[name] = pd.read_csv(csv, index_col=0) df_dict[name] = add_col_level(df_dict[name], name) elif len(line.split('\t')) > 1: df_dict[name] = pd.read_csv(csv, index_col=0, sep='\t') df_dict[name] = add_col_level(df_dict[name], name) else: df_dict[name] = pd.read_csv(csv, index_col=0) # N = genome size # n = number of genes in analysis (so len(a)+len(b)) # K = number of genes in group 1 (len(a)) # k = overlap(len(a&b)) N = len(genes) K = len(df_dict[names[0]]) overlap = set(df_dict[names[0]].index).intersection( df_dict[names[1]].index) k = len(overlap) n = len(df_dict[names[0]]) + len(df_dict[names[1]]) - k J = len(df_dict[names[1]]) p_ab = hypergeometric(N, n, K, J, k) if p_ab is not None: venn_2sample(n, K, k, J, names[0], names[1], ['crimson', 'deepskyblue', 'darkorchid'], p_ab) df_ab = df_dict[names[0]].merge(df_dict[names[1]], right_index=True, left_index=True) df_ab.to_csv('{0}_{1}_overlap.csv'.format(names[0], names[1])) if len(names) == 3: ## Compare sample 1 to sample 3 overlap_ac = set(df_dict[names[0]].index).intersection( df_dict[names[2]].index) k_ac = len(overlap_ac) n_ac = len(df_dict[names[0]]) + len(df_dict[names[2]]) - k_ac J_ac = len(df_dict[names[2]]) p_ac = hypergeometric(N, n_ac, K, J_ac, k_ac) if p_ac is not None: venn_2sample(n_ac, K, k_ac, J_ac, names[0], names[2], ['crimson', 'gold', 'darkorange'], p_ac) df_ac = df_dict[names[0]].merge(df_dict[names[2]], right_index=True, left_index=True) df_ac.to_csv('{0}_{1}_overlap.csv'.format(names[0], names[2])) ## Compare sample 2 to sample 3 overlap_bc = set(df_dict[names[1]].index).intersection( df_dict[names[2]].index) k_bc = len(overlap_bc) n_bc = len(df_dict[names[1]]) + len(df_dict[names[2]]) - k_bc J_bc = len(df_dict[names[2]]) K_bc = len(df_dict[names[1]]) p_bc = hypergeometric(N, n_bc, K_bc, J_bc, k_bc) if p_bc is not None: venn_2sample(n_bc, K_bc, k_bc, J_bc, names[1], names[2], ['deepskyblue', 'gold', 'forestgreen'], p_bc) df_bc = df_dict[names[1]].merge(df_dict[names[2]], right_index=True, left_index=True) df_bc.to_csv('{0}_{1}_overlap.csv'.format(names[1], names[2]))
def get_peak_sequence3(input_file, fasta_file, gff3_file, gene_list, window=1000): '''Makes a fasta file of peak sequences based on an input file. Input file columns - 1: transcript, 2: chromosome, 3: peak center Remember to save the input file as an MS-DOS CSV file if exporting from Excel Note: retrieves sequence Parameters ---------- input_file : str CSV file - see above fasta_file : str .json dictionary of chromosome sequences or fasta file (.json will load faster) gff3_file : str gff3 file for your organism gene_list: str CSV file window : int, default 1000 Size of sequence to retrieve (peak boundaries are window/2 on either side of peak summit) Outputs ------ peak_fasta : fasta file with all peak sequences ''' tx_dict = SP.build_transcript_dict(gff3_file) if type(fasta_file) == dict: fa_dict = fasta_file else: if fasta_file.endswith('json'): with open(fasta_file) as f: fa_dict = json.load(f) else: fa_dict = SP.make_fasta_dict(fasta_file) seq_list = [] no_tx_n = 1 with open(input_file, 'r') as csv_file: f = csv.reader(csv_file, dialect=csv.excel) for row in f: tx_list = row[0].split(',') for tx in tx_list: tx = tx + 'T0' if tx.startswith('3P'): tx = tx.split('3P')[1] chrom = row[1] if not chrom.startswith('chr'): chrom = 'chr' + str(chrom) try: center = int(row[2]) start = center - window / 2 end = center + window / 2 if tx in tx_dict: strand = tx_dict[tx][2] else: print tx + " not in GFF3 file" strand = '+' tx = chrom + ':' + str(center) seq = seq_simple(chrom, start, end, strand, fa_dict) seq_list.append((tx, seq)) except ValueError: pass genes_of_interest = gene_list.split("\n") with open( '{0}_peak_sequences.fa'.format( input_file.split('/')[-1].split('.')[0]), 'w') as fout: for tx, seq in seq_list: if tx in genes_of_interest: fout.write('>' + tx + '\n') fout.write(seq + '\n') return seq_list
def count_reads_in_transcript(bam_files, df, gff3, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) bams = {} for bam_file in bam_files: bams[bam_file] = pysam.Samfile(bam_file) all_reads = {} for bam, reader in bams.iteritems(): all_reads[bam] = pd.DataFrame(index=df.index, columns=['total','intron']) for tx in set(df['transcript']): tx_df = df[df['transcript'] == tx] if organism == 'pombe': tx = tx+'.1' else: tx = tx+'T0' start, end, strand, CDS_start, CDS_end, exons, chrom = SP.tx_info(tx, tx_dict) if organism == 'pombe': lat_rom = {'chr1':'I','chr2':'II','chr3':'III'} chrom = lat_rom[chrom] tx_iter = reader.fetch(chrom, start, end) intron_ranges = {} for ix, r in tx_df.iterrows(): if strand == '+': intron_start = int(r['position']) intron_end = int(r['position']+r['intron size'])+1 elif strand == '-': intron_start = int(r['position']-r['intron size']) intron_end = int(r['position'])+1 intron_ranges[ix] = [range(intron_start,intron_end),0] reads = 0 for read in tx_iter: if read.is_reverse and strand == '+': reads += 1 for ix in intron_ranges: if read.reference_end in intron_ranges[ix][0]: intron_ranges[ix][1] += 1 elif not read.is_reverse and strand == '-': reads += 1 for ix in intron_ranges: if read.reference_start in intron_ranges[ix][0]: intron_ranges[ix][1] += 1 for ix in intron_ranges: try: all_reads[bam].loc[ix,'total'] = reads/float(end-start)*1000 all_reads[bam].loc[ix,'intron'] = ((intron_ranges[ix][1]/float(tx_df.loc[ix,'intron size'])) / (reads/float(end-start))) except ZeroDivisionError: all_reads[bam].loc[ix,'total'] = np.NaN all_reads[bam].loc[ix,'intron'] = np.NaN print ix return all_reads
def build_junction_df(junction_bed, gff3_file, fasta, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta) == str: fasta=SP.make_fasta_dict(fasta) junction_dict = build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) junction_count = 0 for tx, junctions in junction_dict.iteritems(): junction_count += len(junctions) junction_df = pd.DataFrame(index=range(junction_count), columns=['intron tuple','chromosome','start','end','strand','depth','type','size','annotated intron size','annotated intron start','annotated intron end']) n=0 for tx, junctions in junction_dict.iteritems(): for junction in junctions: junction_df.ix[n] = [tx]+junction n+=1 sequence1 = [] sequence2 = [] ann_seq1 = [] ann_seq2 = [] seq_type1 = [] seq_type2 = [] df_tx = [] for index, row in junction_df.iterrows(): df_tx.append(row['intron tuple'][0]) chrom = convert_chrom(row['chromosome']) if row['strand'] == '+': curr1 = fasta[chrom][(row['start']-1):(row['start']+7)] sequence1.append(curr1) curr2 = fasta[chrom][(row['end']-5):(row['end']+3)] sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append(fasta[chrom][(row['annotated intron start']-1):(row['annotated intron start']+7)]) ann_seq2.append(fasta[chrom][(row['annotated intron end']-5):(row['annotated intron end']+3)]) elif row['strand'] == '-': curr1 = SP.reverse_complement(fasta[chrom][(row['start']-6):(row['start']+2)]) sequence1.append(curr1) curr2 = SP.reverse_complement(fasta[chrom][(row['end']-2):(row['end']+6)]) sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append(SP.reverse_complement(fasta[chrom][row['annotated intron start']-6:row['annotated intron start']+2])) ann_seq2.append(SP.reverse_complement(fasta[chrom][row['annotated intron end']-2:row['annotated intron end']+6])) else: sequence1.append('NNNNNNNN') sequence2.append('NNNNNNNN') ann_seq1.append('NNNNNNNN') ann_seq2.append('NNNNNNNN') if row['type'] == 'Annotated': seq_type1.append('5p annotated') seq_type2.append('3p annotated') elif row['type'] == '5p tethered': seq_type1.append('5p annotated') seq_type2.append(curr2[4:6]) else: seq_type1.append(curr1[2:4]) seq_type2.append(curr2[4:6]) junc_seq_df = junction_df junc_seq_df['sequence1'] = sequence1 junc_seq_df['sequence2'] = sequence2 junc_seq_df['seq type1'] = seq_type1 junc_seq_df['seq type2'] = seq_type2 junc_seq_df['annotated sequence1'] = ann_seq1 junc_seq_df['annotated sequence2'] = ann_seq2 junc_seq_df['transcript'] = df_tx return junc_seq_df