def peak_to_seq_pipeline(untagged_peak_file, tagged1_peak_file, tagged2_peak_file, gff3, fasta, junction_df=None, branch_df=None, cutoff=5, name='CP_peaks'): if 'pombe' in gff3: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3, organism=organism) print "Finding peaks in transcripts..." print untagged_peak_file untagged = CP_peaks_by_gene(untagged_peak_file, transcript_dict, cutoff=cutoff) print tagged1_peak_file tagged1 = CP_peaks_by_gene(tagged1_peak_file, transcript_dict, cutoff=cutoff) print tagged2_peak_file tagged2 = CP_peaks_by_gene(tagged2_peak_file, transcript_dict, cutoff=cutoff) print "Comparing peaks between replicates..." peaks = CP_compare_reps(untagged, tagged1, tagged2) print "Checking peaks against annotation..." ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) peak_df = CP_compare_to_annotation(peaks, ss_dict, transcript_dict) peak_df = collapse_unpredicted_peaks(peak_df) peak_df['genome coord'] = peak_df['chromosome'].str.cat(peak_df['position'].apply(int).apply(str), sep=':') if type(fasta) == str: fasta = SP.make_fasta_dict(fasta) print "Adding sequences..." peak_seq_df = add_sequence_to_df(peak_df, fasta, flag=flag) print "Writing bedgraph..." with open(name+'.bedgraph', 'w') as fout: for ix, r in peak_seq_df.iterrows(): if r['strand'] == '+': position2 = r['position']+1 height = r['height'] elif r['strand'] == '-': position2 = r['position']-1 height = r['height']*-1 line_list = [r['chromosome'], r['position'], position2, height, '\n'] line_list = [str(x) for x in line_list] line = '\t'.join(line_list) fout.write(line) print "Completed" return peak_seq_df
'''Usage: python process_juncbase.py juncbase_output gff3_file fasta_file prefix Please note: uses chr# format for chromosome names.''' import sys sys.path.insert(0, '/Users/jordanburke/RNA-is-awesome/') sys.path.insert(0, '/home/jordan/RNA-is-awesome/') sys.path.insert(0, '/home/jordan/CodeBase/RNA-is-awesome') import SPTools as SP juncbase_output = sys.argv[1] gff3_file = sys.argv[2] print gff3_file fasta_file = sys.argv[3] print fasta_file prefix = sys.argv[4] fasta_dict = SP.make_fasta_dict(fasta_file) junc_df, sample_list = SP.read_juncbase_output(juncbase_output) seq_df = SP.get_junction_sequence(junc_df, gff3_file, fasta_dict) pos_matrix_5prime, pos_matrix_3prime = SP.generate_consensus_matrix(gff3_file, fasta_dict, PSSM=True) scored_df = SP.score_new_sites(seq_df, pos_matrix_5prime, pos_matrix_3prime, PSSM=True) filt_df1, filt_df2 = SP.reformat_df(scored_df, sample_list) #intron_ret_df = filt_df[filt_df['as_event_type'] == 'intron_retention'] #intron_ret_df = intron_ret_df.reset_index(drop=True) #alt_donor = filt_df[filt_df['as_event_type'] == 'alternative_donor'] #alt_acceptor = filt_df[filt_df['as_event_type'] == 'alternative_acceptor'] filt_df1.to_csv('{0}_seq_score.tsv'.format(prefix), sep='\t', float_format='%.2f') filt_df2.to_csv('{0}_PSI.tsv'.format(prefix), sep='\t', float_format='%.2f')
def get_peak_sequence3(input_file, fasta_file, gff3_file, gene_list,window=1000): '''Makes a fasta file of peak sequences based on an input file. Input file columns - 1: transcript, 2: chromosome, 3: peak center Remember to save the input file as an MS-DOS CSV file if exporting from Excel Note: retrieves sequence Parameters ---------- input_file : str CSV file - see above fasta_file : str .json dictionary of chromosome sequences or fasta file (.json will load faster) gff3_file : str gff3 file for your organism gene_list: str CSV file window : int, default 1000 Size of sequence to retrieve (peak boundaries are window/2 on either side of peak summit) Outputs ------ peak_fasta : fasta file with all peak sequences ''' tx_dict = SP.build_transcript_dict(gff3_file) if type(fasta_file) == dict: fa_dict = fasta_file else: if fasta_file.endswith('json'): with open(fasta_file) as f: fa_dict = json.load(f) else: fa_dict = SP.make_fasta_dict(fasta_file) seq_list = [] no_tx_n = 1 with open(input_file,'r') as csv_file: f = csv.reader(csv_file, dialect=csv.excel) for row in f: tx_list = row[0].split(',') for tx in tx_list: tx = tx+'T0' if tx.startswith('3P'): tx = tx.split('3P')[1] chrom = row[1] if not chrom.startswith('chr'): chrom = 'chr'+str(chrom) try: center = int(row[2]) start = center-window/2 end = center+window/2 if tx in tx_dict: strand = tx_dict[tx][2] else: print tx+" not in GFF3 file" strand = '+' tx = chrom+':'+str(center) seq = seq_simple(chrom, start, end, strand, fa_dict) seq_list.append((tx,seq)) except ValueError: pass genes_of_interest=gene_list.split("\n") with open('{0}_peak_sequences.fa'.format(input_file.split('/')[-1].split('.')[0]),'w') as fout: for tx, seq in seq_list: if tx in genes_of_interest: fout.write('>'+tx+'\n') fout.write(seq+'\n') return seq_list
def get_peak_sequence3(input_file, fasta_file, gff3_file, gene_list, window=1000): '''Makes a fasta file of peak sequences based on an input file. Input file columns - 1: transcript, 2: chromosome, 3: peak center Remember to save the input file as an MS-DOS CSV file if exporting from Excel Note: retrieves sequence Parameters ---------- input_file : str CSV file - see above fasta_file : str .json dictionary of chromosome sequences or fasta file (.json will load faster) gff3_file : str gff3 file for your organism gene_list: str CSV file window : int, default 1000 Size of sequence to retrieve (peak boundaries are window/2 on either side of peak summit) Outputs ------ peak_fasta : fasta file with all peak sequences ''' tx_dict = SP.build_transcript_dict(gff3_file) if type(fasta_file) == dict: fa_dict = fasta_file else: if fasta_file.endswith('json'): with open(fasta_file) as f: fa_dict = json.load(f) else: fa_dict = SP.make_fasta_dict(fasta_file) seq_list = [] no_tx_n = 1 with open(input_file, 'r') as csv_file: f = csv.reader(csv_file, dialect=csv.excel) for row in f: tx_list = row[0].split(',') for tx in tx_list: tx = tx + 'T0' if tx.startswith('3P'): tx = tx.split('3P')[1] chrom = row[1] if not chrom.startswith('chr'): chrom = 'chr' + str(chrom) try: center = int(row[2]) start = center - window / 2 end = center + window / 2 if tx in tx_dict: strand = tx_dict[tx][2] else: print tx + " not in GFF3 file" strand = '+' tx = chrom + ':' + str(center) seq = seq_simple(chrom, start, end, strand, fa_dict) seq_list.append((tx, seq)) except ValueError: pass genes_of_interest = gene_list.split("\n") with open( '{0}_peak_sequences.fa'.format( input_file.split('/')[-1].split('.')[0]), 'w') as fout: for tx, seq in seq_list: if tx in genes_of_interest: fout.write('>' + tx + '\n') fout.write(seq + '\n') return seq_list
def build_junction_df(junction_bed, gff3_file, fasta, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta) == str: fasta = SP.make_fasta_dict(fasta) junction_dict = build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) junction_count = 0 for tx, junctions in junction_dict.iteritems(): junction_count += len(junctions) junction_df = pd.DataFrame(index=range(junction_count), columns=[ 'intron tuple', 'chromosome', 'start', 'end', 'strand', 'depth', 'type', 'size', 'annotated intron size', 'annotated intron start', 'annotated intron end' ]) n = 0 for tx, junctions in junction_dict.iteritems(): for junction in junctions: junction_df.ix[n] = [tx] + junction n += 1 sequence1 = [] sequence2 = [] ann_seq1 = [] ann_seq2 = [] seq_type1 = [] seq_type2 = [] df_tx = [] for index, row in junction_df.iterrows(): df_tx.append(row['intron tuple'][0]) chrom = convert_chrom(row['chromosome']) if row['strand'] == '+': curr1 = fasta[chrom][(row['start'] - 1):(row['start'] + 7)] sequence1.append(curr1) curr2 = fasta[chrom][(row['end'] - 5):(row['end'] + 3)] sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append( fasta[chrom][(row['annotated intron start'] - 1):(row['annotated intron start'] + 7)]) ann_seq2.append(fasta[chrom][(row['annotated intron end'] - 5):(row['annotated intron end'] + 3)]) elif row['strand'] == '-': curr1 = SP.reverse_complement(fasta[chrom][(row['start'] - 6):(row['start'] + 2)]) sequence1.append(curr1) curr2 = SP.reverse_complement(fasta[chrom][(row['end'] - 2):(row['end'] + 6)]) sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append( SP.reverse_complement( fasta[chrom][row['annotated intron start'] - 6:row['annotated intron start'] + 2])) ann_seq2.append( SP.reverse_complement( fasta[chrom][row['annotated intron end'] - 2:row['annotated intron end'] + 6])) else: sequence1.append('NNNNNNNN') sequence2.append('NNNNNNNN') ann_seq1.append('NNNNNNNN') ann_seq2.append('NNNNNNNN') if row['type'] == 'Annotated': seq_type1.append('5p annotated') seq_type2.append('3p annotated') elif row['type'] == '5p tethered': seq_type1.append('5p annotated') seq_type2.append(curr2[4:6]) else: seq_type1.append(curr1[2:4]) seq_type2.append(curr2[4:6]) junc_seq_df = junction_df junc_seq_df['sequence1'] = sequence1 junc_seq_df['sequence2'] = sequence2 junc_seq_df['seq type1'] = seq_type1 junc_seq_df['seq type2'] = seq_type2 junc_seq_df['annotated sequence1'] = ann_seq1 junc_seq_df['annotated sequence2'] = ann_seq2 junc_seq_df['transcript'] = df_tx return junc_seq_df
def peak_to_seq_pipeline(untagged_peak_file, tagged1_peak_file, tagged2_peak_file, gff3, fasta, junction_df=None, branch_df=None, cutoff=5, name='CP_peaks'): if 'pombe' in gff3: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3, organism=organism) print "Finding peaks in transcripts..." print untagged_peak_file untagged = CP_peaks_by_gene(untagged_peak_file, transcript_dict, cutoff=cutoff) print tagged1_peak_file tagged1 = CP_peaks_by_gene(tagged1_peak_file, transcript_dict, cutoff=cutoff) print tagged2_peak_file tagged2 = CP_peaks_by_gene(tagged2_peak_file, transcript_dict, cutoff=cutoff) print "Comparing peaks between replicates..." peaks = CP_compare_reps(untagged, tagged1, tagged2) print "Checking peaks against annotation..." ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) peak_df = CP_compare_to_annotation(peaks, ss_dict, transcript_dict) peak_df = collapse_unpredicted_peaks(peak_df) peak_df['genome coord'] = peak_df['chromosome'].str.cat( peak_df['position'].apply(int).apply(str), sep=':') if type(fasta) == str: fasta = SP.make_fasta_dict(fasta) print "Adding sequences..." peak_seq_df = add_sequence_to_df(peak_df, fasta, flag=flag) print "Writing bedgraph..." with open(name + '.bedgraph', 'w') as fout: for ix, r in peak_seq_df.iterrows(): if r['strand'] == '+': position2 = r['position'] + 1 height = r['height'] elif r['strand'] == '-': position2 = r['position'] - 1 height = r['height'] * -1 line_list = [ r['chromosome'], r['position'], position2, height, '\n' ] line_list = [str(x) for x in line_list] line = '\t'.join(line_list) fout.write(line) print "Completed" return peak_seq_df
def build_junction_df(junction_bed, gff3_file, fasta, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta) == str: fasta=SP.make_fasta_dict(fasta) junction_dict = build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) junction_count = 0 for tx, junctions in junction_dict.iteritems(): junction_count += len(junctions) junction_df = pd.DataFrame(index=range(junction_count), columns=['intron tuple','chromosome','start','end','strand','depth','type','size','annotated intron size','annotated intron start','annotated intron end']) n=0 for tx, junctions in junction_dict.iteritems(): for junction in junctions: junction_df.ix[n] = [tx]+junction n+=1 sequence1 = [] sequence2 = [] ann_seq1 = [] ann_seq2 = [] seq_type1 = [] seq_type2 = [] df_tx = [] for index, row in junction_df.iterrows(): df_tx.append(row['intron tuple'][0]) chrom = convert_chrom(row['chromosome']) if row['strand'] == '+': curr1 = fasta[chrom][(row['start']-1):(row['start']+7)] sequence1.append(curr1) curr2 = fasta[chrom][(row['end']-5):(row['end']+3)] sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append(fasta[chrom][(row['annotated intron start']-1):(row['annotated intron start']+7)]) ann_seq2.append(fasta[chrom][(row['annotated intron end']-5):(row['annotated intron end']+3)]) elif row['strand'] == '-': curr1 = SP.reverse_complement(fasta[chrom][(row['start']-6):(row['start']+2)]) sequence1.append(curr1) curr2 = SP.reverse_complement(fasta[chrom][(row['end']-2):(row['end']+6)]) sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append(SP.reverse_complement(fasta[chrom][row['annotated intron start']-6:row['annotated intron start']+2])) ann_seq2.append(SP.reverse_complement(fasta[chrom][row['annotated intron end']-2:row['annotated intron end']+6])) else: sequence1.append('NNNNNNNN') sequence2.append('NNNNNNNN') ann_seq1.append('NNNNNNNN') ann_seq2.append('NNNNNNNN') if row['type'] == 'Annotated': seq_type1.append('5p annotated') seq_type2.append('3p annotated') elif row['type'] == '5p tethered': seq_type1.append('5p annotated') seq_type2.append(curr2[4:6]) else: seq_type1.append(curr1[2:4]) seq_type2.append(curr2[4:6]) junc_seq_df = junction_df junc_seq_df['sequence1'] = sequence1 junc_seq_df['sequence2'] = sequence2 junc_seq_df['seq type1'] = seq_type1 junc_seq_df['seq type2'] = seq_type2 junc_seq_df['annotated sequence1'] = ann_seq1 junc_seq_df['annotated sequence2'] = ann_seq2 junc_seq_df['transcript'] = df_tx return junc_seq_df