Ejemplo n.º 1
0
def peak_to_seq_pipeline(untagged_peak_file, tagged1_peak_file, tagged2_peak_file, gff3, fasta, junction_df=None, branch_df=None, cutoff=5, name='CP_peaks'):
    
    if 'pombe' in gff3: organism = 'pombe'
    else: organism = None
        
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    print "Finding peaks in transcripts..."
    
    print untagged_peak_file
    untagged = CP_peaks_by_gene(untagged_peak_file, transcript_dict, cutoff=cutoff)
    
    print tagged1_peak_file
    tagged1 = CP_peaks_by_gene(tagged1_peak_file, transcript_dict, cutoff=cutoff)
    
    print tagged2_peak_file
    tagged2 = CP_peaks_by_gene(tagged2_peak_file, transcript_dict, cutoff=cutoff)
    
    print "Comparing peaks between replicates..."
    peaks = CP_compare_reps(untagged, tagged1, tagged2)
    
    print "Checking peaks against annotation..."
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    peak_df = CP_compare_to_annotation(peaks, ss_dict, transcript_dict)
    peak_df = collapse_unpredicted_peaks(peak_df)
    peak_df['genome coord'] = peak_df['chromosome'].str.cat(peak_df['position'].apply(int).apply(str), sep=':')
    
    if type(fasta) == str:
        fasta = SP.make_fasta_dict(fasta)
    print "Adding sequences..."
    peak_seq_df = add_sequence_to_df(peak_df, fasta, flag=flag)
    
    print "Writing bedgraph..."
    with open(name+'.bedgraph', 'w') as fout:
        for ix, r in peak_seq_df.iterrows():
            if r['strand'] == '+':
                position2 = r['position']+1
                height = r['height']
            elif r['strand'] == '-':
                position2 = r['position']-1
                height = r['height']*-1
            line_list = [r['chromosome'], r['position'], position2, height, '\n']
            line_list = [str(x) for x in line_list]
            line = '\t'.join(line_list)
            fout.write(line)
    
    print "Completed"
    return peak_seq_df
Ejemplo n.º 2
0
'''Usage: python process_juncbase.py juncbase_output gff3_file fasta_file prefix
Please note: uses chr# format for chromosome names.'''

import sys
sys.path.insert(0, '/Users/jordanburke/RNA-is-awesome/')
sys.path.insert(0, '/home/jordan/RNA-is-awesome/')
sys.path.insert(0, '/home/jordan/CodeBase/RNA-is-awesome')
import SPTools as SP

juncbase_output = sys.argv[1]
gff3_file = sys.argv[2]
print gff3_file
fasta_file = sys.argv[3]
print fasta_file
prefix = sys.argv[4]

fasta_dict =  SP.make_fasta_dict(fasta_file)
junc_df, sample_list = SP.read_juncbase_output(juncbase_output)
seq_df = SP.get_junction_sequence(junc_df, gff3_file, fasta_dict)
pos_matrix_5prime, pos_matrix_3prime = SP.generate_consensus_matrix(gff3_file, fasta_dict, PSSM=True)
scored_df = SP.score_new_sites(seq_df, pos_matrix_5prime, pos_matrix_3prime, PSSM=True)
filt_df1, filt_df2 = SP.reformat_df(scored_df, sample_list)

#intron_ret_df = filt_df[filt_df['as_event_type'] == 'intron_retention']
#intron_ret_df = intron_ret_df.reset_index(drop=True)
#alt_donor = filt_df[filt_df['as_event_type'] == 'alternative_donor']
#alt_acceptor = filt_df[filt_df['as_event_type'] == 'alternative_acceptor']

filt_df1.to_csv('{0}_seq_score.tsv'.format(prefix), sep='\t', float_format='%.2f')
filt_df2.to_csv('{0}_PSI.tsv'.format(prefix), sep='\t', float_format='%.2f')
Ejemplo n.º 3
0
def get_peak_sequence3(input_file, fasta_file, gff3_file, gene_list,window=1000):
    '''Makes a fasta file of peak sequences based on an input file.
    Input file columns - 1: transcript, 2: chromosome, 3: peak center
    Remember to save the input file as an MS-DOS CSV file if exporting from Excel
    Note: retrieves sequence
    
    Parameters
    ----------
    input_file : str
            CSV file - see above
    fasta_file : str
            .json dictionary of chromosome sequences or fasta file (.json will load faster)
    gff3_file : str
            gff3 file for your organism
    gene_list: str
             CSV file 
    window : int, default 1000
            Size of sequence to retrieve (peak boundaries are window/2 on either side of peak summit)
            
    Outputs
    ------
    peak_fasta : fasta file with all peak sequences
    '''

    tx_dict = SP.build_transcript_dict(gff3_file)
    if type(fasta_file) == dict:
        fa_dict = fasta_file
    else:
        if fasta_file.endswith('json'):
            with open(fasta_file) as f:
                fa_dict = json.load(f)
        else:
            fa_dict = SP.make_fasta_dict(fasta_file)
    seq_list = []
    no_tx_n = 1
    with open(input_file,'r') as csv_file:
        f = csv.reader(csv_file, dialect=csv.excel)
        for row in f:
            tx_list = row[0].split(',')
            for tx in tx_list:
                tx = tx+'T0'
                if tx.startswith('3P'): tx = tx.split('3P')[1]

                chrom = row[1]
                if not chrom.startswith('chr'):
                    chrom = 'chr'+str(chrom)
                try:
                    center = int(row[2])
                    start = center-window/2
                    end = center+window/2
                    if tx in tx_dict:
                        strand = tx_dict[tx][2]
                    else:
                        print tx+" not in GFF3 file"
                        strand = '+'
                        tx = chrom+':'+str(center)
                    seq = seq_simple(chrom, start, end, strand, fa_dict)
                    seq_list.append((tx,seq))

                except ValueError:
                    pass
    genes_of_interest=gene_list.split("\n")

    with open('{0}_peak_sequences.fa'.format(input_file.split('/')[-1].split('.')[0]),'w') as fout:
        for tx, seq in seq_list:
            if tx in genes_of_interest:
                fout.write('>'+tx+'\n')
                fout.write(seq+'\n')
    return seq_list
Ejemplo n.º 4
0
def get_peak_sequence3(input_file,
                       fasta_file,
                       gff3_file,
                       gene_list,
                       window=1000):
    '''Makes a fasta file of peak sequences based on an input file.
    Input file columns - 1: transcript, 2: chromosome, 3: peak center
    Remember to save the input file as an MS-DOS CSV file if exporting from Excel
    Note: retrieves sequence
    
    Parameters
    ----------
    input_file : str
            CSV file - see above
    fasta_file : str
            .json dictionary of chromosome sequences or fasta file (.json will load faster)
    gff3_file : str
            gff3 file for your organism
    gene_list: str
             CSV file 
    window : int, default 1000
            Size of sequence to retrieve (peak boundaries are window/2 on either side of peak summit)
            
    Outputs
    ------
    peak_fasta : fasta file with all peak sequences
    '''

    tx_dict = SP.build_transcript_dict(gff3_file)
    if type(fasta_file) == dict:
        fa_dict = fasta_file
    else:
        if fasta_file.endswith('json'):
            with open(fasta_file) as f:
                fa_dict = json.load(f)
        else:
            fa_dict = SP.make_fasta_dict(fasta_file)
    seq_list = []
    no_tx_n = 1
    with open(input_file, 'r') as csv_file:
        f = csv.reader(csv_file, dialect=csv.excel)
        for row in f:
            tx_list = row[0].split(',')
            for tx in tx_list:
                tx = tx + 'T0'
                if tx.startswith('3P'): tx = tx.split('3P')[1]

                chrom = row[1]
                if not chrom.startswith('chr'):
                    chrom = 'chr' + str(chrom)
                try:
                    center = int(row[2])
                    start = center - window / 2
                    end = center + window / 2
                    if tx in tx_dict:
                        strand = tx_dict[tx][2]
                    else:
                        print tx + " not in GFF3 file"
                        strand = '+'
                        tx = chrom + ':' + str(center)
                    seq = seq_simple(chrom, start, end, strand, fa_dict)
                    seq_list.append((tx, seq))

                except ValueError:
                    pass
    genes_of_interest = gene_list.split("\n")

    with open(
            '{0}_peak_sequences.fa'.format(
                input_file.split('/')[-1].split('.')[0]), 'w') as fout:
        for tx, seq in seq_list:
            if tx in genes_of_interest:
                fout.write('>' + tx + '\n')
                fout.write(seq + '\n')
    return seq_list
Ejemplo n.º 5
0
def build_junction_df(junction_bed, gff3_file, fasta, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta) == str:
        fasta = SP.make_fasta_dict(fasta)
    junction_dict = build_junction_dict(junction_bed,
                                        gff3_file,
                                        transcript_dict,
                                        organism=organism)
    junction_count = 0
    for tx, junctions in junction_dict.iteritems():
        junction_count += len(junctions)

    junction_df = pd.DataFrame(index=range(junction_count),
                               columns=[
                                   'intron tuple', 'chromosome', 'start',
                                   'end', 'strand', 'depth', 'type', 'size',
                                   'annotated intron size',
                                   'annotated intron start',
                                   'annotated intron end'
                               ])
    n = 0
    for tx, junctions in junction_dict.iteritems():
        for junction in junctions:
            junction_df.ix[n] = [tx] + junction
            n += 1

    sequence1 = []
    sequence2 = []
    ann_seq1 = []
    ann_seq2 = []
    seq_type1 = []
    seq_type2 = []
    df_tx = []
    for index, row in junction_df.iterrows():
        df_tx.append(row['intron tuple'][0])
        chrom = convert_chrom(row['chromosome'])
        if row['strand'] == '+':
            curr1 = fasta[chrom][(row['start'] - 1):(row['start'] + 7)]
            sequence1.append(curr1)
            curr2 = fasta[chrom][(row['end'] - 5):(row['end'] + 3)]
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(
                    fasta[chrom][(row['annotated intron start'] -
                                  1):(row['annotated intron start'] + 7)])
                ann_seq2.append(fasta[chrom][(row['annotated intron end'] -
                                              5):(row['annotated intron end'] +
                                                  3)])
        elif row['strand'] == '-':
            curr1 = SP.reverse_complement(fasta[chrom][(row['start'] -
                                                        6):(row['start'] + 2)])
            sequence1.append(curr1)
            curr2 = SP.reverse_complement(fasta[chrom][(row['end'] -
                                                        2):(row['end'] + 6)])
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(
                    SP.reverse_complement(
                        fasta[chrom][row['annotated intron start'] -
                                     6:row['annotated intron start'] + 2]))
                ann_seq2.append(
                    SP.reverse_complement(
                        fasta[chrom][row['annotated intron end'] -
                                     2:row['annotated intron end'] + 6]))
        else:
            sequence1.append('NNNNNNNN')
            sequence2.append('NNNNNNNN')
            ann_seq1.append('NNNNNNNN')
            ann_seq2.append('NNNNNNNN')

        if row['type'] == 'Annotated':
            seq_type1.append('5p annotated')
            seq_type2.append('3p annotated')
        elif row['type'] == '5p tethered':
            seq_type1.append('5p annotated')
            seq_type2.append(curr2[4:6])
        else:
            seq_type1.append(curr1[2:4])
            seq_type2.append(curr2[4:6])

    junc_seq_df = junction_df
    junc_seq_df['sequence1'] = sequence1
    junc_seq_df['sequence2'] = sequence2
    junc_seq_df['seq type1'] = seq_type1
    junc_seq_df['seq type2'] = seq_type2
    junc_seq_df['annotated sequence1'] = ann_seq1
    junc_seq_df['annotated sequence2'] = ann_seq2
    junc_seq_df['transcript'] = df_tx

    return junc_seq_df
Ejemplo n.º 6
0
def peak_to_seq_pipeline(untagged_peak_file,
                         tagged1_peak_file,
                         tagged2_peak_file,
                         gff3,
                         fasta,
                         junction_df=None,
                         branch_df=None,
                         cutoff=5,
                         name='CP_peaks'):

    if 'pombe' in gff3: organism = 'pombe'
    else: organism = None

    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    print "Finding peaks in transcripts..."

    print untagged_peak_file
    untagged = CP_peaks_by_gene(untagged_peak_file,
                                transcript_dict,
                                cutoff=cutoff)

    print tagged1_peak_file
    tagged1 = CP_peaks_by_gene(tagged1_peak_file,
                               transcript_dict,
                               cutoff=cutoff)

    print tagged2_peak_file
    tagged2 = CP_peaks_by_gene(tagged2_peak_file,
                               transcript_dict,
                               cutoff=cutoff)

    print "Comparing peaks between replicates..."
    peaks = CP_compare_reps(untagged, tagged1, tagged2)

    print "Checking peaks against annotation..."
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    peak_df = CP_compare_to_annotation(peaks, ss_dict, transcript_dict)
    peak_df = collapse_unpredicted_peaks(peak_df)
    peak_df['genome coord'] = peak_df['chromosome'].str.cat(
        peak_df['position'].apply(int).apply(str), sep=':')

    if type(fasta) == str:
        fasta = SP.make_fasta_dict(fasta)
    print "Adding sequences..."
    peak_seq_df = add_sequence_to_df(peak_df, fasta, flag=flag)

    print "Writing bedgraph..."
    with open(name + '.bedgraph', 'w') as fout:
        for ix, r in peak_seq_df.iterrows():
            if r['strand'] == '+':
                position2 = r['position'] + 1
                height = r['height']
            elif r['strand'] == '-':
                position2 = r['position'] - 1
                height = r['height'] * -1
            line_list = [
                r['chromosome'], r['position'], position2, height, '\n'
            ]
            line_list = [str(x) for x in line_list]
            line = '\t'.join(line_list)
            fout.write(line)

    print "Completed"
    return peak_seq_df
Ejemplo n.º 7
0
def build_junction_df(junction_bed, gff3_file, fasta, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta) == str:
        fasta=SP.make_fasta_dict(fasta)
    junction_dict = build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism)
    junction_count = 0
    for tx, junctions in junction_dict.iteritems():
        junction_count += len(junctions)
    
    junction_df = pd.DataFrame(index=range(junction_count), columns=['intron tuple','chromosome','start','end','strand','depth','type','size','annotated intron size','annotated intron start','annotated intron end'])
    n=0
    for tx, junctions in junction_dict.iteritems():
        for junction in junctions:
            junction_df.ix[n] = [tx]+junction
            n+=1
    
    sequence1 = []
    sequence2 = []
    ann_seq1 = []
    ann_seq2 = []
    seq_type1 = []
    seq_type2 = []
    df_tx = []
    for index, row in junction_df.iterrows():
        df_tx.append(row['intron tuple'][0])
        chrom = convert_chrom(row['chromosome'])
        if row['strand'] == '+':
            curr1 = fasta[chrom][(row['start']-1):(row['start']+7)]
            sequence1.append(curr1)
            curr2 = fasta[chrom][(row['end']-5):(row['end']+3)]
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(fasta[chrom][(row['annotated intron start']-1):(row['annotated intron start']+7)])
                ann_seq2.append(fasta[chrom][(row['annotated intron end']-5):(row['annotated intron end']+3)])
        elif row['strand'] == '-':
            curr1 = SP.reverse_complement(fasta[chrom][(row['start']-6):(row['start']+2)])
            sequence1.append(curr1)
            curr2 = SP.reverse_complement(fasta[chrom][(row['end']-2):(row['end']+6)])
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(SP.reverse_complement(fasta[chrom][row['annotated intron start']-6:row['annotated intron start']+2]))
                ann_seq2.append(SP.reverse_complement(fasta[chrom][row['annotated intron end']-2:row['annotated intron end']+6]))
        else:
            sequence1.append('NNNNNNNN')
            sequence2.append('NNNNNNNN')
            ann_seq1.append('NNNNNNNN')
            ann_seq2.append('NNNNNNNN')
        
        
        if row['type'] == 'Annotated': 
            seq_type1.append('5p annotated')
            seq_type2.append('3p annotated')
        elif row['type'] == '5p tethered':
            seq_type1.append('5p annotated')
            seq_type2.append(curr2[4:6])
        else:
            seq_type1.append(curr1[2:4])
            seq_type2.append(curr2[4:6])
            
    junc_seq_df = junction_df
    junc_seq_df['sequence1'] = sequence1
    junc_seq_df['sequence2'] = sequence2
    junc_seq_df['seq type1'] = seq_type1
    junc_seq_df['seq type2'] = seq_type2
    junc_seq_df['annotated sequence1'] = ann_seq1
    junc_seq_df['annotated sequence2'] = ann_seq2
    junc_seq_df['transcript'] = df_tx
    
    return junc_seq_df