Ejemplo n.º 1
0
def peaks_only(config_file, untagged, organism):
    CP_out = []
    quant_bams = {}
    with open(config_file, 'r') as config:
        for line in config:
            if untagged in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = config_file.split('/')[-1].split('_config')[0]
    base_dir = config_file.split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: "+base_dir+name
    
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    
    peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks')
    peak_df.to_pickle(base_dir+name+'_all_peaks.pickle')
    
    quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism)
    quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism)
    
    quant_df.to_pickle(base_dir+name+'_quantitation.pickle')
    quant_df.to_csv(base_dir+name+'_quantitation.csv')
    
    scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
Ejemplo n.º 2
0
def peaks_only(config_file, untagged, organism):
    CP_out = []
    quant_bams = {}
    with open(config_file, 'r') as config:
        for line in config:
            if untagged in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = config_file.split('/')[-1].split('_config')[0]
    base_dir = config_file.split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: " + base_dir + name

    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    peak_df = SP.peak_to_seq_pipeline(CP_untagged,
                                      CP_out[0],
                                      CP_out[1],
                                      gff3,
                                      fa_dict,
                                      name=name + '_CP_peaks')
    peak_df.to_pickle(base_dir + name + '_all_peaks.pickle')

    quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism)
    quant_df = SP.quantitate_junction_df(quant_bams,
                                         quant_df,
                                         gff3,
                                         organism=organism)

    quant_df.to_pickle(base_dir + name + '_quantitation.pickle')
    quant_df.to_csv(base_dir + name + '_quantitation.csv')

    scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
Ejemplo n.º 3
0
def main():
    '''Usage: run SP_pipeline.py config_file untagged_sample_name organism
    config file : file that lists all branch, junction and peak files
    untagged_sample_name : prefix for untagged sample
    organism : pombe, crypto or cerevisiae'''
    junc_beds = []
    branch_bams = []
    CP_out = []
    CP_untagged = None
    quant_bams = {}

    # Read configuration file
    with open(sys.argv[1], 'r') as config:
        for line in config:
            if 'junctions.bed' in line.lower():
                junc_beds.append(line.strip())
            elif 'branch' in line.lower():
                branch_bams.append(line.strip())
            elif sys.argv[2] in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = sys.argv[1].split('/')[-1].split('_config')[0]
    base_dir = sys.argv[1].split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: " + base_dir + name

    print "\nJunction bed files"
    print junc_beds
    print "\nBranch bam files"

    if len(branch_bams) == 2:
        print branch_bams
        use_branches = True
    elif len(branch_bams) == 0:
        print "No data for branches, continuing with only junctions"
        use_branches = False

    print "\nUntagged peaks"
    print CP_untagged
    print "\nChangepoint peaks"
    print CP_out
    print ''

    if CP_untagged is None:
        print "\n Error: no untagged file indicated"
        return None

    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    #### Generate peak df
    if name + '_peaks_w_branch.csv' not in os.listdir(
            base_dir) or name + '_peaks_w_junc.csv' not in os.listdir(
                base_dir):
        if name + '_all_peaks.pickle' not in os.listdir(base_dir):
            peak_df = SP.peak_to_seq_pipeline(CP_untagged,
                                              CP_out[0],
                                              CP_out[1],
                                              gff3,
                                              fa_dict,
                                              name=name + '_CP_peaks')
            peak_df.to_pickle(base_dir + name + '_all_peaks.pickle')
        else:
            peak_df = pd.read_pickle(base_dir + name + '_all_peaks.pickle')

    #### Junction to peak comparison
    if name + '_peaks_w_junc.csv' not in os.listdir(base_dir):
        print "Generating peaks vs. junctions dataframe..."
        peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3,
                                              fa_dict, organism, base_dir,
                                              name)

    else:
        peaks_w_junc = pd.read_pickle(base_dir + name + '_peaks_w_junc.pickle')
        print "Peaks vs. junction dataframe already exists"

    #### Branch to peak comparison
    if use_branches is True:
        if name + '_peaks_w_branch.csv' not in os.listdir(base_dir):
            print "Generating peaks vs. branches dataframe..."
            peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3,
                                                  fa_dict, organism, base_dir,
                                                  name)
        else:
            peaks_w_branch = pd.read_pickle(base_dir + name +
                                            '_peaks_w_branch.pickle')
            print "Peaks vs. branches dataframe already exists"

    #### Clean up dataframe for quantitation
    if name + '_quantitation.csv' not in os.listdir(base_dir):
        quant_df, lariat_df = SP.make_quant_df(peaks_w_junc,
                                               peaks_w_branch,
                                               gff3,
                                               fa_dict,
                                               organism=organism)
        quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch,
                                              fa_dict)
        print "Counting reads in transcripts and at peaks..."
        quant_df = SP.quantitate_junction_df(quant_bams,
                                             quant_df,
                                             gff3,
                                             organism=organism)

        quant_df.to_pickle(base_dir + name + '_quantitation.pickle')
        quant_df.to_csv(base_dir + name + '_quantitation.csv')
        lariat_df.to_pickle(base_dir + name + '_lariats.pickle')
        lariat_df.to_csv(base_dir + name + '_lariats.csv')

        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)

    else:
        quant_df = pd.read_pickle(base_dir + name + '_quantitation.pickle')
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)

    print "\n****Finished****"
Ejemplo n.º 4
0
def main():
    '''Usage: run SP_pipeline.py config_file untagged_sample_name organism
    config file : file that lists all branch, junction and peak files
    untagged_sample_name : prefix for untagged sample
    organism : pombe, crypto or cerevisiae'''
    junc_beds = []
    branch_bams = []
    CP_out = []
    CP_untagged = None
    quant_bams = {}
    
    # Read configuration file
    with open(sys.argv[1], 'r') as config:
        for line in config:
            if 'junctions.bed' in line.lower():
                junc_beds.append(line.strip())
            elif 'branch' in line.lower():
                branch_bams.append(line.strip())
            elif sys.argv[2] in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = sys.argv[1].split('/')[-1].split('_config')[0]
    base_dir = sys.argv[1].split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: "+base_dir+name
    
    print "\nJunction bed files"
    print junc_beds
    print "\nBranch bam files"
    
    if len(branch_bams) == 2:
        print branch_bams
        use_branches = True
    elif len(branch_bams) == 0:
        print "No data for branches, continuing with only junctions"
        use_branches = False
    
    print "\nUntagged peaks"
    print CP_untagged
    print "\nChangepoint peaks"
    print CP_out
    print ''
    
    if CP_untagged is None:
        print "\n Error: no untagged file indicated"
        return None
    
    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    
    #### Generate peak df
    if name+'_peaks_w_branch.csv' not in os.listdir(base_dir) or name+'_peaks_w_junc.csv' not in os.listdir(base_dir):
        if name+'_all_peaks.pickle' not in os.listdir(base_dir):
            peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks')
            peak_df.to_pickle(base_dir+name+'_all_peaks.pickle')
        else:
            peak_df = pd.read_pickle(base_dir+name+'_all_peaks.pickle')
    
    #### Junction to peak comparison
    if name+'_peaks_w_junc.csv' not in os.listdir(base_dir):
        print "Generating peaks vs. junctions dataframe..."
        peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name)
        
    else: 
        peaks_w_junc = pd.read_pickle(base_dir+name+'_peaks_w_junc.pickle')
        print "Peaks vs. junction dataframe already exists"
    
    
    #### Branch to peak comparison
    if use_branches is True:
        if name+'_peaks_w_branch.csv' not in os.listdir(base_dir):
            print "Generating peaks vs. branches dataframe..."
            peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name)
        else: 
            peaks_w_branch = pd.read_pickle(base_dir+name+'_peaks_w_branch.pickle')
            print "Peaks vs. branches dataframe already exists"
    
    #### Clean up dataframe for quantitation
    if name+'_quantitation.csv' not in os.listdir(base_dir):
        quant_df, lariat_df = SP.make_quant_df(peaks_w_junc, peaks_w_branch, gff3, fa_dict, organism=organism)
        quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch, fa_dict)
        print "Counting reads in transcripts and at peaks..."
        quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism)
        
        quant_df.to_pickle(base_dir+name+'_quantitation.pickle')
        quant_df.to_csv(base_dir+name+'_quantitation.csv')
        lariat_df.to_pickle(base_dir+name+'_lariats.pickle')
        lariat_df.to_csv(base_dir+name+'_lariats.csv')
        
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
    else:
        quant_df = pd.read_pickle(base_dir+name+'_quantitation.pickle')
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
    print "\n****Finished****"
Ejemplo n.º 5
0
def main():
    '''Usage: run SPBranch.py unmapped1 unmapped2 threads organism [config_file] [untagged]
    
    Parameters
    -----------
    unmapped1 : bam or fastq file of unmapped reads from tophat or bowtie
    unmapped2 : bam or fastq file of unmapped reads from tophat or bowtie
    threads : number of processors to use
    organism : 'pombe or 'crypto'
    config_file : if using peaks to call - list of changepoint output file names and where to find them
    untagged : untagged sample name (must be in file name)
    
    Output
    ------
    bam files with aligned reads. Will be interpreted by SP_pipeline.
    '''
    
    unmapped1 = sys.argv[1]
    unmapped2 = sys.argv[2]
    threads = int(sys.argv[3])
    
    if unmapped1.endswith('bam'):
        btf_args = 'bamToFastq -i {0} -fq {1}'.format(unmapped1, unmapped1.split('.bam')[-1]+'.fq')
        call(btf_args, shell=False)
        unmapped1 = unmapped1.split('.bam')[-1]+'.fq'
    if unmapped2.endswith('bam'):
        btf_args = 'bamToFastq -i {0} -fq {1}'.format(unmapped2, unmapped2.split('.bam')[-1]+'.fq')
        call(btf_args, shell=False)
        unmapped2 = unmapped2.split('.bam')[-1]+'.fq'
        
    cat_args = 'cat {0} {1} > unmapped_all.fq'.format(unmapped1, unmapped2)
    call(cat_args, shell=True)
    
    organism = sys.argv[4]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
        
    peaks = False
    if len(sys.argv) == 7:
        peaks = True
        with open(sys.argv[5], 'r') as config:
            for line in config:
                if sys.argv[6] in line:
                    CP_untagged = line.strip()
                elif 'changepoint' in line.lower() or 'peak' in line.lower():
                    CP_out.append(line.strip())
        peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name='CP_peaks')

    ann_seqs = collect_intron_seq(gff3, fa_dict)
    
    print "Finding unaligned reads with annotated 5' splice sites"
    find_split_reads('unmapped_all.fq', ann_seqs, 'Ann_branches', threads=threads)
    
    print "Aligning split reads to the genome with Bowtie"
    bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Ann_branches_split.fa --sam Ann_branches.sam'.format(threads, bowtie_index)
    call(bowtie_args, shell=True)
    
    # sort and index
    print "Sorting and indexing bam file"
    samtools1 = 'samtools view -Sbo Ann_branches.bam Ann_branches.sam'
    call(samtools1, shell=True)
    
    samtools2 = 'samtools sort Ann_branches.bam -o Ann_branches_sorted.bam'
    call(samtools2, shell=True)
    
    samtools3 = 'samtools index Ann_branches_sorted.bam'
    call(samtools3, shell=True)
    
    if peaks is True:
        print "Finding unaligned reads with unpredicted splicing events"
        peak_seqs = collect_intron_seq(gff3, fa_dict, peak_df=peak_df)
        find_split_reads('Ann_branches_unsplit.fa', peak_seqs, 'Peak_branches', threads=threads)
        
        print "Aligning split reads to the genome with Bowtie"
        bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Peak_branches_split.fa --sam Peak_branches.sam'.format(threads, bowtie_index)
        call(bowtie_args, shell=True)
        
        print "Sorting and indexing bam file"
        samtools1 = 'samtools view -Sbo Peak_branches.bam Peak_branches.sam'
        call(samtools1, shell=True)

        samtools2 = 'samtools sort Peak_branches.bam -o Peak_branches_sorted.bam'
        call(samtools2, shell=True)

        samtools3 = 'samtools index Peak_branches_sorted.bam'
        call(samtools3, shell=True)
Ejemplo n.º 6
0
def main():
    '''Usage: run SPBranch.py unmapped1 unmapped2 threads organism [config_file] [untagged]
    
    Parameters
    -----------
    unmapped1 : bam or fastq file of unmapped reads from tophat or bowtie
    unmapped2 : bam or fastq file of unmapped reads from tophat or bowtie
    threads : number of processors to use
    organism : 'pombe or 'crypto'
    config_file : if using peaks to call - list of changepoint output file names and where to find them
    untagged : untagged sample name (must be in file name)
    
    Output
    ------
    bam files with aligned reads. Will be interpreted by SP_pipeline.
    '''

    unmapped1 = sys.argv[1]
    unmapped2 = sys.argv[2]
    threads = int(sys.argv[3])

    if unmapped1.endswith('bam'):
        btf_args = 'bamToFastq -i {0} -fq {1}'.format(
            unmapped1,
            unmapped1.split('.bam')[-1] + '.fq')
        call(btf_args, shell=False)
        unmapped1 = unmapped1.split('.bam')[-1] + '.fq'
    if unmapped2.endswith('bam'):
        btf_args = 'bamToFastq -i {0} -fq {1}'.format(
            unmapped2,
            unmapped2.split('.bam')[-1] + '.fq')
        call(btf_args, shell=False)
        unmapped2 = unmapped2.split('.bam')[-1] + '.fq'

    cat_args = 'cat {0} {1} > unmapped_all.fq'.format(unmapped1, unmapped2)
    call(cat_args, shell=True)

    organism = sys.argv[4]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    peaks = False
    if len(sys.argv) == 7:
        peaks = True
        with open(sys.argv[5], 'r') as config:
            for line in config:
                if sys.argv[6] in line:
                    CP_untagged = line.strip()
                elif 'changepoint' in line.lower() or 'peak' in line.lower():
                    CP_out.append(line.strip())
        peak_df = SP.peak_to_seq_pipeline(CP_untagged,
                                          CP_out[0],
                                          CP_out[1],
                                          gff3,
                                          fa_dict,
                                          name='CP_peaks')

    ann_seqs = collect_intron_seq(gff3, fa_dict)

    print "Finding unaligned reads with annotated 5' splice sites"
    find_split_reads('unmapped_all.fq',
                     ann_seqs,
                     'Ann_branches',
                     threads=threads)

    print "Aligning split reads to the genome with Bowtie"
    bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Ann_branches_split.fa --sam Ann_branches.sam'.format(
        threads, bowtie_index)
    call(bowtie_args, shell=True)

    # sort and index
    print "Sorting and indexing bam file"
    samtools1 = 'samtools view -Sbo Ann_branches.bam Ann_branches.sam'
    call(samtools1, shell=True)

    samtools2 = 'samtools sort Ann_branches.bam -o Ann_branches_sorted.bam'
    call(samtools2, shell=True)

    samtools3 = 'samtools index Ann_branches_sorted.bam'
    call(samtools3, shell=True)

    if peaks is True:
        print "Finding unaligned reads with unpredicted splicing events"
        peak_seqs = collect_intron_seq(gff3, fa_dict, peak_df=peak_df)
        find_split_reads('Ann_branches_unsplit.fa',
                         peak_seqs,
                         'Peak_branches',
                         threads=threads)

        print "Aligning split reads to the genome with Bowtie"
        bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Peak_branches_split.fa --sam Peak_branches.sam'.format(
            threads, bowtie_index)
        call(bowtie_args, shell=True)

        print "Sorting and indexing bam file"
        samtools1 = 'samtools view -Sbo Peak_branches.bam Peak_branches.sam'
        call(samtools1, shell=True)

        samtools2 = 'samtools sort Peak_branches.bam -o Peak_branches_sorted.bam'
        call(samtools2, shell=True)

        samtools3 = 'samtools index Peak_branches_sorted.bam'
        call(samtools3, shell=True)