Esempio n. 1
0
def peaks_only(config_file, untagged, organism):
    CP_out = []
    quant_bams = {}
    with open(config_file, 'r') as config:
        for line in config:
            if untagged in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = config_file.split('/')[-1].split('_config')[0]
    base_dir = config_file.split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: "+base_dir+name
    
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    
    peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks')
    peak_df.to_pickle(base_dir+name+'_all_peaks.pickle')
    
    quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism)
    quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism)
    
    quant_df.to_pickle(base_dir+name+'_quantitation.pickle')
    quant_df.to_csv(base_dir+name+'_quantitation.csv')
    
    scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
Esempio n. 2
0
def peaks_only(config_file, untagged, organism):
    CP_out = []
    quant_bams = {}
    with open(config_file, 'r') as config:
        for line in config:
            if untagged in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = config_file.split('/')[-1].split('_config')[0]
    base_dir = config_file.split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: " + base_dir + name

    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    peak_df = SP.peak_to_seq_pipeline(CP_untagged,
                                      CP_out[0],
                                      CP_out[1],
                                      gff3,
                                      fa_dict,
                                      name=name + '_CP_peaks')
    peak_df.to_pickle(base_dir + name + '_all_peaks.pickle')

    quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism)
    quant_df = SP.quantitate_junction_df(quant_bams,
                                         quant_df,
                                         gff3,
                                         organism=organism)

    quant_df.to_pickle(base_dir + name + '_quantitation.pickle')
    quant_df.to_csv(base_dir + name + '_quantitation.csv')

    scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
Esempio n. 3
0
def main():
    '''Usage: run SP_pipeline.py config_file untagged_sample_name organism
    config file : file that lists all branch, junction and peak files
    untagged_sample_name : prefix for untagged sample
    organism : pombe, crypto or cerevisiae'''
    junc_beds = []
    branch_bams = []
    CP_out = []
    CP_untagged = None
    quant_bams = {}

    # Read configuration file
    with open(sys.argv[1], 'r') as config:
        for line in config:
            if 'junctions.bed' in line.lower():
                junc_beds.append(line.strip())
            elif 'branch' in line.lower():
                branch_bams.append(line.strip())
            elif sys.argv[2] in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = sys.argv[1].split('/')[-1].split('_config')[0]
    base_dir = sys.argv[1].split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: " + base_dir + name

    print "\nJunction bed files"
    print junc_beds
    print "\nBranch bam files"

    if len(branch_bams) == 2:
        print branch_bams
        use_branches = True
    elif len(branch_bams) == 0:
        print "No data for branches, continuing with only junctions"
        use_branches = False

    print "\nUntagged peaks"
    print CP_untagged
    print "\nChangepoint peaks"
    print CP_out
    print ''

    if CP_untagged is None:
        print "\n Error: no untagged file indicated"
        return None

    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    #### Generate peak df
    if name + '_peaks_w_branch.csv' not in os.listdir(
            base_dir) or name + '_peaks_w_junc.csv' not in os.listdir(
                base_dir):
        if name + '_all_peaks.pickle' not in os.listdir(base_dir):
            peak_df = SP.peak_to_seq_pipeline(CP_untagged,
                                              CP_out[0],
                                              CP_out[1],
                                              gff3,
                                              fa_dict,
                                              name=name + '_CP_peaks')
            peak_df.to_pickle(base_dir + name + '_all_peaks.pickle')
        else:
            peak_df = pd.read_pickle(base_dir + name + '_all_peaks.pickle')

    #### Junction to peak comparison
    if name + '_peaks_w_junc.csv' not in os.listdir(base_dir):
        print "Generating peaks vs. junctions dataframe..."
        peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3,
                                              fa_dict, organism, base_dir,
                                              name)

    else:
        peaks_w_junc = pd.read_pickle(base_dir + name + '_peaks_w_junc.pickle')
        print "Peaks vs. junction dataframe already exists"

    #### Branch to peak comparison
    if use_branches is True:
        if name + '_peaks_w_branch.csv' not in os.listdir(base_dir):
            print "Generating peaks vs. branches dataframe..."
            peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3,
                                                  fa_dict, organism, base_dir,
                                                  name)
        else:
            peaks_w_branch = pd.read_pickle(base_dir + name +
                                            '_peaks_w_branch.pickle')
            print "Peaks vs. branches dataframe already exists"

    #### Clean up dataframe for quantitation
    if name + '_quantitation.csv' not in os.listdir(base_dir):
        quant_df, lariat_df = SP.make_quant_df(peaks_w_junc,
                                               peaks_w_branch,
                                               gff3,
                                               fa_dict,
                                               organism=organism)
        quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch,
                                              fa_dict)
        print "Counting reads in transcripts and at peaks..."
        quant_df = SP.quantitate_junction_df(quant_bams,
                                             quant_df,
                                             gff3,
                                             organism=organism)

        quant_df.to_pickle(base_dir + name + '_quantitation.pickle')
        quant_df.to_csv(base_dir + name + '_quantitation.csv')
        lariat_df.to_pickle(base_dir + name + '_lariats.pickle')
        lariat_df.to_csv(base_dir + name + '_lariats.csv')

        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)

    else:
        quant_df = pd.read_pickle(base_dir + name + '_quantitation.pickle')
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)

    print "\n****Finished****"
Esempio n. 4
0
def main():
    '''Usage: run SP_pipeline.py config_file untagged_sample_name organism
    config file : file that lists all branch, junction and peak files
    untagged_sample_name : prefix for untagged sample
    organism : pombe, crypto or cerevisiae'''
    junc_beds = []
    branch_bams = []
    CP_out = []
    CP_untagged = None
    quant_bams = {}
    
    # Read configuration file
    with open(sys.argv[1], 'r') as config:
        for line in config:
            if 'junctions.bed' in line.lower():
                junc_beds.append(line.strip())
            elif 'branch' in line.lower():
                branch_bams.append(line.strip())
            elif sys.argv[2] in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = sys.argv[1].split('/')[-1].split('_config')[0]
    base_dir = sys.argv[1].split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: "+base_dir+name
    
    print "\nJunction bed files"
    print junc_beds
    print "\nBranch bam files"
    
    if len(branch_bams) == 2:
        print branch_bams
        use_branches = True
    elif len(branch_bams) == 0:
        print "No data for branches, continuing with only junctions"
        use_branches = False
    
    print "\nUntagged peaks"
    print CP_untagged
    print "\nChangepoint peaks"
    print CP_out
    print ''
    
    if CP_untagged is None:
        print "\n Error: no untagged file indicated"
        return None
    
    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    
    #### Generate peak df
    if name+'_peaks_w_branch.csv' not in os.listdir(base_dir) or name+'_peaks_w_junc.csv' not in os.listdir(base_dir):
        if name+'_all_peaks.pickle' not in os.listdir(base_dir):
            peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks')
            peak_df.to_pickle(base_dir+name+'_all_peaks.pickle')
        else:
            peak_df = pd.read_pickle(base_dir+name+'_all_peaks.pickle')
    
    #### Junction to peak comparison
    if name+'_peaks_w_junc.csv' not in os.listdir(base_dir):
        print "Generating peaks vs. junctions dataframe..."
        peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name)
        
    else: 
        peaks_w_junc = pd.read_pickle(base_dir+name+'_peaks_w_junc.pickle')
        print "Peaks vs. junction dataframe already exists"
    
    
    #### Branch to peak comparison
    if use_branches is True:
        if name+'_peaks_w_branch.csv' not in os.listdir(base_dir):
            print "Generating peaks vs. branches dataframe..."
            peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name)
        else: 
            peaks_w_branch = pd.read_pickle(base_dir+name+'_peaks_w_branch.pickle')
            print "Peaks vs. branches dataframe already exists"
    
    #### Clean up dataframe for quantitation
    if name+'_quantitation.csv' not in os.listdir(base_dir):
        quant_df, lariat_df = SP.make_quant_df(peaks_w_junc, peaks_w_branch, gff3, fa_dict, organism=organism)
        quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch, fa_dict)
        print "Counting reads in transcripts and at peaks..."
        quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism)
        
        quant_df.to_pickle(base_dir+name+'_quantitation.pickle')
        quant_df.to_csv(base_dir+name+'_quantitation.csv')
        lariat_df.to_pickle(base_dir+name+'_lariats.pickle')
        lariat_df.to_csv(base_dir+name+'_lariats.csv')
        
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
    else:
        quant_df = pd.read_pickle(base_dir+name+'_quantitation.pickle')
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
    print "\n****Finished****"