コード例 #1
0
def main():

    ### get command line options
    options = parse_options(sys.argv)

    ### parse parameters from options object
    CFG = settings.parse_args(options, identity='test')
    CFG['use_exon_counts'] = False

    ### generate output directory
    outdir = os.path.join(options.outdir, 'testing')
    if options.timestamp == 'y':
        outdir = '%s_%s' % (outdir, str(datetime.datetime.now()).replace(
            ' ', '_'))
    if CFG['diagnose_plots']:
        CFG['plot_dir'] = os.path.join(options.outdir, 'plots')
        if not os.path.exists(CFG['plot_dir']):
            os.makedirs(CFG['plot_dir'])

    if options.labelA != 'condA' and options.labelB != 'condB':
        outdir = '%s_%s_vs_%s' % (outdir, options.labelA, options.labelB)
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    if CFG['debug']:

        print "Generating simulated dataset"

        npr.seed(23)
        CFG['is_matlab'] = False
        #cov = npr.permutation(20000-20).astype('float').reshape(999, 20)
        #cov = sp.r_[cov, sp.c_[sp.ones((1, 10)) *10, sp.ones((1, 10)) * 500000] + npr.normal(10, 1, 20)]
        #sf = sp.ones((cov.shape[1], ), dtype='float')

        setsize = 50
        ### diff event counts
        cov = sp.zeros((500, 2 * setsize), dtype='int')
        for i in range(10):
            cov[i, :setsize] = nbinom.rvs(30, 0.8, size=setsize)
            cov[i, setsize:] = nbinom.rvs(10, 0.8, size=setsize)
        for i in range(10, cov.shape[0]):
            cov[i, :] = nbinom.rvs(30, 0.8, size=2 * setsize)

        ### diff gene expression
        cov2 = sp.zeros((500, 2 * setsize), dtype='int')
        for i in range(20):
            cov2[i, :setsize] = nbinom.rvs(2000, 0.2, size=setsize)
            cov2[i, setsize:] = nbinom.rvs(2000, 0.3, size=setsize)
        for i in range(20, cov2.shape[0]):
            cov2[i, :] = nbinom.rvs(2000, 0.3, size=2 * setsize)

        cov = sp.c_[cov, cov2] * 10000

        tidx = sp.arange(setsize)

        sf = npr.uniform(0, 5, 2 * setsize)
        sf = sp.r_[sf, sf]

        #dmatrix0 = sp.ones((cov.shape[1], 3), dtype='bool')
        dmatrix1 = sp.zeros((cov.shape[1], 4), dtype='float')
        dmatrix1[:, 0] = 1
        dmatrix1[tidx, 1] = 1
        #dmatrix1[tidx, 2] = 1
        dmatrix1[tidx + (2 * setsize), 2] = 1
        dmatrix1[(2 * setsize):, 3] = 1
        #dmatrix1[:, 4] = sp.log(sf)
        dmatrix0 = dmatrix1[:, [0, 2, 3]]

        cov = cov * sf
        #sf = sp.ones((cov.shape[1], ), dtype='float')

        pvals = run_testing(cov, dmatrix0, dmatrix1, sf, CFG)
        pvals_adj = adj_pval(pvals, CFG)
        pdb.set_trace()
    else:
        val_tag = ''
        if CFG['validate_splicegraphs']:
            val_tag = '.validated'

        if CFG['is_matlab']:
            CFG['fname_genes'] = os.path.join(
                CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.mat' %
                (CFG['confidence_level'], CFG['merge_strategy'], val_tag))
            CFG['fname_count_in'] = os.path.join(
                CFG['out_dirname'], 'spladder',
                'genes_graph_conf%i.%s%s.count.mat' %
                (CFG['confidence_level'], CFG['merge_strategy'], val_tag))
        else:
            CFG['fname_genes'] = os.path.join(
                CFG['out_dirname'], 'spladder',
                'genes_graph_conf%i.%s%s.pickle' %
                (CFG['confidence_level'], CFG['merge_strategy'], val_tag))
            CFG['fname_count_in'] = os.path.join(
                CFG['out_dirname'], 'spladder',
                'genes_graph_conf%i.%s%s.count.hdf5' %
                (CFG['confidence_level'], CFG['merge_strategy'], val_tag))

        condition_strains = None
        CFG['fname_exp_hdf5'] = os.path.join(
            CFG['out_dirname'], 'spladder',
            'genes_graph_conf%i.%s%s.gene_exp.hdf5' %
            (CFG['confidence_level'], CFG['merge_strategy'], val_tag))
        if os.path.exists(CFG['fname_exp_hdf5']):
            if CFG['verbose']:
                print 'Loading expression counts from %s' % CFG[
                    'fname_exp_hdf5']
            IN = h5py.File(CFG['fname_exp_hdf5'], 'r')
            gene_counts = IN['raw_count'][:]
            gene_strains = IN['strains'][:]
            gene_ids = IN['genes'][:]
            IN.close()
        else:
            if options.subset_samples == 'y':
                condition_strains = sp.unique(
                    sp.r_[sp.array(CFG['conditionA']),
                          sp.array(CFG['conditionB'])])
                CFG['fname_exp_hdf5'] = os.path.join(
                    CFG['out_dirname'], 'spladder',
                    'genes_graph_conf%i.%s%s.gene_exp.%i.hdf5' %
                    (CFG['confidence_level'], CFG['merge_strategy'], val_tag,
                     hash(tuple(sp.unique(condition_strains))) * -1))
            if os.path.exists(CFG['fname_exp_hdf5']):
                if CFG['verbose']:
                    print 'Loading expression counts from %s' % CFG[
                        'fname_exp_hdf5']
                IN = h5py.File(CFG['fname_exp_hdf5'], 'r')
                gene_counts = IN['raw_count'][:]
                gene_strains = IN['strains'][:]
                gene_ids = IN['genes'][:]
                IN.close()
            else:
                gene_counts, gene_strains, gene_ids = get_gene_expression(
                    CFG,
                    fn_out=CFG['fname_exp_hdf5'],
                    strain_subset=condition_strains)

        gene_strains = sp.array(
            [x.split(':')[1] if ':' in x else x for x in gene_strains])

        ### estimate size factors for library size normalization
        sf_ge = get_size_factors(gene_counts, CFG)

        ### get index of samples for difftest
        idx1 = sp.where(sp.in1d(gene_strains, CFG['conditionA']))[0]
        idx2 = sp.where(sp.in1d(gene_strains, CFG['conditionB']))[0]

        ### for TESTING
        #setsize = 100
        #idx1 = sp.arange(0, setsize / 2)
        #idx2 = sp.arange(setsize / 2, setsize)

        ### subset expression counts to tested samples
        gene_counts = gene_counts[:, sp.r_[idx1, idx2]]
        sf_ge = sf_ge[sp.r_[idx1, idx2]]
        #sf = sp.r_[sf, sf]

        ### test each event type individually
        for event_type in CFG['event_types']:

            if CFG['verbose']:
                print 'Testing %s events' % event_type

            CFG['fname_events'] = os.path.join(
                CFG['out_dirname'], 'merge_graphs_%s_C%i.counts.hdf5' %
                (event_type, CFG['confidence_level']))

            ### quantify events
            (cov, gene_idx, event_idx, event_ids,
             event_strains) = quantify.quantify_from_counted_events(
                 CFG['fname_events'], sp.r_[idx1, idx2], event_type, CFG)

            ### estimate size factors
            sf_ev = get_size_factors(sp.vstack(cov), CFG)

            sf = sp.r_[sf_ev, sf_ge]

            assert (sp.all(gene_strains == event_strains))

            ### map gene expression to event order
            curr_gene_counts = gene_counts[gene_idx, :]

            ### filter for min expression
            if event_type == 'intron_retention':
                k_idx = sp.where((sp.mean(cov[0] == 0, axis=1) < CFG['max_0_frac']) | \
                                 (sp.mean(cov[1] == 0, axis=1) < CFG['max_0_frac']))[0]
            else:
                k_idx = sp.where(((sp.mean(cov[0] == 0, axis=1) < CFG['max_0_frac']) | \
                                  (sp.mean(cov[1] == 0, axis=1) < CFG['max_0_frac'])) & \
                                 (sp.mean(sp.c_[cov[0][:, :idx1.shape[0]], cov[1][:, :idx1.shape[0]]] == 0, axis=1) < CFG['max_0_frac']) & \
                                 (sp.mean(sp.c_[cov[0][:, idx2.shape[0]:], cov[1][:, idx2.shape[0]:]] == 0, axis=1) < CFG['max_0_frac']))[0]
            if CFG['verbose']:
                print 'Exclude %i of %i %s events (%.2f percent) from testing due to low coverage' % (
                    cov[0].shape[0] - k_idx.shape[0], cov[0].shape[0],
                    event_type,
                    (1 - float(k_idx.shape[0]) / cov[0].shape[0]) * 100)
            if k_idx.shape[0] == 0:
                print 'All events of type %s were filtered out due to low coverage. Please try re-running with less stringent filter criteria' % event_type
                continue
        # k_idx = sp.where((sp.mean(sp.c_[cov[0], cov[1]], axis=1) > 2))[0]
        # k_idx = sp.where((sp.mean(cov[0], axis=1) > 2) & (sp.mean(cov[1], axis=1) > 2))[0]
            cov[0] = cov[0][k_idx, :]
            cov[1] = cov[1][k_idx, :]
            curr_gene_counts = curr_gene_counts[k_idx, :]
            event_idx = event_idx[k_idx]
            gene_idx = gene_idx[k_idx]
            event_ids = [x[k_idx] for x in event_ids]

            cov[0] = sp.around(sp.hstack([cov[0], curr_gene_counts]))
            cov[1] = sp.around(sp.hstack([cov[1], curr_gene_counts]))
            cov = sp.vstack(cov)
            event_ids = sp.hstack(event_ids)

            tidx = sp.arange(idx1.shape[0])

            #if CFG['debug']:
            #    for i in range(cov.shape[0]):
            #        fig = plt.figure(figsize=(8, 6), dpi=100)
            #        ax = fig.add_subplot(111)
            #        ax.hist(cov[i, :] * sf, 50, histtype='bar', rwidth=0.8)
            #        #ax.plot(sp.arange(cov.shape[1]), sorted(cov[i, :]), 'bo')
            #        ax.set_title('Count Distribution - Sample %i' % i )
            #        plt.savefig('count_dist.%i.pdf' % i, format='pdf', bbox_inches='tight')
            #        plt.close(fig)

            ### build design matrix for testing
            dmatrix1 = sp.zeros((cov.shape[1], 4), dtype='bool')
            dmatrix1[:, 0] = 1  # intercept
            dmatrix1[tidx, 1] = 1  # delta a
            dmatrix1[tidx, 2] = 1  # delta g
            dmatrix1[tidx + (idx1.shape[0] + idx2.shape[0]), 2] = 1  # delta g
            dmatrix1[(idx1.shape[0] + idx2.shape[0]):, 3] = 1  # is g
            dmatrix0 = dmatrix1[:, [0, 2, 3]]

            ### make event splice forms unique to prevent unnecessary tests
            event_ids, u_idx, r_idx = sp.unique(event_ids,
                                                return_index=True,
                                                return_inverse=True)
            if CFG['verbose']:
                print 'Consider %i unique event splice forms for testing' % u_idx.shape[
                    0]

            ### run testing
            #pvals = run_testing(cov[u_idx, :], dmatrix0, dmatrix1, sf, CFG, r_idx)
            pvals = run_testing(cov, dmatrix0, dmatrix1, sf, CFG)
            pvals_adj = adj_pval(pvals, CFG)

            ### write output
            out_fname = os.path.join(
                outdir,
                'test_results_C%i_%s.tsv' % (options.confidence, event_type))
            if CFG['verbose']:
                print 'Writing test results to %s' % out_fname
            s_idx = sp.argsort(pvals_adj)
            header = sp.array(['event_id', 'gene', 'p_val', 'p_val_adj'])
            event_ids = sp.array(
                ['%s_%i' % (event_type, i + 1) for i in event_idx],
                dtype='str')
            if CFG['is_matlab']:
                data_out = sp.c_[event_ids[s_idx], gene_ids[gene_idx[s_idx],
                                                            0],
                                 pvals[s_idx].astype('str'),
                                 pvals_adj[s_idx].astype('str')]
            else:
                data_out = sp.c_[event_ids[s_idx], gene_ids[gene_idx[s_idx]],
                                 pvals[s_idx].astype('str'),
                                 pvals_adj[s_idx].astype('str')]
            data_out = sp.r_[header[sp.newaxis, :], data_out]
            sp.savetxt(out_fname, data_out, delimiter='\t', fmt='%s')
コード例 #2
0
def spladder():

    ### get command line options
    options = parse_options(sys.argv)

    ### parse parameters from options object
    CFG = settings.parse_args(options)

    ### add dependencies provided in config section
    #if 'paths' in CFG:
    #    for i in CFG['paths']:
    #        eval('import %s'% CFG['paths'][i])

    ### load confidence level settings
    if not CFG['no_reset_conf']:
        CFG = settings.set_confidence_level(CFG)

    ### do not compute components of merged set, if result file already exists
    fn_out_merge = get_filename('fn_out_merge', CFG)
    fn_out_merge_val = get_filename('fn_out_merge_val', CFG)

    if not 'spladder_infile' in CFG and not os.path.exists(fn_out_merge):
        ### iterate over files, if merge strategy is single
        if CFG['merge_strategy'] in ['single', 'merge_graphs']:
            idxs = range(len(CFG['samples']))
        else:
            idxs = [0]

        ### set parallelization
        if CFG['rproc']:
            jobinfo = []

        ### create out-directory
        if not os.path.exists(CFG['out_dirname']):
            os.makedirs(CFG['out_dirname'])

        ### create spladder sub-directory
        if not os.path.exists(os.path.join(CFG['out_dirname'], 'spladder')):
            os.makedirs(os.path.join(CFG['out_dirname'], 'spladder'))

        ### pre-process annotation, if necessary
        if CFG['anno_fname'].split('.')[-1] != 'pickle':
            if not os.path.exists(CFG['anno_fname'] + '.pickle'):
                if CFG['anno_fname'].split('.')[-1].lower() in ['gff', 'gff3']:
                    (genes,
                     CFG) = init.init_genes_gff3(CFG['anno_fname'], CFG,
                                                 CFG['anno_fname'] + '.pickle')
                elif CFG['anno_fname'].split('.')[-1].lower() in ['gtf']:
                    (genes,
                     CFG) = init.init_genes_gtf(CFG['anno_fname'], CFG,
                                                CFG['anno_fname'] + '.pickle')
                else:
                    print >> sys.stderr, 'ERROR: Unknown annotation format. File needs to end in gtf or gff/gff3\nCurrent file: %s' % CFG[
                        'anno_fname']
                    sys.exit(1)
            CFG['anno_fname'] += '.pickle'

        ### add anotation contigs into lookup table
        if not 'genes' in CFG:
            genes = cPickle.load(open(CFG['anno_fname'], 'r'))
        else:
            genes = CFG['genes']
        CFG = init.append_chrms(
            sp.unique(sp.array([x.chr for x in genes], dtype='str')), CFG)
        del genes

        ### convert input BAMs to sparse arrays - filtered case
        if CFG['bam_to_sparse']:
            for bfn in CFG['bam_fnames']:
                if bfn.endswith('bam') and not os.path.exists(
                        re.sub(r'.bam$', '', bfn) + '.filt.hdf5'):
                    #cnts = dict()

                    if not 'chrm_lookup' in CFG:
                        IN = pysam.Samfile(bfn, 'rb')
                        CFG = append_chrms(
                            [x['SN'] for x in parse_header(IN.text)['SQ']],
                            CFG)
                        IN.close()

                    OUT = h5py.File(
                        re.sub(r'.bam$', '', bfn) + '.filt.hdf5', 'w')
                    if CFG['parallel'] > 1:
                        import multiprocessing as mp
                        pool = mp.Pool(processes=CFG['parallel'])
                        result = [
                            pool.apply_async(
                                summarize_chr,
                                args=(
                                    bfn,
                                    str(chrm),
                                    CFG,
                                ),
                                kwds={'filter': CFG['read_filter']})
                            for chrm in sorted(CFG['chrm_lookup'])
                        ]
                        while result:
                            tmp = result.pop(0).get()
                            OUT.create_dataset(name=(tmp[0] + '_reads_row'),
                                               data=tmp[1].row.astype('uint8'),
                                               compression='gzip')
                            OUT.create_dataset(name=(tmp[0] + '_reads_col'),
                                               data=tmp[1].col,
                                               compression='gzip')
                            OUT.create_dataset(name=(tmp[0] + '_reads_dat'),
                                               data=tmp[1].data,
                                               compression='gzip')
                            OUT.create_dataset(name=(tmp[0] + '_reads_shp'),
                                               data=tmp[1].shape)
                            OUT.create_dataset(name=(tmp[0] + '_introns_m'),
                                               data=tmp[2],
                                               compression='gzip')
                            OUT.create_dataset(name=(tmp[0] + '_introns_p'),
                                               data=tmp[3],
                                               compression='gzip')
                            del tmp
                    else:
                        for chrm in CFG['chrm_lookup']:
                            tmp = summarize_chr(bfn,
                                                str(chrm),
                                                CFG,
                                                filter=CFG['read_filter'])
                            OUT.create_dataset(name=(chrm + '_reads_row'),
                                               data=tmp[1].row.astype('uint8'),
                                               compression='gzip')
                            OUT.create_dataset(name=(chrm + '_reads_col'),
                                               data=tmp[1].col,
                                               compression='gzip')
                            OUT.create_dataset(name=(chrm + '_reads_dat'),
                                               data=tmp[1].data,
                                               compression='gzip')
                            OUT.create_dataset(name=(chrm + '_reads_shp'),
                                               data=tmp[1].shape)
                            OUT.create_dataset(name=(chrm + '_introns_m'),
                                               data=tmp[2],
                                               compression='gzip')
                            OUT.create_dataset(name=(chrm + '_introns_p'),
                                               data=tmp[3],
                                               compression='gzip')
                    OUT.close()
                elif CFG['verbose']:
                    print >> sys.stdout, 'Filtered sparse BAM representation for %s already exists.' % bfn

        ### build individual graphs
        for idx in idxs:
            CFG_ = dict()
            if CFG['merge_strategy'] != 'merge_bams':
                CFG_['bam_fnames'] = CFG['bam_fnames']
                CFG_['samples'] = CFG['samples']
                CFG['bam_fnames'] = CFG['bam_fnames'][idx]
                CFG['samples'] = CFG['samples'][idx]
                CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % (
                    CFG['out_dirname'], CFG['confidence_level'],
                    CFG['samples'])
            else:
                CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % (
                    CFG['out_dirname'], CFG['confidence_level'],
                    CFG['merge_strategy'])

            ### assemble out filename to check if we are already done
            fn_out = CFG['out_fname']
            if CFG['do_prune']:
                fn_out = re.sub('.pickle$', '_pruned.pickle', fn_out)
            if CFG['do_gen_isoforms']:
                fn_out = re.sub('.pickle$', '_with_isoforms.pickle', fn_out)

            if os.path.exists(fn_out):
                print >> sys.stdout, '%s - All result files already exist.' % fn_out
            else:
                if CFG['rproc']:
                    jobinfo.append(
                        rp.rproc('spladder_core', CFG, 15000,
                                 CFG['options_rproc'], 60 * 60))
                else:
                    spladder_core(CFG)

            for key in CFG_:
                try:
                    CFG[key] = CFG_[key].copy()
                except AttributeError:
                    CFG[key] = CFG_[key]

        ### collect results after parallelization
        if CFG['rproc']:
            rp.rproc_wait(jobinfo, 30, 1.0, -1)

        ### merge parts if necessary
        if CFG['merge_strategy'] == 'merge_graphs':
            run_merge(CFG)

    if not 'spladder_infile' in CFG and CFG[
            'merge_strategy'] == 'merge_graphs' and CFG[
                'validate_splicegraphs'] and not os.path.exists(
                    fn_out_merge_val):
        (genes, inserted) = cPickle.load(open(fn_out_merge, 'r'))
        genes = filter_by_edgecount(genes, CFG)
        cPickle.dump((genes, inserted), open(fn_out_merge_val, 'w'), -1)
        del genes

    ### convert input BAMs to sparse arrays - unfiltered case
    if CFG['bam_to_sparse']:
        for bfn in CFG['bam_fnames']:
            if bfn.endswith('bam') and not os.path.exists(
                    re.sub(r'.bam$', '', bfn) + '.hdf5'):
                #cnts = dict()

                if not 'chrm_lookup' in CFG:
                    IN = pysam.Samfile(bfn, 'rb')
                    CFG = append_chrms(
                        [x['SN'] for x in parse_header(IN.text)['SQ']], CFG)
                    IN.close()

                OUT = h5py.File(re.sub(r'.bam$', '', bfn) + '.hdf5', 'w')
                if CFG['parallel'] > 1:
                    import multiprocessing as mp
                    pool = mp.Pool(processes=CFG['parallel'])
                    result = [
                        pool.apply_async(summarize_chr,
                                         args=(
                                             bfn,
                                             str(chrm),
                                             CFG,
                                         ))
                        for chrm in sorted(CFG['chrm_lookup'])
                    ]
                    while result:
                        tmp = result.pop(0).get()
                        OUT.create_dataset(name=(tmp[0] + '_reads_row'),
                                           data=tmp[1].row.astype('uint8'),
                                           compression='gzip')
                        OUT.create_dataset(name=(tmp[0] + '_reads_col'),
                                           data=tmp[1].col,
                                           compression='gzip')
                        OUT.create_dataset(name=(tmp[0] + '_reads_dat'),
                                           data=tmp[1].data,
                                           compression='gzip')
                        OUT.create_dataset(name=(tmp[0] + '_reads_shp'),
                                           data=tmp[1].shape)
                        OUT.create_dataset(name=(tmp[0] + '_introns_m'),
                                           data=tmp[2],
                                           compression='gzip')
                        OUT.create_dataset(name=(tmp[0] + '_introns_p'),
                                           data=tmp[3],
                                           compression='gzip')
                else:
                    for chrm in CFG['chrm_lookup']:
                        tmp = summarize_chr(bfn, str(chrm), CFG)
                        OUT.create_dataset(name=(chrm + '_reads_row'),
                                           data=tmp[1].row.astype('uint8'),
                                           compression='gzip')
                        OUT.create_dataset(name=(chrm + '_reads_col'),
                                           data=tmp[1].col,
                                           compression='gzip')
                        OUT.create_dataset(name=(chrm + '_reads_dat'),
                                           data=tmp[1].data,
                                           compression='gzip')
                        OUT.create_dataset(name=(chrm + '_reads_shp'),
                                           data=tmp[1].shape)
                        OUT.create_dataset(name=(chrm + '_introns_m'),
                                           data=tmp[2],
                                           compression='gzip')
                        OUT.create_dataset(name=(chrm + '_introns_p'),
                                           data=tmp[3],
                                           compression='gzip')
                OUT.close()
            elif CFG['verbose']:
                print >> sys.stdout, 'Sparse BAM representation for %s already exists.' % bfn

    if CFG['merge_strategy'] == 'single':
        idxs = range(len(CFG['samples']))
    else:
        idxs = [0]

    for idx in idxs:
        ### get count output file
        if CFG['merge_strategy'] == 'single':
            fn_in_count = get_filename('fn_count_in', CFG, sample_idx=idx)
            fn_out_count = get_filename('fn_count_out', CFG, sample_idx=idx)
        else:
            fn_in_count = get_filename('fn_count_in', CFG)
            fn_out_count = get_filename('fn_count_out', CFG)

        ### count segment graph
        if CFG['run_as_analysis'] or CFG['count_segment_graph']:
            if not os.path.exists(fn_out_count):
                if CFG['merge_strategy'] == 'single':
                    count_graph_coverage_wrapper(fn_in_count,
                                                 fn_out_count,
                                                 CFG,
                                                 sample_idx=idx)
                else:
                    count_graph_coverage_wrapper(fn_in_count, fn_out_count,
                                                 CFG)

        ### count intron coverage phenotype
        if CFG['count_intron_cov']:
            fn_out_intron_count = fn_out_count.replace('pickle',
                                                       'introns.pickle')
            count_intron_coverage_wrapper(fn_in_count, fn_out_intron_count,
                                          CFG)

    ### handle alternative splicing part
    if CFG['run_as_analysis']:
        collect_events(CFG)

        for e_idx in range(len(CFG['event_types'])):
            analyze_events(CFG, CFG['event_types'][e_idx])
コード例 #3
0
def main():

    ### get command line options
    options = parse_options(sys.argv)

    ### parse parameters from options object
    CFG = settings.parse_args(options, identity='test')
    CFG['use_exon_counts'] = False

    ### generate output directory
    outdir = os.path.join(options.outdir, 'testing')
    if options.timestamp == 'y':
        outdir = '%s_%s' % (outdir, str(datetime.datetime.now()).replace(' ', '_'))
    if CFG['diagnose_plots']:
        CFG['plot_dir'] = os.path.join(options.outdir, 'plots')
        if not os.path.exists(CFG['plot_dir']):
            os.makedirs(CFG['plot_dir'])

    if options.labelA != 'condA' and options.labelB != 'condB':
        outdir = '%s_%s_vs_%s' % (outdir, options.labelA, options.labelB)
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    if CFG['debug']:

        print "Generating simulated dataset"

        npr.seed(23)
        CFG['is_matlab'] = False
        #cov = npr.permutation(20000-20).astype('float').reshape(999, 20)
        #cov = sp.r_[cov, sp.c_[sp.ones((1, 10)) *10, sp.ones((1, 10)) * 500000] + npr.normal(10, 1, 20)]
        #sf = sp.ones((cov.shape[1], ), dtype='float')

        setsize = 50
        ### diff event counts
        cov = sp.zeros((500, 2 * setsize), dtype='int')
        for i in range(10):
            cov[i, :setsize] = nbinom.rvs(30, 0.8, size=setsize)
            cov[i, setsize:] = nbinom.rvs(10, 0.8, size=setsize)
        for i in range(10, cov.shape[0]):
            cov[i, :] = nbinom.rvs(30, 0.8, size=2*setsize)

        ### diff gene expression
        cov2 = sp.zeros((500, 2 * setsize), dtype='int')
        for i in range(20):
            cov2[i, :setsize] = nbinom.rvs(2000, 0.2, size=setsize)
            cov2[i, setsize:] = nbinom.rvs(2000, 0.3, size=setsize)
        for i in range(20, cov2.shape[0]):
            cov2[i, :] = nbinom.rvs(2000, 0.3, size=2*setsize)

        cov = sp.c_[cov, cov2] * 10000

        tidx = sp.arange(setsize)

        sf = npr.uniform(0, 5, 2*setsize)
        sf = sp.r_[sf, sf]

        #dmatrix0 = sp.ones((cov.shape[1], 3), dtype='bool')
        dmatrix1 = sp.zeros((cov.shape[1], 4), dtype='float')
        dmatrix1[:, 0] = 1
        dmatrix1[tidx, 1] = 1
        #dmatrix1[tidx, 2] = 1
        dmatrix1[tidx + (2*setsize), 2] = 1
        dmatrix1[(2*setsize):, 3] = 1
        #dmatrix1[:, 4] = sp.log(sf)
        dmatrix0 = dmatrix1[:, [0, 2, 3]]

        cov = cov * sf
        #sf = sp.ones((cov.shape[1], ), dtype='float')

        pvals = run_testing(cov, dmatrix0, dmatrix1, sf, CFG)
        pvals_adj = adj_pval(pvals, CFG) 
        pdb.set_trace()
    else:
        val_tag = ''
        if CFG['validate_splicegraphs']:
            val_tag = '.validated'

        if CFG['is_matlab']:
            CFG['fname_genes'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.mat' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag))
            CFG['fname_count_in'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.count.mat' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag))
        else:
            CFG['fname_genes'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.pickle' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag))
            CFG['fname_count_in'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.count.hdf5' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag))

        condition_strains = None
        CFG['fname_exp_hdf5'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.gene_exp.hdf5' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag))
        if os.path.exists(CFG['fname_exp_hdf5']):
            if CFG['verbose']:
                print 'Loading expression counts from %s' % CFG['fname_exp_hdf5']
            IN = h5py.File(CFG['fname_exp_hdf5'], 'r')
            gene_counts = IN['raw_count'][:]
            gene_strains = IN['strains'][:]
            gene_ids = IN['genes'][:]
            IN.close()
        else:
            if options.subset_samples == 'y':
                condition_strains = sp.unique(sp.r_[sp.array(CFG['conditionA']), sp.array(CFG['conditionB'])])
                CFG['fname_exp_hdf5'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.gene_exp.%i.hdf5' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag, hash(tuple(sp.unique(condition_strains))) * -1))
            if os.path.exists(CFG['fname_exp_hdf5']):
                if CFG['verbose']:
                    print 'Loading expression counts from %s' % CFG['fname_exp_hdf5']
                IN = h5py.File(CFG['fname_exp_hdf5'], 'r')
                gene_counts = IN['raw_count'][:]
                gene_strains = IN['strains'][:]
                gene_ids = IN['genes'][:]
                IN.close()
            else:
                gene_counts, gene_strains, gene_ids = get_gene_expression(CFG, fn_out=CFG['fname_exp_hdf5'], strain_subset=condition_strains)

        gene_strains = sp.array([x.split(':')[1] if ':' in x else x for x in gene_strains])

        ### estimate size factors for library size normalization
        sf_ge = get_size_factors(gene_counts, CFG)

        ### get index of samples for difftest
        idx1 = sp.where(sp.in1d(gene_strains, CFG['conditionA']))[0]
        idx2 = sp.where(sp.in1d(gene_strains, CFG['conditionB']))[0]

        ### for TESTING
        #setsize = 100
        #idx1 = sp.arange(0, setsize / 2)
        #idx2 = sp.arange(setsize / 2, setsize)

        ### subset expression counts to tested samples
        gene_counts = gene_counts[:, sp.r_[idx1, idx2]]
        sf_ge = sf_ge[sp.r_[idx1, idx2]]
        #sf = sp.r_[sf, sf]

        ### test each event type individually
        for event_type in CFG['event_types']:

            if CFG['verbose']:
                print 'Testing %s events' % event_type

            CFG['fname_events'] = os.path.join(CFG['out_dirname'], 'merge_graphs_%s_C%i.counts.hdf5' % (event_type, CFG['confidence_level']))

            ### quantify events
            (cov, gene_idx, event_idx, event_ids, event_strains) = quantify.quantify_from_counted_events(CFG['fname_events'], sp.r_[idx1, idx2], event_type, CFG)

            ### estimate size factors
            sf_ev = get_size_factors(sp.vstack(cov), CFG)

            sf = sp.r_[sf_ev, sf_ge]

            assert(sp.all(gene_strains == event_strains))

            ### map gene expression to event order
            curr_gene_counts = gene_counts[gene_idx, :]

            ### filter for min expression
            if event_type == 'intron_retention':
                k_idx = sp.where((sp.mean(cov[0] == 0, axis=1) < CFG['max_0_frac']) | \
                                 (sp.mean(cov[1] == 0, axis=1) < CFG['max_0_frac']))[0]
            else:
                k_idx = sp.where(((sp.mean(cov[0] == 0, axis=1) < CFG['max_0_frac']) | \
                                  (sp.mean(cov[1] == 0, axis=1) < CFG['max_0_frac'])) & \
                                 (sp.mean(sp.c_[cov[0][:, :idx1.shape[0]], cov[1][:, :idx1.shape[0]]] == 0, axis=1) < CFG['max_0_frac']) & \
                                 (sp.mean(sp.c_[cov[0][:, idx2.shape[0]:], cov[1][:, idx2.shape[0]:]] == 0, axis=1) < CFG['max_0_frac']))[0]
            if CFG['verbose']:
                print 'Exclude %i of %i %s events (%.2f percent) from testing due to low coverage' % (cov[0].shape[0] - k_idx.shape[0], cov[0].shape[0], event_type, (1 - float(k_idx.shape[0]) / cov[0].shape[0]) * 100)
            if k_idx.shape[0] == 0:
                print 'All events of type %s were filtered out due to low coverage. Please try re-running with less stringent filter criteria' % event_type
                continue
           # k_idx = sp.where((sp.mean(sp.c_[cov[0], cov[1]], axis=1) > 2))[0]
           # k_idx = sp.where((sp.mean(cov[0], axis=1) > 2) & (sp.mean(cov[1], axis=1) > 2))[0]
            cov[0] = cov[0][k_idx, :]
            cov[1] = cov[1][k_idx, :]
            curr_gene_counts = curr_gene_counts[k_idx, :]
            event_idx = event_idx[k_idx]
            gene_idx = gene_idx[k_idx]
            event_ids = [x[k_idx] for x in event_ids]

            cov[0] = sp.around(sp.hstack([cov[0], curr_gene_counts]))
            cov[1] = sp.around(sp.hstack([cov[1], curr_gene_counts]))
            cov = sp.vstack(cov)
            event_ids = sp.hstack(event_ids)

            tidx = sp.arange(idx1.shape[0])

        #if CFG['debug']:
        #    for i in range(cov.shape[0]):
        #        fig = plt.figure(figsize=(8, 6), dpi=100)
        #        ax = fig.add_subplot(111)
        #        ax.hist(cov[i, :] * sf, 50, histtype='bar', rwidth=0.8)
        #        #ax.plot(sp.arange(cov.shape[1]), sorted(cov[i, :]), 'bo')
        #        ax.set_title('Count Distribution - Sample %i' % i )
        #        plt.savefig('count_dist.%i.pdf' % i, format='pdf', bbox_inches='tight')
        #        plt.close(fig)

            ### build design matrix for testing
            dmatrix1 = sp.zeros((cov.shape[1], 4), dtype='bool')
            dmatrix1[:, 0] = 1                      # intercept
            dmatrix1[tidx, 1] = 1                   # delta a
            dmatrix1[tidx, 2] = 1                   # delta g
            dmatrix1[tidx + (idx1.shape[0] + idx2.shape[0]), 2] = 1         # delta g
            dmatrix1[(idx1.shape[0] + idx2.shape[0]):, 3] = 1         # is g
            dmatrix0 = dmatrix1[:, [0, 2, 3]]

            ### make event splice forms unique to prevent unnecessary tests
            event_ids, u_idx, r_idx = sp.unique(event_ids, return_index=True, return_inverse=True)
            if CFG['verbose']:
                print 'Consider %i unique event splice forms for testing' % u_idx.shape[0]

            ### run testing
            #pvals = run_testing(cov[u_idx, :], dmatrix0, dmatrix1, sf, CFG, r_idx)
            pvals = run_testing(cov, dmatrix0, dmatrix1, sf, CFG)
            pvals_adj = adj_pval(pvals, CFG) 

            ### write output
            out_fname = os.path.join(outdir, 'test_results_C%i_%s.tsv' % (options.confidence, event_type))
            if CFG['verbose']:
                print 'Writing test results to %s' % out_fname
            s_idx = sp.argsort(pvals_adj)
            header = sp.array(['event_id', 'gene', 'p_val', 'p_val_adj']) 
            event_ids = sp.array(['%s_%i' % (event_type, i + 1) for i in event_idx], dtype='str')
            if CFG['is_matlab']:
                data_out = sp.c_[event_ids[s_idx], gene_ids[gene_idx[s_idx], 0], pvals[s_idx].astype('str'), pvals_adj[s_idx].astype('str')]
            else:
                data_out = sp.c_[event_ids[s_idx], gene_ids[gene_idx[s_idx]], pvals[s_idx].astype('str'), pvals_adj[s_idx].astype('str')]
            data_out = sp.r_[header[sp.newaxis, :], data_out]
            sp.savetxt(out_fname, data_out, delimiter='\t', fmt='%s')
コード例 #4
0
ファイル: spladder.py プロジェクト: bowhan/spladder
def spladder():

    ### get command line options
    options = parse_options(sys.argv)

    ### parse parameters from options object
    CFG = settings.parse_args(options)

    ### add dependencies provided in config section
    #if 'paths' in CFG:
    #    for i in CFG['paths']:
    #        eval('import %s'% CFG['paths'][i])

    ### load confidence level settings
    if not CFG['no_reset_conf']:
        CFG = settings.set_confidence_level(CFG)

    ### do not compute components of merged set, if result file already exists
    fn_out_merge = get_filename('fn_out_merge', CFG)
    fn_out_merge_val = get_filename('fn_out_merge_val', CFG)

    if not 'spladder_infile' in CFG and not os.path.exists(fn_out_merge):
        ### iterate over files, if merge strategy is single
        if CFG['merge_strategy'] in ['single', 'merge_graphs']:
            idxs = range(len(CFG['samples']))
        else:
            idxs = [0]
        
        ### set parallelization
        if CFG['rproc']:
            jobinfo = []

        ### create out-directory
        if not os.path.exists(CFG['out_dirname']):
            os.makedirs(CFG['out_dirname'])

        ### create spladder sub-directory
        if not os.path.exists(os.path.join(CFG['out_dirname'], 'spladder')):
            os.makedirs(os.path.join(CFG['out_dirname'], 'spladder'))

        ### pre-process annotation, if necessary
        if CFG['anno_fname'].split('.')[-1] != 'pickle':
            if not os.path.exists(CFG['anno_fname'] + '.pickle'):
                if CFG['anno_fname'].split('.')[-1] in ['gff', 'gff3']:
                    (genes, CFG) = init.init_genes_gff3(CFG['anno_fname'], CFG, CFG['anno_fname'] + '.pickle')
                elif CFG['anno_fname'].split('.')[-1] in ['gtf']:
                    (genes, CFG) = init.init_genes_gtf(CFG['anno_fname'], CFG, CFG['anno_fname'] + '.pickle')
                else:
                    print >> sys.stderr, 'ERROR: Unknown annotation format. File needs to end in gtf or gff/gff3\nCurrent file: %s' % CFG['anno_fname']
                    sys.exit(1)
            CFG['anno_fname'] += '.pickle'

        ### add anotation contigs into lookup table
        if not 'genes' in CFG:
            genes = cPickle.load(open(CFG['anno_fname'], 'r'))
        else:
            genes = CFG['genes']
        CFG = init.append_chrms(sp.unique(sp.array([x.chr for x in genes], dtype='str')), CFG)
        del genes


        for idx in idxs:
            CFG_ = dict()
            if CFG['merge_strategy'] != 'merge_bams':
                CFG_['bam_fnames'] = CFG['bam_fnames']
                CFG_['samples'] = CFG['samples']
                CFG['bam_fnames'] = CFG['bam_fnames'][idx]
                CFG['samples'] = CFG['samples'][idx]
                CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['samples'])
            else:
                CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'])

            ### assemble out filename to check if we are already done
            fn_out = CFG['out_fname']
            if CFG['do_prune']:
                fn_out = re.sub('.pickle$', '_pruned.pickle', fn_out)
            if CFG['do_gen_isoforms']:
                fn_out = re.sub('.pickle$', '_with_isoforms.pickle', fn_out)
    
            if os.path.exists(fn_out):
                print >> sys.stdout, '%s - All result files already exist.' % fn_out
            else:
                if CFG['rproc']:
                    jobinfo.append(rp.rproc('spladder_core', CFG, 15000, CFG['options_rproc'], 60*60))
                else:
                    spladder_core(CFG)

            for key in CFG_:
                try:
                    CFG[key] = CFG_[key].copy()
                except AttributeError:
                    CFG[key] = CFG_[key]

        ### collect results after parallelization
        if CFG['rproc']:
            rp.rproc_wait(jobinfo, 30, 1.0, -1)

        ### merge parts if necessary
        if CFG['merge_strategy'] == 'merge_graphs':
            run_merge(CFG)

    if not 'spladder_infile' in CFG and CFG['validate_splicegraphs'] and not os.path.exists(fn_out_merge_val):
        (genes, inserted) = cPickle.load(open(fn_out_merge, 'r'))
        genes = filter_by_edgecount(genes, CFG)
        cPickle.dump((genes, inserted), open(fn_out_merge_val, 'w'), -1)
        del genes

    ### get count output file
    fn_in_count = get_filename('fn_count_in', CFG)
    fn_out_count = get_filename('fn_count_out', CFG)

    ### convert input BAMs to sparse arrays
    if CFG['bam_to_sparse']:
        for bfn in CFG['bam_fnames']:
            if bfn.endswith('bam') and not os.path.exists(re.sub(r'.bam$', '', bfn) + '.npz'):
                cnts = dict()

                if not 'chrm_lookup' in CFG:
                    IN = pysam.Samfile(bfn, 'rb')
                    CFG = append_chrms([x['SN'] for x in parse_header(IN.text)['SQ']], CFG)
                    IN.close()

                if CFG['parallel'] > 1:
                    import multiprocessing as mp
                    pool = mp.Pool(processes=CFG['parallel'])
                    result = [pool.apply_async(summarize_chr, args=(bfn, str(chrm), CFG,)) for chrm in sorted(CFG['chrm_lookup'])]
                    while result:
                        tmp = result.pop(0).get()
                        cnts[tmp[0] + '_reads_row'] = tmp[1].row.astype('uint8')
                        cnts[tmp[0] + '_reads_col'] = tmp[1].col
                        cnts[tmp[0] + '_reads_dat'] = tmp[1].data
                        cnts[tmp[0] + '_reads_shp'] = tmp[1].shape
                        cnts[tmp[0] + '_introns_m'] = tmp[2]
                        cnts[tmp[0] + '_introns_p'] = tmp[3]
                else:
                    for chrm in CFG['chrm_lookup']:
                        tmp = summarize_chr(bfn, str(chrm), CFG)
                        cnts[chrm + '_reads_row'] = tmp[1].row.astype('uint8')
                        cnts[chrm + '_reads_col'] = tmp[1].col
                        cnts[chrm + '_reads_dat'] = tmp[1].data
                        cnts[chrm + '_reads_shp'] = tmp[1].shape
                        cnts[chrm + '_introns_m'] = tmp[2]
                        cnts[chrm + '_introns_p'] = tmp[3]
                sp.savez_compressed(re.sub(r'.bam$', '', bfn), **cnts)
            elif CFG['verbose']:
                print >> sys.stdout, 'Sparse BAM representation for %s already exists.' % bfn

    ### count segment graph
    if CFG['run_as_analysis'] or CFG['count_segment_graph']:
        if not os.path.exists(fn_out_count):
            count_graph_coverage_wrapper(fn_in_count, fn_out_count, CFG)

    ### count intron coverage phenotype
    if CFG['count_intron_cov']:
        fn_out_intron_count = fn_out_count.replace('mat', 'introns.pickle')
        count_intron_coverage_wrapper(fn_in_count, fn_out_intron_count, CFG)

    ### handle alternative splicing part
    if CFG['run_as_analysis']:
        collect_events(CFG)

        for idx in range(len(CFG['event_types'])):
            analyze_events(CFG, CFG['event_types'][idx])
コード例 #5
0
ファイル: spladder_viz.py プロジェクト: arpankbasak/spladder
def spladder_viz():
    """Main visualization code"""

    ### parse command line parameters
    options = parse_options(sys.argv)

    ### parse parameters from options object
    CFG = settings.parse_args(options, identity='viz')

    ### create plot directory if it does not exist yet
    if options.testdir != '-':
        dirname = options.testdir
    else:
        dirname = CFG['out_dirname']
    if not os.path.exists(os.path.join(dirname, 'plots')):
        os.mkdir(os.path.join(dirname, 'plots'))

    if options.format == 'd3':
        try:
            import mpld3
            from mpld3 import plugins
        except ImportError:
            sys.stderr.write(
                "ERROR: missing package for output format d3. Package mpld3 required"
            )
            sys.exit(1)

    ### load gene information
    gene_names = get_gene_names(CFG)

    rows = get_plot_len(CFG)
    gs = gridspec.GridSpec(rows, 1)

    ### set color maps
    cmap_cov = plt.get_cmap('jet')
    cmap_edg = plt.get_cmap('jet')

    ### plot log scale?
    log_tag = ''
    if options.log:
        log_tag = '.log'
    event_tag = ''

    ### did we get any labels?
    if CFG['plot_labels']:
        CFG['plot_labels'] = CFG['plot_labels'].strip(',').split(',')
        assert len(CFG['plot_labels']) == len(
            CFG['bam_fnames']
        ), "The number of given labels (%i) needs to match the number of given bam file groups (%i)" % (
            len(CFG['plot_labels']), len(CFG['bam_fnames']))

    ### the user chose a specific gene for plotting
    ### create pairs of gene ids and an event_id (the latter is None by default)
    if options.gene_name is not None:
        #gid = sp.where(sp.array([x.split('.')[0] for x in gene_names]) == options.gene_name.split('.')[0])[0]
        gids = [[
            sp.where(sp.array(gene_names) == options.gene_name)[0][0],
            options.event_id
        ]]
        if gids.shape[0] == 0:
            sys.stderr.write(
                'ERROR: provided gene ID %s could not be found, please check for correctness\n'
                % options.gene_name)
            sys.exit(1)
    ### the plotting happens on the results of spladder test
    ### the user chooses to plot the top k significant events
    ### this requires the event type to be specified
    elif options.test_result > 0:
        gene_names = []
        for event_type in CFG['event_types']:
            ### the testing script should generate a setup file for the test
            ### SETUP is structured as follows:
            ###  [gene_strains, event_strains, dmatrix0, dmatrix1, event_type, options, CFG]
            labels = options.test_labels.split(':')
            options.labels = labels
            if options.testdir != '-':
                testdir = dirname
            else:
                testdir = os.path.join(
                    dirname, 'testing_%s_vs_%s' % (labels[0], labels[1]))
            SETUP = cPickle.load(
                open(
                    os.path.join(
                        testdir, 'test_setup_C%i_%s.pickle' %
                        (CFG['confidence_level'], event_type)), 'r'))

            ### get strains to plot
            idx1 = sp.where(sp.in1d(SETUP[0], SETUP[6]['conditionA']))[0]
            idx2 = sp.where(sp.in1d(SETUP[0], SETUP[6]['conditionB']))[0]

            ### load test results
            for l, line in enumerate(
                    open(
                        os.path.join(
                            testdir, 'test_results_C%i_%s.tsv' %
                            (CFG['confidence_level'], event_type)), 'r')):
                if l == 0:
                    continue
                if l > options.test_result:
                    break
                sl = line.strip().split('\t')
                gene_names.append([sl[1], sl[0]])
        gids = get_gene_ids(CFG, gene_names)
    ### no gene specified but result provided - plot all genes with confirmed events
    ### if an event_id is provided, only the associated gene will be plotted
    else:
        gids = get_gene_ids(CFG)

    ### iterate over genes to plot
    for gid in gids:

        ### gather information about the gene we plot
        gene = load_genes(CFG, idx=[gid[0]])[0]
        if CFG['verbose']:
            print 'plotting information for gene %s' % gene.name
        gene.from_sparse()

        ### event to plot is specified with the gene id list
        if gid[1] is not None:
            event_info = [
                x[::-1]
                for x in re.split(r'[._]', gid[1][::-1], maxsplit=1)[::-1]
            ]
            event_info[1] = int(event_info[1]) - 1
            event_info = sp.array(event_info, dtype='str')[sp.newaxis, :]
            event_tag = '.%s' % gid[1]
        ### get all significant events of the current gene
        else:
            event_info = get_conf_events(CFG, gid[0])

        ### go over different plotting options
        axes = []
        ### plot result of testing
        if options.test_result > 0:
            fig = plt.figure(figsize=(9, 5), dpi=200)
            gs = gridspec.GridSpec(2, 1, height_ratios=[4, 1])
            _add_ax(fig, axes, gs)
            _add_ax(fig, axes, gs)
            _plot_event(CFG, event_info, fig, axes[1], gs, None, padding=100)
            start, stop = axes[1].get_xlim()
            plot_bam(options,
                     gene,
                     CFG['bam_fnames'],
                     fig,
                     axes[0],
                     gs,
                     None,
                     cmap_cov,
                     cmap_edg,
                     single=False,
                     sharex=axes[1],
                     start=int(start),
                     stop=int(stop))

        ### plot custom layout
        elif options.user == 'y':

            if options.format == 'd3':
                fig = plt.figure(figsize=(12, 2 * rows), dpi=100)
            else:
                fig = plt.figure(figsize=(18, 3 * rows), dpi=200)

            ### plot splicing graph
            if options.splicegraph == 'y':
                _plot_splicegraph(gene, fig, axes, gs)
                xlim = axes[-1].get_xlim()

            ### plot annotated transcripts
            if CFG['plot_transcripts']:
                sharex = None if len(axes) == 0 else axes[0]
                axes.append(fig.add_subplot(gs[len(axes), 0], sharex=sharex))
                multiple(gene.exons, ax=axes[-1], x_range=xlim)
                axes[-1].set_title('Annotated Transcripts')

            ### plot coverage information for a set of given samples
            if len(CFG['bam_fnames']) > 0:
                plot_bam(options, gene, CFG['bam_fnames'], fig, axes, gs, xlim,
                         cmap_cov, cmap_edg)

                ### plot all the samples in a single plot
                if len(CFG['bam_fnames']) > 1:
                    plot_bam(options,
                             gene,
                             CFG['bam_fnames'],
                             fig,
                             axes,
                             gs,
                             xlim,
                             cmap_cov,
                             cmap_edg,
                             single=False)

            ### plot segment counts
            if len(CFG['bam_fnames']
                   ) == 0 or False:  # add option for segment plots
                if options.test_result > 0:
                    _plot_segments(CFG, gid, fig, axes, gs, options,
                                   [idx1, idx2])
                else:
                    _plot_segments(CFG, gid, fig, axes, gs, options)

            ### plot structure of a single given event
            _plot_event(CFG, event_info, fig, axes, gs, xlim)

        ### we only need to adapt the xoom for one axis object - as we share the x
        zoom_x = [float(x) for x in options.zoom_x.split(',')]
        xlim = axes[0].get_xlim()
        xdiff = xlim[1] - xlim[0]
        axes[0].set_xlim(
            [xlim[0] + (zoom_x[0] * xdiff), xlim[0] + (zoom_x[1] * xdiff)])

        for ax in axes:
            vax.clean_axis(ax)

        plt.tight_layout()
        ### save plot into file
        if options.format == 'd3':
            out_fname = os.path.join(
                dirname, 'plots', 'gene_overview_C%i_%s%s%s.html' %
                (options.confidence, gene.name, event_tag, log_tag))
            plugins.clear(fig)
            plugins.connect(fig, plugins.Zoom(enabled=True))
            mpld3.save_html(fig, open(out_fname, 'w'))
        else:
            if options.test_result > 0:
                out_fname = os.path.join(
                    dirname, 'plots', 'gene_overview_C%i_%s%s%s.%s' %
                    (options.confidence, gene.name, event_tag, log_tag,
                     options.format))
            else:
                out_fname = os.path.join(
                    dirname, 'plots', 'gene_overview_C%i_%s%s%s.%s' %
                    (options.confidence, gene.name, event_tag, log_tag,
                     options.format))
            plt.savefig(out_fname, format=options.format, bbox_inches='tight')
        plt.close(fig)
コード例 #6
0
ファイル: spladder.py プロジェクト: ccwang12/spladder
def spladder():

    ### get command line options
    options = parse_options(sys.argv)

    ### parse parameters from options object
    CFG = settings.parse_args(options)

    ### add dependencies provided in config section
    #if 'paths' in CFG:
    #    for i in CFG['paths']:
    #        eval('import %s'% CFG['paths'][i])

    ### load confidence level settings
    if not CFG['no_reset_conf']:
        CFG = settings.set_confidence_level(CFG)

    ### do not compute components of merged set, if result file already exists
    fn_out_merge = '' 
    prune_tag = ''
    if CFG['do_prune']:
        prune_tag = '_pruned'
    if CFG['merge_strategy'] == 'merge_graphs':
        fn_out_merge = '%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag)


    if not 'spladder_infile' in CFG and not os.path.exists(fn_out_merge):
        ### iterate over files, if merge strategy is single
        if CFG['merge_strategy'] in ['single', 'merge_graphs']:
            idxs = range(len(CFG['samples']))
        else:
            idxs = [0]
        
        ### set parallelization
        if CFG['rproc']:
            jobinfo = []

        ### create out-directory
        if not os.path.exists(CFG['out_dirname']):
            os.makedirs(CFG['out_dirname'])

        ### create spladder sub-directory
        if not os.path.exists(os.path.join(CFG['out_dirname'], 'spladder')):
            os.makedirs(os.path.join(CFG['out_dirname'], 'spladder'))

        ### pre-process annotation, if necessary
        if CFG['anno_fname'].split('.')[-1] != 'pickle':
            if not os.path.exists(CFG['anno_fname'] + '.pickle'):
                if CFG['anno_fname'].split('.')[-1] in ['gff', 'gff3']:
                    (genes, CFG) = init.init_genes_gff3(CFG['anno_fname'], CFG, CFG['anno_fname'] + '.pickle')
                elif CFG['anno_fname'].split('.')[-1] in ['gtf']:
                    (genes, CFG) = init.init_genes_gtf(CFG['anno_fname'], CFG, CFG['anno_fname'] + '.pickle')
                else:
                    print >> sys.stderr, 'ERROR: Unknown annotation format. File needs to end in gtf or gff/gff3\nCurrent file: %s' % CFG['anno_fname']
                    sys.exit(1)
            CFG['anno_fname'] += '.pickle'

        ### add anotation contigs into lookup table
        if not 'genes' in CFG:
            genes = cPickle.load(open(CFG['anno_fname'], 'r'))
        else:
            genes = CFG['genes']
        CFG = init.append_chrms(sp.unique(sp.array([x.chr for x in genes], dtype='str')), CFG)
        del genes


        for idx in idxs:
            CFG_ = dict()
            if CFG['merge_strategy'] != 'merge_bams':
                CFG_['bam_fnames'] = CFG['bam_fnames']
                CFG_['samples'] = CFG['samples']
                CFG['bam_fnames'] = CFG['bam_fnames'][idx]
                CFG['samples'] = CFG['samples'][idx]
                CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['samples'])
            else:
                CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'])

            ### assemble out filename to check if we are already done
            fn_out = CFG['out_fname']
            if CFG['do_prune']:
                fn_out = re.sub('.pickle$', '_pruned.pickle', fn_out)
            if CFG['do_gen_isoforms']:
                fn_out = re.sub('.pickle$', '_with_isoforms.pickle', fn_out)
    
            if os.path.exists(fn_out):
                print >> sys.stdout, 'All result files already exist.'
            else:
                if CFG['rproc']:
                    jobinfo.append(rp.rproc('spladder_core', CFG, 15000, CFG['options_rproc'], 40*60))
                else:
                    spladder_core(CFG)

            for key in CFG_:
                try:
                    CFG[key] = CFG_[key].copy()
                except AttributeError:
                    CFG[key] = CFG_[key]

        ### collect results after parallelization
        if CFG['rproc']:
            rp.rproc_wait(jobinfo, 30, 1.0, -1)

        ### merge parts if necessary
        if CFG['merge_strategy'] == 'merge_graphs':
            run_merge(CFG)

    ### determine count output file
    if not 'spladder_infile' in CFG:
        if CFG['validate_splicegraphs']:
            fn_in_count = '%s/spladder/genes_graph_conf%i.%s%s.validated.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag)
        else:
            fn_in_count = '%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag)
    else:
        fn_in_count = CFG['spladder_infile']
    fn_out_count = fn_in_count.replace('.pickle', '') + '.count.pickle'

    ### count segment graph
    if not os.path.exists(fn_out_count):
        count_graph_coverage_wrapper(fn_in_count, fn_out_count, CFG)

    ### count intron coverage phenotype
    if CFG['count_intron_cov']:
        fn_out_intron_count = fn_out_count.replace('mat', 'introns.pickle')
        count_intron_coverage_wrapper(fn_in_count, fn_out_intron_count, CFG)

    ### handle alternative splicing part
    if CFG['run_as_analysis']:
        collect_events(CFG)

        for idx in range(len(CFG['event_types'])):
            analyze_events(CFG, CFG['event_types'][idx])