Ejemplo n.º 1
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='te_diff', help='Output directory [Default: %default]')
    parser.add_option('-c', dest='scale', default=1, type='float', help='CDF plot scale [Default: %default]')
    parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK'])

    parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]')
    parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]')
    parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # clean plot directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower)

        ref_gtf = spread_gtf

    ##################################################
    # hash TEs -> genes
    ##################################################
    te_genes = te.hash_repeats_genes(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True)

    ##################################################
    # hash genes -> RIP diff
    ##################################################
    gene_diff = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input')

    ##################################################
    # compute stats and make plots
    ##################################################
    table_lines, pvals = compute_stats(te_genes, gene_diff, ref_gtf, options.out_dir, options.scale)

    # perform multiple hypothesis correction
    qvals = fdr.ben_hoch(pvals)

    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for i in range(len(table_lines)):
        print >> table_out, '%s %10.2e' % (table_lines[i],qvals[i])
    table_out.close()

    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Ejemplo n.º 2
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-o', dest='out_dir', default='te_diff_regress', help='Output directory to print regression summaries [Default: %default]')
    parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK'])
    parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]')

    parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]')
    parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]')
    parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # clean output directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower, verbose=True)

        ref_gtf = spread_gtf

    # hash genes -> TEs
    gene_tes = te.hash_genes_repeats(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True)

    # hash diffs stats
    gene_diffs = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input')

    table_lines = []
    pvals = []

    for spair in gene_diffs:
        sample1, sample2 = spair

        # construct data frame
        gene_list = list(set(gene_tes.keys()) & set(gene_diffs[spair].keys()))
        df = pd.DataFrame({'diff': [gene_diffs[spair][gene_id] for gene_id in gene_list]})

        covariate_str = ''
        for fam in regression_tes:
            te_key = '%s_fwd' % fam.replace('/','_').replace('-','')
            df[te_key] = [1.0*(('*',fam,'+') in gene_tes.get(gene_id,[])) for gene_id in gene_list]
            if len(covariate_str) == 0:
                covariate_str = te_key
            else:
                covariate_str += ' + %s' % te_key

            te_key = '%s_rev' % fam.replace('/','_').replace('-','')
            df[te_key] = [1.0*(('*',fam,'-') in gene_tes.get(gene_id,[])) for gene_id in gene_list]
            covariate_str += ' + %s' % te_key    

        # regress
        mod = smf.ols(formula='diff ~ %s' % covariate_str, data=df).fit()

        # output model
        mod_out = open('%s/%s-%s.txt' % (options.out_dir, sample1, sample2), 'w')
        print >> mod_out, mod.summary()
        mod_out.close()

        # save table lines
        for fam in regression_tes:
            te_key = '%s_fwd' % fam.replace('/','_').replace('-','')
            cols = (fam, '+', sample1, sample2, sum(df[te_key]), mod.params[te_key], mod.tvalues[te_key], mod.pvalues[te_key]/0.5)
            table_lines.append('%-17s  %1s  %-10s  %-10s  %6d  %8.3f  %8.3f  %10.2e' % cols)
            pvals.append(cols[-1])

            te_key = '%s_rev' % fam.replace('/','_').replace('-','')
            cols = (fam, '-', sample1, sample2, sum(df[te_key]), mod.params[te_key], mod.tvalues[te_key], mod.pvalues[te_key]/0.5)
            table_lines.append('%-17s  %1s  %-10s  %-10s  %6d  %8.3f  %8.3f  %10.2e' % cols)
            pvals.append(cols[-1])

    # perform multiple hypothesis correction
    qvals = fdr.ben_hoch(pvals)

    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for i in range(len(table_lines)):
        print >> table_out, '%s %10.2e' % (table_lines[i],qvals[i])
    table_out.close()


    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Ejemplo n.º 3
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-m',
                      dest='max_stat',
                      default=None,
                      type='float',
                      help='Maximum stat for plotting [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='te_diff',
                      help='Output directory [Default: %default]')
    parser.add_option('-c',
                      dest='scale',
                      default=1,
                      type='float',
                      help='CDF plot scale [Default: %default]')
    parser.add_option('-t',
                      dest='te_gff',
                      default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])

    parser.add_option(
        '-s',
        dest='spread_factor',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]'
    )
    parser.add_option(
        '-l',
        dest='spread_lower',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between median and shortest transcripts [Defafult: %default]'
    )
    parser.add_option(
        '-u',
        dest='spread_upper',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between median and longest transcripts [Defafult: %default]'
    )

    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # clean plot directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf, spread_gtf, options.spread_lower,
                          options.spread_lower)

        ref_gtf = spread_gtf

    ##################################################
    # hash TEs -> genes
    ##################################################
    te_genes = te.hash_repeats_genes(ref_gtf,
                                     options.te_gff,
                                     gene_key='transcript_id',
                                     add_star=True,
                                     stranded=True)

    ##################################################
    # hash genes -> RIP diff
    ##################################################
    gene_diff = cuffdiff.hash_diff(diff_file,
                                   stat='fold',
                                   max_stat=options.max_stat,
                                   sample_first='input')

    ##################################################
    # compute stats and make plots
    ##################################################
    table_lines, pvals = compute_stats(te_genes, gene_diff, ref_gtf,
                                       options.out_dir, options.scale)

    # perform multiple hypothesis correction
    qvals = fdr.ben_hoch(pvals)

    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for i in range(len(table_lines)):
        print >> table_out, '%s %10.2e' % (table_lines[i], qvals[i])
    table_out.close()

    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Ejemplo n.º 4
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-o', dest='out_dir', default='te_diff_regress', help='Output directory to print regression summaries [Default: %default]')
    parser.add_option('-c', dest='scale', default=1, type='float', help='Plot scale [Default: %default]')


    parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK'])
    parser.add_option('-r', dest='orientation', default=False, action='store_true', help='Split TEs by orientation [Default: %default]')

    parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]')

    parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]')
    parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]')
    parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # make output directory
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower, verbose=True)

        ref_gtf = spread_gtf

    # hash genes -> TEs -> occurence num
    gene_te_num = te.hash_genes_repeats_num(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=options.orientation)

    # hash diffs stats
    gene_diffs = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input')

    table_lines = []
    pvals = []

    for spair in gene_diffs:
        sample1, sample2 = spair

        gene_list = list(set(gene_te_num.keys()) & set(gene_diffs[spair].keys()))
                
        for fam in count_tes:
            if options.orientation:
                orients = ['+','-']
            else:
                orients = ['+']

            for orient in orients:
                # hash diff values by TE count
                count_diff = []
                for gene_id in gene_diffs[spair]:
                    if options.orientation:
                        count = gene_te_num.get(gene_id,{}).get(('*',fam,orient), 0)
                    else:
                        count = gene_te_num.get(gene_id,{}).get(('*',fam), 0)

                    while count >= len(count_diff):
                        count_diff.append([])
                    count_diff[count].append(gene_diffs[spair][gene_id])

                df = {'TEs':[], 'stat_low':[], 'stat_mid':[], 'stat_hi':[]}
                for c in range(len(count_diff)):
                    if len(count_diff[c]) > 12:
                        stat_low, stat_mid, stat_hi = stats.quantile(count_diff[c], [.25, .5, .75])
                        df['TEs'].append(c)
                        df['stat_low'].append(stat_low)
                        df['stat_mid'].append(stat_mid)
                        df['stat_hi'].append(stat_hi)
                    else:
                        break

                if len(df['TEs']) > 1:
                    fam_plot = fam[fam.find('/')+1:]

                    if options.orientation:                        
                        out_pdf = '%s/%s-%s_%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot, orient)
                        out_df = '%s/%s-%s_%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot, orient)
                    else:
                        out_pdf = '%s/%s-%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot)
                        out_df = '%s/%s-%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot)

                    ggplot.plot('%s/te_diff_count.r' % os.environ['RDIR'], df, [out_pdf, options.scale], df_file=out_df)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Ejemplo n.º 5
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option(
        '-o',
        dest='out_dir',
        default='te_diff_regress',
        help=
        'Output directory to print regression summaries [Default: %default]')
    parser.add_option('-c',
                      dest='scale',
                      default=1,
                      type='float',
                      help='Plot scale [Default: %default]')

    parser.add_option('-t',
                      dest='te_gff',
                      default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])
    parser.add_option('-r',
                      dest='orientation',
                      default=False,
                      action='store_true',
                      help='Split TEs by orientation [Default: %default]')

    parser.add_option('-m',
                      dest='max_stat',
                      default=None,
                      type='float',
                      help='Maximum stat for plotting [Default: %default]')

    parser.add_option(
        '-s',
        dest='spread_factor',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]'
    )
    parser.add_option(
        '-l',
        dest='spread_lower',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between median and shortest transcripts [Defafult: %default]'
    )
    parser.add_option(
        '-u',
        dest='spread_upper',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between median and longest transcripts [Defafult: %default]'
    )

    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # make output directory
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf,
                          spread_gtf,
                          options.spread_lower,
                          options.spread_lower,
                          verbose=True)

        ref_gtf = spread_gtf

    # hash genes -> TEs -> occurence num
    gene_te_num = te.hash_genes_repeats_num(ref_gtf,
                                            options.te_gff,
                                            gene_key='transcript_id',
                                            add_star=True,
                                            stranded=options.orientation)

    # hash diffs stats
    gene_diffs = cuffdiff.hash_diff(diff_file,
                                    stat='fold',
                                    max_stat=options.max_stat,
                                    sample_first='input')

    table_lines = []
    pvals = []

    for spair in gene_diffs:
        sample1, sample2 = spair

        gene_list = list(
            set(gene_te_num.keys()) & set(gene_diffs[spair].keys()))

        for fam in count_tes:
            if options.orientation:
                orients = ['+', '-']
            else:
                orients = ['+']

            for orient in orients:
                # hash diff values by TE count
                count_diff = []
                for gene_id in gene_diffs[spair]:
                    if options.orientation:
                        count = gene_te_num.get(gene_id, {}).get(
                            ('*', fam, orient), 0)
                    else:
                        count = gene_te_num.get(gene_id, {}).get(('*', fam), 0)

                    while count >= len(count_diff):
                        count_diff.append([])
                    count_diff[count].append(gene_diffs[spair][gene_id])

                df = {'TEs': [], 'stat_low': [], 'stat_mid': [], 'stat_hi': []}
                for c in range(len(count_diff)):
                    if len(count_diff[c]) > 12:
                        stat_low, stat_mid, stat_hi = stats.quantile(
                            count_diff[c], [.25, .5, .75])
                        df['TEs'].append(c)
                        df['stat_low'].append(stat_low)
                        df['stat_mid'].append(stat_mid)
                        df['stat_hi'].append(stat_hi)
                    else:
                        break

                if len(df['TEs']) > 1:
                    fam_plot = fam[fam.find('/') + 1:]

                    if options.orientation:
                        out_pdf = '%s/%s-%s_%s_%s.pdf' % (options.out_dir,
                                                          sample1, sample2,
                                                          fam_plot, orient)
                        out_df = '%s/%s-%s_%s_%s.df' % (options.out_dir,
                                                        sample1, sample2,
                                                        fam_plot, orient)
                    else:
                        out_pdf = '%s/%s-%s_%s.pdf' % (
                            options.out_dir, sample1, sample2, fam_plot)
                        out_df = '%s/%s-%s_%s.df' % (options.out_dir, sample1,
                                                     sample2, fam_plot)

                    ggplot.plot('%s/te_diff_count.r' % os.environ['RDIR'],
                                df, [out_pdf, options.scale],
                                df_file=out_df)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)