Esempio n. 1
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-o', dest='out_dir', default='te_diff', help='Output directory [Default: %default]')
    parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tpf.gff'%os.environ['MASK'])
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        gtf_file = args[0]
        diff_file = args[1]

    # hash genes -> TEs
    gene_tes = te.hash_genes_repeats(gtf_file, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True)

    # create a fake family for unrepetitive genes
    for line in open(gtf_file):
        a = line.split('\t')
        gene_id = gff.gtf_kv(a[8])['transcript_id']
        if not gene_id in gene_tes:
            gene_tes[gene_id] = set([('-','-','*')])

    # get diffs stats
    gene_diffs, te_diffs = get_diff_stats(diff_file, gene_tes)

    # clean plot directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    # stats
    table_lines, pvals = compute_stats(te_diffs, gene_diffs, options.out_dir)

    # perform multiple hypothesis correction
    qvals = fdr.ben_hoch(pvals)

    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for i in range(len(table_lines)):
        print >> table_out, '%s %10.2e' % (table_lines[i],qvals[i])
    table_out.close()
Esempio n. 2
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='te_diff', help='Output directory [Default: %default]')
    parser.add_option('-c', dest='scale', default=1, type='float', help='CDF plot scale [Default: %default]')
    parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK'])

    parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]')
    parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]')
    parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # clean plot directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower)

        ref_gtf = spread_gtf

    ##################################################
    # hash TEs -> genes
    ##################################################
    te_genes = te.hash_repeats_genes(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True)

    ##################################################
    # hash genes -> RIP diff
    ##################################################
    gene_diff = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input')

    ##################################################
    # compute stats and make plots
    ##################################################
    table_lines, pvals = compute_stats(te_genes, gene_diff, ref_gtf, options.out_dir, options.scale)

    # perform multiple hypothesis correction
    qvals = fdr.ben_hoch(pvals)

    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for i in range(len(table_lines)):
        print >> table_out, '%s %10.2e' % (table_lines[i],qvals[i])
    table_out.close()

    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Esempio n. 3
0
def main():
    usage = 'usage: %prog [options] <feature gff>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % os.environ['HOME'])
    parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a gff file for the feature of interest.')
    else:
        feature_gff = args[0]

    ############################################
    # GFF filter
    ############################################
    # filter TEs and features by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter feature GFF
        feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -u -f 0.5 -a %s -b %s > %s' % (feature_gff, filter_merged_bed_file, feature_gff_gff_file), shell=True)
        feature_gff = feature_gff_gff_file

    ############################################
    # lengths
    ############################################
    # compute feature length
    feature_len, feature_num = feature_stats(feature_gff)    

    if feature_num == 0:
        print >> sys.stderr, 'Zero features'
        exit()

    # compute size of search space
    if options.filter_gff:
        genome_length = count_bed(filter_merged_bed_file, feature_len)
    else:
        genome_length = count_hg19()

    # hash counted repeat genomic bp
    te_lengths = te_target_size(options.repeats_gff, feature_len)

    ############################################
    # hash TE/feature overlaps
    ############################################
    # initialize
    te_features = {}
    for rep, fam in te_lengths:
        if options.strand_split:
            te_features[(rep+'+',fam)] = set()
            te_features[('*+',fam)] = set()
            te_features[('*+','*')] = set()
            te_features[(rep+'-',fam)] = set()
            te_features[('*-',fam)] = set()
            te_features[('*-','*')] = set()
        else:
            te_features[(rep,fam)] = set()
            te_features[('*',fam)] = set()
            te_features[('*','*')] = set()
        
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (options.repeats_gff,feature_gff), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        a = line.split('\t')
        
        kv = gff.gtf_kv(a[8])
        rep = kv['repeat']
        fam = kv['family']

        fchrom = a[9]
        fstart = int(a[12])
        fend = int(a[13])

        rep_star = '*'
        if options.strand_split:
            tstrand = a[6]
            fstrand = a[15]
            if tstrand == fstrand:
                rep += '+'
                rep_star += '+'
            else:
                rep += '-'
                rep_star += '-'

        te_features[(rep,fam)].add((fchrom,fstart,fend))
        te_features[(rep_star,fam)].add((fchrom,fstart,fend))
        te_features[(rep_star,'*')].add((fchrom,fstart,fend))

    p.communicate()

    ############################################SW
    # compute stats and print
    ############################################
    lines = []
    p_vals = []
    for te in te_features:
        rep, fam = te

        if options.strand_split:
            te_len = te_lengths[(rep[:-1],fam)]
            te_p = float(te_len) / (2*genome_length)
        else:
            te_len = te_lengths[(rep,fam)]
            te_p = float(te_len) / genome_length
        
        te_count = len(te_features.get(te,[]))
        exp_count = te_p * feature_num

        fold_change = te_count / exp_count

        if fold_change > 1:
            p_val = binom.sf(te_count-1, feature_num, te_p)
        else:
            p_val = binom.cdf(te_count, feature_num, te_p)
        
        p_vals.append(p_val)

        cols = (rep, fam, te_len, te_count, exp_count, fold_change, p_val)
        lines.append('%-18s %-18s %9d %8d %8.1f %8.2f %10.2e' % cols)

    # correct for multiple hypotheses correction
    q_vals = fdr.ben_hoch(p_vals)
    for i in range(len(lines)):
        qline = lines[i] + ' %10.2e' % q_vals[i]
        print qline

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)
        os.close(te_gff_fd)
        os.remove(te_gff_file)
        os.close(feature_gff_gff_fd)
        os.remove(feature_gff_gff_file)
Esempio n. 4
0
def main():
    usage = 'usage: %prog [options] <feature gff/bed>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='gff_file', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % home_dir)
    parser.add_option('-n', dest='null_iterations', type=int, default=50, help='Number of shuffles to perform to estimate null distribution [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a gff file for the feature of interest.')
    else:
        feature_gff = args[0]

    ############################################
    # GFF filter
    ############################################
    # filter TEs and features by gff file
    if options.gff_file:
        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, options.gff_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter feature GFF
        feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -s -u -f 0.5 -a %s -b %s > %s' % (feature_gff, options.gff_file, feature_gff_gff_file), shell=True)
        feature_gff = feature_gff_gff_file

    ############################################
    # lengths
    ############################################
    # compute size of search space
    if options.gff_file:
        genome_length = count_gff(options.gff_file)
    else:
        genome_length = count_hg19()

    # compute feature length
    feature_len, feature_num = feature_stats(feature_gff)

    if feature_num == 0:
        print >> sys.stderr, 'Zero features'
        exit()

    # hash counted repeat genomic bp
    te_in = open(options.repeats_gff)
    genome_te_bp = hash_te(te_in)
    te_in.close()

    ############################################
    # convert feature gff to bed
    ############################################
    if feature_gff[-3:] == 'gtf':
        feature_bed_fd, feature_bed_file = tempfile.mkstemp()
        subprocess.call('gtf2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True)

    elif feature_gff[-3:] == 'gff':
        feature_bed_fd, feature_bed_file = tempfile.mkstemp()
        subprocess.call('gff2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True)

    elif feature_gff[-3:] == 'bed':
        feature_bed_file = feature_gff

    else:
        parser.error('Cannot recognize gff format suffix')

    ############################################
    # null distribution
    ############################################
    shuffle_bed_fd, shuffle_bed_file = tempfile.mkstemp()

    te_null_bp = {}
    for ni in range(options.null_iterations):
        print >> sys.stderr, ni

        # shuffle feature bed
        subprocess.call('shuffleBed -i %s -g %s/research/common/data/genomes/hg19/assembly/human.hg19.genome -excl %s/research/common/data/genomes/hg19/assembly/hg19_gaps.bed > %s' % (feature_bed_file, home_dir, home_dir, shuffle_bed_file), shell=True)

        # intersect w/ TEs and hash overlaps
        te_tmp_bp = intersect_hash(options.repeats_gff, shuffle_bed_file)
        for te in genome_te_bp:
            te_null_bp.setdefault(te,[]).append(te_tmp_bp.get(te,0))

    ############################################
    # actual
    ############################################
    te_bp = intersect_hash(options.repeats_gff, feature_gff)

    ############################################
    # compute stats and print
    ############################################
    lines = []
    p_vals = []
    for te in genome_te_bp:
        feature_freq = float(te_bp.get(te,0))/feature_len
        genome_freq = float(genome_te_bp[te])/genome_length
        fold_change = feature_freq / genome_freq

        #print te, stats.mean(te_null_bp[te]), stats.sd(te_null_bp[te])

        null_u, null_sd = stats.mean_sd(te_null_bp[te])
        if null_sd == 0:
            null_sd = 1.0
            
        if fold_change > 1:
            p = norm.sf(te_bp[te]-1, loc=null_u, scale=null_sd)
        else:
            p = norm.cdf(te_bp.get(te,0), loc=null_u, scale=null_sd)

        p_vals.append(p)

        cols = (te[0], te[1], te_bp.get(te,0), feature_freq, genome_freq, fold_change, p)
        lines.append('%-18s %-18s %8d %11.2e %11.2e %9.2f %10.2e' % cols)

    # correct for multiple hypotheses correction
    q_vals = fdr.ben_hoch(p_vals)
    for i in range(len(lines)):
        qline = lines[i] + ' %10.2e' % q_vals[i]
        print qline

    ############################################
    # clean
    ############################################
    os.close(shuffle_bed_fd)
    os.remove(shuffle_bed_file)
    if feature_gff[-3:] != 'bed':
        os.close(feature_bed_fd)
        os.remove(feature_bed_file)
    if options.gff_file:
        os.close(te_gff_fd)
        os.remove(te_gff_file)
        os.close(feature_gff_gff_fd)
        os.remove(feature_gff_gff_file)
Esempio n. 5
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-o', dest='out_dir', default='te_diff_regress', help='Output directory to print regression summaries [Default: %default]')
    parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK'])
    parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]')

    parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]')
    parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]')
    parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # clean output directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower, verbose=True)

        ref_gtf = spread_gtf

    # hash genes -> TEs
    gene_tes = te.hash_genes_repeats(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True)

    # hash diffs stats
    gene_diffs = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input')

    table_lines = []
    pvals = []

    for spair in gene_diffs:
        sample1, sample2 = spair

        # construct data frame
        gene_list = list(set(gene_tes.keys()) & set(gene_diffs[spair].keys()))
        df = pd.DataFrame({'diff': [gene_diffs[spair][gene_id] for gene_id in gene_list]})

        covariate_str = ''
        for fam in regression_tes:
            te_key = '%s_fwd' % fam.replace('/','_').replace('-','')
            df[te_key] = [1.0*(('*',fam,'+') in gene_tes.get(gene_id,[])) for gene_id in gene_list]
            if len(covariate_str) == 0:
                covariate_str = te_key
            else:
                covariate_str += ' + %s' % te_key

            te_key = '%s_rev' % fam.replace('/','_').replace('-','')
            df[te_key] = [1.0*(('*',fam,'-') in gene_tes.get(gene_id,[])) for gene_id in gene_list]
            covariate_str += ' + %s' % te_key    

        # regress
        mod = smf.ols(formula='diff ~ %s' % covariate_str, data=df).fit()

        # output model
        mod_out = open('%s/%s-%s.txt' % (options.out_dir, sample1, sample2), 'w')
        print >> mod_out, mod.summary()
        mod_out.close()

        # save table lines
        for fam in regression_tes:
            te_key = '%s_fwd' % fam.replace('/','_').replace('-','')
            cols = (fam, '+', sample1, sample2, sum(df[te_key]), mod.params[te_key], mod.tvalues[te_key], mod.pvalues[te_key]/0.5)
            table_lines.append('%-17s  %1s  %-10s  %-10s  %6d  %8.3f  %8.3f  %10.2e' % cols)
            pvals.append(cols[-1])

            te_key = '%s_rev' % fam.replace('/','_').replace('-','')
            cols = (fam, '-', sample1, sample2, sum(df[te_key]), mod.params[te_key], mod.tvalues[te_key], mod.pvalues[te_key]/0.5)
            table_lines.append('%-17s  %1s  %-10s  %-10s  %6d  %8.3f  %8.3f  %10.2e' % cols)
            pvals.append(cols[-1])

    # perform multiple hypothesis correction
    qvals = fdr.ben_hoch(pvals)

    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for i in range(len(table_lines)):
        print >> table_out, '%s %10.2e' % (table_lines[i],qvals[i])
    table_out.close()


    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Esempio n. 6
0
def main():
    usage = 'usage: %prog [options] <feature gff>'
    parser = OptionParser(usage)
    parser.add_option(
        '-g',
        dest='filter_gff',
        help=
        'Filter the TEs by overlap with genes in the given gff file [Default: %default]'
    )
    parser.add_option(
        '-r',
        dest='repeats_gff',
        default=
        '%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff'
        % os.environ['HOME'])
    parser.add_option('-s',
                      dest='strand_split',
                      default=False,
                      action='store_true',
                      help='Split statistics by strand [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a gff file for the feature of interest.')
    else:
        feature_gff = args[0]

    ############################################
    # GFF filter
    ############################################
    # filter TEs and features by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' %
                        (options.filter_gff, filter_merged_bed_file),
                        shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp()
        subprocess.call(
            'intersectBed -a %s -b %s > %s' %
            (options.repeats_gff, filter_merged_bed_file, te_gff_file),
            shell=True)
        options.repeats_gff = te_gff_file

        # filter feature GFF
        feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp()
        subprocess.call(
            'intersectBed -u -f 0.5 -a %s -b %s > %s' %
            (feature_gff, filter_merged_bed_file, feature_gff_gff_file),
            shell=True)
        feature_gff = feature_gff_gff_file

    ############################################
    # lengths
    ############################################
    # compute feature length
    feature_len, feature_num = feature_stats(feature_gff)

    if feature_num == 0:
        print >> sys.stderr, 'Zero features'
        exit()

    # compute size of search space
    if options.filter_gff:
        genome_length = count_bed(filter_merged_bed_file, feature_len)
    else:
        genome_length = count_hg19()

    # hash counted repeat genomic bp
    te_lengths = te_target_size(options.repeats_gff, feature_len)

    ############################################
    # hash TE/feature overlaps
    ############################################
    # initialize
    te_features = {}
    for rep, fam in te_lengths:
        if options.strand_split:
            te_features[(rep + '+', fam)] = set()
            te_features[('*+', fam)] = set()
            te_features[('*+', '*')] = set()
            te_features[(rep + '-', fam)] = set()
            te_features[('*-', fam)] = set()
            te_features[('*-', '*')] = set()
        else:
            te_features[(rep, fam)] = set()
            te_features[('*', fam)] = set()
            te_features[('*', '*')] = set()

    p = subprocess.Popen('intersectBed -wo -a %s -b %s' %
                         (options.repeats_gff, feature_gff),
                         shell=True,
                         stdout=subprocess.PIPE)
    for line in p.stdout:
        a = line.split('\t')

        kv = gff.gtf_kv(a[8])
        rep = kv['repeat']
        fam = kv['family']

        fchrom = a[9]
        fstart = int(a[12])
        fend = int(a[13])

        rep_star = '*'
        if options.strand_split:
            tstrand = a[6]
            fstrand = a[15]
            if tstrand == fstrand:
                rep += '+'
                rep_star += '+'
            else:
                rep += '-'
                rep_star += '-'

        te_features[(rep, fam)].add((fchrom, fstart, fend))
        te_features[(rep_star, fam)].add((fchrom, fstart, fend))
        te_features[(rep_star, '*')].add((fchrom, fstart, fend))

    p.communicate()

    ############################################SW
    # compute stats and print
    ############################################
    lines = []
    p_vals = []
    for te in te_features:
        rep, fam = te

        if options.strand_split:
            te_len = te_lengths[(rep[:-1], fam)]
            te_p = float(te_len) / (2 * genome_length)
        else:
            te_len = te_lengths[(rep, fam)]
            te_p = float(te_len) / genome_length

        te_count = len(te_features.get(te, []))
        exp_count = te_p * feature_num

        fold_change = te_count / exp_count

        if fold_change > 1:
            p_val = binom.sf(te_count - 1, feature_num, te_p)
        else:
            p_val = binom.cdf(te_count, feature_num, te_p)

        p_vals.append(p_val)

        cols = (rep, fam, te_len, te_count, exp_count, fold_change, p_val)
        lines.append('%-18s %-18s %9d %8d %8.1f %8.2f %10.2e' % cols)

    # correct for multiple hypotheses correction
    q_vals = fdr.ben_hoch(p_vals)
    for i in range(len(lines)):
        qline = lines[i] + ' %10.2e' % q_vals[i]
        print qline

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)
        os.close(te_gff_fd)
        os.remove(te_gff_file)
        os.close(feature_gff_gff_fd)
        os.remove(feature_gff_gff_file)
Esempio n. 7
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-m',
                      dest='max_stat',
                      default=None,
                      type='float',
                      help='Maximum stat for plotting [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='te_diff',
                      help='Output directory [Default: %default]')
    parser.add_option('-c',
                      dest='scale',
                      default=1,
                      type='float',
                      help='CDF plot scale [Default: %default]')
    parser.add_option('-t',
                      dest='te_gff',
                      default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])

    parser.add_option(
        '-s',
        dest='spread_factor',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]'
    )
    parser.add_option(
        '-l',
        dest='spread_lower',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between median and shortest transcripts [Defafult: %default]'
    )
    parser.add_option(
        '-u',
        dest='spread_upper',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between median and longest transcripts [Defafult: %default]'
    )

    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # clean plot directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf, spread_gtf, options.spread_lower,
                          options.spread_lower)

        ref_gtf = spread_gtf

    ##################################################
    # hash TEs -> genes
    ##################################################
    te_genes = te.hash_repeats_genes(ref_gtf,
                                     options.te_gff,
                                     gene_key='transcript_id',
                                     add_star=True,
                                     stranded=True)

    ##################################################
    # hash genes -> RIP diff
    ##################################################
    gene_diff = cuffdiff.hash_diff(diff_file,
                                   stat='fold',
                                   max_stat=options.max_stat,
                                   sample_first='input')

    ##################################################
    # compute stats and make plots
    ##################################################
    table_lines, pvals = compute_stats(te_genes, gene_diff, ref_gtf,
                                       options.out_dir, options.scale)

    # perform multiple hypothesis correction
    qvals = fdr.ben_hoch(pvals)

    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for i in range(len(table_lines)):
        print >> table_out, '%s %10.2e' % (table_lines[i], qvals[i])
    table_out.close()

    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Esempio n. 8
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-o', dest='out_dir', default='te_diff_regress', help='Output directory to print regression summaries [Default: %default]')
    parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tpf.gff'%os.environ['MASK'])
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        gtf_file = args[0]
        diff_file = args[1]

    # hash genes -> TEs
    gene_tes = te.hash_genes_repeats(gtf_file, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True)

    # hash diffs stats
    gene_diffs = hash_diff(diff_file)

    # clean output directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    table_lines = []
    pvals = []

    for spair in gene_diffs:
        sample1, sample2 = spair

        # construct data frame
        gene_list = list(set(gene_tes.keys()) & set(gene_diffs[spair].keys()))
        df = pd.DataFrame({'diff': [gene_diffs[spair][gene_id] for gene_id in gene_list]})

        covariate_str = ''
        for fam in regression_tes:
            te_key = '%s_fwd' % fam.replace('/','_').replace('-','')
            df[te_key] = [1.0*(('*',fam,'+') in gene_tes.get(gene_id,[])) for gene_id in gene_list]
            if len(covariate_str) == 0:
                covariate_str = te_key
            else:
                covariate_str += ' + %s' % te_key

            te_key = '%s_rev' % fam.replace('/','_').replace('-','')
            df[te_key] = [1.0*(('*',fam,'-') in gene_tes.get(gene_id,[])) for gene_id in gene_list]
            covariate_str += ' + %s' % te_key    

        # regress
        mod = smf.ols(formula='diff ~ %s' % covariate_str, data=df).fit()

        # output model
        mod_out = open('%s/%s-%s.txt' % (options.out_dir, sample1, sample2), 'w')
        print >> mod_out, mod.summary()
        mod_out.close()

        # save table lines
        for fam in regression_tes:
            te_key = '%s_fwd' % fam.replace('/','_').replace('-','')
            cols = (fam, '+', sample1, sample2, sum(df[te_key]), mod.params[te_key], mod.tvalues[te_key], mod.pvalues[te_key]/0.5)
            table_lines.append('%-17s  %1s  %-10s  %-10s  %6d  %8.3f  %8.3f  %10.2e' % cols)
            pvals.append(cols[-1])

            te_key = '%s_rev' % fam.replace('/','_').replace('-','')
            cols = (fam, '-', sample1, sample2, sum(df[te_key]), mod.params[te_key], mod.tvalues[te_key], mod.pvalues[te_key]/0.5)
            table_lines.append('%-17s  %1s  %-10s  %-10s  %6d  %8.3f  %8.3f  %10.2e' % cols)
            pvals.append(cols[-1])

    # perform multiple hypothesis correction
    qvals = fdr.ben_hoch(pvals)

    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for i in range(len(table_lines)):
        print >> table_out, '%s %10.2e' % (table_lines[i],qvals[i])
    table_out.close()