def main(): usage = 'usage: %prog [options] <gtf> <diff>' parser = OptionParser(usage) parser.add_option('-o', dest='out_dir', default='te_diff', help='Output directory [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tpf.gff'%os.environ['MASK']) (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide .gtf and .diff files') else: gtf_file = args[0] diff_file = args[1] # hash genes -> TEs gene_tes = te.hash_genes_repeats(gtf_file, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True) # create a fake family for unrepetitive genes for line in open(gtf_file): a = line.split('\t') gene_id = gff.gtf_kv(a[8])['transcript_id'] if not gene_id in gene_tes: gene_tes[gene_id] = set([('-','-','*')]) # get diffs stats gene_diffs, te_diffs = get_diff_stats(diff_file, gene_tes) # clean plot directory if os.path.isdir(options.out_dir): shutil.rmtree(options.out_dir) os.mkdir(options.out_dir) # stats table_lines, pvals = compute_stats(te_diffs, gene_diffs, options.out_dir) # perform multiple hypothesis correction qvals = fdr.ben_hoch(pvals) table_out = open('%s/table.txt' % options.out_dir, 'w') for i in range(len(table_lines)): print >> table_out, '%s %10.2e' % (table_lines[i],qvals[i]) table_out.close()
def main(): usage = 'usage: %prog [options] <gtf> <diff>' parser = OptionParser(usage) parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]') parser.add_option('-o', dest='out_dir', default='te_diff', help='Output directory [Default: %default]') parser.add_option('-c', dest='scale', default=1, type='float', help='CDF plot scale [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK']) parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]') parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]') parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide .gtf and .diff files') else: ref_gtf = args[0] diff_file = args[1] # clean plot directory if os.path.isdir(options.out_dir): shutil.rmtree(options.out_dir) os.mkdir(options.out_dir) if options.spread_factor or options.spread_lower or options.spread_upper: # filter for similar length if options.spread_factor: options.spread_lower = math.sqrt(options.spread_factor) options.spread_upper = math.sqrt(options.spread_factor) spread_gtf = '%s/spread_factor.gtf' % options.out_dir gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower) ref_gtf = spread_gtf ################################################## # hash TEs -> genes ################################################## te_genes = te.hash_repeats_genes(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True) ################################################## # hash genes -> RIP diff ################################################## gene_diff = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input') ################################################## # compute stats and make plots ################################################## table_lines, pvals = compute_stats(te_genes, gene_diff, ref_gtf, options.out_dir, options.scale) # perform multiple hypothesis correction qvals = fdr.ben_hoch(pvals) table_out = open('%s/table.txt' % options.out_dir, 'w') for i in range(len(table_lines)): print >> table_out, '%s %10.2e' % (table_lines[i],qvals[i]) table_out.close() if options.spread_factor or options.spread_lower or options.spread_upper: os.remove(spread_gtf)
def main(): usage = 'usage: %prog [options] <feature gff>' parser = OptionParser(usage) parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % os.environ['HOME']) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a gff file for the feature of interest.') else: feature_gff = args[0] ############################################ # GFF filter ############################################ # filter TEs and features by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter feature GFF feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -u -f 0.5 -a %s -b %s > %s' % (feature_gff, filter_merged_bed_file, feature_gff_gff_file), shell=True) feature_gff = feature_gff_gff_file ############################################ # lengths ############################################ # compute feature length feature_len, feature_num = feature_stats(feature_gff) if feature_num == 0: print >> sys.stderr, 'Zero features' exit() # compute size of search space if options.filter_gff: genome_length = count_bed(filter_merged_bed_file, feature_len) else: genome_length = count_hg19() # hash counted repeat genomic bp te_lengths = te_target_size(options.repeats_gff, feature_len) ############################################ # hash TE/feature overlaps ############################################ # initialize te_features = {} for rep, fam in te_lengths: if options.strand_split: te_features[(rep+'+',fam)] = set() te_features[('*+',fam)] = set() te_features[('*+','*')] = set() te_features[(rep+'-',fam)] = set() te_features[('*-',fam)] = set() te_features[('*-','*')] = set() else: te_features[(rep,fam)] = set() te_features[('*',fam)] = set() te_features[('*','*')] = set() p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (options.repeats_gff,feature_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') kv = gff.gtf_kv(a[8]) rep = kv['repeat'] fam = kv['family'] fchrom = a[9] fstart = int(a[12]) fend = int(a[13]) rep_star = '*' if options.strand_split: tstrand = a[6] fstrand = a[15] if tstrand == fstrand: rep += '+' rep_star += '+' else: rep += '-' rep_star += '-' te_features[(rep,fam)].add((fchrom,fstart,fend)) te_features[(rep_star,fam)].add((fchrom,fstart,fend)) te_features[(rep_star,'*')].add((fchrom,fstart,fend)) p.communicate() ############################################SW # compute stats and print ############################################ lines = [] p_vals = [] for te in te_features: rep, fam = te if options.strand_split: te_len = te_lengths[(rep[:-1],fam)] te_p = float(te_len) / (2*genome_length) else: te_len = te_lengths[(rep,fam)] te_p = float(te_len) / genome_length te_count = len(te_features.get(te,[])) exp_count = te_p * feature_num fold_change = te_count / exp_count if fold_change > 1: p_val = binom.sf(te_count-1, feature_num, te_p) else: p_val = binom.cdf(te_count, feature_num, te_p) p_vals.append(p_val) cols = (rep, fam, te_len, te_count, exp_count, fold_change, p_val) lines.append('%-18s %-18s %9d %8d %8.1f %8.2f %10.2e' % cols) # correct for multiple hypotheses correction q_vals = fdr.ben_hoch(p_vals) for i in range(len(lines)): qline = lines[i] + ' %10.2e' % q_vals[i] print qline ############################################ # clean ############################################ if options.filter_gff: os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) os.close(te_gff_fd) os.remove(te_gff_file) os.close(feature_gff_gff_fd) os.remove(feature_gff_gff_file)
def main(): usage = 'usage: %prog [options] <feature gff/bed>' parser = OptionParser(usage) parser.add_option('-g', dest='gff_file', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % home_dir) parser.add_option('-n', dest='null_iterations', type=int, default=50, help='Number of shuffles to perform to estimate null distribution [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a gff file for the feature of interest.') else: feature_gff = args[0] ############################################ # GFF filter ############################################ # filter TEs and features by gff file if options.gff_file: # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, options.gff_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter feature GFF feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -s -u -f 0.5 -a %s -b %s > %s' % (feature_gff, options.gff_file, feature_gff_gff_file), shell=True) feature_gff = feature_gff_gff_file ############################################ # lengths ############################################ # compute size of search space if options.gff_file: genome_length = count_gff(options.gff_file) else: genome_length = count_hg19() # compute feature length feature_len, feature_num = feature_stats(feature_gff) if feature_num == 0: print >> sys.stderr, 'Zero features' exit() # hash counted repeat genomic bp te_in = open(options.repeats_gff) genome_te_bp = hash_te(te_in) te_in.close() ############################################ # convert feature gff to bed ############################################ if feature_gff[-3:] == 'gtf': feature_bed_fd, feature_bed_file = tempfile.mkstemp() subprocess.call('gtf2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True) elif feature_gff[-3:] == 'gff': feature_bed_fd, feature_bed_file = tempfile.mkstemp() subprocess.call('gff2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True) elif feature_gff[-3:] == 'bed': feature_bed_file = feature_gff else: parser.error('Cannot recognize gff format suffix') ############################################ # null distribution ############################################ shuffle_bed_fd, shuffle_bed_file = tempfile.mkstemp() te_null_bp = {} for ni in range(options.null_iterations): print >> sys.stderr, ni # shuffle feature bed subprocess.call('shuffleBed -i %s -g %s/research/common/data/genomes/hg19/assembly/human.hg19.genome -excl %s/research/common/data/genomes/hg19/assembly/hg19_gaps.bed > %s' % (feature_bed_file, home_dir, home_dir, shuffle_bed_file), shell=True) # intersect w/ TEs and hash overlaps te_tmp_bp = intersect_hash(options.repeats_gff, shuffle_bed_file) for te in genome_te_bp: te_null_bp.setdefault(te,[]).append(te_tmp_bp.get(te,0)) ############################################ # actual ############################################ te_bp = intersect_hash(options.repeats_gff, feature_gff) ############################################ # compute stats and print ############################################ lines = [] p_vals = [] for te in genome_te_bp: feature_freq = float(te_bp.get(te,0))/feature_len genome_freq = float(genome_te_bp[te])/genome_length fold_change = feature_freq / genome_freq #print te, stats.mean(te_null_bp[te]), stats.sd(te_null_bp[te]) null_u, null_sd = stats.mean_sd(te_null_bp[te]) if null_sd == 0: null_sd = 1.0 if fold_change > 1: p = norm.sf(te_bp[te]-1, loc=null_u, scale=null_sd) else: p = norm.cdf(te_bp.get(te,0), loc=null_u, scale=null_sd) p_vals.append(p) cols = (te[0], te[1], te_bp.get(te,0), feature_freq, genome_freq, fold_change, p) lines.append('%-18s %-18s %8d %11.2e %11.2e %9.2f %10.2e' % cols) # correct for multiple hypotheses correction q_vals = fdr.ben_hoch(p_vals) for i in range(len(lines)): qline = lines[i] + ' %10.2e' % q_vals[i] print qline ############################################ # clean ############################################ os.close(shuffle_bed_fd) os.remove(shuffle_bed_file) if feature_gff[-3:] != 'bed': os.close(feature_bed_fd) os.remove(feature_bed_file) if options.gff_file: os.close(te_gff_fd) os.remove(te_gff_file) os.close(feature_gff_gff_fd) os.remove(feature_gff_gff_file)
def main(): usage = 'usage: %prog [options] <gtf> <diff>' parser = OptionParser(usage) parser.add_option('-o', dest='out_dir', default='te_diff_regress', help='Output directory to print regression summaries [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK']) parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]') parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]') parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]') parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide .gtf and .diff files') else: ref_gtf = args[0] diff_file = args[1] # clean output directory if os.path.isdir(options.out_dir): shutil.rmtree(options.out_dir) os.mkdir(options.out_dir) if options.spread_factor or options.spread_lower or options.spread_upper: # filter for similar length if options.spread_factor: options.spread_lower = math.sqrt(options.spread_factor) options.spread_upper = math.sqrt(options.spread_factor) spread_gtf = '%s/spread_factor.gtf' % options.out_dir gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower, verbose=True) ref_gtf = spread_gtf # hash genes -> TEs gene_tes = te.hash_genes_repeats(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True) # hash diffs stats gene_diffs = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input') table_lines = [] pvals = [] for spair in gene_diffs: sample1, sample2 = spair # construct data frame gene_list = list(set(gene_tes.keys()) & set(gene_diffs[spair].keys())) df = pd.DataFrame({'diff': [gene_diffs[spair][gene_id] for gene_id in gene_list]}) covariate_str = '' for fam in regression_tes: te_key = '%s_fwd' % fam.replace('/','_').replace('-','') df[te_key] = [1.0*(('*',fam,'+') in gene_tes.get(gene_id,[])) for gene_id in gene_list] if len(covariate_str) == 0: covariate_str = te_key else: covariate_str += ' + %s' % te_key te_key = '%s_rev' % fam.replace('/','_').replace('-','') df[te_key] = [1.0*(('*',fam,'-') in gene_tes.get(gene_id,[])) for gene_id in gene_list] covariate_str += ' + %s' % te_key # regress mod = smf.ols(formula='diff ~ %s' % covariate_str, data=df).fit() # output model mod_out = open('%s/%s-%s.txt' % (options.out_dir, sample1, sample2), 'w') print >> mod_out, mod.summary() mod_out.close() # save table lines for fam in regression_tes: te_key = '%s_fwd' % fam.replace('/','_').replace('-','') cols = (fam, '+', sample1, sample2, sum(df[te_key]), mod.params[te_key], mod.tvalues[te_key], mod.pvalues[te_key]/0.5) table_lines.append('%-17s %1s %-10s %-10s %6d %8.3f %8.3f %10.2e' % cols) pvals.append(cols[-1]) te_key = '%s_rev' % fam.replace('/','_').replace('-','') cols = (fam, '-', sample1, sample2, sum(df[te_key]), mod.params[te_key], mod.tvalues[te_key], mod.pvalues[te_key]/0.5) table_lines.append('%-17s %1s %-10s %-10s %6d %8.3f %8.3f %10.2e' % cols) pvals.append(cols[-1]) # perform multiple hypothesis correction qvals = fdr.ben_hoch(pvals) table_out = open('%s/table.txt' % options.out_dir, 'w') for i in range(len(table_lines)): print >> table_out, '%s %10.2e' % (table_lines[i],qvals[i]) table_out.close() if options.spread_factor or options.spread_lower or options.spread_upper: os.remove(spread_gtf)
def main(): usage = 'usage: %prog [options] <feature gff>' parser = OptionParser(usage) parser.add_option( '-g', dest='filter_gff', help= 'Filter the TEs by overlap with genes in the given gff file [Default: %default]' ) parser.add_option( '-r', dest='repeats_gff', default= '%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % os.environ['HOME']) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a gff file for the feature of interest.') else: feature_gff = args[0] ############################################ # GFF filter ############################################ # filter TEs and features by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp() subprocess.call( 'intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter feature GFF feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp() subprocess.call( 'intersectBed -u -f 0.5 -a %s -b %s > %s' % (feature_gff, filter_merged_bed_file, feature_gff_gff_file), shell=True) feature_gff = feature_gff_gff_file ############################################ # lengths ############################################ # compute feature length feature_len, feature_num = feature_stats(feature_gff) if feature_num == 0: print >> sys.stderr, 'Zero features' exit() # compute size of search space if options.filter_gff: genome_length = count_bed(filter_merged_bed_file, feature_len) else: genome_length = count_hg19() # hash counted repeat genomic bp te_lengths = te_target_size(options.repeats_gff, feature_len) ############################################ # hash TE/feature overlaps ############################################ # initialize te_features = {} for rep, fam in te_lengths: if options.strand_split: te_features[(rep + '+', fam)] = set() te_features[('*+', fam)] = set() te_features[('*+', '*')] = set() te_features[(rep + '-', fam)] = set() te_features[('*-', fam)] = set() te_features[('*-', '*')] = set() else: te_features[(rep, fam)] = set() te_features[('*', fam)] = set() te_features[('*', '*')] = set() p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (options.repeats_gff, feature_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') kv = gff.gtf_kv(a[8]) rep = kv['repeat'] fam = kv['family'] fchrom = a[9] fstart = int(a[12]) fend = int(a[13]) rep_star = '*' if options.strand_split: tstrand = a[6] fstrand = a[15] if tstrand == fstrand: rep += '+' rep_star += '+' else: rep += '-' rep_star += '-' te_features[(rep, fam)].add((fchrom, fstart, fend)) te_features[(rep_star, fam)].add((fchrom, fstart, fend)) te_features[(rep_star, '*')].add((fchrom, fstart, fend)) p.communicate() ############################################SW # compute stats and print ############################################ lines = [] p_vals = [] for te in te_features: rep, fam = te if options.strand_split: te_len = te_lengths[(rep[:-1], fam)] te_p = float(te_len) / (2 * genome_length) else: te_len = te_lengths[(rep, fam)] te_p = float(te_len) / genome_length te_count = len(te_features.get(te, [])) exp_count = te_p * feature_num fold_change = te_count / exp_count if fold_change > 1: p_val = binom.sf(te_count - 1, feature_num, te_p) else: p_val = binom.cdf(te_count, feature_num, te_p) p_vals.append(p_val) cols = (rep, fam, te_len, te_count, exp_count, fold_change, p_val) lines.append('%-18s %-18s %9d %8d %8.1f %8.2f %10.2e' % cols) # correct for multiple hypotheses correction q_vals = fdr.ben_hoch(p_vals) for i in range(len(lines)): qline = lines[i] + ' %10.2e' % q_vals[i] print qline ############################################ # clean ############################################ if options.filter_gff: os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) os.close(te_gff_fd) os.remove(te_gff_file) os.close(feature_gff_gff_fd) os.remove(feature_gff_gff_file)
def main(): usage = 'usage: %prog [options] <gtf> <diff>' parser = OptionParser(usage) parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]') parser.add_option('-o', dest='out_dir', default='te_diff', help='Output directory [Default: %default]') parser.add_option('-c', dest='scale', default=1, type='float', help='CDF plot scale [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK']) parser.add_option( '-s', dest='spread_factor', default=None, type='float', help= 'Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]' ) parser.add_option( '-l', dest='spread_lower', default=None, type='float', help= 'Allow multiplicative factor between median and shortest transcripts [Defafult: %default]' ) parser.add_option( '-u', dest='spread_upper', default=None, type='float', help= 'Allow multiplicative factor between median and longest transcripts [Defafult: %default]' ) (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide .gtf and .diff files') else: ref_gtf = args[0] diff_file = args[1] # clean plot directory if os.path.isdir(options.out_dir): shutil.rmtree(options.out_dir) os.mkdir(options.out_dir) if options.spread_factor or options.spread_lower or options.spread_upper: # filter for similar length if options.spread_factor: options.spread_lower = math.sqrt(options.spread_factor) options.spread_upper = math.sqrt(options.spread_factor) spread_gtf = '%s/spread_factor.gtf' % options.out_dir gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower) ref_gtf = spread_gtf ################################################## # hash TEs -> genes ################################################## te_genes = te.hash_repeats_genes(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True) ################################################## # hash genes -> RIP diff ################################################## gene_diff = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input') ################################################## # compute stats and make plots ################################################## table_lines, pvals = compute_stats(te_genes, gene_diff, ref_gtf, options.out_dir, options.scale) # perform multiple hypothesis correction qvals = fdr.ben_hoch(pvals) table_out = open('%s/table.txt' % options.out_dir, 'w') for i in range(len(table_lines)): print >> table_out, '%s %10.2e' % (table_lines[i], qvals[i]) table_out.close() if options.spread_factor or options.spread_lower or options.spread_upper: os.remove(spread_gtf)
def main(): usage = 'usage: %prog [options] <gtf> <diff>' parser = OptionParser(usage) parser.add_option('-o', dest='out_dir', default='te_diff_regress', help='Output directory to print regression summaries [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tpf.gff'%os.environ['MASK']) (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide .gtf and .diff files') else: gtf_file = args[0] diff_file = args[1] # hash genes -> TEs gene_tes = te.hash_genes_repeats(gtf_file, options.te_gff, gene_key='transcript_id', add_star=True, stranded=True) # hash diffs stats gene_diffs = hash_diff(diff_file) # clean output directory if os.path.isdir(options.out_dir): shutil.rmtree(options.out_dir) os.mkdir(options.out_dir) table_lines = [] pvals = [] for spair in gene_diffs: sample1, sample2 = spair # construct data frame gene_list = list(set(gene_tes.keys()) & set(gene_diffs[spair].keys())) df = pd.DataFrame({'diff': [gene_diffs[spair][gene_id] for gene_id in gene_list]}) covariate_str = '' for fam in regression_tes: te_key = '%s_fwd' % fam.replace('/','_').replace('-','') df[te_key] = [1.0*(('*',fam,'+') in gene_tes.get(gene_id,[])) for gene_id in gene_list] if len(covariate_str) == 0: covariate_str = te_key else: covariate_str += ' + %s' % te_key te_key = '%s_rev' % fam.replace('/','_').replace('-','') df[te_key] = [1.0*(('*',fam,'-') in gene_tes.get(gene_id,[])) for gene_id in gene_list] covariate_str += ' + %s' % te_key # regress mod = smf.ols(formula='diff ~ %s' % covariate_str, data=df).fit() # output model mod_out = open('%s/%s-%s.txt' % (options.out_dir, sample1, sample2), 'w') print >> mod_out, mod.summary() mod_out.close() # save table lines for fam in regression_tes: te_key = '%s_fwd' % fam.replace('/','_').replace('-','') cols = (fam, '+', sample1, sample2, sum(df[te_key]), mod.params[te_key], mod.tvalues[te_key], mod.pvalues[te_key]/0.5) table_lines.append('%-17s %1s %-10s %-10s %6d %8.3f %8.3f %10.2e' % cols) pvals.append(cols[-1]) te_key = '%s_rev' % fam.replace('/','_').replace('-','') cols = (fam, '-', sample1, sample2, sum(df[te_key]), mod.params[te_key], mod.tvalues[te_key], mod.pvalues[te_key]/0.5) table_lines.append('%-17s %1s %-10s %-10s %6d %8.3f %8.3f %10.2e' % cols) pvals.append(cols[-1]) # perform multiple hypothesis correction qvals = fdr.ben_hoch(pvals) table_out = open('%s/table.txt' % options.out_dir, 'w') for i in range(len(table_lines)): print >> table_out, '%s %10.2e' % (table_lines[i],qvals[i]) table_out.close()