def main(): usage = 'usage: %prog [options] <feature gff/bed>' parser = OptionParser(usage) parser.add_option('-g', dest='gff_file', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % home_dir) parser.add_option('-n', dest='null_iterations', type=int, default=50, help='Number of shuffles to perform to estimate null distribution [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a gff file for the feature of interest.') else: feature_gff = args[0] ############################################ # GFF filter ############################################ # filter TEs and features by gff file if options.gff_file: # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, options.gff_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter feature GFF feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -s -u -f 0.5 -a %s -b %s > %s' % (feature_gff, options.gff_file, feature_gff_gff_file), shell=True) feature_gff = feature_gff_gff_file ############################################ # lengths ############################################ # compute size of search space if options.gff_file: genome_length = count_gff(options.gff_file) else: genome_length = count_hg19() # compute feature length feature_len, feature_num = feature_stats(feature_gff) if feature_num == 0: print >> sys.stderr, 'Zero features' exit() # hash counted repeat genomic bp te_in = open(options.repeats_gff) genome_te_bp = hash_te(te_in) te_in.close() ############################################ # convert feature gff to bed ############################################ if feature_gff[-3:] == 'gtf': feature_bed_fd, feature_bed_file = tempfile.mkstemp() subprocess.call('gtf2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True) elif feature_gff[-3:] == 'gff': feature_bed_fd, feature_bed_file = tempfile.mkstemp() subprocess.call('gff2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True) elif feature_gff[-3:] == 'bed': feature_bed_file = feature_gff else: parser.error('Cannot recognize gff format suffix') ############################################ # null distribution ############################################ shuffle_bed_fd, shuffle_bed_file = tempfile.mkstemp() te_null_bp = {} for ni in range(options.null_iterations): print >> sys.stderr, ni # shuffle feature bed subprocess.call('shuffleBed -i %s -g %s/research/common/data/genomes/hg19/assembly/human.hg19.genome -excl %s/research/common/data/genomes/hg19/assembly/hg19_gaps.bed > %s' % (feature_bed_file, home_dir, home_dir, shuffle_bed_file), shell=True) # intersect w/ TEs and hash overlaps te_tmp_bp = intersect_hash(options.repeats_gff, shuffle_bed_file) for te in genome_te_bp: te_null_bp.setdefault(te,[]).append(te_tmp_bp.get(te,0)) ############################################ # actual ############################################ te_bp = intersect_hash(options.repeats_gff, feature_gff) ############################################ # compute stats and print ############################################ lines = [] p_vals = [] for te in genome_te_bp: feature_freq = float(te_bp.get(te,0))/feature_len genome_freq = float(genome_te_bp[te])/genome_length fold_change = feature_freq / genome_freq #print te, stats.mean(te_null_bp[te]), stats.sd(te_null_bp[te]) null_u, null_sd = stats.mean_sd(te_null_bp[te]) if null_sd == 0: null_sd = 1.0 if fold_change > 1: p = norm.sf(te_bp[te]-1, loc=null_u, scale=null_sd) else: p = norm.cdf(te_bp.get(te,0), loc=null_u, scale=null_sd) p_vals.append(p) cols = (te[0], te[1], te_bp.get(te,0), feature_freq, genome_freq, fold_change, p) lines.append('%-18s %-18s %8d %11.2e %11.2e %9.2f %10.2e' % cols) # correct for multiple hypotheses correction q_vals = fdr.ben_hoch(p_vals) for i in range(len(lines)): qline = lines[i] + ' %10.2e' % q_vals[i] print qline ############################################ # clean ############################################ os.close(shuffle_bed_fd) os.remove(shuffle_bed_file) if feature_gff[-3:] != 'bed': os.close(feature_bed_fd) os.remove(feature_bed_file) if options.gff_file: os.close(te_gff_fd) os.remove(te_gff_file) os.close(feature_gff_gff_fd) os.remove(feature_gff_gff_file)
def main(): usage = 'usage: %prog [options] <feature gff>' parser = OptionParser(usage) parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % os.environ['HOME']) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a gff file for the feature of interest.') else: feature_gff = args[0] ############################################ # GFF filter ############################################ # filter TEs and features by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter feature GFF feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -u -f 0.5 -a %s -b %s > %s' % (feature_gff, filter_merged_bed_file, feature_gff_gff_file), shell=True) feature_gff = feature_gff_gff_file ############################################ # lengths ############################################ # compute feature length feature_len, feature_num = feature_stats(feature_gff) if feature_num == 0: print >> sys.stderr, 'Zero features' exit() # compute size of search space if options.filter_gff: genome_length = count_bed(filter_merged_bed_file, feature_len) else: genome_length = count_hg19() # hash counted repeat genomic bp te_lengths = te_target_size(options.repeats_gff, feature_len) ############################################ # hash TE/feature overlaps ############################################ # initialize te_features = {} for rep, fam in te_lengths: if options.strand_split: te_features[(rep+'+',fam)] = set() te_features[('*+',fam)] = set() te_features[('*+','*')] = set() te_features[(rep+'-',fam)] = set() te_features[('*-',fam)] = set() te_features[('*-','*')] = set() else: te_features[(rep,fam)] = set() te_features[('*',fam)] = set() te_features[('*','*')] = set() p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (options.repeats_gff,feature_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') kv = gff.gtf_kv(a[8]) rep = kv['repeat'] fam = kv['family'] fchrom = a[9] fstart = int(a[12]) fend = int(a[13]) rep_star = '*' if options.strand_split: tstrand = a[6] fstrand = a[15] if tstrand == fstrand: rep += '+' rep_star += '+' else: rep += '-' rep_star += '-' te_features[(rep,fam)].add((fchrom,fstart,fend)) te_features[(rep_star,fam)].add((fchrom,fstart,fend)) te_features[(rep_star,'*')].add((fchrom,fstart,fend)) p.communicate() ############################################SW # compute stats and print ############################################ lines = [] p_vals = [] for te in te_features: rep, fam = te if options.strand_split: te_len = te_lengths[(rep[:-1],fam)] te_p = float(te_len) / (2*genome_length) else: te_len = te_lengths[(rep,fam)] te_p = float(te_len) / genome_length te_count = len(te_features.get(te,[])) exp_count = te_p * feature_num fold_change = te_count / exp_count if fold_change > 1: p_val = binom.sf(te_count-1, feature_num, te_p) else: p_val = binom.cdf(te_count, feature_num, te_p) p_vals.append(p_val) cols = (rep, fam, te_len, te_count, exp_count, fold_change, p_val) lines.append('%-18s %-18s %9d %8d %8.1f %8.2f %10.2e' % cols) # correct for multiple hypotheses correction q_vals = fdr.ben_hoch(p_vals) for i in range(len(lines)): qline = lines[i] + ' %10.2e' % q_vals[i] print qline ############################################ # clean ############################################ if options.filter_gff: os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) os.close(te_gff_fd) os.remove(te_gff_file) os.close(feature_gff_gff_fd) os.remove(feature_gff_gff_file)
def main(): usage = 'usage: %prog [options] <feature gff>' parser = OptionParser(usage) parser.add_option( '-g', dest='filter_gff', help= 'Filter the TEs by overlap with genes in the given gff file [Default: %default]' ) parser.add_option( '-r', dest='repeats_gff', default= '%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % os.environ['HOME']) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a gff file for the feature of interest.') else: feature_gff = args[0] ############################################ # GFF filter ############################################ # filter TEs and features by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp() subprocess.call( 'intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter feature GFF feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp() subprocess.call( 'intersectBed -u -f 0.5 -a %s -b %s > %s' % (feature_gff, filter_merged_bed_file, feature_gff_gff_file), shell=True) feature_gff = feature_gff_gff_file ############################################ # lengths ############################################ # compute feature length feature_len, feature_num = feature_stats(feature_gff) if feature_num == 0: print >> sys.stderr, 'Zero features' exit() # compute size of search space if options.filter_gff: genome_length = count_bed(filter_merged_bed_file, feature_len) else: genome_length = count_hg19() # hash counted repeat genomic bp te_lengths = te_target_size(options.repeats_gff, feature_len) ############################################ # hash TE/feature overlaps ############################################ # initialize te_features = {} for rep, fam in te_lengths: if options.strand_split: te_features[(rep + '+', fam)] = set() te_features[('*+', fam)] = set() te_features[('*+', '*')] = set() te_features[(rep + '-', fam)] = set() te_features[('*-', fam)] = set() te_features[('*-', '*')] = set() else: te_features[(rep, fam)] = set() te_features[('*', fam)] = set() te_features[('*', '*')] = set() p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (options.repeats_gff, feature_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') kv = gff.gtf_kv(a[8]) rep = kv['repeat'] fam = kv['family'] fchrom = a[9] fstart = int(a[12]) fend = int(a[13]) rep_star = '*' if options.strand_split: tstrand = a[6] fstrand = a[15] if tstrand == fstrand: rep += '+' rep_star += '+' else: rep += '-' rep_star += '-' te_features[(rep, fam)].add((fchrom, fstart, fend)) te_features[(rep_star, fam)].add((fchrom, fstart, fend)) te_features[(rep_star, '*')].add((fchrom, fstart, fend)) p.communicate() ############################################SW # compute stats and print ############################################ lines = [] p_vals = [] for te in te_features: rep, fam = te if options.strand_split: te_len = te_lengths[(rep[:-1], fam)] te_p = float(te_len) / (2 * genome_length) else: te_len = te_lengths[(rep, fam)] te_p = float(te_len) / genome_length te_count = len(te_features.get(te, [])) exp_count = te_p * feature_num fold_change = te_count / exp_count if fold_change > 1: p_val = binom.sf(te_count - 1, feature_num, te_p) else: p_val = binom.cdf(te_count, feature_num, te_p) p_vals.append(p_val) cols = (rep, fam, te_len, te_count, exp_count, fold_change, p_val) lines.append('%-18s %-18s %9d %8d %8.1f %8.2f %10.2e' % cols) # correct for multiple hypotheses correction q_vals = fdr.ben_hoch(p_vals) for i in range(len(lines)): qline = lines[i] + ' %10.2e' % q_vals[i] print qline ############################################ # clean ############################################ if options.filter_gff: os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) os.close(te_gff_fd) os.remove(te_gff_file) os.close(feature_gff_gff_fd) os.remove(feature_gff_gff_file)