def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--pfam', dest='pfam_dir') parser.add_argument('--genome-fasta', dest='genome_fasta_file') parser.add_argument('--min-orf-length', dest='min_orf_length', type=int, default=30) parser.add_argument('-o', '--output-dir', dest='output_dir', default='out') parser.add_argument('-p', '--num-processes', dest='num_processes', type=int, default=1) parser.add_argument('--mode', dest='mode', choices=['orf', 'first_orf', 'full'], default='orf') parser.add_argument('gtf_file') args = parser.parse_args() # get args pfam_dir = args.pfam_dir genome_fasta_file = args.genome_fasta_file gtf_file = args.gtf_file min_orf_length = args.min_orf_length num_processes = args.num_processes output_dir = args.output_dir mode = args.mode # check command line parameters if which('pfam_scan.pl') is None: parser.error("'pfam_scan.pl' not found in PATH") if which('hmmscan') is None: parser.error("'hmmscan' not found in PATH") if which('signalp') is None: parser.error("'signalp' not found in PATH") if not check_pfam_dir(pfam_dir): parser.error("Required Pfam-A and Pfam-B files not found at '%s'" % (pfam_dir)) if not os.path.exists(genome_fasta_file): parser.error("Genome FASTA file '%s' not found" % (genome_fasta_file)) if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) #if os.path.exists(output_dir): # parser.error("Output directory '%s' already exists" % (output_dir)) # create output dir if not os.path.exists(output_dir): logging.info("Creating output directory '%s'" % (output_dir)) os.makedirs(output_dir) if mode == 'full': return full_transcript_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir, num_processes) else: first_orf_only = (mode == 'first_orf') return orf_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir, min_orf_length, first_orf_only, num_processes) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--pfam', dest='pfam_dir') parser.add_argument('--genome-fasta', dest='genome_fasta_file') parser.add_argument('--min-orf-length', dest='min_orf_length', type=int, default=30) parser.add_argument('-o', '--output-dir', dest='output_dir', default='out') parser.add_argument('-p', '--num-processes', dest='num_processes', type=int, default=1) parser.add_argument('--mode', dest='mode', choices=['orf', 'first_orf', 'full'], default='orf') parser.add_argument('gtf_file') args = parser.parse_args() # get args pfam_dir = args.pfam_dir genome_fasta_file = args.genome_fasta_file gtf_file = args.gtf_file min_orf_length = args.min_orf_length num_processes = args.num_processes output_dir = args.output_dir mode = args.mode # check command line parameters if which('pfam_scan.pl') is None: parser.error("'pfam_scan.pl' not found in PATH") if which('hmmscan') is None: parser.error("'hmmscan' not found in PATH") if which('signalp') is None: parser.error("'signalp' not found in PATH") if not check_pfam_dir(pfam_dir): parser.error("Required Pfam-A and Pfam-B files not found at '%s'" % (pfam_dir)) if not os.path.exists(genome_fasta_file): parser.error("Genome FASTA file '%s' not found" % (genome_fasta_file)) if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) #if os.path.exists(output_dir): # parser.error("Output directory '%s' already exists" % (output_dir)) # create output dir if not os.path.exists(output_dir): logging.info("Creating output directory '%s'" % (output_dir)) os.makedirs(output_dir) if mode == 'full': return full_transcript_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir, num_processes) else: first_orf_only = (mode == 'first_orf') return orf_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir, min_orf_length, first_orf_only, num_processes) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--assembly", dest = 'assembly_bed', default = intergenic_assembly_bed, help = 'Assembly file used for shuffling and snp overlap intersection') parser.add_argument("--snps", dest = 'snps', default = snp_bed, help = 'SNP universe bed file') parser.add_argument("--excl", dest = 'excl', default = excl_file, help = 'Exclusion file used for shuffling') parser.add_argument("--chrom", dest = 'chrom', default = chrom_sizes_file, help = 'Chrom size file used for shuffling') parser.add_argument("--gtf", dest = 'gtf', default = gtf_file, help = 'GTF file used to generate shuffle (should match assembly_bed)') parser.add_argument("--gwas", dest = 'gwas', default = gwas_bed, help = 'GWAS bed file file used for intersection') parser.add_argument("--shuffs", dest = 'shuffs', default = 100, help = 'number of shuffles to perform') parser.add_argument("-p", dest = 'proc', default = 4, help = 'number of processors to use') parser.add_argument("--flank", dest = 'flank', default = 0, help = 'number of flanking bases to add to bed files') args = parser.parse_args() args.proc = int(args.proc) args.flank = int(args.flank) logging.info('Output is printed to stdout, to save use \'>\' <filename>') # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) if not os.path.isdir('GWAS_TMPS'): os.mkdir('GWAS_TMPS') prefix = 'GWAS_TMPS' locus_intervals_file = os.path.join(prefix, 'locus_intervals.bed') intersect_file = os.path.join(prefix, 'intersect.txt') assembly_flank = os.path.join(prefix, 'flank.bed') logging.info('Parsing GTF file') with open(locus_intervals_file, 'w') as f: j = 0 for locus_transcripts in parse_gtf(open(gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) locus_id = j j+=1 print >>f, '\t'.join(map(str, [locus_chrom, locus_start, locus_end, locus_id])) #apply flank to the bed file #read chrom file to make sure flanks added do not enter chrom ends chrom_length = {} for line in open(chrom_sizes_file): line = line.strip().split('\t') chr = line[0] length = line[1] chrom_length[chr] = length with open(assembly_flank, 'w') as f: for line in open(args.assembly_bed): line = line.strip().split('\t') chr = line[0] start = int(line[1]) end = int(line[2]) chr_len = chrom_length[chr] start = max(0, (start - args.flank)) end = min(chr_len, (end + args.flank)) line[1] = start line[2] = end print >> f, '\t'.join(map(str, line)) #GWAS snps #do intersections for real data and report number of overlapping GWAS snps logging.info('Intersecting assembly with GWAS snps') args_int = ['bedtools', 'intersect', '-a', args.gwas, '-b', assembly_flank, '-wa', '-wb'] with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) gwas_overlap = len(snps) #snp universe #do intersections for real data and report number of overlapping snps in snp universe logging.info('Intersecting assembly with snp universe') args_int = ['bedtools', 'intersect', '-a', args.snps, '-b', assembly_flank, '-wa', '-wb', '-sorted'] with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) snp_overlap = len(snps) frac_real = float(gwas_overlap)/snp_overlap logging.info('%d GWAS snps overlap compendia genes' % gwas_overlap) logging.info('%d snps (from \"snp universe\") overlap compendia genes' % snp_overlap) logging.info('Frac: %f' % frac_real) #loop the shuffle to generate a distribution of nulls for number of snps hit by random intergenic genes pool = multiprocessing.Pool(args.proc) NUM_SHUFFS = int(args.shuffs) shuff_args = (args.snps, args.gwas, args.excl, locus_intervals_file, args.chrom, args.gtf, frac_real, gwas_overlap, snp_overlap, NUM_SHUFFS, prefix, args.flank) tasks = [] header = [ 'gwas_shuff', 'snp_shuff', 'frac_shuff', 'gwas_real', 'snp_real', 'frac_real', 'OR' ] print '\t'.join(header) for i in xrange(NUM_SHUFFS): tasks.append((i,) + shuff_args) result_iter = pool.imap_unordered(shuffle_imap, tasks) for line in result_iter: print line pool.close() pool.join() shutil.rmtree(prefix) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('gtf_file') parser.add_argument('excl_file') parser.add_argument('chrom_sizes') parser.add_argument("output_prefix") args = parser.parse_args() prefix = args.output_prefix excl_file = args.excl_file chrom_sizes_file = args.chrom_sizes gtf_file = args.gtf_file # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) gene_intervals_file = prefix + '.gene_intervals.bed' gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed' shuffled_gtf_file = prefix + '.shuffle.gtf' sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf' logging.info('Parsing GTF file') with open(gene_intervals_file, 'w') as f: for locus_transcripts in parse_gtf(open(gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for g in get_gene_intervals(locus_transcripts): print >>f, '\t'.join(map(str, [g.chrom, g.start, g.end, g.gene_id])) # randomly shuffle genes logging.info("Shuffling genes") args = ['bedtools', 'shuffle', '-excl', excl_file, '-i', gene_intervals_file, '-g', args.chrom_sizes] with open(gene_intervals_shuffled_file, 'w') as fileh: subprocess.call(args, stdout=fileh) # read new gene positions logging.info("Reading shuffled gene intervals") shuffle_gene_map = {} with open(gene_intervals_shuffled_file) as fileh: for line in fileh: fields = line.strip().split('\t') chrom = fields[0] start = int(fields[1]) end = int(fields[2]) gene_id = fields[3] shuffle_gene_map[gene_id] = (chrom, start, end) # reposition transcripts logging.info("Repositioning transcripts") with open(shuffled_gtf_file, 'w') as fileh: for locus_transcripts in parse_gtf(open(gtf_file)): # get original positions orig_gene_map = {} for g in get_gene_intervals(locus_transcripts): orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end) for t in locus_transcripts: gene_id = t.attrs['gene_id'] orig_chrom, orig_start, orig_end = orig_gene_map[gene_id] if gene_id not in shuffle_gene_map: logging.warning('Gene %s [%s:%d-%d] could not be shuffled' % (gene_id, orig_chrom, orig_start, orig_end)) continue new_chrom, new_start, new_end = shuffle_gene_map[gene_id] # reposition transcript t.chrom = new_chrom t.start = new_start + (t.start - orig_start) t.end = new_start + (t.end - orig_start) for e in t.exons: e.start = new_start + (e.start - orig_start) e.end = new_start + (e.end - orig_start) fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons) print '\t'.join(fields) #for f in t.to_gtf_features(source='shuffle'): # print >>fileh, str(f) logging.info("Sorting GTF file") sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--rand_snp", dest = 'rand_snp', default = rand_snps_file, help = 'Bed file of random snps to use as negative control for analyses') parser.add_argument("--snps", dest = 'snps', default = snp_bed, help = 'SNP universe bed file') parser.add_argument("--excl", dest = 'excl', default = excl_file, help = 'Exclusion file used for shuffling') parser.add_argument("--chrom", dest = 'chrom', default = chrom_sizes_file, help = 'Chrom size file used for shuffling') parser.add_argument("--gtf", dest = 'gtf', help = 'GTF file used to generate shuffle') parser.add_argument("--gwas", dest = 'gwas', default = gwas_bed_file, help = 'GWAS bed file file used for intersection') parser.add_argument("--shuffs", dest = 'shuffs', default = 100, help = 'number of shuffles to perform') parser.add_argument("-p", dest = 'proc', default = 4, help = 'number of processors to use') parser.add_argument("--flank", dest = 'flank', default = 0, help = 'number of flanking bases to add to bed files') parser.add_argument("--exon", dest="exon", action="store_true", default=False, help="Perform analysis looking only at exonic overlap") args = parser.parse_args() args.proc = int(args.proc) args.flank = int(args.flank) logging.info('Output is printed to stdout') if args.exon: logging.info('Looking at exonic overlap only') # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) if not os.path.isdir('GWAS_TMPS'): os.mkdir('GWAS_TMPS') prefix = 'GWAS_TMPS' locus_intervals_file = os.path.join(prefix, 'locus_intervals.bed') intersect_file = os.path.join(prefix, 'intersect.txt') assembly_flank = os.path.join(prefix, 'flank.bed') assembly_flank_sorted = os.path.join(prefix + '.flank.sorted.bed') assembly_bed = os.path.join(prefix, 'assembly.bed') #read chrom file to make sure flanks added do not enter chrom ends chrom_length = {} for line in open(chrom_sizes_file): line = line.strip().split('\t') chr = line[0] length = line[1] chrom_length[chr] = length #convert GTF file to BED for initial intersections and get locus intervals logging.info('Parsing GTF: converting to BED and obtaining locus intervals') with open(assembly_bed, 'w') as f2: with open(locus_intervals_file, 'w') as f: j = 0 for locus_transcripts in parse_gtf(open(args.gtf)): if (j%2500)==0: logging.debug('Finished %d/%d loci' % (j, 35000)) for t in locus_transcripts: name = t.attrs['transcript_id'] fields = write_bed(t.chrom, name, t.strand, 1000, t.exons, args.flank, chrom_length) print >>f2, '\t'.join(fields) # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) locus_id = j j+=1 print >>f, '\t'.join(map(str, [locus_chrom, locus_start, locus_end, locus_id])) #apply flank to the bed file with open(assembly_flank, 'w') as f: for line in open(assembly_bed): line = line.strip().split('\t') chr = line[0] start = int(line[1]) end = int(line[2]) chr_len = chrom_length[chr] start = max(0, (start - args.flank)) end = min(chr_len, (end + args.flank)) line[1] = start line[2] = end print >> f, '\t'.join(map(str, line)) args_sort = ['sort', '-k1,1', '-k2,2n', assembly_flank] with open(assembly_flank_sorted, 'w') as fileh: subprocess.call(args_sort, stdout=fileh) #GWAS snps #do intersections for real data and report number of overlapping GWAS snps logging.info('Intersecting assembly with GWAS snps') args_int = ['bedtools', 'intersect', '-a', args.gwas, '-b', assembly_flank_sorted, '-sorted', '-wa', '-wb'] if args.exon: args_int.append('-split') with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) gwas_overlap = len(snps) #Random snps #do intersections for real data and report number of overlapping GWAS snps logging.info('Intersecting assembly with random snps') args_int = ['bedtools', 'intersect', '-a', args.rand_snp, '-b', assembly_flank_sorted, '-sorted', '-wa', '-wb'] if args.exon: args_int.append('-split') with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) rand_overlap = len(snps) #snp universe #do intersections for real data and report number of overlapping snps in snp universe logging.info('Intersecting assembly with snp universe') args_int = ['bedtools', 'intersect', '-a', args.snps, '-b', assembly_flank_sorted, '-sorted', '-wa', '-wb'] if args.exon: args_int.append('-split') with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) snp_overlap = len(snps) frac_real = float(gwas_overlap)/snp_overlap frac_rand = float(rand_overlap)/snp_overlap if args.exon: logging.info('%d GWAS snps overlap compendia exons' % gwas_overlap) logging.info('%d random snps overlap compendia exons' % rand_overlap) logging.info('%d total snps overlap compendia exons' % snp_overlap) else: logging.info('%d GWAS snps overlap compendia genes' % gwas_overlap) logging.info('%d random snps overlap compendia genes' % rand_overlap) logging.info('%d total snps overlap compendia genes' % snp_overlap) logging.info('Frac_gwas: %f' % frac_real) logging.info('Frac_rand: %f' % frac_rand) #loop the shuffle to generate a distribution of nulls for number of snps hit by random intergenic genes pool = multiprocessing.Pool(args.proc) NUM_SHUFFS = int(args.shuffs) shuff_args = (args.snps, args.gwas, args.rand_snp, args.excl, locus_intervals_file, args.chrom, args.gtf, frac_real, frac_rand, gwas_overlap, rand_overlap, snp_overlap, NUM_SHUFFS, prefix, args.flank, args.exon) tasks = [] header = [ 'gwas_shuff_overlap', 'rand_snp_shuff_overlap', 'all_snp_shuff_overlap', 'frac_gwas_shuff', 'frac_rand_shuff', 'gwas_overlap', 'rand_snp_overlap', 'all_snp_overlap', 'frac_gwas', 'frac_rand', 'OR_gwas', 'OR_rand' ] print '\t'.join(header) logging.info("Beginning shuffles") for i in xrange(NUM_SHUFFS): tasks.append((i,) + shuff_args) result_iter = pool.imap_unordered(shuffle_imap, tasks) for line in result_iter: print line pool.close() pool.join() shutil.rmtree(prefix) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--assembly", dest = 'assembly_bed', default = intergenic_assembly_bed, help = 'Assembly file used for shuffling and snp overlap intersection') parser.add_argument("--snps", dest = 'snps', default = snp_bed, help = 'SNP universe bed file') parser.add_argument("--excl", dest = 'excl', default = excl_file, help = 'Exclusion file used for shuffling') parser.add_argument("--chrom", dest = 'chrom', default = chrom_sizes_file, help = 'Chrom size file used for shuffling') parser.add_argument("--gtf", dest = 'gtf', default = gtf_file, help = 'GTF file used to generate shuffle (should match assembly_bed)') parser.add_argument("--gwas", dest = 'gwas', default = gwas_bed, help = 'GWAS bed file file used for intersection') parser.add_argument("--flank", dest = 'flank', default = 0, help = 'number of flanking bases to add to bed files') args = parser.parse_args() args.flank = int(args.flank) logging.info('Output is printed to stdout, to save use \'>\' <filename>') # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) if not os.path.isdir('GWAS_TMPS'): os.mkdir('GWAS_TMPS') prefix = 'GWAS_TMPS' gene_intervals_file = os.path.join(prefix, 'gene_intervals.bed') intersect_file = os.path.join(prefix, 'intersect.txt') assembly_flank = os.path.join(prefix, 'flank.bed') output_file = 'gwas_intergenic_null.txt' logging.info('Parsing GTF file') with open(gene_intervals_file, 'w') as f: for locus_transcripts in parse_gtf(open(gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) # logging.debug("[LOCUS] %s:%d-%d %d transcripts" % # (locus_chrom, locus_start, locus_end, # len(locus_transcripts))) for g in get_gene_intervals(locus_transcripts): print >>f, '\t'.join(map(str, [g.chrom, g.start, g.end, g.gene_id])) #apply flank to the bed file #read chrom file to make sure flanks added do not enter chrom ends chrom_length = {} for line in open(chrom_sizes_file): line = line.strip().split('\t') chr = line[0] length = line[1] chrom_length[chr] = length with open(assembly_flank, 'w') as f: for line in open(args.assembly_bed): line = line.strip().split('\t') chr = line[0] start = int(line[1]) end = int(line[2]) chr_len = chrom_length[chr] start = max(0, (start - args.flank)) end = min(chr_len, (end + args.flank)) line[1] = start line[2] = end print >> f, '\t'.join(map(str, line)) #GWAS snps #do intersections for real data and report number of overlapping GWAS snps logging.info('Intersecting assembly with GWAS snps') args_int = ['bedtools', 'intersect', '-a', args.gwas, '-b', assembly_flank, '-wa', '-wb'] with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) gwas_overlap = len(snps) #snp universe #do intersections for real data and report number of overlapping snps in snp universe logging.info('Intersecting assembly with snp universe') args_int = ['bedtools', 'intersect', '-a', args.snps, '-b', assembly_flank, '-wa', '-wb', '-sorted'] with open(intersect_file, 'w') as fileh: subprocess.call(args_int, stdout=fileh) #count number of SNPs caught snps = set() for line in open(intersect_file): line = line.strip().split('\t') rsID = line[RSIDCOL] snps.add(rsID) snp_overlap = len(snps) frac_real = float(gwas_overlap)/snp_overlap logging.info('%d GWAS snps overlap compendia genes' % gwas_overlap) logging.info('%d snps (from \"snp universe\") overlap compendia genes' % snp_overlap) logging.info('Frac: %f' % frac_real) print '\t'.join(map(str, [args.flank, gwas_overlap, snp_overlap, frac_real])) # valso.close() shutil.rmtree(prefix) return 0
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('gtf_file') parser.add_argument('excl_file') parser.add_argument('chrom_sizes') parser.add_argument("output_prefix") args = parser.parse_args() prefix = args.output_prefix excl_file = args.excl_file chrom_sizes_file = args.chrom_sizes gtf_file = args.gtf_file # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) gene_intervals_file = prefix + '.gene_intervals.bed' gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed' shuffled_gtf_file = prefix + '.shuffle.gtf' sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf' logging.info('Parsing GTF file') with open(gene_intervals_file, 'w') as f: for locus_transcripts in parse_gtf(open(gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for g in get_gene_intervals(locus_transcripts): print >> f, '\t'.join( map(str, [g.chrom, g.start, g.end, g.gene_id])) # randomly shuffle genes logging.info("Shuffling genes") args = [ 'bedtools', 'shuffle', '-excl', excl_file, '-i', gene_intervals_file, '-g', args.chrom_sizes ] with open(gene_intervals_shuffled_file, 'w') as fileh: subprocess.call(args, stdout=fileh) # read new gene positions logging.info("Reading shuffled gene intervals") shuffle_gene_map = {} with open(gene_intervals_shuffled_file) as fileh: for line in fileh: fields = line.strip().split('\t') chrom = fields[0] start = int(fields[1]) end = int(fields[2]) gene_id = fields[3] shuffle_gene_map[gene_id] = (chrom, start, end) # reposition transcripts logging.info("Repositioning transcripts") with open(shuffled_gtf_file, 'w') as fileh: for locus_transcripts in parse_gtf(open(gtf_file)): # get original positions orig_gene_map = {} for g in get_gene_intervals(locus_transcripts): orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end) for t in locus_transcripts: gene_id = t.attrs['gene_id'] orig_chrom, orig_start, orig_end = orig_gene_map[gene_id] if gene_id not in shuffle_gene_map: logging.warning( 'Gene %s [%s:%d-%d] could not be shuffled' % (gene_id, orig_chrom, orig_start, orig_end)) continue new_chrom, new_start, new_end = shuffle_gene_map[gene_id] # reposition transcript t.chrom = new_chrom t.start = new_start + (t.start - orig_start) t.end = new_start + (t.end - orig_start) for e in t.exons: e.start = new_start + (e.start - orig_start) e.end = new_start + (e.end - orig_start) fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons) print '\t'.join(fields) #for f in t.to_gtf_features(source='shuffle'): # print >>fileh, str(f) logging.info("Sorting GTF file") sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)