def _main(data,output_folder,num_threads,overwrite=False): data['out'] = output_folder datasaver = JSON_saver(create_path(data['out'],"record","json",overwrite=overwrite)) datasaver.save(data) header_print("Running full CNS identification pipeline on %s alignment files" % len(data["ref_aligned_chroms"]),h_type=1) data['genome_beds'] = create_path(data['out']+"genome_beds",overwrite=overwrite) data = create_genome_beds(data,data['genome_beds'],overwrite=overwrite) for chromosome in sorted(data['ref_aligned_chroms'].keys()): header_print("Identify CNS on %s" % chromosome,h_type=2) chromDat = {key:data[key] for key in data if key!="ref_aligned_chroms"} chromDat['chrom_seq_maf'] = data['ref_aligned_chroms'][chromosome]['chrom_seq_maf'] chromDat['chrom_conservation_wig'] = data['ref_aligned_chroms'][chromosome]['chrom_conservation_wig'] chromDat['out'] = create_path(data['out']+"chrom/"+chromosome,overwrite=overwrite) chromDat = chrom_cns_identify(chromDat,chromDat['out'],num_threads,overwrite=overwrite,chrom_name=chromosome) data['ref_aligned_chroms'][chromosome] = {key:chromDat[key] for key in chromDat if not key.startswith("ref_")} datasaver.save(data) data = combine_cns(data,data['out'],overwrite=overwrite) datasaver.save(data) return data
def _main(data,output_folder,overwrite=False): datasaver = JSON_saver(create_path(output_folder,"record","json",overwrite=overwrite)) datasaver.save(data) header_print("Combining %s CNS files"%len(data['ref_aligned_chroms'])) data["combined_cns"] = create_path(output_folder,"combined_identified","cns",overwrite=overwrite) print len(data['ref_aligned_chroms']) cns = Cns() i = 0 for chrom in data['ref_aligned_chroms']: i+=1 print i chrom_cns = Cns(file_name=data['ref_aligned_chroms'][chrom]['results']) for entry in chrom_cns.entries: entry.cns_ID = "%s:%s" %(chrom,str(entry.cns_ID)) for genome in entry.sequences: for seq in entry.sequences[genome]: seq.cns_ID = entry.cns_ID cns.entries.append(entry) cns.save_file(data["combined_cns"]) datasaver.save(data) return data
def _main(data, output_folder, overwrite=False): datasaver = JSON_saver( create_path(output_folder, "record", "json", overwrite=overwrite)) datasaver.save(data) #gff3_to_bed info = "Convert coding regions to .bed:" header_print(info) data['ref_coding_bed'] = create_path(output_folder, "ref_coding", "bed", overwrite=overwrite) gff3_to_bed(gff3_file=data['genomes'][data['ref_genome']]['annot_gff3'], bed_out=data['ref_coding_bed'], type_list=['CDS'], sequence_prefix=data['ref_genome'] + ":") datasaver.save(data) #gff3_to_bed info = "Convert per-genome gene regions to .bed:" header_print(info) data['genome_annot_beds_folder'] = create_path(output_folder + "genome_annot_beds", overwrite=overwrite) for genome in data['genomes']: data['genomes'][genome]['annot_bed'] = create_path( data['genome_annot_beds_folder'], genome + "_annot", "bed", overwrite=overwrite) Gff3(file_name=data['genomes'][genome]['annot_gff3']) \ .to_bed(type_list=['gene'],genome=genome) \ .save_file(data['genomes'][genome]['annot_bed']) datasaver.save(data) return data
def run(config_file,output_folder,num_threads,overwrite=False): config = None with open(config_file) as intructionJSON: config = json.load(intructionJSON) output_folder = create_path(output_folder,overwrite=overwrite) _main(config,output_folder,num_threads,overwrite=overwrite)
def _main(data,output_folder,num_threads,overwrite=False,chrom_name=None): datasaver = JSON_saver(create_path(output_folder,"record","json",overwrite=overwrite)) datasaver.save(data) #maf_to_bed info = "Convert aligned sequences to .bed:" header_print(info) data['ref_seq_bed'] = create_path(output_folder,"ref_seq","bed",overwrite=overwrite) maf_to_bed(maf_file = data['chrom_seq_maf'], bed_out = data['ref_seq_bed'], ref_genome = data['ref_genome'], index_tag = "chrom_maf_index") datasaver.save(data) # #$bedtools intersect # info = "Intersect aligned regions with conserved regions:" # header_print(info) # data['conserved_bed'] = create_path(output_folder,"conserved","bed",overwrite=overwrite) # cmd = "bedtools intersect -a %s -b %s > %s" % (data['ref_seq_bed'],data['chrom_conserved_bed'],data['conserved_bed']) # print cmd # tracker = Progress_tracker("Running bedtools intersect",1).estimate(False).display() # process = subprocess.Popen(cmd, shell=True) # process.wait() # tracker.done() # datasaver.save(data) #$bedtools subtract info = "Subtract coding regions from aligned regions:" header_print(info) data['aligned_noncoding_bed'] = create_path(output_folder,"aligned_noncoding_bed","bed",overwrite=overwrite) cmd = "bedtools subtract -a %s -b %s > %s" % (data['ref_seq_bed'],data['ref_coding_bed'],data['aligned_noncoding_bed']) tracker = Progress_tracker("Running bedtools subtract",1).estimate(False).display() process = subprocess.Popen(cmd, shell=True) process.wait() tracker.done() datasaver.save(data) #wiggle_to_bed info = "Converting especially conserved regions in wiggle file to bed" header_print(info) data['best_conserved_bed'] = create_path(output_folder,"best_conserved","bed",overwrite=overwrite) wiggle_to_bed(wig_file=data['chrom_conservation_wig'], out_file=data['best_conserved_bed'], genome_name=data['ref_genome']) datasaver.save(data) #filter_bed_with_wiggle info = "Intersecting wiggle bed with the potential cns bed" header_print(info) data['cns_bed'] = create_path(output_folder,"cns","bed",overwrite=overwrite) cmd = "bedtools intersect -a %s -b %s > %s" % (data['aligned_noncoding_bed'],data['best_conserved_bed'],data['cns_bed']) tracker = Progress_tracker("Running bedtools intersect",1).estimate(False).display() process = subprocess.Popen(cmd, shell=True) process.wait() tracker.done() datasaver.save(data) #slice_maf_by_bed info = "Slice multi-alignment file based on identified conserved non-coding regions:" header_print(info) data['cns_maf'] = create_path(output_folder,"cns","maf",overwrite=overwrite) slice_maf_by_bed(maf_file = data['chrom_seq_maf'], bed_file = data['cns_bed'], index_tag = "chrom_maf_index", ref_genome = data['ref_genome'], out_file = data['cns_maf'], max_N_ratio = 0.5, max_gap_ratio = 0.5, min_len = 15) datasaver.save(data) #maf_to_bed info = "Convert per-genome CNS regions to .bed:" header_print(info) data['genome_cns_beds_folder'] = create_path(output_folder+"genome_cns_beds",overwrite=overwrite) cns_maf = Maf(file_name=data['cns_maf']) for genome in data['genomes']: data['genomes'][genome]['cns_bed'] = create_path(data['genome_cns_beds_folder'],genome+"_cns_"+chrom_name,"bed",overwrite=overwrite) bed = cns_maf.to_bed(genome_name=genome,index_tag="cns_maf_index") bed.save_file(data['genomes'][genome]['cns_bed']) del cns_maf datasaver.save(data) #$bedtools closest info = "Find closest gene for each CNS region:" header_print(info) data['gene_proximity_beds_folder'] = create_path(output_folder+"gene_proximity_beds",overwrite=overwrite) for genome in data['genomes']: data['genomes'][genome]['gene_proximity_bed'] = \ create_path(data['gene_proximity_beds_folder'],genome+"_proxim","bed",overwrite=overwrite) cmd = "bedtools closest -D a -a %s -b %s > %s" % \ (data['genomes'][genome]['cns_bed'], data['genomes'][genome]['annot_bed'], data['genomes'][genome]['gene_proximity_bed']) process = subprocess.Popen(cmd, shell=True) process.wait() datasaver.save(data) #maf_and_proxim_bed_to_cns info = "Process proximity and maf files into .cns file:" header_print(info) data['results'] = create_path(output_folder,"identified_CNSs","cns",overwrite=overwrite) cns_proxim_beds = {genome:Bed13(data['genomes'][genome]['gene_proximity_bed']) for genome in data['genomes']} Maf(file_name=data['cns_maf'])\ .cns_from_proxim_beds(cns_proxim_beds,"cns_maf_index")\ .save_file(data['results']) datasaver.save(data) return data
def _main(data, output_folder, num_threads, overwrite=False, chrom_name=None): datasaver = JSON_saver( create_path(output_folder, "record", "json", overwrite=overwrite)) datasaver.save(data) #maf_to_bed info = "Convert aligned sequences to .bed:" header_print(info) data['ref_seq_bed'] = create_path(output_folder, "ref_seq", "bed", overwrite=overwrite) maf_to_bed(maf_file=data['chrom_seq_maf'], bed_out=data['ref_seq_bed'], ref_genome=data['ref_genome'], index_tag="chrom_maf_index") datasaver.save(data) # #$bedtools intersect # info = "Intersect aligned regions with conserved regions:" # header_print(info) # data['conserved_bed'] = create_path(output_folder,"conserved","bed",overwrite=overwrite) # cmd = "bedtools intersect -a %s -b %s > %s" % (data['ref_seq_bed'],data['chrom_conserved_bed'],data['conserved_bed']) # print cmd # tracker = Progress_tracker("Running bedtools intersect",1).estimate(False).display() # process = subprocess.Popen(cmd, shell=True) # process.wait() # tracker.done() # datasaver.save(data) #$bedtools subtract info = "Subtract coding regions from aligned regions:" header_print(info) data['aligned_noncoding_bed'] = create_path(output_folder, "aligned_noncoding_bed", "bed", overwrite=overwrite) cmd = "bedtools subtract -a %s -b %s > %s" % ( data['ref_seq_bed'], data['ref_coding_bed'], data['aligned_noncoding_bed']) tracker = Progress_tracker("Running bedtools subtract", 1).estimate(False).display() process = subprocess.Popen(cmd, shell=True) process.wait() tracker.done() datasaver.save(data) #wiggle_to_bed info = "Converting especially conserved regions in wiggle file to bed" header_print(info) data['best_conserved_bed'] = create_path(output_folder, "best_conserved", "bed", overwrite=overwrite) wiggle_to_bed(wig_file=data['chrom_conservation_wig'], out_file=data['best_conserved_bed'], genome_name=data['ref_genome']) datasaver.save(data) #filter_bed_with_wiggle info = "Intersecting wiggle bed with the potential cns bed" header_print(info) data['cns_bed'] = create_path(output_folder, "cns", "bed", overwrite=overwrite) cmd = "bedtools intersect -a %s -b %s > %s" % ( data['aligned_noncoding_bed'], data['best_conserved_bed'], data['cns_bed']) tracker = Progress_tracker("Running bedtools intersect", 1).estimate(False).display() process = subprocess.Popen(cmd, shell=True) process.wait() tracker.done() datasaver.save(data) #slice_maf_by_bed info = "Slice multi-alignment file based on identified conserved non-coding regions:" header_print(info) data['cns_maf'] = create_path(output_folder, "cns", "maf", overwrite=overwrite) slice_maf_by_bed(maf_file=data['chrom_seq_maf'], bed_file=data['cns_bed'], index_tag="chrom_maf_index", ref_genome=data['ref_genome'], out_file=data['cns_maf'], max_N_ratio=0.5, max_gap_ratio=0.5, min_len=15) datasaver.save(data) #maf_to_bed info = "Convert per-genome CNS regions to .bed:" header_print(info) data['genome_cns_beds_folder'] = create_path(output_folder + "genome_cns_beds", overwrite=overwrite) cns_maf = Maf(file_name=data['cns_maf']) for genome in data['genomes']: data['genomes'][genome]['cns_bed'] = create_path( data['genome_cns_beds_folder'], genome + "_cns_" + chrom_name, "bed", overwrite=overwrite) bed = cns_maf.to_bed(genome_name=genome, index_tag="cns_maf_index") bed.save_file(data['genomes'][genome]['cns_bed']) del cns_maf datasaver.save(data) #$bedtools closest info = "Find closest gene for each CNS region:" header_print(info) data['gene_proximity_beds_folder'] = create_path(output_folder + "gene_proximity_beds", overwrite=overwrite) for genome in data['genomes']: data['genomes'][genome]['gene_proximity_bed'] = \ create_path(data['gene_proximity_beds_folder'],genome+"_proxim","bed",overwrite=overwrite) cmd = "bedtools closest -D a -a %s -b %s > %s" % \ (data['genomes'][genome]['cns_bed'], data['genomes'][genome]['annot_bed'], data['genomes'][genome]['gene_proximity_bed']) process = subprocess.Popen(cmd, shell=True) process.wait() datasaver.save(data) #maf_and_proxim_bed_to_cns info = "Process proximity and maf files into .cns file:" header_print(info) data['results'] = create_path(output_folder, "identified_CNSs", "cns", overwrite=overwrite) cns_proxim_beds = { genome: Bed13(data['genomes'][genome]['gene_proximity_bed']) for genome in data['genomes'] } Maf(file_name=data['cns_maf'])\ .cns_from_proxim_beds(cns_proxim_beds,"cns_maf_index")\ .save_file(data['results']) datasaver.save(data) return data