def setUp(self): self.ann_dir = "tests/data/ann.bed12" self.size = "tests/data/chrom.sizes" names = ('Bract', 'Cotyledon') ngs = [ os.path.join('tests/data', "%s.sort.bam" % name) for name in names ] self.ngs = [SeqFile(fname, 'NGS') for fname in ngs] self.ann = SeqFile(bed2bam(self.ann_dir, self.size, "/tmp/igia"), 'ANN') load_seqinfo(self.ngs)
def setUp(self): names = ('Bract', 'Cotyledon') ngs = [ os.path.join('tests/data', "%s.sort.bam" % name) for name in names ] self.ngs = [SeqFile(fname, 'NGS') for fname in ngs] load_seqinfo(self.ngs) self.tgs = [SeqFile('tests/data/all_fixed_star.sort.bam', 'TGS')] self.ival_rev = Interval("Chr01", 93000, 96500, "-") self.ival_rev.build_cov(self.ngs) jgn_rev = JunctionGraphNgs(self.ival_rev) self.rev_intron = jgn_rev.identify_intron(self.ngs) self.ival_fwd = Interval("Chr01", 485500, 493000, "+") self.ival_fwd.build_cov(self.ngs) jgn_fwd = JunctionGraphNgs(self.ival_fwd) self.fwd_intron = jgn_fwd.identify_intron(self.ngs) self.genome = GenomeFile(os.path.join("tests", "data", "genome.fa"))
def main(args): """Main entry point allowing external calls Args: args ([str]): command line parameter list """ args = parse_args(args) setup_logging(args.loglevel) check_paraclu(args) _logger.debug("Starting IGIA ...") ngs_obj_list = [SeqFile(x, "NGS") for x in args.ngs_file] tgs_obj_list = [SeqFile(x, "TGS") for x in args.tgs_file] ext_tss_list = load_txs(args.tss) ext_tes_list = load_txs(args.tes) out_dir = args.out_dir ann = load_ann(args.ann, args.size, out_dir, "ANN") # Update Global variables. GVAR.RULE = args.rule GVAR.TXS_DIFF = args.dtxs GVAR.SPLICED_INTRON_PIR_CUTOFF = args.pir f_genome = args.f_genome paraclu_path = args.paraclu_path load_seqinfo(ngs_obj_list) _logger.info("Start building linkage ...") bam_list = ngs_obj_list + tgs_obj_list if ann is not None: bam_list += [ann] linkage = find_linkage(bam_list) _logger.info("Finish building linkage") cluster_indx = 0 with OutputHandle(out_dir) as f_out: for chrom, start, end in linkage.iterlinkage(): try: if args.time_out is not None: signal.signal(signal.SIGALRM, time_out_handler) signal.alarm(args.time_out) _logger.debug( "Start identifying elements in {0}:{1}-{2}".format( chrom, start, end)) gene_cluster_list = identify_element( chrom, start, end, ngs_obj_list, tgs_obj_list, ext_tss_list, ext_tes_list, ann, f_genome, paraclu_path) _logger.debug( "Finish identifying elements in {0}:{1}-{2}".format( chrom, start, end)) for gene_cluster in gene_cluster_list: # list of gene cluster without any common exon if not gene_cluster.has_element(): continue cluster_indx += 1 cluster_name = "c_{0}".format(cluster_indx) gene_cluster.write_element2bed6(*f_out.element_handles(), cluster_name) _logger.debug( "Start identifying transcript for {0}".format( gene_cluster)) trans = identify_transcript(gene_cluster, ann) trans.write2bed12(cluster_name, *f_out.isoform_handles()) _logger.debug( "Finish identifying transcript for {0}".format( gene_cluster)) if args.time_out is not None: signal.alarm(0) except TimeOutError: print("TimeOut: {0}\t{1}\t{2}\n".format(chrom, start, end)) with open(os.path.join(args.out_dir, "igia_debug_timeout.log"), "a") as f: f.write("TimeOut ({0}s): {1}\t{2}\t{3}\n".format( args.time_out, chrom, start, end)) _logger.info("End")
def main(args): """Main entry point allowing external calls Args: args ([str]): command line parameter list """ comm = MPI.COMM_WORLD rank = comm.Get_rank() rank_size = comm.Get_size() if rank == 0: args = parse_args(args) check_paraclu(args) out_dir = args.out_dir rand_seq = str(hash(time.time() * 100000 + random.random())) tmp_dir = os.path.join(out_dir, rand_seq) args.tmp_dir = tmp_dir setup_logging(args.loglevel) _logger.info("Starting IGIA in MPI mode ...") else: args = None args = comm.bcast(args, root=0) out_dir = args.out_dir tmp_dir = args.tmp_dir ngs_obj_list = [SeqFile(x, "NGS") for x in args.ngs_file] tgs_obj_list = [SeqFile(x, "TGS") for x in args.tgs_file] ext_tss_list = load_txs(args.tss) ext_tes_list = load_txs(args.tes) f_ann = args.ann size = args.size f_genome = args.f_genome paraclu_path = args.paraclu_path ann = None if f_ann: if rank == 0: _logger.info("Loading annotation transcripts ...") if not size: raise ValueError("Error: missing Chrom Size file") ann_bam = bed2bam(f_ann, size, tmp_dir) else: ann_bam = None ann_bam = comm.bcast(ann_bam, root=0) ann = SeqFile(ann_bam, "ANN") # Update Global variables GVAR.RULE = args.rule GVAR.TXS_DIFF = args.dtxs GVAR.SPLICED_INTRON_PIR_CUTOFF = args.pir load_seqinfo(ngs_obj_list) _logger.info("Start building linkage ... ") bam_list = ngs_obj_list + tgs_obj_list if ann is not None: bam_list += [ann] # Scatter linkage scan tasks if rank == 0: chrom_size_list = bam_list[0].chromsize() scan_linkage_infos = [(chrom_size, seq_obj_indx) for chrom_size in chrom_size_list for seq_obj_indx in range(len(bam_list))] master = LBMaster(comm, GVAR.MAX_QUEUE_LEN, master_tag=0, worker_tag=1, sleep=GVAR.SLEEP_TIME) linkages = master.do(scan_linkage_infos) linkage = Linkage() for sub_linkage in linkages: linkage.add_linkage(sub_linkage) else: worker = LBScanLinkageWorker(comm, master_tag=0, worker_tag=1) worker.do(bam_list) linkage = None _logger.info("Finish building linkage ... ") # with open("node_{0}.log".format(rank), "a") as f: # f.write("finish building linkage\n") if rank == 0: data = "do" else: data = None data = comm.bcast(data, root=0) assert data == "do" # Identify transcripts if rank == 0: _logger.info("Start identifying transcripts") linkage_region_list = list(linkage.iterlinkage()) master = LBMaster(comm, GVAR.MAX_QUEUE_LEN, master_tag=2, worker_tag=3, sleep=GVAR.SLEEP_TIME) master.do(linkage_region_list) else: worker = LBIdentifyIsoWorker(comm, master_tag=2, worker_tag=3) worker.do(tmp_dir, ngs_obj_list, tgs_obj_list, ext_tss_list, ext_tes_list, ann, f_genome, paraclu_path) # Merge results if rank == 0: _logger.info("Finish identifying transcripts") outfiles = [ "intron.bed6", "internal_exon.bed6", "tss_exon.bed6", "tes_exon.bed6", "isoF.bed12", "isoA.bed12", "isoR.bed12", "isoM.bed12", "isoC.bed12", "isoP.bed12" ] res_dirs = [ os.path.join(tmp_dir, "node_{0}".format(x)) for x in range(1, rank_size) ] for name in outfiles: out_files = [os.path.join(res_dir, name) for res_dir in res_dirs] code = "cat " + " ".join(out_files) + " > " + os.path.join( out_dir, name) subprocess.call(code, shell=True) for res_dir in res_dirs: code = "rm -r {0}".format(res_dir) subprocess.call(code, shell=True) _logger.info("End")