def setup_command(args) : if args.setupcmd == 'add' : gp = GluttonParameters(args.project, create=True) gp.add(args.contigs, args.sample, args.species, args.bam, args.assembler, copy=args.copy) gp.flush() print >> stderr, "added %s (%s contains %d samples)" % (args.sample, args.project, gp.count()) elif args.setupcmd == 'remove' : gp = GluttonParameters(args.project, create=False) gp.remove(args.sample) gp.flush() print >> stderr, "removed %s (%s contains %d samples)" % (args.sample, args.project, gp.count()) elif args.setupcmd == 'list' : gp = GluttonParameters(args.project, create=False) gp.list() return 0
class Aligner(object) : def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) : self.directory = join(top_level_directory, 'alignments') self.min_length = min_length # glutton self.min_hitidentity = min_hitidentity # blast self.min_hitlength = min_hitlength # blast self.max_evalue = max_evalue # blast self.min_alignidentity = min_alignidentity # pagan self.min_alignoverlap = min_alignoverlap # pagan check_dir(self.directory, create=True) self.search = All_vs_all_search(batch_size) self.cleanup_files = [] self.q = None self.lock = threading.Lock() self.complete_jobs = 0 self.total_jobs = 0 self.log = get_log() self.param = GluttonParameters(top_level_directory) self.db = GluttonDB(reference_fname) self.param.set_reference(self.db) self.resume = self.param.able_to_resume() self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume) self.param.set_full_checksum() def _read_contigs(self) : contigs = {} for label in self.param.get_sample_ids() : accepted = 0 rejected = { 'length' : 0, 'ambiguous' : 0 } fname = self.param.get_contigs(label) for r in SeqIO.parse(fname, 'fasta') : if len(r) < self.min_length : rejected['length'] += 1 continue #if 'N' in r : # rejected['ambiguous'] += 1 # continue qid = self.info.get_query_from_contig(label, r.description) contigs[qid] = biopy_to_gene(r, qid) accepted += 1 self.log.info("%s: read %d contigs (rejected %d due to length < %d)" % #and %d due to 'N's)" % (fname, accepted, rejected['length'], self.min_length)) #, rejected['ambiguous'])) return contigs def stop(self) : self.search.stop() self.info.update_query_gene_mapping(self.search.get_intermediate_results()) if self.q : self.q.stop() rm_f(self.cleanup_files) self.info.flush() self.param.flush() def _correct_strand(self, contig, strand) : if strand == '-' : contig.reverse_complement() return contig def align(self) : self.log.info("starting alignment procedure") # convert the names of the contigs to something no program can complain about # + filter out the ones that could never have a long enough alignment contigs = self._read_contigs() pending_contigs = [ contigs[i] for i in self.info.pending_queries() ] self.log.info("%d contigs have not been assigned to genes..." % len(pending_contigs)) # depending on when the program was terminated this step may be complete or partially # complete if pending_contigs : db_fname = self.db.extract_all() self.cleanup_files.append(db_fname) # do an all vs all search of contigs vs database of transcripts # return a dict of tmp ids with gene ids self.info.update_query_gene_mapping( self.search.process( db_fname, pending_contigs, self.db.nucleotide, self.min_hitidentity, self.min_hitlength, self.max_evalue) ) rm_f(db_fname) # save intermediate results self.info.flush() # use the database to convert the mapping from tmp id -> gene # to gene family -> list of (tmp id, strands) genefamily_contig_map = self.info.build_genefamily2contigs() self.log.info("%d contigs assigned to %d gene families" % (sum([ len(i) for i in genefamily_contig_map.values() ]), len(genefamily_contig_map))) self.log.info("(%d have already been run)" % self.info.len_genefamily2filename()) if self.info.len_genefamily2filename() == len(genefamily_contig_map) : self.log.info("alignment already done, exiting early...") return else : self.log.info("starting alignments...") # queue all the alignments up using a work queue and pagan self.q = WorkQueue() self.total_jobs = len(genefamily_contig_map) - self.info.len_genefamily2filename() self.complete_jobs = -1 self._progress() for famid in self.sort_keys_by_complexity(genefamily_contig_map) : # ignore the jobs that have already been run if self.info.in_genefamily2filename(famid) : continue try : # get the alignment and tree from the database alignment = self.db.get_alignment(famid) tree = alignment.get_tree() # get contigs job_contigs = [ self._correct_strand(contigs[contigid], strand) for contigid,strand in genefamily_contig_map[famid] ] # queue the job self.q.enqueue( PaganJob( self.job_callback, job_contigs, famid, alignment, tree, self.min_alignidentity, self.min_alignoverlap) ) # avoid the split code later in the loop... continue except GluttonDBError, gde : # this means we have never heard of this gene family self.log.warn(str(gde)) continue except GluttonDBFileError, gdfe : # this means we have heard of the gene family, but the # alignment files were missing... self.log.warn(str(gdfe)) # okay, the gene family was not aligned for some reason # instead we will split the gene family into constituent genes # and handle each one separately... self.log.warn("gene family was not aligned, breaking down into separate genes...") self.total_jobs += (len(genefamily_contig_map[famid]) - 1) # collect contigs by gene gene2contigs = collections.defaultdict(list) for contigid,strand in genefamily_contig_map[famid] : try : geneid = self.info.query_to_gene(contigid) except KeyError : # this should be impossible self.log.warn("no gene assignment for %s" % contigid) continue gene2contigs[geneid].append((contigid, strand)) # run each gene separately for geneid in gene2contigs : try : alignment = [ self.db.get_gene(geneid) ] except GluttonDBError, gde : self.log.warn(str(gde)) continue # queue the job self.q.enqueue( PaganJob( self.job_callback, [ self._correct_strand(contigs[contigid], strand) for contigid,strand in gene2contigs[geneid] ], geneid, alignment, None, self.min_alignidentity, self.min_alignoverlap) )