Example #1
0
    def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) :
        self.directory = join(top_level_directory, 'alignments')
        self.min_length = min_length # glutton
        self.min_hitidentity = min_hitidentity # blast 
        self.min_hitlength = min_hitlength # blast
        self.max_evalue = max_evalue # blast
        self.min_alignidentity = min_alignidentity # pagan
        self.min_alignoverlap = min_alignoverlap # pagan

        check_dir(self.directory, create=True)

        self.search = All_vs_all_search(batch_size)
        self.cleanup_files = []
        self.q = None

        self.lock = threading.Lock()
        self.complete_jobs = 0
        self.total_jobs = 0

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.param.set_reference(self.db)

        self.resume = self.param.able_to_resume()

        self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume)
        self.param.set_full_checksum()
Example #2
0
class Aligner(object) :
    def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) :
        self.directory = join(top_level_directory, 'alignments')
        self.min_length = min_length # glutton
        self.min_hitidentity = min_hitidentity # blast 
        self.min_hitlength = min_hitlength # blast
        self.max_evalue = max_evalue # blast
        self.min_alignidentity = min_alignidentity # pagan
        self.min_alignoverlap = min_alignoverlap # pagan

        check_dir(self.directory, create=True)

        self.search = All_vs_all_search(batch_size)
        self.cleanup_files = []
        self.q = None

        self.lock = threading.Lock()
        self.complete_jobs = 0
        self.total_jobs = 0

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.param.set_reference(self.db)

        self.resume = self.param.able_to_resume()

        self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume)
        self.param.set_full_checksum()

    def _read_contigs(self) :
        contigs = {}

        for label in self.param.get_sample_ids() :
            accepted = 0
            rejected = { 'length' : 0, 'ambiguous' : 0 }

            fname = self.param.get_contigs(label)

            for r in SeqIO.parse(fname, 'fasta') :
                if len(r) < self.min_length :
                    rejected['length'] += 1
                    continue

                #if 'N' in r :
                #    rejected['ambiguous'] += 1
                #    continue

                qid = self.info.get_query_from_contig(label, r.description)
 
                contigs[qid] = biopy_to_gene(r, qid)
                accepted += 1

            self.log.info("%s: read %d contigs (rejected %d due to length < %d)" % #and %d due to 'N's)" %
                (fname, accepted, rejected['length'], self.min_length)) #, rejected['ambiguous']))

        return contigs

    def stop(self) :
        self.search.stop()
        self.info.update_query_gene_mapping(self.search.get_intermediate_results())
        
        if self.q :
            self.q.stop()

        rm_f(self.cleanup_files)

        self.info.flush()
        self.param.flush()

    def _correct_strand(self, contig, strand) :
        if strand == '-' :
            contig.reverse_complement()

        return contig

    def align(self) :
        self.log.info("starting alignment procedure")

        # convert the names of the contigs to something no program can complain about
        # + filter out the ones that could never have a long enough alignment
        contigs = self._read_contigs()

        pending_contigs = [ contigs[i] for i in self.info.pending_queries() ]

        self.log.info("%d contigs have not been assigned to genes..." % len(pending_contigs))

        # depending on when the program was terminated this step may be complete or partially
        # complete 
        if pending_contigs :
            db_fname = self.db.extract_all()
            self.cleanup_files.append(db_fname)

            # do an all vs all search of contigs vs database of transcripts
            # return a dict of tmp ids with gene ids
            self.info.update_query_gene_mapping(
                self.search.process(
                    db_fname, 
                    pending_contigs,
                    self.db.nucleotide,
                    self.min_hitidentity,
                    self.min_hitlength,
                    self.max_evalue)
                )

            rm_f(db_fname)

        # save intermediate results
        self.info.flush()

        # use the database to convert the mapping from tmp id -> gene
        # to gene family -> list of (tmp id, strands)
        genefamily_contig_map = self.info.build_genefamily2contigs()
        
        self.log.info("%d contigs assigned to %d gene families" % 
                (sum([ len(i) for i in genefamily_contig_map.values() ]), len(genefamily_contig_map)))
        self.log.info("(%d have already been run)" % self.info.len_genefamily2filename())

        if self.info.len_genefamily2filename() == len(genefamily_contig_map) :
            self.log.info("alignment already done, exiting early...")
            return
        else :
            self.log.info("starting alignments...")


        # queue all the alignments up using a work queue and pagan
        self.q = WorkQueue()

        self.total_jobs = len(genefamily_contig_map) - self.info.len_genefamily2filename()
        self.complete_jobs = -1
        self._progress()

        for famid in self.sort_keys_by_complexity(genefamily_contig_map) :
            # ignore the jobs that have already been run
            if self.info.in_genefamily2filename(famid) :
                continue

            try :
                # get the alignment and tree from the database
                alignment = self.db.get_alignment(famid)
                tree = alignment.get_tree()

                # get contigs
                job_contigs = [ self._correct_strand(contigs[contigid], strand) for contigid,strand in genefamily_contig_map[famid] ]

                # queue the job
                self.q.enqueue(
                    PaganJob(
                        self.job_callback,
                        job_contigs,
                        famid,
                        alignment,
                        tree,
                        self.min_alignidentity,
                        self.min_alignoverlap)
                    )

                # avoid the split code later in the loop...
                continue

            except GluttonDBError, gde :
                # this means we have never heard of this gene family
                self.log.warn(str(gde))
                continue

            except GluttonDBFileError, gdfe :
                # this means we have heard of the gene family, but the
                # alignment files were missing...
                self.log.warn(str(gdfe))

            # okay, the gene family was not aligned for some reason
            # instead we will split the gene family into constituent genes
            # and handle each one separately...

            self.log.warn("gene family was not aligned, breaking down into separate genes...")
            self.total_jobs += (len(genefamily_contig_map[famid]) - 1)

            # collect contigs by gene
            gene2contigs = collections.defaultdict(list)

            for contigid,strand in genefamily_contig_map[famid] :
                try :
                    geneid = self.info.query_to_gene(contigid)

                except KeyError : # this should be impossible
                    self.log.warn("no gene assignment for %s" % contigid)
                    continue

                gene2contigs[geneid].append((contigid, strand))

            # run each gene separately
            for geneid in gene2contigs :
                try :
                    alignment = [ self.db.get_gene(geneid) ]

                except GluttonDBError, gde :
                    self.log.warn(str(gde))
                    continue

                # queue the job
                self.q.enqueue(
                    PaganJob(
                        self.job_callback,
                        [ self._correct_strand(contigs[contigid], strand) for contigid,strand in gene2contigs[geneid] ],
                        geneid,
                        alignment,
                        None,
                        self.min_alignidentity,
                        self.min_alignoverlap)
                    )