Ejemplo n.º 1
0
class Aligner(object) :
    def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) :
        self.directory = join(top_level_directory, 'alignments')
        self.min_length = min_length # glutton
        self.min_hitidentity = min_hitidentity # blast 
        self.min_hitlength = min_hitlength # blast
        self.max_evalue = max_evalue # blast
        self.min_alignidentity = min_alignidentity # pagan
        self.min_alignoverlap = min_alignoverlap # pagan

        check_dir(self.directory, create=True)

        self.search = All_vs_all_search(batch_size)
        self.cleanup_files = []
        self.q = None

        self.lock = threading.Lock()
        self.complete_jobs = 0
        self.total_jobs = 0

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.param.set_reference(self.db)

        self.resume = self.param.able_to_resume()

        self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume)
        self.param.set_full_checksum()

    def _read_contigs(self) :
        contigs = {}

        for label in self.param.get_sample_ids() :
            accepted = 0
            rejected = { 'length' : 0, 'ambiguous' : 0 }

            fname = self.param.get_contigs(label)

            for r in SeqIO.parse(fname, 'fasta') :
                if len(r) < self.min_length :
                    rejected['length'] += 1
                    continue

                #if 'N' in r :
                #    rejected['ambiguous'] += 1
                #    continue

                qid = self.info.get_query_from_contig(label, r.description)
 
                contigs[qid] = biopy_to_gene(r, qid)
                accepted += 1

            self.log.info("%s: read %d contigs (rejected %d due to length < %d)" % #and %d due to 'N's)" %
                (fname, accepted, rejected['length'], self.min_length)) #, rejected['ambiguous']))

        return contigs

    def stop(self) :
        self.search.stop()
        self.info.update_query_gene_mapping(self.search.get_intermediate_results())
        
        if self.q :
            self.q.stop()

        rm_f(self.cleanup_files)

        self.info.flush()
        self.param.flush()

    def _correct_strand(self, contig, strand) :
        if strand == '-' :
            contig.reverse_complement()

        return contig

    def align(self) :
        self.log.info("starting alignment procedure")

        # convert the names of the contigs to something no program can complain about
        # + filter out the ones that could never have a long enough alignment
        contigs = self._read_contigs()

        pending_contigs = [ contigs[i] for i in self.info.pending_queries() ]

        self.log.info("%d contigs have not been assigned to genes..." % len(pending_contigs))

        # depending on when the program was terminated this step may be complete or partially
        # complete 
        if pending_contigs :
            db_fname = self.db.extract_all()
            self.cleanup_files.append(db_fname)

            # do an all vs all search of contigs vs database of transcripts
            # return a dict of tmp ids with gene ids
            self.info.update_query_gene_mapping(
                self.search.process(
                    db_fname, 
                    pending_contigs,
                    self.db.nucleotide,
                    self.min_hitidentity,
                    self.min_hitlength,
                    self.max_evalue)
                )

            rm_f(db_fname)

        # save intermediate results
        self.info.flush()

        # use the database to convert the mapping from tmp id -> gene
        # to gene family -> list of (tmp id, strands)
        genefamily_contig_map = self.info.build_genefamily2contigs()
        
        self.log.info("%d contigs assigned to %d gene families" % 
                (sum([ len(i) for i in genefamily_contig_map.values() ]), len(genefamily_contig_map)))
        self.log.info("(%d have already been run)" % self.info.len_genefamily2filename())

        if self.info.len_genefamily2filename() == len(genefamily_contig_map) :
            self.log.info("alignment already done, exiting early...")
            return
        else :
            self.log.info("starting alignments...")


        # queue all the alignments up using a work queue and pagan
        self.q = WorkQueue()

        self.total_jobs = len(genefamily_contig_map) - self.info.len_genefamily2filename()
        self.complete_jobs = -1
        self._progress()

        for famid in self.sort_keys_by_complexity(genefamily_contig_map) :
            # ignore the jobs that have already been run
            if self.info.in_genefamily2filename(famid) :
                continue

            try :
                # get the alignment and tree from the database
                alignment = self.db.get_alignment(famid)
                tree = alignment.get_tree()

                # get contigs
                job_contigs = [ self._correct_strand(contigs[contigid], strand) for contigid,strand in genefamily_contig_map[famid] ]

                # queue the job
                self.q.enqueue(
                    PaganJob(
                        self.job_callback,
                        job_contigs,
                        famid,
                        alignment,
                        tree,
                        self.min_alignidentity,
                        self.min_alignoverlap)
                    )

                # avoid the split code later in the loop...
                continue

            except GluttonDBError, gde :
                # this means we have never heard of this gene family
                self.log.warn(str(gde))
                continue

            except GluttonDBFileError, gdfe :
                # this means we have heard of the gene family, but the
                # alignment files were missing...
                self.log.warn(str(gdfe))

            # okay, the gene family was not aligned for some reason
            # instead we will split the gene family into constituent genes
            # and handle each one separately...

            self.log.warn("gene family was not aligned, breaking down into separate genes...")
            self.total_jobs += (len(genefamily_contig_map[famid]) - 1)

            # collect contigs by gene
            gene2contigs = collections.defaultdict(list)

            for contigid,strand in genefamily_contig_map[famid] :
                try :
                    geneid = self.info.query_to_gene(contigid)

                except KeyError : # this should be impossible
                    self.log.warn("no gene assignment for %s" % contigid)
                    continue

                gene2contigs[geneid].append((contigid, strand))

            # run each gene separately
            for geneid in gene2contigs :
                try :
                    alignment = [ self.db.get_gene(geneid) ]

                except GluttonDBError, gde :
                    self.log.warn(str(gde))
                    continue

                # queue the job
                self.q.enqueue(
                    PaganJob(
                        self.job_callback,
                        [ self._correct_strand(contigs[contigid], strand) for contigid,strand in gene2contigs[geneid] ],
                        geneid,
                        alignment,
                        None,
                        self.min_alignidentity,
                        self.min_alignoverlap)
                    )
Ejemplo n.º 2
0
Archivo: db.py Proyecto: ajm/glutton
class GluttonDB(object) :
    def __init__(self, fname=None) :
        self.fname       = fname
        self.compression = ZIP_DEFLATED
        self.metadata    = None
        self.data        = None     # dict of famid -> GeneFamily obj (list of Genes)
        self.seq2famid   = None     # dict of geneid -> famid
        self.dirty       = False
        self.lock        = threading.Lock()
        self.complete_jobs = 0
        self.total_jobs = 0

        self.log = get_log()

        if self.fname :
            self._read()

            if not self.is_complete() :
                self.log.warn("%s is not complete!" % self.fname)

    @property
    def species(self) :
        return self.metadata['species-name']

    @property
    def release(self) :
        return self.metadata['species-release']

    @property
    def nucleotide(self) :
        return self.metadata['nucleotide']

    @property
    def download_time(self) :
        return self.metadata['download-time']

    @property
    def version(self) :
        return self.metadata['glutton-version']

    @property
    def database(self) :
        return self.metadata['database-name']

    @property
    def filename(self) :
        return self.fname

    @property
    def checksum(self) :
        return md5(self.fname)

    def stop(self) :
        if hasattr(self, "q") :
            self.q.stop()

    def flush(self) :
        if self.dirty :
            self._write()

    # read manifest to get data and mapping files
    # read data file to get seqID to gluttonID mapping
    # read mapping file to get gluttonID to file name mapping 
    def _read(self) :
        global MANIFEST_FNAME

        z = ZipFile(self.fname, 'r', compression=self.compression)
    
        def _err(msg) :
            z.close()
            raise GluttonImportantFileNotFoundError(msg)
    
        # without the manifest all is lost
        # we need this to get the names of the other
        # XML files
        if MANIFEST_FNAME not in z.namelist() :
            _err('manifest not found in %s' % self.fname)

        self.metadata = json.load(z.open(MANIFEST_FNAME))
        
        self.log.info("read manifest - created on %s using glutton version %.1f" % \
            (time.strftime('%d/%m/%y at %H:%M:%S', time.localtime(self.download_time)), \
             self.version))

        # the data file is the raw data grouped into gene families
        # when we do a local alignment we need to get the gene id
        # of the best hit and find out which gene family it belongs to 
        if self.metadata['data-file'] not in z.namelist() :
            _err('data file (%s) not found in %s' % (self.metadata['data-file'], self.fname))

        self.data = json_to_glutton(json.load(z.open(self.metadata['data-file'])))
        self.seq2famid = self._create_lookup_table(self.data)

        self.log.info("read %d gene families (%d genes)" % (len(self.data), len(self.seq2famid)))

        z.close()

    def _create_lookup_table(self, families) :
        tmp = {}

        for fid in families :
            for gene in families[fid] :
                tmp[gene.id] = fid
        
        return tmp

    def _valid_manifest(self, m) :
        global metadata_keys

        for k in metadata_keys :
            if k not in m :
                return False
        
        return True

    def _write_to_archive(self, data, zfile, zname) :
        fname = tmpfile()

        f = open(fname, 'w')
        f.write(json.dumps(data))
        f.close()

        zfile.write(fname, arcname=zname)
        os.remove(fname)

    def _write(self) :
        global MANIFEST_FNAME

        assert self._valid_manifest(self.metadata)

        z = ZipFile(self.fname, 'a', compression=self.compression)

        self._write_to_archive(self.metadata,               z, MANIFEST_FNAME)
        self._write_to_archive(glutton_to_json(self.data),  z, self.metadata['data-file'])

        z.close()

        self.dirty = False

    def _default_datafile(self, species, release) :
        return "%s_%d_data.json" % (species, release)

    def build(self, fname, species, release=None, database_name='ensembl', nucleotide=False, download_only=False) :
        self.fname = fname

        # if the name is specified and the file exists, then that means the 
        # download already took place and we should get:
        #   - database_name
        #   - release
        # from the metadata
        if self.fname and exists(self.fname) :
            self.log.info("%s exists, resuming..." % self.fname)
       
        else :
            # release not specified
            if not release :
                self.log.info("release not provided, getting latest release...")
                release = EnsemblDownloader().get_latest_release(species, database_name)
                self.log.info("latest release is %d" % release) 

            # default name if it was not defined
            if not self.fname :
                #self.fname = "%s_%d_%s_%s.glt" % (species, release, "nuc" if nucleotide else "pep", get_ensembl_download_method())
                self.fname = "%s_%d.glt" % (species, release)
                self.log.info("database filename not specified, using '%s'" % self.fname)

            # are we resuming or starting fresh?
            if not exists(self.fname) :
                self.log.info("%s does not exist, starting from scratch..." % self.fname)
                self._initialise_db(species, release, database_name, nucleotide)

        # either way, read contents into memory
        self._read()


        # not really necessary, but check that the species from cli and in the file
        # are the same + nucleotide
        if self.species != species :
            self.log.warn("species from CLI (%s) and glutton file (%s) do not match!" % (species, self.species))

        if release and (self.release != release) :
            self.log.warn("release from CLI (%d) and glutton file (%d) do not match!" % (release, self.release))

        if self.nucleotide != nucleotide :
            self.log.warn("nucleotide/protein from CLI (%s) and glutton file (%s) do not match!" % \
                ("nucleotide" if nucleotide else "protein", "nucleotide" if self.nucleotide else "protein"))


        # no work to do
        if self.is_complete() : 
            self.log.info("%s is already complete!" % self.fname)
            return

        # don't do the analysis, just exit
        if download_only :
            self.log.info("download complete")
            return

        # build db
        self._perform_alignments()
        
        # write to disk
        self._write()

        self.log.info("finished building %s/%s" % (self.species, self.release))

    def _get_unaligned_families(self) :
        unaligned = []

        z = ZipFile(self.fname, 'r', compression=self.compression)
        aligned = set([ i.split('.')[0] for i in z.namelist() if i.endswith('.tree') ])
        z.close()

        for i in self.data :
            if (i not in aligned) and (len(self.data[i]) > 1) :
                unaligned.append(i)

        self.log.info("found %d unaligned gene families" % len(unaligned))

        return unaligned

    def _perform_alignments(self) :
        unaligned = self._get_unaligned_families()

        if not hasattr(self, "q") :
            self.q = WorkQueue()

        self.total_jobs = len(unaligned)
        self.complete_jobs = -1
        self._progress()

        for i in unaligned :
            self.q.enqueue(PrankJob(self.job_callback, self.data[i]))

        self.log.debug("waiting for job queue to drain...")

        self.q.join()

    def _initialise_db(self, species, release, database_name, nucleotide) :
        e = EnsemblDownloader()
        self.log.info("downloading %s/%d" % (species, release))
        
        try :
            self.data = ensembl_to_glutton(e.download(species, release, database_name, nucleotide))

        except EnsemblDownloadError, ede :
            self.log.fatal(ede.message)
            exit(1)



        # drosophila melanogastor - nucleotide - ensembl-main
        # contains transcripts, but not gene families
        count = 0
        for famid in self.data :
            if len(self.data[famid]) == 1 :
                count += 1
        
        if count == len(self.data) :
            raise GluttonDBBuildError("downloaded %d gene families composed of a single gene each ('sql' method will do this on some species that do not contain all transcripts (e.g. drosophila_melanogaster in ensembl-main))" % count)



        self.metadata = {}
        self.metadata['download-time']      = time.time()

        # glutton metadata
        self.metadata['glutton-version']    = glutton.__version__
        self.metadata['program-name']       = Prank().name
        self.metadata['program-version']    = Prank().version
        self.metadata['species-name']       = species
        self.metadata['species-release']    = release
        self.metadata['nucleotide']         = nucleotide
        self.metadata['database-name']      = database_name
        self.metadata['download-method']    = get_ensembl_download_method()

        # other xml files
        self.metadata['data-file']          = self._default_datafile(species, release)
        
        self.dirty = True
        self._write()
Ejemplo n.º 3
0
class All_vs_all_search(object):
    def __init__(self, batch_size=100):
        self.nucleotide = False
        self.min_hitidentity = None
        self.min_hitlength = None
        self.max_evalue = None
        self.batch_size = batch_size
        self.log = get_log()
        self.cleanup_files = []
        self.gene_assignments = {}
        self.lock = threading.Lock()
        self.q = None

        self.total_jobs = 0
        self.complete_jobs = 0

    def _batch(self, x):
        tmp = []

        for i in x:
            tmp.append(i)

            if len(tmp) == self.batch_size:
                yield tmp
                tmp = []

        if not tmp:
            raise StopIteration

        yield tmp

    def process(self, db, queries, nucleotide, min_hitidentity, min_hitlength, max_evalue):
        self.nucleotide = nucleotide
        self.min_hitidentity = min_hitidentity
        self.min_hitlength = min_hitlength
        self.max_evalue = max_evalue

        # we need to deal with the index files here because
        # all of the blastx jobs need them
        self.cleanup_files += [db + i for i in [".phr", ".pin", ".psq"]]

        # creates db + {phr,pin,psq} in same dir as db
        self.log.info("creating blast db...")
        Blast.makedb(db)  # XXX THIS IS ALWAYS PROTEIN, BECAUSE WE WANT TO RUN BLASTX

        # queue up the jobs
        self.log.info("starting local alignments...")
        self.q = WorkQueue()

        self.total_jobs = len(queries)
        self.complete_jobs = -self.batch_size
        self._progress()

        for query in self._batch(queries):
            self.q.enqueue(BlastJob(self.job_callback, db, query, "blastx"))

        self.log.debug("waiting for job queue to drain...")
        self.q.join()

        rm_f(self.cleanup_files)

        return self.gene_assignments

    def stop(self):
        if self.q:
            self.q.stop()

        rm_f(self.cleanup_files)

    def get_intermediate_results(self):
        return self.gene_assignments

    def _progress(self):
        self.complete_jobs += self.batch_size

        if self.complete_jobs > self.total_jobs:
            self.complete_jobs = self.total_jobs

        sys.stderr.write("\rProgress: %d / %d blastx alignments " % (self.complete_jobs, self.total_jobs))

        if self.complete_jobs == self.total_jobs:
            sys.stderr.write("\n")
            sys.stderr.flush()

    def job_callback(self, job):
        self.log.debug("%d blast results returned" % len(job.results))

        self.lock.acquire()

        self._progress()

        if job.success():
            qlen = dict([(q.id, len(q)) for q in job.input])

            for br in job.results:
                # length = max(br.qstart, br.qend) - min(br.qstart, br.qend)
                strand = "+" if br.qstart < br.qend else "-"

                if (
                    (br.qseqid in self.gene_assignments)
                    or (self.max_evalue < br.evalue)
                    or (self.min_hitidentity > br.pident)
                    or (self.min_hitlength > br.length)
                ):
                    continue

                self.gene_assignments[br.qseqid] = (br.sseqid, strand)

        for q in job.input:
            if q.id not in self.gene_assignments:
                self.gene_assignments[q.id] = None

        self.lock.release()