def process(self, db, queries, nucleotide, min_hitidentity, min_hitlength, max_evalue): self.nucleotide = nucleotide self.min_hitidentity = min_hitidentity self.min_hitlength = min_hitlength self.max_evalue = max_evalue # we need to deal with the index files here because # all of the blastx jobs need them self.cleanup_files += [db + i for i in [".phr", ".pin", ".psq"]] # creates db + {phr,pin,psq} in same dir as db self.log.info("creating blast db...") Blast.makedb(db) # XXX THIS IS ALWAYS PROTEIN, BECAUSE WE WANT TO RUN BLASTX # queue up the jobs self.log.info("starting local alignments...") self.q = WorkQueue() self.total_jobs = len(queries) self.complete_jobs = -self.batch_size self._progress() for query in self._batch(queries): self.q.enqueue(BlastJob(self.job_callback, db, query, "blastx")) self.log.debug("waiting for job queue to drain...") self.q.join() rm_f(self.cleanup_files) return self.gene_assignments
def _perform_alignments(self) : unaligned = self._get_unaligned_families() if not hasattr(self, "q") : self.q = WorkQueue() self.total_jobs = len(unaligned) self.complete_jobs = -1 self._progress() for i in unaligned : self.q.enqueue(PrankJob(self.job_callback, self.data[i])) self.log.debug("waiting for job queue to drain...") self.q.join()
class GluttonDB(object) : def __init__(self, fname=None) : self.fname = fname self.compression = ZIP_DEFLATED self.metadata = None self.data = None # dict of famid -> GeneFamily obj (list of Genes) self.seq2famid = None # dict of geneid -> famid self.dirty = False self.lock = threading.Lock() self.complete_jobs = 0 self.total_jobs = 0 self.log = get_log() if self.fname : self._read() if not self.is_complete() : self.log.warn("%s is not complete!" % self.fname) @property def species(self) : return self.metadata['species-name'] @property def release(self) : return self.metadata['species-release'] @property def nucleotide(self) : return self.metadata['nucleotide'] @property def download_time(self) : return self.metadata['download-time'] @property def version(self) : return self.metadata['glutton-version'] @property def database(self) : return self.metadata['database-name'] @property def filename(self) : return self.fname @property def checksum(self) : return md5(self.fname) def stop(self) : if hasattr(self, "q") : self.q.stop() def flush(self) : if self.dirty : self._write() # read manifest to get data and mapping files # read data file to get seqID to gluttonID mapping # read mapping file to get gluttonID to file name mapping def _read(self) : global MANIFEST_FNAME z = ZipFile(self.fname, 'r', compression=self.compression) def _err(msg) : z.close() raise GluttonImportantFileNotFoundError(msg) # without the manifest all is lost # we need this to get the names of the other # XML files if MANIFEST_FNAME not in z.namelist() : _err('manifest not found in %s' % self.fname) self.metadata = json.load(z.open(MANIFEST_FNAME)) self.log.info("read manifest - created on %s using glutton version %.1f" % \ (time.strftime('%d/%m/%y at %H:%M:%S', time.localtime(self.download_time)), \ self.version)) # the data file is the raw data grouped into gene families # when we do a local alignment we need to get the gene id # of the best hit and find out which gene family it belongs to if self.metadata['data-file'] not in z.namelist() : _err('data file (%s) not found in %s' % (self.metadata['data-file'], self.fname)) self.data = json_to_glutton(json.load(z.open(self.metadata['data-file']))) self.seq2famid = self._create_lookup_table(self.data) self.log.info("read %d gene families (%d genes)" % (len(self.data), len(self.seq2famid))) z.close() def _create_lookup_table(self, families) : tmp = {} for fid in families : for gene in families[fid] : tmp[gene.id] = fid return tmp def _valid_manifest(self, m) : global metadata_keys for k in metadata_keys : if k not in m : return False return True def _write_to_archive(self, data, zfile, zname) : fname = tmpfile() f = open(fname, 'w') f.write(json.dumps(data)) f.close() zfile.write(fname, arcname=zname) os.remove(fname) def _write(self) : global MANIFEST_FNAME assert self._valid_manifest(self.metadata) z = ZipFile(self.fname, 'a', compression=self.compression) self._write_to_archive(self.metadata, z, MANIFEST_FNAME) self._write_to_archive(glutton_to_json(self.data), z, self.metadata['data-file']) z.close() self.dirty = False def _default_datafile(self, species, release) : return "%s_%d_data.json" % (species, release) def build(self, fname, species, release=None, database_name='ensembl', nucleotide=False, download_only=False) : self.fname = fname # if the name is specified and the file exists, then that means the # download already took place and we should get: # - database_name # - release # from the metadata if self.fname and exists(self.fname) : self.log.info("%s exists, resuming..." % self.fname) else : # release not specified if not release : self.log.info("release not provided, getting latest release...") release = EnsemblDownloader().get_latest_release(species, database_name) self.log.info("latest release is %d" % release) # default name if it was not defined if not self.fname : #self.fname = "%s_%d_%s_%s.glt" % (species, release, "nuc" if nucleotide else "pep", get_ensembl_download_method()) self.fname = "%s_%d.glt" % (species, release) self.log.info("database filename not specified, using '%s'" % self.fname) # are we resuming or starting fresh? if not exists(self.fname) : self.log.info("%s does not exist, starting from scratch..." % self.fname) self._initialise_db(species, release, database_name, nucleotide) # either way, read contents into memory self._read() # not really necessary, but check that the species from cli and in the file # are the same + nucleotide if self.species != species : self.log.warn("species from CLI (%s) and glutton file (%s) do not match!" % (species, self.species)) if release and (self.release != release) : self.log.warn("release from CLI (%d) and glutton file (%d) do not match!" % (release, self.release)) if self.nucleotide != nucleotide : self.log.warn("nucleotide/protein from CLI (%s) and glutton file (%s) do not match!" % \ ("nucleotide" if nucleotide else "protein", "nucleotide" if self.nucleotide else "protein")) # no work to do if self.is_complete() : self.log.info("%s is already complete!" % self.fname) return # don't do the analysis, just exit if download_only : self.log.info("download complete") return # build db self._perform_alignments() # write to disk self._write() self.log.info("finished building %s/%s" % (self.species, self.release)) def _get_unaligned_families(self) : unaligned = [] z = ZipFile(self.fname, 'r', compression=self.compression) aligned = set([ i.split('.')[0] for i in z.namelist() if i.endswith('.tree') ]) z.close() for i in self.data : if (i not in aligned) and (len(self.data[i]) > 1) : unaligned.append(i) self.log.info("found %d unaligned gene families" % len(unaligned)) return unaligned def _perform_alignments(self) : unaligned = self._get_unaligned_families() if not hasattr(self, "q") : self.q = WorkQueue() self.total_jobs = len(unaligned) self.complete_jobs = -1 self._progress() for i in unaligned : self.q.enqueue(PrankJob(self.job_callback, self.data[i])) self.log.debug("waiting for job queue to drain...") self.q.join() def _initialise_db(self, species, release, database_name, nucleotide) : e = EnsemblDownloader() self.log.info("downloading %s/%d" % (species, release)) try : self.data = ensembl_to_glutton(e.download(species, release, database_name, nucleotide)) except EnsemblDownloadError, ede : self.log.fatal(ede.message) exit(1) # drosophila melanogastor - nucleotide - ensembl-main # contains transcripts, but not gene families count = 0 for famid in self.data : if len(self.data[famid]) == 1 : count += 1 if count == len(self.data) : raise GluttonDBBuildError("downloaded %d gene families composed of a single gene each ('sql' method will do this on some species that do not contain all transcripts (e.g. drosophila_melanogaster in ensembl-main))" % count) self.metadata = {} self.metadata['download-time'] = time.time() # glutton metadata self.metadata['glutton-version'] = glutton.__version__ self.metadata['program-name'] = Prank().name self.metadata['program-version'] = Prank().version self.metadata['species-name'] = species self.metadata['species-release'] = release self.metadata['nucleotide'] = nucleotide self.metadata['database-name'] = database_name self.metadata['download-method'] = get_ensembl_download_method() # other xml files self.metadata['data-file'] = self._default_datafile(species, release) self.dirty = True self._write()
def align(self) : self.log.info("starting alignment procedure") # convert the names of the contigs to something no program can complain about # + filter out the ones that could never have a long enough alignment contigs = self._read_contigs() pending_contigs = [ contigs[i] for i in self.info.pending_queries() ] self.log.info("%d contigs have not been assigned to genes..." % len(pending_contigs)) # depending on when the program was terminated this step may be complete or partially # complete if pending_contigs : db_fname = self.db.extract_all() self.cleanup_files.append(db_fname) # do an all vs all search of contigs vs database of transcripts # return a dict of tmp ids with gene ids self.info.update_query_gene_mapping( self.search.process( db_fname, pending_contigs, self.db.nucleotide, self.min_hitidentity, self.min_hitlength, self.max_evalue) ) rm_f(db_fname) # save intermediate results self.info.flush() # use the database to convert the mapping from tmp id -> gene # to gene family -> list of (tmp id, strands) genefamily_contig_map = self.info.build_genefamily2contigs() self.log.info("%d contigs assigned to %d gene families" % (sum([ len(i) for i in genefamily_contig_map.values() ]), len(genefamily_contig_map))) self.log.info("(%d have already been run)" % self.info.len_genefamily2filename()) if self.info.len_genefamily2filename() == len(genefamily_contig_map) : self.log.info("alignment already done, exiting early...") return else : self.log.info("starting alignments...") # queue all the alignments up using a work queue and pagan self.q = WorkQueue() self.total_jobs = len(genefamily_contig_map) - self.info.len_genefamily2filename() self.complete_jobs = -1 self._progress() for famid in self.sort_keys_by_complexity(genefamily_contig_map) : # ignore the jobs that have already been run if self.info.in_genefamily2filename(famid) : continue try : # get the alignment and tree from the database alignment = self.db.get_alignment(famid) tree = alignment.get_tree() # get contigs job_contigs = [ self._correct_strand(contigs[contigid], strand) for contigid,strand in genefamily_contig_map[famid] ] # queue the job self.q.enqueue( PaganJob( self.job_callback, job_contigs, famid, alignment, tree, self.min_alignidentity, self.min_alignoverlap) ) # avoid the split code later in the loop... continue except GluttonDBError, gde : # this means we have never heard of this gene family self.log.warn(str(gde)) continue except GluttonDBFileError, gdfe : # this means we have heard of the gene family, but the # alignment files were missing... self.log.warn(str(gdfe))
class Aligner(object) : def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) : self.directory = join(top_level_directory, 'alignments') self.min_length = min_length # glutton self.min_hitidentity = min_hitidentity # blast self.min_hitlength = min_hitlength # blast self.max_evalue = max_evalue # blast self.min_alignidentity = min_alignidentity # pagan self.min_alignoverlap = min_alignoverlap # pagan check_dir(self.directory, create=True) self.search = All_vs_all_search(batch_size) self.cleanup_files = [] self.q = None self.lock = threading.Lock() self.complete_jobs = 0 self.total_jobs = 0 self.log = get_log() self.param = GluttonParameters(top_level_directory) self.db = GluttonDB(reference_fname) self.param.set_reference(self.db) self.resume = self.param.able_to_resume() self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume) self.param.set_full_checksum() def _read_contigs(self) : contigs = {} for label in self.param.get_sample_ids() : accepted = 0 rejected = { 'length' : 0, 'ambiguous' : 0 } fname = self.param.get_contigs(label) for r in SeqIO.parse(fname, 'fasta') : if len(r) < self.min_length : rejected['length'] += 1 continue #if 'N' in r : # rejected['ambiguous'] += 1 # continue qid = self.info.get_query_from_contig(label, r.description) contigs[qid] = biopy_to_gene(r, qid) accepted += 1 self.log.info("%s: read %d contigs (rejected %d due to length < %d)" % #and %d due to 'N's)" % (fname, accepted, rejected['length'], self.min_length)) #, rejected['ambiguous'])) return contigs def stop(self) : self.search.stop() self.info.update_query_gene_mapping(self.search.get_intermediate_results()) if self.q : self.q.stop() rm_f(self.cleanup_files) self.info.flush() self.param.flush() def _correct_strand(self, contig, strand) : if strand == '-' : contig.reverse_complement() return contig def align(self) : self.log.info("starting alignment procedure") # convert the names of the contigs to something no program can complain about # + filter out the ones that could never have a long enough alignment contigs = self._read_contigs() pending_contigs = [ contigs[i] for i in self.info.pending_queries() ] self.log.info("%d contigs have not been assigned to genes..." % len(pending_contigs)) # depending on when the program was terminated this step may be complete or partially # complete if pending_contigs : db_fname = self.db.extract_all() self.cleanup_files.append(db_fname) # do an all vs all search of contigs vs database of transcripts # return a dict of tmp ids with gene ids self.info.update_query_gene_mapping( self.search.process( db_fname, pending_contigs, self.db.nucleotide, self.min_hitidentity, self.min_hitlength, self.max_evalue) ) rm_f(db_fname) # save intermediate results self.info.flush() # use the database to convert the mapping from tmp id -> gene # to gene family -> list of (tmp id, strands) genefamily_contig_map = self.info.build_genefamily2contigs() self.log.info("%d contigs assigned to %d gene families" % (sum([ len(i) for i in genefamily_contig_map.values() ]), len(genefamily_contig_map))) self.log.info("(%d have already been run)" % self.info.len_genefamily2filename()) if self.info.len_genefamily2filename() == len(genefamily_contig_map) : self.log.info("alignment already done, exiting early...") return else : self.log.info("starting alignments...") # queue all the alignments up using a work queue and pagan self.q = WorkQueue() self.total_jobs = len(genefamily_contig_map) - self.info.len_genefamily2filename() self.complete_jobs = -1 self._progress() for famid in self.sort_keys_by_complexity(genefamily_contig_map) : # ignore the jobs that have already been run if self.info.in_genefamily2filename(famid) : continue try : # get the alignment and tree from the database alignment = self.db.get_alignment(famid) tree = alignment.get_tree() # get contigs job_contigs = [ self._correct_strand(contigs[contigid], strand) for contigid,strand in genefamily_contig_map[famid] ] # queue the job self.q.enqueue( PaganJob( self.job_callback, job_contigs, famid, alignment, tree, self.min_alignidentity, self.min_alignoverlap) ) # avoid the split code later in the loop... continue except GluttonDBError, gde : # this means we have never heard of this gene family self.log.warn(str(gde)) continue except GluttonDBFileError, gdfe : # this means we have heard of the gene family, but the # alignment files were missing... self.log.warn(str(gdfe)) # okay, the gene family was not aligned for some reason # instead we will split the gene family into constituent genes # and handle each one separately... self.log.warn("gene family was not aligned, breaking down into separate genes...") self.total_jobs += (len(genefamily_contig_map[famid]) - 1) # collect contigs by gene gene2contigs = collections.defaultdict(list) for contigid,strand in genefamily_contig_map[famid] : try : geneid = self.info.query_to_gene(contigid) except KeyError : # this should be impossible self.log.warn("no gene assignment for %s" % contigid) continue gene2contigs[geneid].append((contigid, strand)) # run each gene separately for geneid in gene2contigs : try : alignment = [ self.db.get_gene(geneid) ] except GluttonDBError, gde : self.log.warn(str(gde)) continue # queue the job self.q.enqueue( PaganJob( self.job_callback, [ self._correct_strand(contigs[contigid], strand) for contigid,strand in gene2contigs[geneid] ], geneid, alignment, None, self.min_alignidentity, self.min_alignoverlap) )
class All_vs_all_search(object): def __init__(self, batch_size=100): self.nucleotide = False self.min_hitidentity = None self.min_hitlength = None self.max_evalue = None self.batch_size = batch_size self.log = get_log() self.cleanup_files = [] self.gene_assignments = {} self.lock = threading.Lock() self.q = None self.total_jobs = 0 self.complete_jobs = 0 def _batch(self, x): tmp = [] for i in x: tmp.append(i) if len(tmp) == self.batch_size: yield tmp tmp = [] if not tmp: raise StopIteration yield tmp def process(self, db, queries, nucleotide, min_hitidentity, min_hitlength, max_evalue): self.nucleotide = nucleotide self.min_hitidentity = min_hitidentity self.min_hitlength = min_hitlength self.max_evalue = max_evalue # we need to deal with the index files here because # all of the blastx jobs need them self.cleanup_files += [db + i for i in [".phr", ".pin", ".psq"]] # creates db + {phr,pin,psq} in same dir as db self.log.info("creating blast db...") Blast.makedb(db) # XXX THIS IS ALWAYS PROTEIN, BECAUSE WE WANT TO RUN BLASTX # queue up the jobs self.log.info("starting local alignments...") self.q = WorkQueue() self.total_jobs = len(queries) self.complete_jobs = -self.batch_size self._progress() for query in self._batch(queries): self.q.enqueue(BlastJob(self.job_callback, db, query, "blastx")) self.log.debug("waiting for job queue to drain...") self.q.join() rm_f(self.cleanup_files) return self.gene_assignments def stop(self): if self.q: self.q.stop() rm_f(self.cleanup_files) def get_intermediate_results(self): return self.gene_assignments def _progress(self): self.complete_jobs += self.batch_size if self.complete_jobs > self.total_jobs: self.complete_jobs = self.total_jobs sys.stderr.write("\rProgress: %d / %d blastx alignments " % (self.complete_jobs, self.total_jobs)) if self.complete_jobs == self.total_jobs: sys.stderr.write("\n") sys.stderr.flush() def job_callback(self, job): self.log.debug("%d blast results returned" % len(job.results)) self.lock.acquire() self._progress() if job.success(): qlen = dict([(q.id, len(q)) for q in job.input]) for br in job.results: # length = max(br.qstart, br.qend) - min(br.qstart, br.qend) strand = "+" if br.qstart < br.qend else "-" if ( (br.qseqid in self.gene_assignments) or (self.max_evalue < br.evalue) or (self.min_hitidentity > br.pident) or (self.min_hitlength > br.length) ): continue self.gene_assignments[br.qseqid] = (br.sseqid, strand) for q in job.input: if q.id not in self.gene_assignments: self.gene_assignments[q.id] = None self.lock.release()