def __get_binned_data(self, chr, start, end, type): if type == "gene": data = self.Genes elif type == "transcript": data = self.Transcripts # GAF uses M_rCRS whereas mutations are often using M for the chromosome field. if chr == "M" and ("M" not in data) and ("M_rCRS" not in data): raise GafInvalidChromosomeValue( "Unable to process mitochondria mutation with chr: %s .... data keys are %s" % (str(chr)), str(data.keys()), ) if chr == "M" and ("M" not in data): # TODO: Verify that it is okay to do this. chr = "M_rCRS" if chr not in data: self.logger.warn("Invalid chromosome value for Gaf search: %s" % (str(chr))) return list() # raise GafInvalidChromosomeValue("Invalid chromosome value: %s" % (str(chr))) bins = region2bins(start, end) records = list() for b in bins: records.extend(data[chr].get(b, [])) return records
def __get_binned_data(self, chr, start, end, type): if type == 'gene': data = self.Genes elif type == 'transcript': data = self.Transcripts # GAF uses M_rCRS whereas mutations are often using M for the chromosome field. if chr == 'M' and ('M' not in data) and ('M_rCRS' not in data): raise GafInvalidChromosomeValue( "Unable to process mitochondria mutation with chr: %s .... data keys are %s" % (str(chr)), str(data.keys())) if chr == 'M' and ('M' not in data): # TODO: Verify that it is okay to do this. chr = 'M_rCRS' if chr not in data: self.logger.warn("Invalid chromosome value for Gaf search: %s" % (str(chr))) return list() #raise GafInvalidChromosomeValue("Invalid chromosome value: %s" % (str(chr))) bins = region2bins(start, end) records = list() for b in bins: records.extend(data[chr].get(b, [])) return records
def _get_binned_transcripts_given_index(self, chr, start, end, index_dict): bins = region2bins(int(start), int(end)) records = list() for b in bins: key = chr + "_" + str(b) try: txs = index_dict[key] records.extend(txs) except KeyError: pass return set(records)
def test_build_ensembl_transcripts_by_genomic_location_index(self): """Test that we can get an ensembl transcript from a genomic position""" protocol = "file" transcript_index_filename = "out/test_ensemble_gtf_for_gp.db" output_filename = "out/test_ensemble_gtf_for_gp.db.idx" shutil.rmtree(output_filename, ignore_errors=True) ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" genome_build_factory = GenomeBuildFactory() genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol) genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol) # Now load the index and look something up. gp_index = Shove(protocol + "://" + output_filename) gt_transcript_id = "YAL067C" bins = region2bins(1496172, 1496400) for bin in bins: key = 'I_' + str(bin) if key in gp_index.keys(): self.assertTrue(gp_index[key] == gt_transcript_id)