def __init__(self, seq_repo_path=None, regions_preload=None, preload_pos_margin=500, assembly_name=None): ''' :param seq_repo_path: Path to local seqrepo directory. If None, read HGVS_SEQREPO_DIR environment variable :param regions_preload: Iterable[ChrInterval], optionally preload these genomic regions :param preload_pos_margin: adding margin at the end of a preloaded genome in order to have data to verify structural variants across the end of a gene ''' if not seq_repo_path: seq_repo_path = os.environ.get("HGVS_SEQREPO_DIR") if seq_repo_path: seq_repo = SeqRepo(seq_repo_path) self.seq_repo_fetcher = seq_repo.fetch else: logging.warn("Using remote sequence provider.") self.seq_repo_fetcher = seqfetcher.fetch_seq self.assembly_name = assembly_name if not self.assembly_name: self.assembly_name = self.DEFAULT_ASSY_NAME self.assy_map = assemblies.make_name_ac_map(self.assembly_name) self.preloaded_regions = {} if regions_preload: self.preloaded_regions = build_interval_trees_by_chr( regions_preload, lambda c, s, e: self._fetch_seq(c, s, e + preload_pos_margin))
def __init__(self): # If HGVS_SEQREPO_DIR is defined, we use seqrepo for *all* sequences. # If HGVS_SEQREPO_URL is defined, use instance of seqrepo-rest-service for *all* sequences. # (see https://github.com/biocommons/seqrepo-rest-service for more info) # Otherwise, we fall back to remote sequence fetching seqrepo_dir = os.environ.get("HGVS_SEQREPO_DIR") seqrepo_url = os.environ.get("HGVS_SEQREPO_URL") if seqrepo_dir: from biocommons.seqrepo import SeqRepo self.sr = SeqRepo(seqrepo_dir) def _fetch_seq_seqrepo(ac, start_i=None, end_i=None): return self.sr.fetch(ac, start_i, end_i) self.fetcher = _fetch_seq_seqrepo self.source = "SeqRepo ({})".format(seqrepo_dir) elif seqrepo_url: from biocommons.seqrepo.dataproxy import SeqRepoRESTDataProxy self.sr = SeqRepoRESTDataProxy(seqrepo_url) self.fetcher = lambda ac, start_i=None, end_i=None: self.sr.get_sequence( ac, start_i, end_i) self.source = f"SeqRepo REST ({seqrepo_url})" else: self.sr = None self.fetcher = bioutils.seqfetcher.fetch_seq self.source = "bioutils.seqfetcher (network fetching)" _logger.info("Fetching sequences with " + self.source)
def test_refseq_translation(tmpdir_factory): dir = str(tmpdir_factory.mktemp('seqrepo')) seqrepo = SeqRepo(dir, writeable=True) seqrepo.store("NCBISEQUENCE", [{"namespace": "NCBI", "alias": "ncbiac"}]) seqrepo.commit() del seqrepo seqrepo = SeqRepo(dir, writeable=False, translate_ncbi_namespace=False) aliases = list(seqrepo.aliases.find_aliases(alias="ncbiac")) assert len(aliases) == 1 assert aliases[0]["namespace"] == "NCBI" seqrepo = SeqRepo(dir, writeable=False, translate_ncbi_namespace=True) aliases = list(seqrepo.aliases.find_aliases(alias="ncbiac")) assert len(aliases) == 1 assert aliases[0]["namespace"] == "RefSeq"
def test_seqrepo_dir_not_exist(tmpdir_factory): """Ensure that exception is raised for non-existent seqrepo directory""" dir = str(tmpdir_factory.mktemp('seqrepo')) + "-IDONTEXIST" with pytest.raises(OSError) as ex: SeqRepo(dir, writeable=False) assert "Unable to open SeqRepo directory" in str(ex.value)
def get_seqrepo(self, seqrepo_dir) -> SeqRepo: """Return SeqRepo instance if seqrepo_dir exists. :param Path seqrepo_dir: Path to seqrepo directory :return: SeqRepo instance """ if not Path(seqrepo_dir).exists(): raise NotADirectoryError(f"Could not find {seqrepo_dir}") return SeqRepo(seqrepo_dir)
def __init__(self): # If HGVS_SEQREPO_DIR is defined, we use seqrepo for *all* sequences # Otherwise, we fall back to remote sequence fetching seqrepo_dir = os.environ.get("HGVS_SEQREPO_DIR") if seqrepo_dir: from biocommons.seqrepo import SeqRepo self.sr = SeqRepo(seqrepo_dir) def _fetch_seq_seqrepo(ac, start_i=None, end_i=None): return self.sr.fetch(ac, start_i, end_i) self.fetcher = _fetch_seq_seqrepo self.source = "SeqRepo ({})".format(seqrepo_dir) else: self.sr = None self.fetcher = bioutils.seqfetcher.fetch_seq self.source = "bioutils.seqfetcher (network fetching)" _logger.info("Fetching sequences with " + self.source)
def make_seqrepo(writeable): sr = SeqRepo("/tmp/sr", writeable=True) sr.store("SMELLASSWEET", [{"namespace": "en", "alias": "rose"}, {"namespace": "fr", "alias": "rose"}]) if writeable is False: del sr sr = SeqRepo("/tmp/sr", writeable=writeable) print("pid {pid} created {sr}".format(pid=os.getpid(), sr=sr)) return sr
def __init__(self): # If HGVS_SEQREPO_DIR is defined, we use seqrepo for *all* sequences # Otherwise, we fall back to remote sequence fetching seqrepo_dir = os.environ.get("HGVS_SEQREPO_DIR") if seqrepo_dir: from biocommons.seqrepo import SeqRepo sr = SeqRepo(seqrepo_dir) def _fetch_seq_seqrepo(ac, start_i=None, end_i=None): return sr.fetch(ac, start_i, end_i) self.fetcher = _fetch_seq_seqrepo logger.info( "Using SeqRepo({}) sequence fetching".format(seqrepo_dir)) else: self.fetcher = bioutils.seqfetcher.fetch_seq logger.info("Using remote sequence fetching")
class SeqFetcher(object): """This class is intended primarily as a mixin for HGVS data providers that doen't otherwise have access to sequence data. It uses the fetch_seq() function in this module to fetch sequences from several sources; see that function for details. >> sf = SeqFetcher() >> sf.fetch_seq('NP_056374.2',0,10) 'MESRETLSSS' """ def __init__(self): # If HGVS_SEQREPO_DIR is defined, we use seqrepo for *all* sequences # Otherwise, we fall back to remote sequence fetching seqrepo_dir = os.environ.get("HGVS_SEQREPO_DIR") if seqrepo_dir: from biocommons.seqrepo import SeqRepo self.sr = SeqRepo(seqrepo_dir) def _fetch_seq_seqrepo(ac, start_i=None, end_i=None): return self.sr.fetch(ac, start_i, end_i) self.fetcher = _fetch_seq_seqrepo self.source = "SeqRepo ({})".format(seqrepo_dir) else: self.sr = None self.fetcher = bioutils.seqfetcher.fetch_seq self.source = "bioutils.seqfetcher (network fetching)" _logger.info("Fetching sequences with " + self.source) def fetch_seq(self, ac, start_i=None, end_i=None): try: return self.fetcher(ac, start_i, end_i) except Exception as ex: raise HGVSDataNotAvailableError( "Failed to fetch {ac} from {self.source} ({ex})".format( ac=ac, ex=ex, self=self))
def seqrepo_keepcase(tmpdir_factory): dir = str(tmpdir_factory.mktemp('seqrepo')) return SeqRepo(dir, upcase=False, writeable=True)
def seqrepo_ro(tmpdir_factory): dir = str(tmpdir_factory.mktemp('seqrepo')) sr = SeqRepo(dir, writeable=True) del sr # close it return SeqRepo(dir)
def _get_seqrepo(cf): sr_dir = cf.get("sequences", "seqrepo") sr = SeqRepo(root_dir=sr_dir, translate_ncbi_namespace=True) logger.info("Opened {sr}".format(sr=sr)) return sr
def create_text(self, defn): vo = models.Text(definition=defn) vo._id = ga4gh_identify(vo) return vo if __name__ == "__main__": import os from biocommons.seqrepo import SeqRepo from ga4gh.vrs.dataproxy import SeqRepoRESTDataProxy, SeqRepoDataProxy if "SEQREPO_DIR" in os.environ: seqrepo_dir = os.environ.get("SEQREPO_DIR", "/usr/local/share/seqrepo/latest") data_proxy = SeqRepoDataProxy(SeqRepo(root_dir=seqrepo_dir)) else: seqrepo_url = os.environ.get( "GA4GH_VRS_DATAPROXY_URI", "https://services.genomicmedlab.org/seqrepo") data_proxy = SeqRepoRESTDataProxy(base_url=seqrepo_url) object_store = {} av = AnyVar(data_proxy=data_proxy, object_store=object_store) v = av.translate_allele("NM_000551.3:c.1A>T", fmt="hgvs") vid = av.put_object(v) v2 = av.get_object(vid, deref=True) assert v == v2 # roundtrip test
def search(): from biocommons.seqrepo import SeqRepo sr = SeqRepo("/usr/local/share/seqrepo/latest") #return [sr[key] for key in sorted(sr.keys())] return sr["NC_000001.11"][780000:780020
def _get_seqrepo(cf): sr_dir = cf.get("sequences", "seqrepo") sr = SeqRepo(root_dir=sr_dir) logger.info("Opened sequence directory " + sr_dir) return sr
def seqrepo(tmpdir_factory): dir = str(tmpdir_factory.mktemp('seqrepo')) return SeqRepo(dir, writeable=True)
def _get_seqrepo(cf): sr_dir = cf.get("sequences", "seqrepo") sr = SeqRepo(root_dir=sr_dir) logger.info("Opened {sr}".format(sr=sr)) return sr