def get_acs_for_protein_seq(self, seq): """ returns a list of protein accessions for a given sequence. The list is guaranteed to contain at least one element with the MD5-based accession (MD5_01234abc...def56789) at the end of the list. """ md5 = seq_md5(seq) return [r['ac'] for r in self._fetchall(self._queries['acs_for_protein_md5'], [md5]) ] + ['MD5_' + md5]
def get_acs_for_protein_seq(self,seq): """ returns a list of protein accessions for a given sequence. The list is guaranteed to contain at least one element with the MD5-based accession (MD5_01234abc...def56789) at the end of the list. """ md5 = seq_md5(seq) cur = self._get_cursor() cur.execute(self.sql['acs_for_protein_md5'],[md5]) return [ r['ac'] for r in cur.fetchall() ] + [ 'MD5_'+md5 ]
def load_txinfo(session, opts, cf): self_aln_method = "transcript" update_period = 250 sf = None # established on first use, below @lru_cache(maxsize=100) def _fetch_origin_by_name(name): try: ori = session.query(usam.Origin).filter( usam.Origin.name == name).one() except NoResultFound as e: logger.error("No origin for " + ti.origin) raise e return ori n_rows = len(gzip.open(opts["FILE"]).readlines()) - 1 tir = ufti.TxInfoReader(gzip.open(opts["FILE"])) logger.info("opened " + opts["FILE"]) session.execute("set role {admin_role};".format( admin_role=cf.get("uta", "admin_role"))) session.execute("set search_path = " + usam.schema_name) n_new = 0 n_unchanged = 0 n_cds_changed = 0 n_exons_changed = 0 for i_ti, ti in enumerate(tir): if ti.exons_se_i == "": logger.warning(ti.ac + ": no exons?!; skipping.") continue if ti.cds_se_i: cds_start_i, cds_end_i = map(int, ti.cds_se_i.split(",")) else: cds_start_i = cds_end_i = None cds_md5 = None # 1. Fetch or make the Transcript record existing = session.query(usam.Transcript).filter( usam.Transcript.ac == ti.ac, ) assert existing.count() <= 1, "Expected max 1 existing transcripts with accession {ti.ac}".format(ti=ti) u_tx = None if existing.count() == 1: u_tx = existing[0] if (u_tx.cds_start_i, u_tx.cds_end_i) != (cds_start_i, cds_end_i): u_tx.ac = "{u_tx.ac}/{u_tx.cds_start_i}..{u_tx.cds_end_i}".format(u_tx=u_tx) logger.warn("Transcript {ti.ac}: CDS coordinates changed!; renamed to {u_tx.ac}".format(ti=ti, u_tx=u_tx)) session.flush() u_tx = None n_cds_changed += 1 # state: u_tx is set if a transcript was found and was # unchanged, or None if 1) no such was found or 2) was found # and had updated CDS coords. if u_tx is None: ori = _fetch_origin_by_name(ti.origin) if ti.cds_se_i: if sf is None: sf = _get_seqfetcher(cf) try: cds_seq = sf.fetch(ti.ac, cds_start_i, cds_end_i) except KeyError: raise Exception("{ac}: not in sequence database".format(ac=ti.ac)) cds_md5 = seq_md5(cds_seq) else: cds_md5 = None assert (cds_start_i is not None) ^ (cds_md5 is None), "failed: cds_start_i is None i.f.f. cds_md5_is None" u_tx = usam.Transcript( ac=ti.ac, origin=ori, hgnc=ti.hgnc, cds_start_i=cds_start_i, cds_end_i=cds_end_i, cds_md5=cds_md5, ) session.add(u_tx) if u_tx.hgnc != ti.hgnc: logger.warn("{ti.ac}: HGNC symbol changed from {u_tx.hgnc} to {ti.hgnc}".format( u_tx=u_tx, ti=ti)) u_tx.hgnc = ti.hgnc # state: transcript now exists, either existing or freshly-created # 2. Upsert an ExonSet attached to the Transcript n, o = _upsert_exon_set_record(session, ti.ac, ti.ac, 1, self_aln_method, ti.exons_se_i) (no) = (n is not None, o is not None) if no == (True, False): n_new += 1 elif no == (True, True): logger.warn("Transcript {ti.ac} exon structure changed".format(ti=ti)) n_exons_changed += 1 elif no == (False, True): logger.debug("Transcript {ti.ac} exon structure unchanged".format(ti=ti)) n_unchanged += 1 if i_ti % update_period == 0 or i_ti + 1 == n_rows: session.commit() logger.info("{i_ti}/{n_rows} {p:.1f}%; {n_new} new, {n_unchanged} unchanged, " "{n_cds_changed} cds changed, {n_exons_changed} exons changed; commited".format( i_ti=i_ti, n_rows=n_rows, n_new=n_new, n_unchanged=n_unchanged, n_cds_changed=n_cds_changed, n_exons_changed=n_exons_changed, p=(i_ti + 1) / n_rows * 100))