def load_uniprot_mappings(self, ac_types=None, bi=False, ncbi_tax_id=None): ncbi_tax_id = self.get_tax_id(ncbi_tax_id) tables = self.tables[ncbi_tax_id] ac_types = ac_types if ac_types is not None else self.name_types.keys() # creating empty MappingTable objects: for ac_typ in ac_types: tables[(ac_typ, 'uniprot')] = MappingTable(ac_typ, 'uniprot', 'protein', ac_typ, None, ncbi_tax_id, None, log=self.ownlog) # attempting to load them from Pickle i = 0 for ac_typ in ac_types: md5ac = common.md5((ac_typ, 'uniprot', bi, ncbi_tax_id)) cachefile = os.path.join('cache', md5ac) if self.cache and os.path.isfile(cachefile): tables[(ac_typ, 'uniprot')].mapping = \ pickle.load(open(cachefile, 'rb')) ac_types.remove(ac_typ) tables[(ac_typ, 'uniprot')].mid = md5ac # loading the remaining from the big UniProt mapping file: if len(ac_types) > 0: url = urls.urls['uniprot_idmap_ftp']['url'] c = curl.Curl(url, silent=False, large=True) prg = progress.Progress(c.size, "Processing ID conversion list", 99) for l in c.result: prg.step(len(l)) l = l.decode('ascii').strip().split('\t') for ac_typ in ac_types: if len(l) > 2 and self.name_types[ac_typ] == l[1]: other = l[2].split('.')[0] if l[2] not in tables[(ac_typ, 'uniprot')].mapping['to']: tables[(ac_typ, 'uniprot')].mapping['to'][other] = [] tables[(ac_typ, 'uniprot')].mapping['to'][other].\ append(l[0].split('-')[0]) if bi: uniprot = l[0].split('-')[0] if uniprot not in tables[(ac_typ, 'uniprot')].\ mapping['from']: tables[(ac_typ, 'uniprot')].\ mapping['from'][uniprot] = [] tables[(ac_typ, 'uniprot')].mapping['from'][uniprot].\ append(other) prg.terminate() if self.cache: for ac_typ in ac_types: md5ac = common.md5((ac_typ, bi)) cachefile = os.path.join('cache', md5ac) pickle.dump(tables[(ac_typ, 'uniprot')].mapping, open(cachefile, 'wb'))
def load(self, key): cachefile = common.md5(json.dumps(key)) cachefile = os.path.join(self.cachedir, cachefile) if os.path.exists(cachefile): self.lists[key] = pickle.load(open(cachefile, 'rb')) self._log('Reference list for ID type `%s` for organism `%u` ' 'has been loaded from `%s`.' % (key + (cachefile, ))) else: self.lists[key] = self._load(key) pickle.dump(self.lists[key], open(cachefile, 'wb')) self._log('Reference list for ID type `%s` for organism `%u` ' 'has been saved to `%s`.' % (key + (cachefile, )))
def __init__(self, one, two, typ, source, param, ncbi_tax_id, mysql=None, log=None, cache=False, cachedir='cache', uniprots=None): ''' When initializing ID conversion tables for the first time data is downloaded from UniProt and read into dictionaries. It takes a couple of seconds. Data is saved to pickle dumps, this way later the tables load much faster. ''' self.param = param self.one = one self.two = two self.typ = typ self.maxlOne = None self.maxlTwo = None self.mysql = mysql self.cache = cache self.cachedir = cachedir self.mapping = {"to": {}, "from": {}} if log.__class__.__name__ != 'logw': self.session = common.gen_session_id() self.ownlog = logn.logw(self.session, 'INFO') else: self.ownlog = log if param is not None: self.mid = common.md5((one, two, self.param.bi, ncbi_tax_id)) md5param = common.md5(json.dumps(self.param.__dict__)) self.cachefile = os.path.join(self.cachedir, md5param) if self.cache and os.path.isfile(self.cachefile): self.mapping = pickle.load(open(self.cachefile, 'rb')) elif len(self.mapping['to']) == 0 or (param.bi and len( self.mapping['from']) == 0): if os.path.exists(self.cachefile): os.remove(self.cachefile) if source == "mysql": self.read_mapping_mysql(param, ncbi_tax_id) elif source == "file": self.read_mapping_file(param, ncbi_tax_id) elif source == "pickle": self.read_mapping_pickle(param, ncbi_tax_id) elif source == "uniprot": self.read_mapping_uniprot(param, ncbi_tax_id) elif source == "uniprotlist": self.read_mapping_uniprot_list(param, uniprots=uniprots, ncbi_tax_id=ncbi_tax_id) if len(self.mapping['to']) and (not param.bi or len(self.mapping['from'])): pickle.dump(self.mapping, open(self.cachefile, 'wb'))