def __init__(self, UniProtAC, XML = None, cache_dir = None, silent = True): if cache_dir and not(os.path.exists(cache_dir)): raise Exception("The cache directory %s does not exist." % cache_dir) self.UniProtAC = UniProtAC self.silent = silent # Get XML if XML == None: protein_xml = None cached_filepath = None if cache_dir: cached_filepath = os.path.join(cache_dir, '%s.xml' % UniProtAC) if cached_filepath and os.path.exists(cached_filepath): protein_xml = read_file(cached_filepath) else: if not silent: colortext.write("Retrieving %s\n" % UniProtAC, "cyan") url = 'http://www.uniprot.org/uniprot/%s.xml' % UniProtAC protein_xml = http_get(url) if not(protein_xml.strip()): raise EmptyUniProtACXMLException('The file %s is empty.' % UniProtAC) if cached_filepath: write_file(cached_filepath, protein_xml) self.XML = protein_xml else: self.XML = XML self.recommended_name = None self.submitted_names = [] self.alternative_names = [] # Get DOM try: self._dom = parseString(protein_xml) except: if cached_filepath: raise Exception("The UniProtAC XML for '%s' was invalid. The cached file is located at %s. Check this file - if it is not valid XML then delete the file and rerun the script." % (UniProtAC, cached_filepath)) else: raise Exception("The UniProtAC XML for '%s' was invalid." % UniProtAC) main_tags = self._dom.getElementsByTagName("uniprot") assert(len(main_tags) == 1) entry_tags = main_tags[0].getElementsByTagName("entry") assert(len(entry_tags) == 1) self.entry_tag = entry_tags[0] self._parse_evidence_tag() self._parse_sequence_tag() self._parse_protein_tag() self._parse_organism_tag() self._parse_subsections() self._parse_PDB_mapping()
def get_obsolete_acc_to_uniparc(acc): ''' Tries to determine the UniParc ID for obsolete ACCs which are not returned using uniprot_map. :param acc: The UniProt accession number. :return: The corresponding UniParc ID. Warning: This is a fragile function as the underlying website generation or URL could change. ''' contents = http_get('www.uniprot.org/uniparc/?query={0}'.format(acc)) mtchs = re.findall(r'"UPI[A-Z0-9]+?"', contents, re.DOTALL) uniparc_id = set([m[1:-1] for m in mtchs]) if len(uniparc_id) == 1: return uniparc_id.pop() elif len(uniparc_id) > 1: raise Exception('Multiple UPI identifiers found.') return None
def _get_XML(self): uparc_xml = None cached_filepath = None if self.cache_dir: cached_filepath = os.path.join(self.cache_dir, '%s.xml' % self.UniParcID) if cached_filepath and os.path.exists(cached_filepath): uparc_xml = read_file(cached_filepath) else: if not self.silent: colortext.write("Retrieving %s\n" % self.UniParcID, "cyan") url = 'http://www.uniprot.org/uniparc/%s.xml' % self.UniParcID uparc_xml = http_get(url) if cached_filepath: write_file(cached_filepath, uparc_xml) self.XML = uparc_xml # Get DOM self._dom = parseString(uparc_xml) main_tags = self._dom.getElementsByTagName("uniparc") assert(len(main_tags) == 1) entry_tags = main_tags[0].getElementsByTagName("entry") assert(len(entry_tags) == 1) self.entry_tag = entry_tags[0]
def __init__(self, UniParcID, UniProtACs = None, UniProtIDs = None, cache_dir = None, silent = False): if cache_dir and not(os.path.exists(os.path.abspath(cache_dir))): raise Exception("The cache directory %s does not exist." % os.path.abspath(cache_dir)) self.UniParcID = UniParcID self.cache_dir = cache_dir self.recommended_name = None self.silent = silent # Get AC mapping if not UniProtACs or UniParcID=='UPI0000047CA3': # todo: is this UPI0000047CA3 special handling necessary? mapping = uniprot_map('UPARC', 'ACC', [UniParcID], cache_dir = cache_dir, silent = silent)[UniParcID] self.UniProtACs = mapping else: self.UniProtACs = UniProtACs # Get ID mapping if not UniProtIDs: mapping = uniprot_map('UPARC', 'ID', [UniParcID], cache_dir = cache_dir, silent = silent)[UniParcID] self.UniProtIDs = mapping else: self.UniProtIDs = UniProtIDs # Get FASTA cached_filepath = None if cache_dir: cached_filepath = os.path.join(cache_dir, '%s.fasta' % UniParcID) if cached_filepath and os.path.exists(cached_filepath): fasta = read_file(cached_filepath) else: if not silent: print("Getting FASTA file") url = 'http://www.uniprot.org/uniparc/%s.fasta' % UniParcID fasta = http_get(url) if cached_filepath: write_file(cached_filepath, fasta) # Get sequence header = fasta.split("\n")[0].split() assert(len(header) == 2) assert(header[0] == ">%s" % UniParcID) assert(header[1].startswith("status=")) sequence = "".join(map(string.strip, fasta.split("\n")[1:])) self.sequence = sequence # Get atomic mass (and sequence again) self.atomic_mass = None self.CRC64Digest = None recommended_names = [] alternative_names = [] submitted_names = [] self.AC_entries = {} subsections = ProteinSubsectionHolder(len(sequence)) for UniProtAC in self.UniProtACs: #colortext.write("%s\n" % UniProtAC, 'cyan') try: AC_entry = UniProtACEntry(UniProtAC, cache_dir = self.cache_dir, silent = silent) except EmptyUniProtACXMLException: continue self.AC_entries[UniProtAC] = AC_entry # Mass sanity check if self.atomic_mass != None: assert(self.atomic_mass == AC_entry.atomic_mass) self.atomic_mass = AC_entry.atomic_mass # Sequence sanity check assert(self.sequence == AC_entry.sequence) # CRC 64 sanity check if self.CRC64Digest != None: assert(self.CRC64Digest == AC_entry.CRC64Digest) self.CRC64Digest = AC_entry.CRC64Digest assert(CRC64.CRC64digest(self.sequence) == self.CRC64Digest) if AC_entry.recommended_name: found = False for n in recommended_names: if n[0] == AC_entry.recommended_name: n[1] += 1 found = True break if not found: recommended_names.append([AC_entry.recommended_name, 1]) for alternative_name in AC_entry.alternative_names: found = False for n in alternative_names: if n[0] == alternative_name: n[1] += 1 found = True break if not found: alternative_names.append([alternative_name, 1]) for submitted_name in AC_entry.submitted_names: found = False for n in submitted_names: if n[0] == submitted_name: n[1] += 1 found = True break if not found: submitted_names.append([submitted_name, 1]) subsections += AC_entry.subsections self.subsections = subsections assert(len(set(UniParcMergedRecommendedNamesRemap.keys()).intersection(set(UniParcMergedSubmittedNamesRemap.keys()))) == 0) if UniParcID in UniParcMergedRecommendedNamesRemap: recommended_names = [[UniParcMergedRecommendedNamesRemap[UniParcID], 1]] elif UniParcID in UniParcMergedSubmittedNamesRemap: recommended_names = [[UniParcMergedSubmittedNamesRemap[UniParcID], 1]] if not silent: colortext.write('Subsections\n', 'orange') #print(subsections) if len(recommended_names) == 0 and len(alternative_names) == 0 and len(submitted_names) == 0: raise UniParcEntryStandardizationException("UniParcID %s has no recommended names." % UniParcID) elif len(recommended_names) == 0: s = ["UniParcID %s has no recommended names.\n" % UniParcID] if alternative_names: s.append("It has the following alternative names:") for tpl in sorted(alternative_names, key=lambda x:-x[1]): s.append("\n count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name'])) if tpl[0]['Short names']: s.append(" (short names: %s)" % ",".join(tpl[0]['Short names'])) if tpl[0]['EC numbers']: s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers'])) if submitted_names: s.append("It has the following submitted names:") for tpl in sorted(submitted_names, key=lambda x:-x[1]): s.append("\n count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name'])) if tpl[0]['Short names']: s.append(" (short names: %s)" % ",".join(tpl[0]['Short names'])) if tpl[0]['EC numbers']: s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers'])) #raise UniParcEntryStandardizationException("".join(s)) elif len(recommended_names) > 1: s = ["UniParcID %s has multiple recommended names: " % UniParcID] for tpl in sorted(recommended_names, key=lambda x:-x[1]): s.append("\n count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name'])) if tpl[0]['Short names']: s.append(" (short names: %s)" % ",".join(tpl[0]['Short names'])) if tpl[0]['EC numbers']: s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers'])) raise UniParcEntryStandardizationException("".join(s)) #assert(len(recommended_names) == 1) # todo: this is not always available #print(recommended_names) self.recommended_name = None if len(recommended_names) == 1: self.recommended_name = recommended_names[0][0] self.get_organisms()