def test_retrieve_id_17(self): expected_tags = { 'mainid': 'N1NU19', 'chains': { 'A0A023VGG0', 'A0A064D5Y7', 'A0A096Y6F3', 'A0A097J0G2', 'A0A0A0CK90', 'A0A0A8NVJ2', 'A0A0B6VLZ4', 'A0A0C4XWE1', 'A0A0D5XR52', 'A0A0D7LBA5', 'A0A0E0SXV4', 'A0A0F0SXM7', 'A0A0F0T558', 'A0A0F1AU30', 'A0A0H3AEJ2', 'A0A0H3YD74', 'A0A0J0IP13', 'A0A0K2S451', 'A0A0K3RSM2', 'A0A0M1V7U1', 'A0A0N9NKB1', 'A0A0R4CY03', 'A0A0R9Q1C9', 'A0A0V9GPN4', 'A0A0V9J4A8', 'A0A0W1W990', 'A0A126JTC8', 'A0A142BNX4', 'A0A144XRS0', 'A0A167SUV6', 'A0A1B3B6M7', 'A0A1C0D4B4', 'A0A1J0E5N', 'A0A1Q8YU31', 'A0A1U7FUF6', 'A0A1W6AR35', 'A0A1W6ARV1', 'A0A1Y0F4W8', 'A0A220UU09', 'A0A221KL77', 'A0A2I7QEW9', 'A0A2I8R9F4', 'A0A2I8XNZ0', 'A0A2K9V063', 'A0A2L0TKR8', 'A0A2N1EL23', 'A0A2R4DIJ7', 'A0A2R4KLJ4', 'A0A2S1XWH3', 'A0A2S4RFM4', 'A0PB04', 'A4IUB3', 'B6VP02', 'C4NUX0', 'C4NVF2', 'C6GA28', 'D2Y9Z2', 'D3VGJ5', 'D3VLT8', 'E1ITU2', 'E7DBG1', 'E7DBH5', 'F2Z7X9', 'F5BPS4', 'H9TK43', 'K0H5V6', 'K4N9S7', 'K4W3W9', 'K4Y465', 'N1NU19', 'T0P0P2', 'T0QDZ9', 'U3PHU8', 'U9YS98', 'V9SJY4', 'W1JB02' }, 'seqid': '14', 'source': 'UniClust90_2018_08', 'comments': 'Representative=N1NU19 n=76 Descriptions=[Conjugal transfer protein TraV|Type IV conjugative transfer system protein TraV|Sex pilus assembly protein|Conjugative transfer protein TraV|Sex pilus assembly|Type IV conjugative transfer system lipoprotein (TraV)] Members=A0A126JTC8,A0A0E0SXV4,A0A0F1AU30,A0A167SUV6,A0PB04,U9YS98,A0A144XRS0,D2Y9Z2,A0A142BNX4,C4NVF2,A0A0J0IP13,A0A0K3RSM2,A0A097J0G2,A4IUB3,W1JB02,A0A0H3YD74,A0A1W6ARV1,T0QDZ9,A0A0V9GPN4,A0A0V9J4A8,E7DBH5,A0A2S4RFM4,C4NUX0,A0A2R4DIJ7,A0A023VGG0,A0A0A0CK90,U3PHU8,A0A096Y6F3,A0A0K2S451,B6VP02,A0A2I8R9F4,A0A2R4KLJ4,K0H5V6,K4Y465,D3VLT8,F5BPS4,A0A0B6VLZ4,T0P0P2,A0A0D7LBA5,E1ITU2,A0A0N9NKB1,A0A1C0D4B4,H9TK43,A0A0F0SXM7,A0A0A8NVJ2,A0A220UU09,K4W3W9,F2Z7X9,A0A0M1V7U1,A0A0W1W990,C6GA28,A0A2I8XNZ0,A0A2K9V063,A0A0R4CY03,A0A2S1XWH3,A0A1Q8YU31,V9SJY4,A0A0C4XWE1,A0A2L0TKR8,A0A2N1EL23,A0A0R9Q1C9,K4N9S7,N1NU19,A0A1B3B6M7,A0A0D5XR52,A0A0F0T558,A0A2I7QEW9,A0A1Y0F4W8,A0A221KL77,E7DBG1,A0A064D5Y7,A0A1U7FUF6,A0A0H3AEJ2,D3VGJ5,A0A1W6AR35,A0A1J0E5N4' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[16]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_4(self): expected_tags = { 'mainid': 'UPI0000000005', 'chains': {'UPI0000000005'}, 'seqid': '1', 'source': 'UniParc', 'comments': 'status=active' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[3]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_11(self): expected_tags = { 'mainid': 'ABCDE', 'chains': {'j', 'k', 'l'}, 'seqid': None, 'source': 'RCSB PDB', 'comments': None } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[10]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_7(self): expected_tags = { 'mainid': '3J4F', 'chains': {'A'}, 'seqid': None, 'source': 'RCSB PDB', 'comments': 'PDBID|CHAIN|SEQUENCE' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[6]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_14(self): expected_tags = { 'mainid': 'HJ41', 'chains': {'A', 'B', 'C'}, 'seqid': '1', 'source': 'CROPS', 'comments': 'these are some comments' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[13]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_12(self): expected_tags = { 'mainid': '6AVG', 'chains': {'B', 'C'}, 'seqid': '2', 'source': 'PDBe', 'comments': None } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[11]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_5(self): expected_tags = { 'mainid': 'P05067', 'chains': {'P05067'}, 'seqid': '1', 'source': 'UniProtKB/SwissProt (archived)', 'comments': 'archived from Release 18.0 01-MAY-1991 SV=3' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[4]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_6(self): expected_tags = { 'mainid': 'Q55167', 'chains': {'Q55167'}, 'seqid': '1', 'source': 'UniProtKB/TrEMBL (archived)', 'comments': 'archived from Release 17.0 01-JUN-2001 SV=1' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[5]) self.assertDictEqual(retrieved_tags, expected_tags)
def parsemap(instream): """Parse cropmap from string. :param instream: Imported-to-string cropmap file content. :type instream: str :return: Mapping and backmapping coordinates. :rtype: dict [str, dict [str, dict [str, dict [int, int]]]] """ mapdict = {} newid = [] indx = -1 inmaplines = instream.splitlines() inmaplines.append('') for raw in range(len(inmaplines)): line = inmaplines[raw].rstrip() if (not line or line.startswith(">")): if indx >= 0: if newid['mainid'] not in mapdict: mapdict[newid['mainid']] = {} if newid['seqid'] not in mapdict[newid['mainid']]: mapdict[newid['mainid']][newid['seqid']] = {} mapdict[newid['mainid']][ newid['seqid']]['cropmap'] = copy.deepcopy(forthmap) mapdict[newid['mainid']][ newid['seqid']]['cropbackmap'] = copy.deepcopy(backmap) if not line: try: line = f.readline().rstrip() if not line: break except Exception: break if line.startswith(">"): newid = retrieve_id(line) indx += 1 forthmap = {} backmap = {} elif line.startswith("#") or line.startswith(' #'): pass else: m = line.split(' ') if m[1] != '0': forthmap[int(m[0])] = int(m[1]) backmap[int(m[1])] = int(m[0]) else: forthmap[int(m[0])] = None return mapdict
def test_retrieve_id_3(self): expected_tags = { 'mainid': 'Q9K794', 'chains': {'Q9K794'}, 'seqid': '1', 'source': 'UniRef50', 'comments': 'Putative AgrB-like protein n=2 Tax=Bacillus TaxID=1386 RepID=AGRB_BACHD' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[2]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_2(self): expected_tags = { 'mainid': 'Q3SA23', 'chains': {'Q3SA23'}, 'seqid': '1', 'source': 'UniProtKB/TrEMBL', 'comments': 'Q3SA23_9HIV1 Protein Nef (Fragment) OS=Human immunodeficiency virus 1 OX=11676 GN=nef PE=3 SV=1' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[1]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_1(self): expected_tags = { 'mainid': 'Q6GZX4', 'chains': {'Q6GZX4'}, 'seqid': '1', 'source': 'UniProtKB/SwissProt', 'comments': '001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[0]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_15(self): expected_tags = { 'mainid': 'A0A094PJX3', 'chains': {'A0A094PJX'}, 'seqid': '30', 'source': 'UniClust30_2018_08', 'comments': 'Representative=A0A094PJX3 n=1 Descriptions=[Uncharacterized protein] Members=A0A094PJX3' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[14]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_10(self): expected_tags = { 'mainid': '6AVG', 'chains': {'B', 'C'}, 'seqid': '2', 'source': 'RCSB PDB', 'comments': 'T-cell receptor alpha variable 4,TCR alpha chain|H**o sapiens' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[9]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_9(self): expected_tags = { 'mainid': '6LN2', 'chains': {'A'}, 'seqid': None, 'source': 'MrBUMP', 'comments': "resolution: 3.20 experiment: XRAY release_date: 2020-03-18 [ 476678 : ALL ] ['29-1054'] <SEQSE>29,1054<SEQSE> <100>1<100> <95>1<95> <90>1<90> <70>0<70> <50>0<50>" } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[8]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_8(self): expected_tags = { 'mainid': '3J4F', 'chains': {'D'}, 'seqid': None, 'source': 'MrBUMP', 'comments': 'resolution: 8.60 experiment: EMIC release_date: 2013-07-24 [ 3 : ALL ]' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[7]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_16(self): expected_tags = { 'mainid': 'U1GGI9', 'chains': {'A0A239WXC9', 'A0A2W5CTJ', 'U1GGI9', 'U7JT03'}, 'seqid': '39', 'source': 'UniClust90_2018_08', 'comments': 'Representative=U1GGI9 n=4 Descriptions=[ATP-dependent Clp protease adapter protein ClpS|ATP-dependent Clp protease adapter ClpS] Members=U7JT03,A0A239WXC9,U1GGI9,A0A2W5CTJ1' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[15]) self.assertDictEqual(retrieved_tags, expected_tags)
def test_retrieve_id_13(self): expected_tags = { 'mainid': '6DWU', 'chains': { 'AA', 'AC', 'AE', 'AG', 'AI', 'AK', 'AM', 'AO', 'AQ', 'AS', 'AU', 'BA', 'BC', 'BE', 'BG', 'BI', 'BK', 'BM', 'BO', 'BQ', 'BS', 'BU', 'CA', 'CC', 'CE', 'CG', 'CI', 'CK', 'CM', 'CO', 'CQ', 'CS', 'CU', 'DA', 'DC', 'DE', 'DG', 'DI', 'DK', 'DM', 'DO', 'DQ', 'DS', 'DU' }, 'seqid': '1', 'source': 'RCSB PDB', 'comments': 'Cationic trypsin|Bos taurus (9913)' } retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[12]) self.assertDictEqual(retrieved_tags, expected_tags)
def __init__(self, seqid=None, oligomer=None, seq=None, chains=None, source=None, header=None, biotype=None, extrainfo=None): self.oligomer_id = None self.name = None self.chains = set() self.source = None self.source_headers = [] self.crops_header = None self.seqs = {} self.biotype = None self.infostring = None self.cropmap = None self.cropbackmap = None self.msa = None self.cropmsa = None self.intervals = None if header is not None: if isinstance(header, str): self.source_headers.append(header) try: header_info = retrieve_id(header) except Exception: logging.warning( 'Header format not recognised. Information not extracted.' ) header_info = None else: logging.critical("Argument 'header' should be a string.") raise TypeError else: header_info = None if seqid is not None: if isinstance(seqid, str): self.name = seqid elif isinstance(seqid, int): self.name = str(seqid) else: logging.critical("Sequence ID 'seqid' should be a string.") raise TypeError else: if header_info is not None: if 'seqid' in header_info: self.name = header_info['seqid'] else: self.name = '1' if seq is not None: if isinstance(seq, str): self.seqs['mainseq'] = seq else: logging.critical("Chain sequence 'seq' should be a string.") raise TypeError else: self.seqs['mainseq'] = '' if oligomer is not None: if isinstance(oligomer, str): self.oligomer_id = oligomer else: logging.critical("Oligomer ID 'oligomer' should be a string.") raise TypeError else: if header_info is not None: if 'mainid' in header_info: self.oligomer_id = header_info['mainid'] if chains is not None: if isinstance(chains, set): for ch in chains: if isinstance(ch, str): self.chains.add(ch) else: logging.critical( "Chain IDs in 'chains' set should be strings.") raise TypeError else: logging.critical( "Argument 'chains' should be a set of strings.") raise TypeError else: if header_info is not None: if 'chains' in header_info: self.chains = header_info['chains'] if source is not None: if isinstance(source, str): self.source = source else: logging.critical("Argument 'source' should be a string.") raise TypeError else: if header_info is not None: if 'source' in header_info: self.source = header_info['source'] if biotype is not None: if biotype.lower() == 'guess': self.biotype = guess_type(seq) else: self.biotype = biotype else: self.biotype = None if extrainfo is not None: if isinstance(extrainfo, str): self.infostring = extrainfo else: logging.critical("Argument 'extrainfo' should be a string.") raise TypeError else: if header_info is not None: if 'comments' in header_info: self.infostring = header_info['comments'] else: self.infostring = "" if oligomer is None: self.crops_header = makeheader(mainid='NOID', seqid=self.name, chains=self.chains, source=self.source, extrainfo=self.infostring) else: self.crops_header = makeheader(mainid=self.oligomer_id, seqid=self.name, chains=self.chains, source=self.source, extrainfo=self.infostring)
def parseseq(instream, inset=None): """Parse sequence(s). :param instream: Imported-to-string sequence file content (fasta format). :type instream: str :param inset: Sequence IDs to return, if None it returns them all, defaults to None. :type inset: set or dict or str, optional :raises TypeError: When inset a set [str]; or instream is not a string. :return: Parsed sequences. :rtype: dict [str, :class:`crops.elements.sequences.oligoseq`] """ if isinstance(instream, str) is False: logging.critical('Input argument instream should be a string.') raise TypeError if inset is not None: if (not isinstance(inset, str) and not isinstance(inset, dict) and not isinstance(inset, set)): logging.critical('Input argument inset should be a set or, ' 'alternatively a string or a dictionary.') raise TypeError elif isinstance(inset, str): temp = inset inset = set() inset.add(temp) upperset = set() for element in inset: if not isinstance(element, str): logging.critical('Elements in inseq should be strings.') raise TypeError upperset.add(element.upper()) newseqs = {} newid = [] head = '' chain = '' ignore = False ignore = False indx = -1 inseqlines = instream.splitlines() inseqlines.append('') for raw in range(len(inseqlines)): line = inseqlines[raw].rstrip() if (not line or line.startswith(">")) and not ignore: if indx >= 0: if newid['mainid'] not in newseqs: newseqs[newid['mainid']] = oligoseq( oligomer_id=newid['mainid']) aseq = sequence(seqid=newid['seqid'], oligomer=newid['mainid'], seq=chain, chains=newid['chains'], source=newid['source'], header=head, extrainfo=newid['comments']) newseqs[newid['mainid']].add_sequence(aseq) else: pass if line.startswith(">"): newid = retrieve_id(line) head = line indx += 1 chain = '' if inset is not None: ignore = False if newid['mainid'] in upperset else True elif line.startswith("#") or line.startswith(' #'): continue else: if not ignore: chain += str(line) return newseqs