Esempio n. 1
0
    def test_retrieve_id_17(self):
        expected_tags = {
            'mainid':
            'N1NU19',
            'chains': {
                'A0A023VGG0', 'A0A064D5Y7', 'A0A096Y6F3', 'A0A097J0G2',
                'A0A0A0CK90', 'A0A0A8NVJ2', 'A0A0B6VLZ4', 'A0A0C4XWE1',
                'A0A0D5XR52', 'A0A0D7LBA5', 'A0A0E0SXV4', 'A0A0F0SXM7',
                'A0A0F0T558', 'A0A0F1AU30', 'A0A0H3AEJ2', 'A0A0H3YD74',
                'A0A0J0IP13', 'A0A0K2S451', 'A0A0K3RSM2', 'A0A0M1V7U1',
                'A0A0N9NKB1', 'A0A0R4CY03', 'A0A0R9Q1C9', 'A0A0V9GPN4',
                'A0A0V9J4A8', 'A0A0W1W990', 'A0A126JTC8', 'A0A142BNX4',
                'A0A144XRS0', 'A0A167SUV6', 'A0A1B3B6M7', 'A0A1C0D4B4',
                'A0A1J0E5N', 'A0A1Q8YU31', 'A0A1U7FUF6', 'A0A1W6AR35',
                'A0A1W6ARV1', 'A0A1Y0F4W8', 'A0A220UU09', 'A0A221KL77',
                'A0A2I7QEW9', 'A0A2I8R9F4', 'A0A2I8XNZ0', 'A0A2K9V063',
                'A0A2L0TKR8', 'A0A2N1EL23', 'A0A2R4DIJ7', 'A0A2R4KLJ4',
                'A0A2S1XWH3', 'A0A2S4RFM4', 'A0PB04', 'A4IUB3', 'B6VP02',
                'C4NUX0', 'C4NVF2', 'C6GA28', 'D2Y9Z2', 'D3VGJ5', 'D3VLT8',
                'E1ITU2', 'E7DBG1', 'E7DBH5', 'F2Z7X9', 'F5BPS4', 'H9TK43',
                'K0H5V6', 'K4N9S7', 'K4W3W9', 'K4Y465', 'N1NU19', 'T0P0P2',
                'T0QDZ9', 'U3PHU8', 'U9YS98', 'V9SJY4', 'W1JB02'
            },
            'seqid':
            '14',
            'source':
            'UniClust90_2018_08',
            'comments':
            'Representative=N1NU19 n=76 Descriptions=[Conjugal transfer protein TraV|Type IV conjugative transfer system protein TraV|Sex pilus assembly protein|Conjugative transfer protein TraV|Sex pilus assembly|Type IV conjugative transfer system lipoprotein (TraV)] Members=A0A126JTC8,A0A0E0SXV4,A0A0F1AU30,A0A167SUV6,A0PB04,U9YS98,A0A144XRS0,D2Y9Z2,A0A142BNX4,C4NVF2,A0A0J0IP13,A0A0K3RSM2,A0A097J0G2,A4IUB3,W1JB02,A0A0H3YD74,A0A1W6ARV1,T0QDZ9,A0A0V9GPN4,A0A0V9J4A8,E7DBH5,A0A2S4RFM4,C4NUX0,A0A2R4DIJ7,A0A023VGG0,A0A0A0CK90,U3PHU8,A0A096Y6F3,A0A0K2S451,B6VP02,A0A2I8R9F4,A0A2R4KLJ4,K0H5V6,K4Y465,D3VLT8,F5BPS4,A0A0B6VLZ4,T0P0P2,A0A0D7LBA5,E1ITU2,A0A0N9NKB1,A0A1C0D4B4,H9TK43,A0A0F0SXM7,A0A0A8NVJ2,A0A220UU09,K4W3W9,F2Z7X9,A0A0M1V7U1,A0A0W1W990,C6GA28,A0A2I8XNZ0,A0A2K9V063,A0A0R4CY03,A0A2S1XWH3,A0A1Q8YU31,V9SJY4,A0A0C4XWE1,A0A2L0TKR8,A0A2N1EL23,A0A0R9Q1C9,K4N9S7,N1NU19,A0A1B3B6M7,A0A0D5XR52,A0A0F0T558,A0A2I7QEW9,A0A1Y0F4W8,A0A221KL77,E7DBG1,A0A064D5Y7,A0A1U7FUF6,A0A0H3AEJ2,D3VGJ5,A0A1W6AR35,A0A1J0E5N4'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[16])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 2
0
    def test_retrieve_id_4(self):
        expected_tags = {
            'mainid': 'UPI0000000005',
            'chains': {'UPI0000000005'},
            'seqid': '1',
            'source': 'UniParc',
            'comments': 'status=active'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[3])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 3
0
    def test_retrieve_id_11(self):
        expected_tags = {
            'mainid': 'ABCDE',
            'chains': {'j', 'k', 'l'},
            'seqid': None,
            'source': 'RCSB PDB',
            'comments': None
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[10])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 4
0
    def test_retrieve_id_7(self):
        expected_tags = {
            'mainid': '3J4F',
            'chains': {'A'},
            'seqid': None,
            'source': 'RCSB PDB',
            'comments': 'PDBID|CHAIN|SEQUENCE'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[6])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 5
0
    def test_retrieve_id_14(self):
        expected_tags = {
            'mainid': 'HJ41',
            'chains': {'A', 'B', 'C'},
            'seqid': '1',
            'source': 'CROPS',
            'comments': 'these are some comments'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[13])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 6
0
    def test_retrieve_id_12(self):
        expected_tags = {
            'mainid': '6AVG',
            'chains': {'B', 'C'},
            'seqid': '2',
            'source': 'PDBe',
            'comments': None
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[11])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 7
0
    def test_retrieve_id_5(self):
        expected_tags = {
            'mainid': 'P05067',
            'chains': {'P05067'},
            'seqid': '1',
            'source': 'UniProtKB/SwissProt (archived)',
            'comments': 'archived from Release 18.0 01-MAY-1991 SV=3'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[4])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 8
0
    def test_retrieve_id_6(self):
        expected_tags = {
            'mainid': 'Q55167',
            'chains': {'Q55167'},
            'seqid': '1',
            'source': 'UniProtKB/TrEMBL (archived)',
            'comments': 'archived from Release 17.0 01-JUN-2001 SV=1'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[5])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 9
0
def parsemap(instream):
    """Parse cropmap from string.

    :param instream: Imported-to-string cropmap file content.
    :type instream: str

    :return: Mapping and backmapping coordinates.
    :rtype: dict [str, dict [str, dict [str, dict [int, int]]]]

    """
    mapdict = {}
    newid = []
    indx = -1
    inmaplines = instream.splitlines()
    inmaplines.append('')
    for raw in range(len(inmaplines)):
        line = inmaplines[raw].rstrip()
        if (not line or line.startswith(">")):
            if indx >= 0:
                if newid['mainid'] not in mapdict:
                    mapdict[newid['mainid']] = {}
                if newid['seqid'] not in mapdict[newid['mainid']]:
                    mapdict[newid['mainid']][newid['seqid']] = {}
                    mapdict[newid['mainid']][
                        newid['seqid']]['cropmap'] = copy.deepcopy(forthmap)
                    mapdict[newid['mainid']][
                        newid['seqid']]['cropbackmap'] = copy.deepcopy(backmap)
            if not line:
                try:
                    line = f.readline().rstrip()
                    if not line:
                        break
                except Exception:
                    break

        if line.startswith(">"):
            newid = retrieve_id(line)
            indx += 1
            forthmap = {}
            backmap = {}
        elif line.startswith("#") or line.startswith(' #'):
            pass
        else:
            m = line.split('  ')
            if m[1] != '0':
                forthmap[int(m[0])] = int(m[1])
                backmap[int(m[1])] = int(m[0])
            else:
                forthmap[int(m[0])] = None

    return mapdict
Esempio n. 10
0
    def test_retrieve_id_3(self):
        expected_tags = {
            'mainid':
            'Q9K794',
            'chains': {'Q9K794'},
            'seqid':
            '1',
            'source':
            'UniRef50',
            'comments':
            'Putative AgrB-like protein n=2 Tax=Bacillus TaxID=1386 RepID=AGRB_BACHD'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[2])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 11
0
    def test_retrieve_id_2(self):
        expected_tags = {
            'mainid':
            'Q3SA23',
            'chains': {'Q3SA23'},
            'seqid':
            '1',
            'source':
            'UniProtKB/TrEMBL',
            'comments':
            'Q3SA23_9HIV1 Protein Nef (Fragment) OS=Human immunodeficiency virus 1  OX=11676 GN=nef PE=3 SV=1'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[1])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 12
0
    def test_retrieve_id_1(self):
        expected_tags = {
            'mainid':
            'Q6GZX4',
            'chains': {'Q6GZX4'},
            'seqid':
            '1',
            'source':
            'UniProtKB/SwissProt',
            'comments':
            '001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[0])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 13
0
    def test_retrieve_id_15(self):
        expected_tags = {
            'mainid':
            'A0A094PJX3',
            'chains': {'A0A094PJX'},
            'seqid':
            '30',
            'source':
            'UniClust30_2018_08',
            'comments':
            'Representative=A0A094PJX3 n=1 Descriptions=[Uncharacterized protein] Members=A0A094PJX3'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[14])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 14
0
    def test_retrieve_id_10(self):
        expected_tags = {
            'mainid':
            '6AVG',
            'chains': {'B', 'C'},
            'seqid':
            '2',
            'source':
            'RCSB PDB',
            'comments':
            'T-cell receptor alpha variable 4,TCR alpha chain|H**o sapiens'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[9])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 15
0
    def test_retrieve_id_9(self):
        expected_tags = {
            'mainid':
            '6LN2',
            'chains': {'A'},
            'seqid':
            None,
            'source':
            'MrBUMP',
            'comments':
            "resolution: 3.20 experiment: XRAY release_date: 2020-03-18 [ 476678 : ALL ] ['29-1054'] <SEQSE>29,1054<SEQSE> <100>1<100> <95>1<95> <90>1<90> <70>0<70> <50>0<50>"
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[8])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 16
0
    def test_retrieve_id_8(self):
        expected_tags = {
            'mainid':
            '3J4F',
            'chains': {'D'},
            'seqid':
            None,
            'source':
            'MrBUMP',
            'comments':
            'resolution: 8.60 experiment: EMIC release_date: 2013-07-24 [ 3 : ALL ]'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[7])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 17
0
    def test_retrieve_id_16(self):
        expected_tags = {
            'mainid':
            'U1GGI9',
            'chains': {'A0A239WXC9', 'A0A2W5CTJ', 'U1GGI9', 'U7JT03'},
            'seqid':
            '39',
            'source':
            'UniClust90_2018_08',
            'comments':
            'Representative=U1GGI9 n=4 Descriptions=[ATP-dependent Clp protease adapter protein ClpS|ATP-dependent Clp protease adapter ClpS] Members=U7JT03,A0A239WXC9,U1GGI9,A0A2W5CTJ1'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[15])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 18
0
    def test_retrieve_id_13(self):
        expected_tags = {
            'mainid': '6DWU',
            'chains': {
                'AA', 'AC', 'AE', 'AG', 'AI', 'AK', 'AM', 'AO', 'AQ', 'AS',
                'AU', 'BA', 'BC', 'BE', 'BG', 'BI', 'BK', 'BM', 'BO', 'BQ',
                'BS', 'BU', 'CA', 'CC', 'CE', 'CG', 'CI', 'CK', 'CM', 'CO',
                'CQ', 'CS', 'CU', 'DA', 'DC', 'DE', 'DG', 'DI', 'DK', 'DM',
                'DO', 'DQ', 'DS', 'DU'
            },
            'seqid': '1',
            'source': 'RCSB PDB',
            'comments': 'Cationic trypsin|Bos taurus (9913)'
        }

        retrieved_tags = cit.retrieve_id(_HEADERS_LIST.splitlines()[12])

        self.assertDictEqual(retrieved_tags, expected_tags)
Esempio n. 19
0
    def __init__(self,
                 seqid=None,
                 oligomer=None,
                 seq=None,
                 chains=None,
                 source=None,
                 header=None,
                 biotype=None,
                 extrainfo=None):
        self.oligomer_id = None
        self.name = None
        self.chains = set()
        self.source = None
        self.source_headers = []
        self.crops_header = None
        self.seqs = {}
        self.biotype = None
        self.infostring = None
        self.cropmap = None
        self.cropbackmap = None
        self.msa = None
        self.cropmsa = None
        self.intervals = None

        if header is not None:
            if isinstance(header, str):
                self.source_headers.append(header)
                try:
                    header_info = retrieve_id(header)
                except Exception:
                    logging.warning(
                        'Header format not recognised. Information not extracted.'
                    )
                    header_info = None
            else:
                logging.critical("Argument 'header' should be a string.")
                raise TypeError
        else:
            header_info = None

        if seqid is not None:
            if isinstance(seqid, str):
                self.name = seqid
            elif isinstance(seqid, int):
                self.name = str(seqid)
            else:
                logging.critical("Sequence ID 'seqid' should be a string.")
                raise TypeError
        else:
            if header_info is not None:
                if 'seqid' in header_info:
                    self.name = header_info['seqid']
            else:
                self.name = '1'

        if seq is not None:
            if isinstance(seq, str):
                self.seqs['mainseq'] = seq
            else:
                logging.critical("Chain sequence 'seq' should be a string.")
                raise TypeError
        else:
            self.seqs['mainseq'] = ''

        if oligomer is not None:
            if isinstance(oligomer, str):
                self.oligomer_id = oligomer
            else:
                logging.critical("Oligomer ID 'oligomer' should be a string.")
                raise TypeError
        else:
            if header_info is not None:
                if 'mainid' in header_info:
                    self.oligomer_id = header_info['mainid']

        if chains is not None:
            if isinstance(chains, set):
                for ch in chains:
                    if isinstance(ch, str):
                        self.chains.add(ch)
                    else:
                        logging.critical(
                            "Chain IDs in 'chains' set should be strings.")
                        raise TypeError
            else:
                logging.critical(
                    "Argument 'chains' should be a set of strings.")
                raise TypeError
        else:
            if header_info is not None:
                if 'chains' in header_info:
                    self.chains = header_info['chains']

        if source is not None:
            if isinstance(source, str):
                self.source = source
            else:
                logging.critical("Argument 'source' should be a string.")
                raise TypeError
        else:
            if header_info is not None:
                if 'source' in header_info:
                    self.source = header_info['source']

        if biotype is not None:
            if biotype.lower() == 'guess':
                self.biotype = guess_type(seq)
            else:
                self.biotype = biotype
        else:
            self.biotype = None

        if extrainfo is not None:
            if isinstance(extrainfo, str):
                self.infostring = extrainfo
            else:
                logging.critical("Argument 'extrainfo' should be a string.")
                raise TypeError
        else:
            if header_info is not None:
                if 'comments' in header_info:
                    self.infostring = header_info['comments']
            else:
                self.infostring = ""

        if oligomer is None:
            self.crops_header = makeheader(mainid='NOID',
                                           seqid=self.name,
                                           chains=self.chains,
                                           source=self.source,
                                           extrainfo=self.infostring)
        else:
            self.crops_header = makeheader(mainid=self.oligomer_id,
                                           seqid=self.name,
                                           chains=self.chains,
                                           source=self.source,
                                           extrainfo=self.infostring)
Esempio n. 20
0
def parseseq(instream, inset=None):
    """Parse sequence(s).

    :param instream: Imported-to-string sequence file content (fasta format).
    :type instream: str
    :param inset: Sequence IDs to return, if None it returns them all, defaults to None.
    :type inset: set or dict or str, optional

    :raises TypeError: When inset a set [str]; or instream is not a string.

    :return: Parsed sequences.
    :rtype: dict [str, :class:`crops.elements.sequences.oligoseq`]

    """
    if isinstance(instream, str) is False:
        logging.critical('Input argument instream should be a string.')
        raise TypeError

    if inset is not None:
        if (not isinstance(inset, str) and not isinstance(inset, dict)
                and not isinstance(inset, set)):
            logging.critical('Input argument inset should be a set or, '
                             'alternatively a string or a dictionary.')
            raise TypeError
        elif isinstance(inset, str):
            temp = inset
            inset = set()
            inset.add(temp)
        upperset = set()
        for element in inset:
            if not isinstance(element, str):
                logging.critical('Elements in inseq should be strings.')
                raise TypeError
            upperset.add(element.upper())

    newseqs = {}
    newid = []
    head = ''
    chain = ''
    ignore = False
    ignore = False
    indx = -1
    inseqlines = instream.splitlines()
    inseqlines.append('')
    for raw in range(len(inseqlines)):
        line = inseqlines[raw].rstrip()
        if (not line or line.startswith(">")) and not ignore:
            if indx >= 0:
                if newid['mainid'] not in newseqs:
                    newseqs[newid['mainid']] = oligoseq(
                        oligomer_id=newid['mainid'])
                aseq = sequence(seqid=newid['seqid'],
                                oligomer=newid['mainid'],
                                seq=chain,
                                chains=newid['chains'],
                                source=newid['source'],
                                header=head,
                                extrainfo=newid['comments'])
                newseqs[newid['mainid']].add_sequence(aseq)
        else:
            pass

        if line.startswith(">"):
            newid = retrieve_id(line)
            head = line
            indx += 1
            chain = ''
            if inset is not None:
                ignore = False if newid['mainid'] in upperset else True
        elif line.startswith("#") or line.startswith(' #'):
            continue
        else:
            if not ignore:
                chain += str(line)

    return newseqs