def _getPolymers(lines): """Returns list of polymers (macromolecules).""" pdbid = lines['pdbid'] polymers = dict() for i, line in lines['SEQRES']: ch = line[11] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.sequence += ''.join(getSequence(line[19:].split())) for i, line in lines['DBREF ']: i += 1 ch = line[12] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('DBREF chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] dbabbr = line[26:32].strip() dbref = DBRef() dbref.dbabbr = dbabbr dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown') dbref.accession = line[33:41].strip() dbref.idcode = line[42:54].strip() try: first = int(line[14:18]) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'initial sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: last = int(line[20:24]) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'ending sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.first = (first, line[18], int(line[56:60])) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'initial sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.last = (last, line[24].strip(), int(line[62:67])) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'ending sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.dbrefs.append(dbref) dbref1 = lines['DBREF1'] dbref2 = lines['DBREF2'] if len(dbref1) != len(dbref2): LOGGER.warn('DBREF1 and DBREF1 records are not complete') dbref12 = [] else: dbref12 = zip(dbref1, dbref2) # PY3K: OK for dbref1, dbref2 in dbref12: i, line = dbref1 i += 1 ch = line[12] dbabbr = line[26:32].strip() dbref = DBRef() dbref.dbabbr = dbabbr dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown') dbref.idcode = line[47:67].strip() try: first = int(line[14:18]) except: LOGGER.warn('DBREF1 for chain {2}: failed to parse ' 'initial sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: last = int(line[20:24]) except: LOGGER.warn('DBREF1 for chain {2}: failed to parse ' 'ending sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) i, line = dbref2 i += 1 if line[12] == ' ': LOGGER.warn('DBREF2 chain identifier is not specified ' '({0}:{1})'.format(pdbid, i, ch)) elif line[12] != ch: LOGGER.warn('DBREF1 and DBREF2 chain id mismatch' '({0}:{1})'.format(pdbid, i, ch)) dbref.accession = line[18:40].strip() try: dbref.first = (first, line[18].strip(), int(line[45:55])) except: LOGGER.warn('DBREF2 for chain {2}: failed to parse ' 'initial sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.last = (last, line[24].strip(), int(line[57:67])) except: LOGGER.warn('DBREF2 for chain {2}: failed to parse ' 'ending sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.dbrefs.append(dbref) for poly in polymers.values(): # PY3K: OK resnum = [] for dbref in poly.dbrefs: dbabbr = dbref.dbabbr if dbabbr == 'PDB': if not (pdbid == dbref.accession == dbref.idcode): LOGGER.warn('DBREF for chain {2} refers to PDB ' 'entry {3} ({0}:{1})'.format( pdbid, i, ch, dbref.accession)) else: if pdbid == dbref.accession or pdbid == dbref.idcode: LOGGER.warn('DBREF for chain {2} is {3}, ' 'expected PDB ({0}:{1})'.format( pdbid, i, ch, dbabbr)) dbref.database = 'PDB' resnum.append((dbref.first[0], dbref.last[0])) resnum.sort() last = -10000 for first, temp in resnum: if first <= last: LOGGER.warn('DBREF records overlap for chain {0} ({1})'.format( poly.chid, pdbid)) last = temp for i, line in lines['MODRES']: ch = line[16] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('MODRES chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly if poly.modified is None: poly.modified = [] poly.modified.append( (line[12:15].strip(), line[18:22].strip() + line[22].strip(), line[24:27].strip(), line[29:70].strip())) for i, line in lines['SEQADV']: i += 1 ch = line[16] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('MODRES chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly dbabbr = line[24:28].strip() resname = line[12:15].strip() try: resnum = int(line[18:22].strip()) except: continue LOGGER.warn('SEQADV for chain {2}: failed to parse PDB sequence ' 'number ({0}:{1})'.format(pdbid, i, ch)) icode = line[22].strip() try: dbnum = int(line[43:48].strip()) except: continue LOGGER.warn('SEQADV for chain {2}: failed to parse database ' 'sequence number ({0}:{1})'.format(pdbid, i, ch)) comment = line[49:70].strip() match = False for dbref in poly.dbrefs: if not dbref.first[0] <= resnum <= dbref.last[0]: continue match = True if dbref.dbabbr != dbabbr: LOGGER.warn('SEQADV for chain {2}: reference database ' 'mismatch, expected {3} parsed {4} ' '({0}:{1})'.format(pdbid, i, ch, repr(dbref.dbabbr), repr(dbabbr))) continue dbacc = line[29:38].strip() if dbref.accession[:9] != dbacc[:9]: LOGGER.warn('SEQADV for chain {2}: accession code ' 'mismatch, expected {3} parsed {4} ' '({0}:{1})'.format(pdbid, i, ch, repr(dbref.accession), repr(dbacc))) continue dbref.diff.append((resname, resnum, icode, dbnum, dbnum, comment)) if not match: continue LOGGER.warn('SEQADV for chain {2}: database sequence reference ' 'not found ({0}:{1})'.format(pdbid, i, ch)) string = ' '.join([line[10:].strip() for i, line in lines['COMPND']]) if string.startswith('MOL_ID'): dict_ = {} for molecule in string[6:].split('MOL_ID'): dict_.clear() for token in molecule.split(';'): token = token.strip() if not token: continue items = token.split(':', 1) if len(items) == 2: key, value = items dict_[key.strip()] = value.strip() chains = dict_.pop('CHAIN', '').strip() if not chains: continue for ch in chains.split(','): ch = ch.strip() poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.name = dict_.get('MOLECULE', '') poly.fragment = dict_.get('FRAGMENT', '') poly.comments = dict_.get('OTHER_DETAILS', '') val = dict_.get('SYNONYM', '') poly.synonyms = [s.strip() for s in val.split(',')] if val else [] val = dict_.get('EC', '') poly.ec = [s.strip() for s in val.split(',')] if val else [] poly.engineered = dict_.get('ENGINEERED', '') == 'YES' poly.mutation = dict_.get('MUTATION', '') == 'YES' return list(polymers.values())
def _getPolymers(lines): """Returns list of polymers (macromolecules).""" pdbid = lines['pdbid'] polymers = dict() for i, line in lines['SEQRES']: ch = line[11] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.sequence += ''.join(getSequence(line[19:].split())) for i, line in lines['DBREF ']: i += 1 ch = line[12] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('DBREF chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] dbabbr = line[26:32].strip() dbref = DBRef() dbref.dbabbr = dbabbr dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown') dbref.accession = line[33:41].strip() dbref.idcode = line[42:54].strip() try: first = int(line[14:18]) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'initial sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: last = int(line[20:24]) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'ending sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.first = (first, line[18], int(line[56:60])) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'initial sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.last = (last, line[24].strip(), int(line[62:67])) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'ending sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.dbrefs.append(dbref) dbref1 = lines['DBREF1'] dbref2 = lines['DBREF2'] if len(dbref1) != len(dbref2): LOGGER.warn('DBREF1 and DBREF1 records are not complete') dbref12 = [] else: dbref12 = zip(dbref1, dbref2) # PY3K: OK for dbref1, dbref2 in dbref12: i, line = dbref1 i += 1 ch = line[12] dbabbr = line[26:32].strip() dbref = DBRef() dbref.dbabbr = dbabbr dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown') dbref.idcode = line[47:67].strip() try: first = int(line[14:18]) except: LOGGER.warn('DBREF1 for chain {2}: failed to parse ' 'initial sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: last = int(line[20:24]) except: LOGGER.warn('DBREF1 for chain {2}: failed to parse ' 'ending sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) i, line = dbref2 i += 1 if line[12] == ' ': LOGGER.warn('DBREF2 chain identifier is not specified ' '({0}:{1})'.format(pdbid, ch)) elif line[12] != ch: LOGGER.warn('DBREF1 and DBREF2 chain id mismatch' '({0}:{1})'.format(pdbid, ch)) dbref.accession = line[18:40].strip() try: dbref.first = (first, line[18].strip(), int(line[45:55])) except: LOGGER.warn('DBREF2 for chain {2}: failed to parse ' 'initial sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.last = (last, line[24].strip(), int(line[57:67])) except: LOGGER.warn('DBREF2 for chain {2}: failed to parse ' 'ending sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.dbrefs.append(dbref) for poly in polymers.values(): # PY3K: OK resnum = [] for dbref in poly.dbrefs: dbabbr = dbref.dbabbr if dbabbr == 'PDB': if not (pdbid == dbref.accession == dbref.idcode): LOGGER.warn('DBREF for chain {2} refers to PDB ' 'entry {3} ({0}:{1})' .format(pdbid, i, ch, dbref.accession)) else: if pdbid == dbref.accession or pdbid == dbref.idcode: LOGGER.warn('DBREF for chain {2} is {3}, ' 'expected PDB ({0}:{1})' .format(pdbid, i, ch, dbabbr)) dbref.database = 'PDB' resnum.append((dbref.first[0], dbref.last[0])) resnum.sort() last = -10000 for first, temp in resnum: if first <= last: LOGGER.warn('DBREF records overlap for chain {0} ({1})' .format(poly.chid, pdbid)) last = temp for i, line in lines['MODRES']: ch = line[16] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('MODRES chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly if poly.modified is None: poly.modified = [] poly.modified.append((line[12:15].strip(), line[18:22].strip() + line[22].strip(), line[24:27].strip(), line[29:70].strip())) for i, line in lines['SEQADV']: i += 1 ch = line[16] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('MODRES chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly dbabbr = line[24:28].strip() resname = line[12:15].strip() try: resnum = int(line[18:22].strip()) except: #LOGGER.warn('SEQADV for chain {2}: failed to parse PDB sequence ' # 'number ({0}:{1})'.format(pdbid, i, ch)) continue icode = line[22].strip() try: dbnum = int(line[43:48].strip()) except: #LOGGER.warn('SEQADV for chain {2}: failed to parse database ' # 'sequence number ({0}:{1})'.format(pdbid, i, ch)) continue comment = line[49:70].strip() match = False for dbref in poly.dbrefs: if not dbref.first[0] <= resnum <= dbref.last[0]: continue match = True if dbref.dbabbr != dbabbr: LOGGER.warn('SEQADV for chain {2}: reference database ' 'mismatch, expected {3} parsed {4} ' '({0}:{1})'.format(pdbid, i, ch, repr(dbref.dbabbr), repr(dbabbr))) continue dbacc = line[29:38].strip() if dbref.accession[:9] != dbacc[:9]: LOGGER.warn('SEQADV for chain {2}: accession code ' 'mismatch, expected {3} parsed {4} ' '({0}:{1})'.format(pdbid, i, ch, repr(dbref.accession), repr(dbacc))) continue dbref.diff.append((resname, resnum, icode, dbnum, dbnum, comment)) if not match: LOGGER.warn('SEQADV for chain {2}: database sequence reference ' 'not found ({0}:{1})'.format(pdbid, i, ch)) continue string = ' '.join([line[10:].strip() for i, line in lines['COMPND']]) if string.startswith('MOL_ID'): dict_ = {} for molecule in string[6:].split('MOL_ID'): dict_.clear() for token in molecule.split(';'): token = token.strip() if not token: continue items = token.split(':', 1) if len(items) == 2: key, value = items dict_[key.strip()] = value.strip() chains = dict_.pop('CHAIN', '').strip() if not chains: continue for ch in chains.split(','): ch = ch.strip() poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.name = dict_.get('MOLECULE', '') poly.fragment = dict_.get('FRAGMENT', '') poly.comments = dict_.get('OTHER_DETAILS', '') val = dict_.get('SYNONYM', '') poly.synonyms = [s.strip() for s in val.split(',') ] if val else [] val = dict_.get('EC', '') poly.ec = [s.strip() for s in val.split(',')] if val else [] poly.engineered = dict_.get('ENGINEERED', '') == 'YES' poly.mutation = dict_.get('MUTATION', '') == 'YES' return list(polymers.values())