experimental_sequence_aln = experimental_sequence_aln[ 0:len(uniprot_sequence)] experimental_sequence_aln_conflicts = experimental_sequence_aln_conflicts[ 0:len(uniprot_sequence)] #print ''.join(experimental_sequence_aln_conflicts) # Now add the various sequence data to kinDB experimental_sequence_aln = ''.join(experimental_sequence_aln) experimental_sequence_aln_conflicts = ''.join( experimental_sequence_aln_conflicts) #print k, pdbid, chainid, len(experimental_sequence), len(observed_sequence) #print k, pdbid, chainid, experimental_sequence_aln exp = etree.SubElement(chain_node, 'experimental') etree.SubElement( exp, 'sequence').text = '\n' + seqwrap(experimental_sequence) exp.set('length', str(len(experimental_sequence))) etree.SubElement(exp, 'sequence_aln' ).text = '\n' + seqwrap(experimental_sequence_aln) etree.SubElement( exp, 'sequence_aln_conflicts' ).text = '\n' + seqwrap(experimental_sequence_aln_conflicts) obs = etree.SubElement(chain_node, 'observed') etree.SubElement( obs, 'sequence').text = '\n' + seqwrap(observed_sequence) #if pdbid == '2W1C': #if pdbid == '3LAU': #if pdbid == '1O6L': if pdbid == '3O50': #sys.exit()
pk_description = x.get('description') pk_begin = int( x.find('./location/begin').attrib['position'] ) pk_end = int( x.find('./location/end').attrib['position'] ) pk_length = pk_end - pk_begin + 1 #PK_domain = deepcopy(x) PK_domain = etree.Element('pk_domain') PK_domain.set('description', pk_description) PK_domain.set('begin', str(pk_begin)) PK_domain.set('end', str(pk_end)) PK_domain.set('length', str(pk_length)) PK_domain.set('id', str(x_iter)) PK_domain.set('kinDB_id', (entry_name + '_' + AC + '_PK' + str(x_iter))) #location = PK_domain.find('./location') #etree.SubElement(location, 'length').text = str(pk_length) domain_sequence = seqwrap(sequence[pk_begin-1:pk_end]) etree.SubElement(PK_domain, 'sequence').text = '\n' + domain_sequence kinase_uniprot.append(PK_domain) # = References to other DBs = # NCBI Gene GeneIDs = [x.get('id') for x in uniprot_kinases[k].findall('./dbReference[@type="GeneID"]')] # XXX: exceptions for kinases which have no GeneIDs annotated; LMTK3 RefSeq status is PROVISIONAL; RIPK4 presumably RefSeq sequence is not an exact match; SIK3 RefSeq status is VALIDATED # Will add these manually, since we are mainly using GeneID to collect publications currently if entry_name == 'LMTK3_HUMAN': GeneIDs = ['114783'] if entry_name == 'RIPK4_HUMAN': GeneIDs = ['54101'] if entry_name == 'SIK3_HUMAN': GeneIDs = ['23387'] if len(GeneIDs) > 0:
# ================================= # Output templates.fa and templates-resnums.txt # ================================= templates_filtered = templates.xpath('template[not(@DELETE_ME="")]') with open(templates_fa_filename, 'w') as templates_fa_file: with open(templates_resnums_filename, 'w') as templates_resnums_file: for t in range(ntemplates_filtered): template = templates_filtered[t] template_id = template.get('template_id') chainid = template.get('pk_chainid_pdb') sequence = template.get('pk_domain_observed_resnames') resnums = template.get('pk_domain_observed_uniprot_resnums') template_header = '>' + template_id + '\n' template_fa_string = template_header + seqwrap(sequence) template_resnums_string = template_header + resnums + '\n' templates_fa_file.write(template_fa_string) templates_resnums_file.write(template_resnums_string) # ================================= # Some stats # ================================= template_ACs = [x.get('uniprotAC') for x in templates_filtered] nkinases_with_pk_pdb = len(set(template_ACs)) print 'Total number of pdb chains:', npdb_chains print '(Number of templates created before filtering: ' + str(ntemplates) + ')' print 'Total number of templates created:', ntemplates_filtered print 'Number of kinases with at least one template:', nkinases_with_pk_pdb
for pk_domain in pk_domains: target_id = pk_domain.get('kinDB_id') pk_domain_sequence = sequnwrap(pk_domain.findtext('sequence')) len_pk_domain = int(pk_domain.get('length')) # XXX XXX XXX IMPORTANT: overriding Abl1 sequence so that it includes all of helix I (up to residue 513). Eventually will come up with a better automated method for determining domain boundaries if target_id == 'ABL1_HUMAN_P00519_PK0': pk_domain_sequence = sequnwrap(pk_domain.getparent().findtext('sequence'))[241:513] len_pk_domain = len(pk_domain_sequence) # Set target name. target_ids.append(target_id) # Write alignment file entry. contents += ">%s\n" % target_id contents += seqwrap(pk_domain_sequence) if verbose: print "%24s : %s" % (target_id, pk_domain_sequence) # Mutants mutants = kinDB[k].findall('mutants/mutant') for mutant in mutants: # XXX Skipping these for now - don't have a stable system for assigning IDs yet continue mut_pk_domain_id = mutant.get('pk_domain_id') mutated_full_sequence = list( sequnwrap( kuniprot.find('sequence').text ) ) pk_domain_begin = int( kuniprot.find('pk_domain[@id="%s"]' % mut_pk_domain_id).get('begin') ) pk_domain_end = int( kuniprot.find('pk_domain[@id="%s"]' % mut_pk_domain_id).get('end') ) # XXX IMPORTANT: override Abl1 sequence if target_id == 'ABL1_HUMAN_P00519_PK0':
if __name__ == '__main__': krange = range(nkinases) pool = Pool() results = pool.map(gather_pdb, krange) #results = map(gather_pdb, krange) # serial version, for debugging for k in krange: pdb_nodes = kinDB[k].findall('pk_pdb') for p in range(len(pdb_nodes)): chain_nodes = pdb_nodes[p].findall('chain') for c in range(len(chain_nodes)): DELETE_ME = results[k][p][c][6] if DELETE_ME: chain_nodes[c].set('DELETE_ME','') exp = etree.SubElement(chain_nodes[c], 'experimental') etree.SubElement(exp, 'sequence').text = '\n' + seqwrap(results[k][p][c][0]) exp.set('length', str(len(results[k][p][c][0]))) #etree.SubElement(exp, 'sequence_aln').text = '\n' + seqwrap(results[k][p][c][1]) # NOTE: this is no longer added to the database etree.SubElement(exp, 'sequence_aln_conflicts').text = '\n' + seqwrap(results[k][p][c][2]) obs = etree.SubElement(chain_nodes[c], 'observed') etree.SubElement(obs, 'sequence_aln_exp').text = '\n' + seqwrap(results[k][p][c][3]) etree.SubElement(obs, 'sequence_aln').text = '\n' + seqwrap(results[k][p][c][4]) etree.SubElement(obs, 'ss_aln').text = '\n' + seqwrap(results[k][p][c][5]) # Expression data expression_data = results[k][p][-1] if verbose: print expression_data expression_data_node = etree.Element('expression_data') for e in expression_data.keys(): expression_data_node.set(e, expression_data[e]) pdb_nodes[p].insert(0, expression_data_node)
target_id = pk_domain.get('kinDB_id') pk_domain_sequence = sequnwrap(pk_domain.findtext('sequence')) len_pk_domain = int(pk_domain.get('length')) # XXX XXX XXX IMPORTANT: overriding Abl1 sequence so that it includes all of helix I (up to residue 513). Eventually will come up with a better automated method for determining domain boundaries if target_id == 'ABL1_HUMAN_P00519_PK0': pk_domain_sequence = sequnwrap( pk_domain.getparent().findtext('sequence'))[241:513] len_pk_domain = len(pk_domain_sequence) # Set target name. target_ids.append(target_id) # Write alignment file entry. contents += ">%s\n" % target_id contents += seqwrap(pk_domain_sequence) if verbose: print "%24s : %s" % (target_id, pk_domain_sequence) # Mutants mutants = kinDB[k].findall('mutants/mutant') for mutant in mutants: # XXX Skipping these for now - don't have a stable system for assigning IDs yet continue mut_pk_domain_id = mutant.get('pk_domain_id') mutated_full_sequence = list( sequnwrap(kuniprot.find('sequence').text)) pk_domain_begin = int( kuniprot.find('pk_domain[@id="%s"]' % mut_pk_domain_id).get('begin'))
i += 1 # In cases such as 3LAU and 1O6L, additional sequence at end makes experimental_sequence_aln longer than uniprot_sequence by 1 if len(experimental_sequence_aln) != len(uniprot_sequence): experimental_sequence_aln = experimental_sequence_aln[0 : len(uniprot_sequence)] experimental_sequence_aln_conflicts = experimental_sequence_aln_conflicts[0 : len(uniprot_sequence)] # print ''.join(experimental_sequence_aln_conflicts) # Now add the various sequence data to kinDB experimental_sequence_aln = "".join(experimental_sequence_aln) experimental_sequence_aln_conflicts = "".join(experimental_sequence_aln_conflicts) # print k, pdbid, chainid, len(experimental_sequence), len(observed_sequence) # print k, pdbid, chainid, experimental_sequence_aln exp = etree.SubElement(chain_node, "experimental") etree.SubElement(exp, "sequence").text = "\n" + seqwrap(experimental_sequence) exp.set("length", str(len(experimental_sequence))) etree.SubElement(exp, "sequence_aln").text = "\n" + seqwrap(experimental_sequence_aln) etree.SubElement(exp, "sequence_aln_conflicts").text = "\n" + seqwrap(experimental_sequence_aln_conflicts) obs = etree.SubElement(chain_node, "observed") etree.SubElement(obs, "sequence").text = "\n" + seqwrap(observed_sequence) # if pdbid == '2W1C': # if pdbid == '3LAU': # if pdbid == '1O6L': if pdbid == "3O50": # sys.exit() pass # Only add if the chain matches that in the kinDB
# ================================= # Output templates.fa and templates-resnums.txt # ================================= templates_filtered = templates.xpath('template[not(@DELETE_ME="")]') with open(templates_fa_filename, 'w') as templates_fa_file: with open(templates_resnums_filename, 'w') as templates_resnums_file: for t in range(ntemplates_filtered): template = templates_filtered[t] template_id = template.get('template_id') chainid = template.get('pk_chainid_pdb') sequence = template.get('pk_domain_observed_resnames') resnums = template.get('pk_domain_observed_uniprot_resnums') template_header = '>' + template_id + '\n' template_fa_string = template_header + seqwrap(sequence) template_resnums_string = template_header + resnums + '\n' templates_fa_file.write(template_fa_string) templates_resnums_file.write(template_resnums_string) # ================================= # Some stats # ================================= template_ACs = [ x.get('uniprotAC') for x in templates_filtered ] nkinases_with_pk_pdb = len(set(template_ACs )) print 'Total number of pdb chains:', npdb_chains print '(Number of templates created before filtering: ' + str(ntemplates) + ')' print 'Total number of templates created:', ntemplates_filtered print 'Number of kinases with at least one template:', nkinases_with_pk_pdb