예제 #1
0
                experimental_sequence_aln = experimental_sequence_aln[
                    0:len(uniprot_sequence)]
                experimental_sequence_aln_conflicts = experimental_sequence_aln_conflicts[
                    0:len(uniprot_sequence)]

            #print ''.join(experimental_sequence_aln_conflicts)

            # Now add the various sequence data to kinDB
            experimental_sequence_aln = ''.join(experimental_sequence_aln)
            experimental_sequence_aln_conflicts = ''.join(
                experimental_sequence_aln_conflicts)
            #print k, pdbid, chainid, len(experimental_sequence), len(observed_sequence)
            #print k, pdbid, chainid, experimental_sequence_aln
            exp = etree.SubElement(chain_node, 'experimental')
            etree.SubElement(
                exp, 'sequence').text = '\n' + seqwrap(experimental_sequence)
            exp.set('length', str(len(experimental_sequence)))
            etree.SubElement(exp, 'sequence_aln'
                             ).text = '\n' + seqwrap(experimental_sequence_aln)
            etree.SubElement(
                exp, 'sequence_aln_conflicts'
            ).text = '\n' + seqwrap(experimental_sequence_aln_conflicts)
            obs = etree.SubElement(chain_node, 'observed')
            etree.SubElement(
                obs, 'sequence').text = '\n' + seqwrap(observed_sequence)

            #if pdbid == '2W1C':
            #if pdbid == '3LAU':
            #if pdbid == '1O6L':
            if pdbid == '3O50':
                #sys.exit()
예제 #2
0
        pk_description = x.get('description')
        pk_begin = int( x.find('./location/begin').attrib['position'] )
        pk_end = int( x.find('./location/end').attrib['position'] )
        pk_length = pk_end - pk_begin + 1
        #PK_domain = deepcopy(x)
        PK_domain = etree.Element('pk_domain')
        PK_domain.set('description', pk_description)
        PK_domain.set('begin', str(pk_begin))
        PK_domain.set('end', str(pk_end))
        PK_domain.set('length', str(pk_length))
        PK_domain.set('id', str(x_iter))
        PK_domain.set('kinDB_id', (entry_name + '_' + AC + '_PK' + str(x_iter)))

        #location = PK_domain.find('./location')
        #etree.SubElement(location, 'length').text = str(pk_length)
        domain_sequence = seqwrap(sequence[pk_begin-1:pk_end])
        etree.SubElement(PK_domain, 'sequence').text = '\n' + domain_sequence
        kinase_uniprot.append(PK_domain)

    # = References to other DBs =
    # NCBI Gene
    GeneIDs = [x.get('id') for x in uniprot_kinases[k].findall('./dbReference[@type="GeneID"]')]
    # XXX: exceptions for kinases which have no GeneIDs annotated; LMTK3 RefSeq status is PROVISIONAL; RIPK4 presumably RefSeq sequence is not an exact match; SIK3 RefSeq status is VALIDATED
    # Will add these manually, since we are mainly using GeneID to collect publications currently
    if entry_name == 'LMTK3_HUMAN':
        GeneIDs = ['114783']
    if entry_name == 'RIPK4_HUMAN':
        GeneIDs = ['54101']
    if entry_name == 'SIK3_HUMAN':
        GeneIDs = ['23387']
    if len(GeneIDs) > 0:
예제 #3
0
# =================================
# Output templates.fa and templates-resnums.txt
# =================================
templates_filtered = templates.xpath('template[not(@DELETE_ME="")]')

with open(templates_fa_filename, 'w') as templates_fa_file:
    with open(templates_resnums_filename, 'w') as templates_resnums_file:
        for t in range(ntemplates_filtered):
            template = templates_filtered[t]
            template_id = template.get('template_id')
            chainid = template.get('pk_chainid_pdb')
            sequence = template.get('pk_domain_observed_resnames')
            resnums = template.get('pk_domain_observed_uniprot_resnums')

            template_header = '>' + template_id + '\n'
            template_fa_string = template_header + seqwrap(sequence)
            template_resnums_string = template_header + resnums + '\n'

            templates_fa_file.write(template_fa_string)
            templates_resnums_file.write(template_resnums_string)

# =================================
# Some stats
# =================================
template_ACs = [x.get('uniprotAC') for x in templates_filtered]
nkinases_with_pk_pdb = len(set(template_ACs))

print 'Total number of pdb chains:', npdb_chains
print '(Number of templates created before filtering: ' + str(ntemplates) + ')'
print 'Total number of templates created:', ntemplates_filtered
print 'Number of kinases with at least one template:', nkinases_with_pk_pdb
        for pk_domain in pk_domains:
            target_id = pk_domain.get('kinDB_id')
            pk_domain_sequence = sequnwrap(pk_domain.findtext('sequence'))
            len_pk_domain = int(pk_domain.get('length'))

            # XXX XXX XXX IMPORTANT: overriding Abl1 sequence so that it includes all of helix I (up to residue 513). Eventually will come up with a better automated method for determining domain boundaries
            if target_id == 'ABL1_HUMAN_P00519_PK0':
                pk_domain_sequence = sequnwrap(pk_domain.getparent().findtext('sequence'))[241:513]
                len_pk_domain = len(pk_domain_sequence)
        
            # Set target name.
            target_ids.append(target_id)

            # Write alignment file entry.
            contents +=  ">%s\n" % target_id
            contents += seqwrap(pk_domain_sequence)

            if verbose:
                print "%24s : %s" % (target_id, pk_domain_sequence)

        # Mutants
        mutants = kinDB[k].findall('mutants/mutant')
        for mutant in mutants:
            # XXX Skipping these for now - don't have a stable system for assigning IDs yet
            continue
            mut_pk_domain_id = mutant.get('pk_domain_id')
            mutated_full_sequence = list( sequnwrap( kuniprot.find('sequence').text ) )
            pk_domain_begin = int( kuniprot.find('pk_domain[@id="%s"]' % mut_pk_domain_id).get('begin') )
            pk_domain_end = int( kuniprot.find('pk_domain[@id="%s"]' % mut_pk_domain_id).get('end') )
            # XXX IMPORTANT: override Abl1 sequence
            if target_id == 'ABL1_HUMAN_P00519_PK0':
예제 #5
0
if __name__ == '__main__':
    krange = range(nkinases)
    pool = Pool()
    results = pool.map(gather_pdb, krange)
    #results = map(gather_pdb, krange)   # serial version, for debugging
    for k in krange:
        pdb_nodes = kinDB[k].findall('pk_pdb')
        for p in range(len(pdb_nodes)):
            chain_nodes = pdb_nodes[p].findall('chain')
            for c in range(len(chain_nodes)):
                DELETE_ME = results[k][p][c][6]
                if DELETE_ME:
                    chain_nodes[c].set('DELETE_ME','')
                exp = etree.SubElement(chain_nodes[c], 'experimental')
                etree.SubElement(exp, 'sequence').text = '\n' + seqwrap(results[k][p][c][0])
                exp.set('length', str(len(results[k][p][c][0])))
                #etree.SubElement(exp, 'sequence_aln').text = '\n' + seqwrap(results[k][p][c][1]) # NOTE: this is no longer added to the database
                etree.SubElement(exp, 'sequence_aln_conflicts').text = '\n' + seqwrap(results[k][p][c][2])
                obs = etree.SubElement(chain_nodes[c], 'observed')
                etree.SubElement(obs, 'sequence_aln_exp').text = '\n' + seqwrap(results[k][p][c][3])
                etree.SubElement(obs, 'sequence_aln').text = '\n' + seqwrap(results[k][p][c][4])
                etree.SubElement(obs, 'ss_aln').text = '\n' + seqwrap(results[k][p][c][5])
            # Expression data
            expression_data = results[k][p][-1]
            if verbose: print expression_data
            expression_data_node = etree.Element('expression_data')
            for e in expression_data.keys():
                expression_data_node.set(e, expression_data[e])
            pdb_nodes[p].insert(0, expression_data_node)
                
            target_id = pk_domain.get('kinDB_id')
            pk_domain_sequence = sequnwrap(pk_domain.findtext('sequence'))
            len_pk_domain = int(pk_domain.get('length'))

            # XXX XXX XXX IMPORTANT: overriding Abl1 sequence so that it includes all of helix I (up to residue 513). Eventually will come up with a better automated method for determining domain boundaries
            if target_id == 'ABL1_HUMAN_P00519_PK0':
                pk_domain_sequence = sequnwrap(
                    pk_domain.getparent().findtext('sequence'))[241:513]
                len_pk_domain = len(pk_domain_sequence)

            # Set target name.
            target_ids.append(target_id)

            # Write alignment file entry.
            contents += ">%s\n" % target_id
            contents += seqwrap(pk_domain_sequence)

            if verbose:
                print "%24s : %s" % (target_id, pk_domain_sequence)

        # Mutants
        mutants = kinDB[k].findall('mutants/mutant')
        for mutant in mutants:
            # XXX Skipping these for now - don't have a stable system for assigning IDs yet
            continue
            mut_pk_domain_id = mutant.get('pk_domain_id')
            mutated_full_sequence = list(
                sequnwrap(kuniprot.find('sequence').text))
            pk_domain_begin = int(
                kuniprot.find('pk_domain[@id="%s"]' %
                              mut_pk_domain_id).get('begin'))
예제 #7
0
                i += 1

            # In cases such as 3LAU and 1O6L, additional sequence at end makes experimental_sequence_aln longer than uniprot_sequence by 1
            if len(experimental_sequence_aln) != len(uniprot_sequence):
                experimental_sequence_aln = experimental_sequence_aln[0 : len(uniprot_sequence)]
                experimental_sequence_aln_conflicts = experimental_sequence_aln_conflicts[0 : len(uniprot_sequence)]

            # print ''.join(experimental_sequence_aln_conflicts)

            # Now add the various sequence data to kinDB
            experimental_sequence_aln = "".join(experimental_sequence_aln)
            experimental_sequence_aln_conflicts = "".join(experimental_sequence_aln_conflicts)
            # print k, pdbid, chainid, len(experimental_sequence), len(observed_sequence)
            # print k, pdbid, chainid, experimental_sequence_aln
            exp = etree.SubElement(chain_node, "experimental")
            etree.SubElement(exp, "sequence").text = "\n" + seqwrap(experimental_sequence)
            exp.set("length", str(len(experimental_sequence)))
            etree.SubElement(exp, "sequence_aln").text = "\n" + seqwrap(experimental_sequence_aln)
            etree.SubElement(exp, "sequence_aln_conflicts").text = "\n" + seqwrap(experimental_sequence_aln_conflicts)
            obs = etree.SubElement(chain_node, "observed")
            etree.SubElement(obs, "sequence").text = "\n" + seqwrap(observed_sequence)

            # if pdbid == '2W1C':
            # if pdbid == '3LAU':
            # if pdbid == '1O6L':
            if pdbid == "3O50":
                # sys.exit()
                pass

        # Only add if the chain matches that in the kinDB
# =================================
# Output templates.fa and templates-resnums.txt
# =================================
templates_filtered = templates.xpath('template[not(@DELETE_ME="")]')

with open(templates_fa_filename, 'w') as templates_fa_file:
    with open(templates_resnums_filename, 'w') as templates_resnums_file:
        for t in range(ntemplates_filtered):
            template = templates_filtered[t]
            template_id = template.get('template_id')
            chainid = template.get('pk_chainid_pdb')
            sequence = template.get('pk_domain_observed_resnames')
            resnums = template.get('pk_domain_observed_uniprot_resnums')

            template_header = '>' + template_id + '\n'
            template_fa_string = template_header + seqwrap(sequence)
            template_resnums_string = template_header + resnums + '\n'

            templates_fa_file.write(template_fa_string)
            templates_resnums_file.write(template_resnums_string)

# =================================
# Some stats
# =================================
template_ACs = [ x.get('uniprotAC') for x in templates_filtered ]
nkinases_with_pk_pdb = len(set(template_ACs ))

print 'Total number of pdb chains:', npdb_chains
print '(Number of templates created before filtering: ' + str(ntemplates) + ')'
print 'Total number of templates created:', ntemplates_filtered
print 'Number of kinases with at least one template:', nkinases_with_pk_pdb