def get_bindingdb_data_file(self):
     # Unless use_existing_bindingdb_data is set to True, retrieve a new file from BindingDB
     if os.path.exists(bindingdb_all_data_filepath) and self.use_existing_bindingdb_data:
         logger.info('BindingDB data file found at: {0}'.format(bindingdb_all_data_filepath))
     else:
         logger.info('Retrieving new BindingDB data file from BindingDB server...')
         retrieve_all_BindingDB_data(bindingdb_all_data_filepath, decompress=False)
Exemple #2
0
 def setup(self):
     self.uniprot_data_dir = os.path.join(external_data_dirpath, 'UniProt')
     if not os.path.exists(self.uniprot_data_dir):
         os.mkdir(self.uniprot_data_dir)
     self.uniprot_xml_out_filepath = os.path.join(self.uniprot_data_dir, 'uniprot-search.xml')
     self.domain_names_filename = 'selected_domain_names.txt'
     self.now = datetime.datetime.utcnow()
     # get current crawl number
     crawldata_row = models.CrawlData.query.first()
     self.current_crawl_number = crawldata_row.current_crawl_number
     logger.info('Current crawl number: {0}'.format(self.current_crawl_number))
Exemple #3
0
 def get_bindingdb_data_file(self):
     # Unless use_existing_bindingdb_data is set to True, retrieve a new file from BindingDB
     if os.path.exists(bindingdb_all_data_filepath
                       ) and self.use_existing_bindingdb_data:
         logger.info('BindingDB data file found at: {0}'.format(
             bindingdb_all_data_filepath))
     else:
         logger.info(
             'Retrieving new BindingDB data file from BindingDB server...')
         retrieve_all_BindingDB_data(bindingdb_all_data_filepath,
                                     decompress=False)
Exemple #4
0
 def setup(self):
     self.uniprot_data_dir = os.path.join(external_data_dirpath, 'UniProt')
     if not os.path.exists(self.uniprot_data_dir):
         os.mkdir(self.uniprot_data_dir)
     self.uniprot_xml_out_filepath = os.path.join(self.uniprot_data_dir,
                                                  'uniprot-search.xml')
     self.domain_names_filename = 'selected_domain_names.txt'
     self.now = datetime.datetime.utcnow()
     # get current crawl number
     crawldata_row = models.CrawlData.query.first()
     self.current_crawl_number = crawldata_row.current_crawl_number
     logger.info('Current crawl number: {0}'.format(
         self.current_crawl_number))
Exemple #5
0
    def get_uniprot_data(self):
        if os.path.exists(self.uniprot_xml_out_filepath) and self.use_existing_data:
            logger.info('UniProt XML document found at: {0}'.format(self.uniprot_xml_out_filepath))
        else:
            logger.info('Retrieving new XML document from UniProt website.')
            xml_text = retrieve_uniprot(self.uniprot_query)
            if len(xml_text) == 0:
                raise Exception('UniProt search returned no entries.')
            logger.info('Saving new XML document as: {0}'.format(self.uniprot_xml_out_filepath))
            with open(self.uniprot_xml_out_filepath, 'w') as uniprot_xml_file:
                uniprot_xml_file.write(xml_text + '\n')

        logger.info('Reading UniProt XML document: {0}'.format(self.uniprot_xml_out_filepath))
        self.uniprot_xml = etree.parse(self.uniprot_xml_out_filepath, xml_parser).getroot()
Exemple #6
0
    def get_uniprot_data(self):
        if os.path.exists(
                self.uniprot_xml_out_filepath) and self.use_existing_data:
            logger.info('UniProt XML document found at: {0}'.format(
                self.uniprot_xml_out_filepath))
        else:
            logger.info('Retrieving new XML document from UniProt website.')
            xml_text = retrieve_uniprot(self.uniprot_query)
            if len(xml_text) == 0:
                raise Exception('UniProt search returned no entries.')
            logger.info('Saving new XML document as: {0}'.format(
                self.uniprot_xml_out_filepath))
            with open(self.uniprot_xml_out_filepath, 'w') as uniprot_xml_file:
                uniprot_xml_file.write(xml_text + '\n')

        logger.info('Reading UniProt XML document: {0}'.format(
            self.uniprot_xml_out_filepath))
        self.uniprot_xml = etree.parse(self.uniprot_xml_out_filepath,
                                       xml_parser).getroot()
Exemple #7
0
    def check_all_gather_scripts_have_been_run(self):
        """
        Test whether each of the gather scripts have been run,
        and whether they have been updated in the correct order
        """
        data_problem = False
        for data_type in ['uniprot', 'ncbi_gene', 'bindingdb', 'pdb', 'cbioportal', 'chembl']:
            datestamp_type = data_type + '_datestamp'
            current_crawl_datatype_datestamp = getattr(self.current_crawl_datestamps_row, datestamp_type)
            if current_crawl_datatype_datestamp == None:
                logger.info('data_type "%s" FAIL: no data found in db' % data_type)
                data_problem = True
            elif current_crawl_datatype_datestamp <= self.safe_crawl_datestamp:
                logger.info('data_type "%s" FAIL: current data (%s) is older than or as old as safe-crawl data (%s)' % (data_type, current_crawl_datatype_datestamp.strftime(targetexplorer.core.datestamp_format_string), self.safe_crawl_datestamp.strftime(targetexplorer.core.datestamp_format_string)))
                data_problem = True
            elif current_crawl_datatype_datestamp > self.safe_crawl_datestamp:
                logger.info('data_type "%s" PASS: current data (%s) is newer than safe-crawl data (%s)' % (data_type, current_crawl_datatype_datestamp.strftime(targetexplorer.core.datestamp_format_string), self.safe_crawl_datestamp.strftime(targetexplorer.core.datestamp_format_string)))

        if data_problem:
            raise DatabaseException('Commit aborted.')
        else:
            logger.info('Proceeding to commit to master db...')
Exemple #8
0
    def check_all_gather_scripts_have_been_run(self):
        """
        Test whether each of the gather scripts have been run,
        and whether they have been updated in the correct order
        """
        data_problem = False
        for data_type in ['uniprot', 'ncbi_gene', 'bindingdb', 'pdb', 'cbioportal']:
            datestamp_type = data_type + '_datestamp'
            current_crawl_datatype_datestamp = getattr(self.current_crawl_datestamps_row, datestamp_type)
            if current_crawl_datatype_datestamp == None:
                logger.info('data_type "%s" FAIL: no data found in db' % data_type)
                data_problem = True
            elif current_crawl_datatype_datestamp <= self.safe_crawl_datestamp:
                logger.info('data_type "%s" FAIL: current data (%s) is older than or as old as safe-crawl data (%s)' % (data_type, current_crawl_datatype_datestamp.strftime(targetexplorer.core.datestamp_format_string), self.safe_crawl_datestamp.strftime(targetexplorer.core.datestamp_format_string)))
                data_problem = True
            elif current_crawl_datatype_datestamp > self.safe_crawl_datestamp:
                logger.info('data_type "%s" PASS: current data (%s) is newer than safe-crawl data (%s)' % (data_type, current_crawl_datatype_datestamp.strftime(targetexplorer.core.datestamp_format_string), self.safe_crawl_datestamp.strftime(targetexplorer.core.datestamp_format_string)))

        if data_problem:
            raise DatabaseException('Commit aborted.')
        else:
            logger.info('Proceeding to commit to master db...')
Exemple #9
0
 def delete_old_crawls(self):
     crawl_numbers = [row.crawl_number for row in models.DateStamps.query.all()]
     if len(crawl_numbers) > self.project_config['ncrawls_to_save']:
         logger.info('More than %d crawls found.' % self.project_config['ncrawls_to_save'])
         crawl_numbers_sorted = sorted(crawl_numbers, reverse=True)
         crawls_to_delete = crawl_numbers_sorted[self.project_config['ncrawls_to_save']:]
         # iterate through crawls to delete
         for crawl_to_delete in crawls_to_delete:
             logger.info('Deleting crawl %d...' % crawl_to_delete)
             # iterate through tables
             for table_class_name in models.table_class_names:
                 if table_class_name == 'CrawlData':
                     continue
                 table = getattr(models, table_class_name)
                 rows_to_delete = table.query.filter_by(crawl_number=crawl_to_delete)
                 logger.info('  - %s - %d rows' % (table_class_name, rows_to_delete.count()))
                 rows_to_delete.delete()
Exemple #10
0
 def delete_old_crawls(self):
     crawl_numbers = [row.crawl_number for row in models.DateStamps.query.all()]
     if len(crawl_numbers) > self.project_config['ncrawls_to_save']:
         logger.info('More than %d crawls found.' % self.project_config['ncrawls_to_save'])
         crawl_numbers_sorted = sorted(crawl_numbers, reverse=True)
         crawls_to_delete = crawl_numbers_sorted[self.project_config['ncrawls_to_save']:]
         # iterate through crawls to delete
         for crawl_to_delete in crawls_to_delete:
             logger.info('Deleting crawl %d...' % crawl_to_delete)
             # iterate through tables
             for table_class_name in models.table_class_names:
                 if table_class_name == 'CrawlData':
                     continue
                 table = getattr(models, table_class_name)
                 rows_to_delete = table.query.filter_by(crawl_number=crawl_to_delete)
                 logger.info('  - %s - %d rows' % (table_class_name, rows_to_delete.count()))
                 rows_to_delete.delete()
Exemple #11
0
def extract_sifts_seq(sifts_filepath, uniprot_ac, uniprot_entry_name, pdb_id, chain_id, uniprot_sequence):
    exception_message = None

    sifts = etree.fromstring( gzip.open(sifts_filepath, 'r').read() )

    # First check whether the first residue with matching chainID and a UniProt crossref has the same UniProt AC as was picked up from UniProt (by gather-uniprot.py).
    # 3O50 and 3O51 are picked up by gather-uniprot.py from uniprot AC O14965. But these have uniprot AC B4DX16 in the sifts .xml files, which is a TrEMBL entry. Sequences are almost identical except for deletion of ~70 residues prior to PK domain of B4DX16. This means that experimental_sequence_aln and related sequences are not added by gather-pdb.py. Need to sort out a special case for these pdbs. Should check for similar cases in other kinases.
    # 3O50 and 3O51 can be ignored. (Plenty of other PDBs for that protein)
    # 3OG7 is picked up from uniprot AC P15056, but the PDB entry links to Q5IBP5 - this is the AKAP9-BRAF fusion protein.
    # XXX TODO XXX 3OG7 will be ignored for now, but at some point should make separate entries for fusion proteins, and add the PDB files accordingly.

    first_matching_uniprot_resi = sifts.find('entity[@type="protein"]/segment/listResidue/residue/crossRefDb[@dbSource="PDB"][@dbChainId="%s"]/../crossRefDb[@dbSource="UniProt"]' % chain_id)
    sifts_uniprot_ac = first_matching_uniprot_resi.get('dbAccessionId')
    if uniprot_ac != sifts_uniprot_ac:
        logger.info('PDB %s chain %s picked up from UniProt entry %s %s. Non-matching UniProtAC in sifts: %s. This chain will be deleted.' % (pdb_id, chain_id, uniprot_entry_name, uniprot_ac, sifts_uniprot_ac))
        exception_message = 'DELETE_ME'

    #
    #
    # TODO check if there are any PDBs where two proteins share the same chainID (I seem to remember that there are - check previous scripts)
    #
    #

    # ======
    # Extract sequence data from the SIFTS XML
    # ======

    # These are the sifts residues which include a PDB crossref with matching chainID
    chain_residues = sifts.findall('entity[@type="protein"]/segment/listResidue/residue/crossRefDb[@dbSource="PDB"][@dbChainId="%s"]/..' % chain_id)
    experimental_sequence = ''
    experimental_sequence_pdb_resids = []
    experimental_sequence_uniprot_res_indices = []
    observed_sequence_aln_exp = ''
    experimental_sequence_aln = ['-'] * len(uniprot_sequence) # This will contain the alignment of the experimental sequence against the full UniProt sequence. Conflicting residues will be added if they are contiguous with non-conflicting segments. NOTE: this is no longer added to the database.
    experimental_sequence_aln_conflicts = ['-'] * len(uniprot_sequence) # Same, but conflicting residues are added as lower case
    observed_sequence_aln = ['-'] * len(uniprot_sequence) # This will contain the alignment of the observed sequence against the full UniProt sequence. Conflicting residues will be ignored.
    ss_aln = ['-'] * len(uniprot_sequence) # This will contain the alignment of the secondary structure codes against the full UniProt sequence. Conflicting residues will be ignored.
    n_crossref_uniprot_matches = 0

    for r in chain_residues:
        residue_details = r.findall('residueDetail')
        residue_detail_texts = [ detail.text.strip() for detail in residue_details ] # list of strings
        ss = r.findtext('residueDetail[@property="codeSecondaryStructure"]')
        resname = r.attrib['dbResName'] 
        if resname == None:
            print 'ERROR: UniProt crossref not found for conflicting residue!', uniprot_ac, pdb_id, chain_id, r.attrib
            raise Exception
        try:
            # Note that this BioPython dict converts a modified aa to the single-letter code of its unmodified parent (e.g. "TPO":"T")
            single_letter = Bio.Data.SCOPData.protein_letters_3to1[ resname ]
        except KeyError:
            if resname == 'ACE': # Just ignore N-terminal ACE
                continue
            elif resname == 'CAS': # S-(dimethylarsenic)cysteine
                single_letter = 'C'
            elif resname == 'MHO': # S-oxymethionine
                single_letter = 'M'
            elif resname == 'LGY': # 3NX8. (E)-N-(4-oxobutylidene)lysine
                single_letter = 'K'
            elif resname == 'AME': # N-acetylmethionine
                single_letter = 'M'
            elif resname == 'NMM': # 3KB7
                single_letter = 'R'
            elif resname == 'OCY': # 2R9S
                single_letter = 'C'
            elif resname == 'CY0': # 2J5E
                single_letter = 'C'
            elif resname == 'CY7': # 2JIV
                single_letter = 'C'
            else:
                print 'KeyError: Problem converting resname', resname, 'to single letter code.', chain_id, r.attrib
                raise KeyError
        # Add residue to experimental_sequence
        experimental_sequence += single_letter

        # Also save the pdb resids, which we will use later
        pdb_resid = r.find('crossRefDb[@dbSource="PDB"]').attrib['dbResNum']
        # TODO need to generalize this. Shift to manual_overrides.yaml or do something else? In the short-term, perhaps just skip these PDBs?
        # Some pdb resids are e.g. '464A'
        if pdb_resid.isdigit() == False:
            if pdb_id in ['1O6L','2JDO','2JDR','2UW9','2X39','2XH5']: # These pdbs include three residues with pdb resids 464A, 464B, 464C, (all with UniProt crossrefs) then continues from 465. We will change this so that the pdb resids continue to iterate
                corrected_pdb_resids = {'464A':465, '464B':466, '464C':467}
                if pdb_resid in corrected_pdb_resids.keys():
                    pdb_resid = corrected_pdb_resids[pdb_resid]
                elif int(pdb_resid[0:3]) > 464:
                    pdb_resid = int(pdb_resid) + 3
            # Otherwise just extract the number (this will also detect negative numbers)
            else:
                pdb_resid = ''.join([char for char in pdb_resid if (char.isdigit() or char == '-')])
        try:
            experimental_sequence_pdb_resids.append( int(pdb_resid) )
        except:
            print 'Problem converting pdb_resid into int.', uniprot_ac, pdb_id, chain_id, pdb_resid
            raise Exception

        # Also add residue to experimental_sequence_aln. Residues which do not match the uniprot sequence (and thus do not have a uniprot crossref) will be added later
        crossref_uniprot = r.find('crossRefDb[@dbSource="UniProt"][@dbAccessionId="%s"]' % uniprot_ac)
        if crossref_uniprot != None:
            n_crossref_uniprot_matches += 1
            index = int(crossref_uniprot.attrib['dbResNum']) - 1
            experimental_sequence_aln[index] = single_letter
            if 'Conflict' in residue_detail_texts or 'Engineered mutation' in residue_detail_texts:
                experimental_sequence_aln_conflicts[index] = single_letter.lower()
            else:
                experimental_sequence_aln_conflicts[index] = single_letter
            experimental_sequence_uniprot_res_indices.append(index)
            # Add residue to observed_sequence_aln if it is observed and is not a conflict
            if 'Not_Observed' not in residue_detail_texts and ('Conflict' not in residue_detail_texts or 'Engineered mutation' in residue_detail_texts):
                observed_sequence_aln[index] = single_letter
                if ss != None:
                    ss_aln[index] = ss
        else:
            experimental_sequence_uniprot_res_indices.append(None)
            pass
        # Add residue to observed_sequence_aln_exp if it is observed, otherwise '-'
        if 'Not_Observed' in residue_detail_texts:
            observed_sequence_aln_exp += '-'
        else:
            observed_sequence_aln_exp += single_letter

    # Now check whether the number of non-observed residues is more than 90% of the experimental sequence length
    n_unobserved_residues = observed_sequence_aln_exp.count('-')
    if ( float(n_unobserved_residues) / float(len(experimental_sequence)) ) > 0.9:
        exception_message = 'DELETE_ME'

    # ======
    # Now we add the residues which do not have a UniProt crossref
    # ======

    #print e, uniprot_ac, pdb_id, chain_id
    #print experimental_sequence
    #print ''.join(experimental_sequence_aln_conflicts)

    i = 0

    # But first we have to deal with cases where residues have been added at the N-terminus which extend before the start of the uniprot sequence. The excess residues will be ignored.
    # Get the uniprot residue index of the first residue with a uniprot crossref
    for s in range(len(experimental_sequence_uniprot_res_indices)):
        UP_res_index = experimental_sequence_uniprot_res_indices[s]
        if UP_res_index != None:
            first_exp_seq_uniprot_res_index = UP_res_index
            # And the corresponding pdb resid
            corresponding_pdb_resid = experimental_sequence_pdb_resids[s]
            exp_seq_first_uniprot_res_index = s
            break
    # And get the pdb resid of the first residue in the experimental sequence
    for s in experimental_sequence_pdb_resids:
        if s != None:
            first_exp_seq_pdb_resid = s
            break
    ignore_excess_Nterm_residues_flag = False
    # If the experimental sequence includes the first residue of the full uniprot sequence
    try:
        if first_exp_seq_uniprot_res_index == 0:
            # And if the value of the first pdb resid is lower than that of the pdb resid corresponding to the first uniprot residue
            if first_exp_seq_pdb_resid < corresponding_pdb_resid:
                # Then we will ignore the excess residues
                ignore_excess_Nterm_residues_flag = True
    except:
        # XXX should do something better than this
        # exception occurs with P27791 (KAPCA_RAT)
        exception_message = 'DELETE_ME'

    # Now iterate through the residues in the experimental sequence and add residues which do not have a uniprot crossref, but are contiguous in terms of PDB numbering

    while i < len(experimental_sequence):
        resname_i = experimental_sequence[i]
        uniprot_res_index_i = experimental_sequence_uniprot_res_indices[i]
        pdb_resid_i = experimental_sequence_pdb_resids[i]

        if (ignore_excess_Nterm_residues_flag == True) and (pdb_resid_i < corresponding_pdb_resid):
            pass # we ignore these residues

        # If this residue does not have a uniprot crossref
        elif uniprot_res_index_i == None:
            # Start a list of residues with no uniprot crossref
            contiguous_noUP_residues = [ resname_i ]
            # Then check the next residue
            j = i + 1
            while j < len(experimental_sequence):
                resname_j = experimental_sequence[j]
                uniprot_res_index_j = experimental_sequence_uniprot_res_indices[j]
                pdb_resid_j = experimental_sequence_pdb_resids[j]
                #print 'len, i, j:', len(experimental_sequence), i, j, pdb_resid_i, pdb_resid_j, contiguous_noUP_residues

                # If this residue also has no uniprot crossref, and is contiguous in terms of pdb resnum, then add it to the list, and move on to the next one
                if (uniprot_res_index_j == None) and ((pdb_resid_j - pdb_resid_i) == (j-i)):
                    #print 'adding to list:', j, resname_j
                    contiguous_noUP_residues.append( resname_j )
                    pass

                # If this residue does have a uniprot crossref, and if it is contiguous in terms of pdb resnum, then we add the list of residues without uniprot crossrefs at this position
                elif (uniprot_res_index_j != None) and ((pdb_resid_j - pdb_resid_i) == (j-i)):
                    #print 'adding to sequence_aln:', j
                    experimental_sequence_aln[ (uniprot_res_index_j - j) : uniprot_res_index_j ] = contiguous_noUP_residues
                    experimental_sequence_aln_conflicts[ (uniprot_res_index_j - j) : uniprot_res_index_j ] = list(''.join(contiguous_noUP_residues).lower())
                    i = j
                    break

                # If this residue is not contiguous in terms of pdb resnum, go back and check if the first of contiguous_noUP_residues is pdb-contiguous with the previous residue - if so, add contiguous_noUP_residues
                elif (pdb_resid_j - pdb_resid_i) != (j-i):
                    #print 'checking backwards:', j
                    if (pdb_resid_i - experimental_sequence_pdb_resids[i-1]) == 1:
                        last_uniprot_res_index = experimental_sequence_uniprot_res_indices[i-1]
                        experimental_sequence_aln[ last_uniprot_res_index + 1 : last_uniprot_res_index + 1 + (j-i)] = contiguous_noUP_residues
                        experimental_sequence_aln_conflicts[ last_uniprot_res_index + 1 : last_uniprot_res_index + 1 + (j-i)] = list(''.join(contiguous_noUP_residues).lower())
                    i = j - 1
                    break

                # If we have reached the end of experimental_sequence, go back and check if the first of contiguous_noUP_residues is pdb-contiguous with the previous residue - if so, add contiguous_noUP_residues
                if j == len(experimental_sequence) - 1:
                    #print 'THIS IS THE END', len(experimental_sequence), i, j, pdb_resid_i, experimental_sequence_pdb_resids[i], experimental_sequence_pdb_resids[i-1], contiguous_noUP_residues
                    #print experimental_sequence_pdb_resids
                    if (pdb_resid_i - experimental_sequence_pdb_resids[i-1]) == 1:
                        last_uniprot_res_index = experimental_sequence_uniprot_res_indices[i-1]
                        experimental_sequence_aln[ last_uniprot_res_index + 1 : last_uniprot_res_index + 2 + (j-i)] = contiguous_noUP_residues
                        experimental_sequence_aln_conflicts[ last_uniprot_res_index + 1 : last_uniprot_res_index + 2 + (j-i)] = list(''.join(contiguous_noUP_residues).lower())
                    i = j
                    break
                j += 1

        i += 1

        # ======
        # Some final processing
        # ======

        # In cases such as 3LAU and 1O6L, additional sequence at end makes experimental_sequence_aln longer than uniprot_sequence by 1
        # Handle this by removing the extraneous sequence
        if len(experimental_sequence_aln) != len(uniprot_sequence):
            experimental_sequence_aln = experimental_sequence_aln[0:len(uniprot_sequence)]
            experimental_sequence_aln_conflicts = experimental_sequence_aln_conflicts[0:len(uniprot_sequence)]

        experimental_sequence_aln = ''.join(experimental_sequence_aln)
        experimental_sequence_aln_conflicts = ''.join(experimental_sequence_aln_conflicts)
        observed_sequence_aln = ''.join(observed_sequence_aln)
        ss_aln = ''.join(ss_aln)

        chain_results_dict = {
            'chain_id': chain_id,
            'experimental_seq': experimental_sequence,
            'experimental_seq_aln_conflicts': experimental_sequence_aln_conflicts,
            'observed_seq_aln_exp': observed_sequence_aln_exp,
            'observed_seq_aln': observed_sequence_aln,
            'observed_ss_aln': ss_aln,
            'exception_message': exception_message,
        }
        return chain_results_dict
Exemple #12
0
    def extract_detailed_uniprot_data(self, uniprot_entry_node):
        # = IDs and names =
        ac = uniprot_entry_node.findtext('./accession')
        entry_name = uniprot_entry_node.findtext('./name')
        if self.skip_uniprot_entries and entry_name in self.skip_uniprot_entries:
            skip_message = self.skip_uniprot_entries[entry_name]
            logger.info(
                'OVERRIDE: Skipping UniProt entry {0} - reason: {1}'.format(
                    entry_name, skip_message
                )
            )
            return
        recommended_name = uniprot_entry_node.findtext('./protein/recommendedName/fullName')
        gene_name_nodes = uniprot_entry_node.findall('./gene/name')
        gene_name_data = []
        for gene_name_node in gene_name_nodes:
            gene_name = gene_name_node.text
            gene_name_type = gene_name_node.get('type')
            gene_name_obj = models.UniProtGeneName(
                crawl_number=self.current_crawl_number,
                gene_name=gene_name,
                gene_name_type=gene_name_type
            )
            gene_name_data.append(gene_name_obj)

        # = Date entry was last modified in UniProt =
        last_uniprot_update = uniprot_entry_node.get('modified')

        # = Taxonomy =
        uniprot_organism_node = uniprot_entry_node.find('organism')
        ncbi_taxon_id = uniprot_organism_node.find('dbReference[@type="NCBI Taxonomy"]').get('id')
        taxon_name_scientific = uniprot_organism_node.findtext('name[@type="scientific"]')
        taxon_name_common = uniprot_organism_node.findtext('name[@type="common"]')
        lineage = uniprot_organism_node.find('lineage')
        lineage_csv = ','.join([taxon.text for taxon in lineage.getchildren()])

        # = Functions, disease associations, subcellular locations =
        functions = []
        disease_associations = []
        subcellular_locations = []
        for domain in uniprot_entry_node.findall('./comment[@type="function"]'):
            functions.append(
                models.UniProtFunction(
                    crawl_number=self.current_crawl_number,
                    function=domain.findtext('./text')
                )
            )
        for domain in uniprot_entry_node.findall('./comment[@type="disease"]'):
            disease_associations.append(
                models.UniProtDiseaseAssociation(
                    crawl_number=self.current_crawl_number,
                    disease_association=domain.findtext('./text')
                )
            )
        for domain in uniprot_entry_node.findall('./comment[@type="subcellular location"]'):
            subcellular_locations.append(
                models.UniProtSubcellularLocation(
                    crawl_number=self.current_crawl_number,
                    subcellular_location=domain.findtext('./subcellularLocation/location')
                )
            )

        # = Canonical isoform =

        isoforms = []

        # Returned UniProt XML contains sequence data only for the canonical isoform
        uniprot_canonical_sequence_node = uniprot_entry_node.find(
            './sequence[@length][@mass]'
        )
        canonical_sequence = ''.join(uniprot_canonical_sequence_node.text.split())
        canseq_length = uniprot_canonical_sequence_node.get('length')
        canseq_mass = uniprot_canonical_sequence_node.get('mass')
        canseq_date_modified = uniprot_canonical_sequence_node.get('modified')
        canseq_version = uniprot_canonical_sequence_node.get('version')
        uniprot_isoform = models.UniProtIsoform(
            crawl_number=self.current_crawl_number,
            ac=ac+'-1',
            is_canonical=True,
            length=canseq_length,
            mass=canseq_mass,
            date_modified=canseq_date_modified,
            version=canseq_version,
            sequence=canonical_sequence
        )
        # empty list for notes (which do not exist for the canonical sequence)
        isoforms.append((uniprot_isoform, []))

        # = Alternative isoforms =
        # Canonical isoform is given the attrib type="displayed", meaning that the sequence is displayed in the HTML version of the entry
        # Example alt isoform:
        #     <comment>
        #         <isoform>
        #             <id>P00519-2</id>
        #             <name>IB</name>
        #             <sequence type="described" ref="VSP_004957"/>
        #             <note>Contains a N-myristoyl glycine at position 2.</note>
        #         </isoform>
        #     </comment>

        for uniprot_isoform_node in uniprot_entry_node.findall('comment/isoform'):
            isoform_ac = uniprot_isoform_node.findtext('id')
            seq_node = uniprot_isoform_node.find('sequence')
            notes = [
                models.UniProtIsoformNote(
                    crawl_number=self.current_crawl_number, note=node.text
                ) for node in uniprot_isoform_node.findall('note')
                ]
            if seq_node.get('type') != 'displayed':
                uniprot_isoform = models.UniProtIsoform(
                    crawl_number=self.current_crawl_number,
                    ac=isoform_ac,
                    is_canonical=False
                )

            isoforms.append((uniprot_isoform, notes))

        # = UniProt "Protein kinase" domain annotations =
        # XXX TODO Generalize

        # if self.uniprot_domain_regex != None:
        #     selected_domains = uniprot_entry_node.xpath(
        #         'feature[@type="domain"][match_regex(@description, "{0}")]'.format(
        #             self.uniprot_domain_regex
        #         ),
        #         extensions={(None, 'match_regex'): xpath_match_regex_case_sensitive}
        #     )
        # else:
        domains = uniprot_entry_node.findall('feature[@type="domain"]')

        # Skip if no matching domains found
        if len(domains) < 1:
            return

        # Finally, add the domains to the new database
        domain_objs = []
        target_iter = 0
        for domain_id, domain in enumerate(domains):
            # First calculate the PK domain length and sequence
            domain_description = domain.get('description')
            if self.uniprot_domain_regex and re.match(self.uniprot_domain_regex, domain_description):
                is_target_domain = True
                target_id = entry_name + '_D' + str(target_iter)
                target_iter += 1
            else:
                is_target_domain = False
            begin = int(domain.find('./location/begin').get('position'))
            end = int(domain.find('./location/end').get('position'))
            length = end - begin + 1
            domain_seq = canonical_sequence[begin-1:end]

            if (self.pseudodomain_manual_annotations
                and entry_name in self.pseudodomain_manual_annotations
                and domain_description == self.pseudodomain_manual_annotations[entry_name].get('description')
                ):
                pseudodomain_notes = self.pseudodomain_manual_annotations[entry_name].get('message')
                logger.info(
                    'OVERRIDE: Labeling domain "{0}" as a pseudodomain - reason: {1}'.format(
                        target_id,
                        pseudodomain_notes
                    )
                )
                is_pseudodomain = True
            else:
                is_pseudodomain = False

            domain_obj = models.UniProtDomain(
                crawl_number=self.current_crawl_number,
                domain_id=domain_id,
                target_id=target_id if is_target_domain else None,
                is_target_domain=is_target_domain,
                description=domain_description,
                is_pseudodomain=is_pseudodomain,
                pseudodomain_notes=pseudodomain_notes if is_pseudodomain else None,
                begin=begin,
                end=end,
                length=length,
                sequence=domain_seq
            )
            domain_objs.append(domain_obj)

        # = References to other DBs =
        # NCBI Gene
        ncbi_gene_entries = []
        gene_ids = [
            int(domain.get('id')) for domain in uniprot_entry_node.findall('./dbReference[@type="GeneID"]')
        ]

        # manual annotations
        if self.ncbi_gene_id_manual_annotations and entry_name in self.ncbi_gene_id_manual_annotations:
            gene_ids = self.ncbi_gene_id_manual_annotations[entry_name].get('gene_ids')
            gene_ids_message = self.ncbi_gene_id_manual_annotations[entry_name].get('message')
            logger.info(
                'OVERRIDE: Manually annotating Gene IDs for entry {0} - reason: {1}'.format(
                    entry_name, gene_ids_message
                )
            )

        for gene_id in gene_ids:
            # manual override skips
            if self.skip_ncbi_gene_entries and gene_id in self.skip_ncbi_gene_entries:
                skip_gene_id_message = self.skip_ncbi_gene_entries[gene_id]
                logger.info(
                    'OVERRIDE: Skipping Gene ID {0} for entry {1} - reason: {2}'.format(
                        gene_id,
                        entry_name,
                        skip_gene_id_message
                    )
                )
                continue

            ncbi_gene_entries.append(
                models.NCBIGeneEntry(
                    crawl_number=self.current_crawl_number,
                    gene_id=gene_id
                )
            )

        # Ensembl

        # transcript_data = {
        #     'ENSMUST00000003710':
        #         {
        #             'gene':
        #                 'ENSG000...',
        #             'protein':
        #                 'ENSP000...',
        #         }
        # }

        ensembl_transcript_nodes = uniprot_entry_node.findall(
            './dbReference[@type="Ensembl"]'
        )

        ensembl_data = {}
        ensembl_transcript_matched_to_uniprot_isoform = False
        for transcript_node in ensembl_transcript_nodes:
            ensembl_transcript_id = transcript_node.get('id')

            ensembl_gene_nodes = transcript_node.findall('property[@type="gene ID"]')
            if len(ensembl_gene_nodes) > 1:
                logger.info(
                    'WARNING: Ensembl transcript {0} linked with > 1 gene ID'.format(
                        ensembl_transcript_id
                    )
                )
            ensembl_gene_id = ensembl_gene_nodes[0].get('value')

            ensembl_protein_nodes = transcript_node.findall('property[@type="protein sequence ID"]')
            if len(ensembl_protein_nodes) > 1:
                logger.info(
                    'WARNING: Ensembl transcript {0} linked with > 1 protein ID'.format(
                        ensembl_transcript_id
                    )
                )
            ensembl_protein_id = ensembl_protein_nodes[0].get('value')

            uniprot_isoform_molecule_node = transcript_node.find('molecule')
            if uniprot_isoform_molecule_node is not None:
                uniprot_isoform_ac = uniprot_isoform_molecule_node.get('id')
                if uniprot_isoform_ac == isoforms[0][0].ac:
                    ensembl_transcript_matched_to_uniprot_isoform = True
            elif uniprot_isoform_molecule_node is None and ensembl_transcript_matched_to_uniprot_isoform is False:
                uniprot_isoform_ac = isoforms[0][0].ac
                ensembl_transcript_matched_to_uniprot_isoform = True
            else:
                uniprot_isoform_ac = None

            ensembl_data[ensembl_transcript_id] = {
                'gene': ensembl_gene_id,
                'protein': ensembl_protein_id,
                'uniprot_isoform_ac': uniprot_isoform_ac
            }


        # HGNC
        hgnc_entries = []
        hgnc_dbrefs = uniprot_entry_node.findall('./dbReference[@type="HGNC"]')
        for hgnc_dbref in hgnc_dbrefs:
            hgnc_gene_id = hgnc_dbref.get('id')
            approved_symbol = hgnc_dbref.find('property[@type="gene designation"]').get('value')
            hgnc_entries.append(
                models.HGNCEntry(
                    crawl_number=self.current_crawl_number,
                    gene_id=hgnc_gene_id,
                    approved_symbol=approved_symbol
                )
            )

        # = Family information =
        similarity_comments = uniprot_entry_node.xpath('./comment[@type="similarity"]')
        family = False
        for s in similarity_comments:
            for f in kinase_family_uniprot_similarity_text.keys():
                if f in s.findtext('text'):
                    family = kinase_family_uniprot_similarity_text[f]

        # = PDB entries (from UniProt XML) =
        # keep X-ray and NMR structures (not "Model")
        pdbs = uniprot_entry_node.xpath(
            './dbReference[@type="PDB"]/property[@type="method"][@value="X-ray" or @value="NMR"]/..'
        )
        pdb_data = []
        for p in pdbs:
            pdb_id = p.get('id')
            if self.skip_pdbs and pdb_id in self.skip_pdbs:
                skip_pdb_message = self.skip_pdbs[pdb_id]
                logger.info(
                    'OVERRIDE: Skipping PDB {0} for entry {1} - reason: {2}'.format(
                        pdb_id, entry_name, skip_pdb_message
                    )
                )
                continue

            pdb_method = p.find('property[@type="method"]').get('value')
            resolution_node = p.find('property[@type="resolution"]')
            resolution = resolution_node.get('value') if resolution_node != None else None
            chains_span_str = p.find('property[@type="chains"]').get('value')
            chains_span = parse_uniprot_pdbref_chains(chains_span_str)
            chain_data_dicts = []
            for c in chains_span.keys():
                chain_id = c
                pdb_begin = chains_span[c][0]
                pdb_end = chains_span[c][1]
                # Use the begin and end info to decide if this pdb chain includes the pk_domain. But we will get other sequence info from sifts XML files, using gather-pdb.py
                # Have to check against each PK domain
                for domain in domain_objs:
                    pk_begin = domain.begin
                    pk_end = domain.end
                    if (pdb_begin < pk_begin+30) & (pdb_end > pk_end-30):
                        chain_data_dict = models.PDBChain(
                            crawl_number=self.current_crawl_number,
                            chain_id=chain_id,
                            begin=pdb_begin,
                            end=pdb_end
                        )
                        chain_data_dicts.append({
                            'chain_obj': chain_data_dict,
                            'domain_obj': domain
                        })
                    else:
                        continue

            if len(chain_data_dicts) > 0:
                pdb_obj = models.PDBEntry(
                    crawl_number=self.current_crawl_number,
                    pdb_id=pdb_id,
                    method=pdb_method,
                    resolution=resolution
                )
                pdb_data.append({'pdb_obj': pdb_obj, 'chain_data_dicts': chain_data_dicts})

        # ========
        # Construct data objects and add to db
        # ========

        db_entry = models.DBEntry(
            crawl_number=self.current_crawl_number,
            npdbs=len(pdb_data),
            ndomains=len(domain_objs),
            nisoforms=len(isoforms),
            nfunctions=len(functions),
            ndisease_associations=len(disease_associations),
        )
        db.session.add(db_entry)
        uniprot_entry = models.UniProtEntry(
            crawl_number=self.current_crawl_number,
            ac=ac,
            entry_name=entry_name,
            last_uniprot_update=last_uniprot_update,
            ncbi_taxon_id=ncbi_taxon_id,
            db_entry=db_entry,
            recommended_name=recommended_name,
            taxon_name_scientific=taxon_name_scientific,
            taxon_name_common=taxon_name_common,
            lineage=lineage_csv,
        )
        if family:
            uniprot_entry.family = family
        db.session.add(uniprot_entry)
        for function_obj in functions:
            function_obj.db_entry = db_entry
            function_obj.uniprot_entry = uniprot_entry
            db.session.add(function_obj)
        for disease_association_obj in disease_associations:
            disease_association_obj.db_entry = db_entry
            disease_association_obj.uniprot_entry = uniprot_entry
            db.session.add(disease_association_obj)
        for subcellular_location_obj in subcellular_locations:
            subcellular_location_obj.db_entry = db_entry
            subcellular_location_obj.uniprot_entry = uniprot_entry
            db.session.add(subcellular_location_obj)
        for isoform_data in isoforms:
            isoform_obj = isoform_data[0]
            notes = isoform_data[1]
            isoform_obj.db_entry = db_entry
            isoform_obj.uniprot_entry = uniprot_entry
            db.session.add(isoform_obj)
            for note_obj in notes:
                note_obj.uniprotisoform = isoform_obj
                db.session.add(note_obj)
        for domain_obj in domain_objs:
            domain_obj.db_entry = db_entry
            domain_obj.uniprot_entry = uniprot_entry
            db.session.add(domain_obj)
        for pdb_data_dict in pdb_data:
            pdb_obj = pdb_data_dict['pdb_obj']
            chain_data_dicts = pdb_data_dict['chain_data_dicts']
            pdb_obj.db_entry = db_entry
            db.session.add(pdb_obj)
            for chain_data_dict in chain_data_dicts:
                chain_obj = chain_data_dict['chain_obj']
                domain_obj = chain_data_dict['domain_obj']
                chain_obj.pdb_entry = pdb_obj
                chain_obj.uniprot_domain = domain_obj
                db.session.add(chain_obj)
        for gene_name_obj in gene_name_data:
            gene_name_obj.db_entry = db_entry
            db.session.add(gene_name_obj)
        for NCBIGeneEntry in ncbi_gene_entries:
            NCBIGeneEntry.db_entry = db_entry
            db.session.add(NCBIGeneEntry)
        for HGNCEntry in hgnc_entries:
            HGNCEntry.db_entry = db_entry
            db.session.add(HGNCEntry)
        for ensembl_transcript_id in ensembl_data:
            ensembl_gene_id = ensembl_data[ensembl_transcript_id]['gene']
            ensembl_gene_row = models.EnsemblGene(
                crawl_number=self.current_crawl_number,
                gene_id=ensembl_gene_id,
                db_entry=db_entry,
            )
            db.session.add(ensembl_gene_row)

            ensembl_transcript_row = models.EnsemblTranscript(
                crawl_number=self.current_crawl_number,
                transcript_id=ensembl_transcript_id,
                ensembl_gene=ensembl_gene_row,
            )
            ensembl_transcript_uniprot_isoform_ac = ensembl_data[ensembl_transcript_id]['uniprot_isoform_ac']
            if ensembl_transcript_uniprot_isoform_ac is not None:
                matching_uniprot_isoform_obj = [
                    isoform[0] for isoform in isoforms
                    if isoform[0].ac == ensembl_transcript_uniprot_isoform_ac
                ]
                if len(matching_uniprot_isoform_obj) != 0:
                    ensembl_transcript_row.uniprot_isoform = matching_uniprot_isoform_obj[0]
            db.session.add(ensembl_transcript_row)

            ensembl_protein_id = ensembl_data[ensembl_transcript_id]['protein']
            ensembl_protein_row = models.EnsemblProtein(
                crawl_number=self.current_crawl_number,
                protein_id=ensembl_protein_id,
                ensembl_gene=ensembl_gene_row,
                ensembl_transcript=ensembl_transcript_row,
            )
            db.session.add(ensembl_protein_row)
 def finish(self):
     logger.info('Done.')
Exemple #14
0
    def analyze_domain_selections(self):
        """
        Prints useful info on the domains selected by uniprot_domain_regex
        """
        selected_domain_names = list(
            set([d.get('description') for d in self.selected_domains]))

        selected_domain_name_counts = [
            len(
                self.uniprot_xml.findall(
                    'entry/feature[@type="domain"][@description="%s"]' % name))
            for name in selected_domain_names
        ]

        domain_names_str = 'Regex: %s\n' % self.uniprot_domain_regex
        domain_names_str += 'Number of domains matching regex: %d\n\n' % len(
            self.selected_domains)
        domain_names_str += '= Unique domain names which match regex =\n'

        for i in range(len(selected_domain_names)):
            domain_names_str += '{:^{name_width}s} : {:>{pop_width}d}\n'.format(
                selected_domain_names[i],
                selected_domain_name_counts[i],
                name_width=max([len(n) + 4 for n in selected_domain_names]),
                pop_width=max(
                    [len(str(p)) + 1 for p in selected_domain_name_counts]))
        domain_names_str += '\n'
        logger.info(domain_names_str)

        logger.info(
            '(Unique domain names which do not match regex will be output to {0})'
            .format(self.domain_names_filename))

        all_domains = self.uniprot_xml.findall(
            './entry/feature[@type="domain"]')
        domain_names_str += '= Unique domain names which do not match regex =\n'
        nonselected_domain_names = list(
            set([
                d.get('description') for d in all_domains
                if d.get('description') not in selected_domain_names
            ]))

        if self.count_nonselected_domain_names:
            nonselected_domain_name_counts = [
                int(
                    self.uniprot_xml.xpath(
                        'count(entry/feature[@type="domain"][@description="{0}"])'
                        .format(name))) for name in nonselected_domain_names
            ]
            for i in range(len(nonselected_domain_names)):
                domain_names_str += '{:^{name_width}s} : {:>{pop_width}d}\n'.format(
                    nonselected_domain_names[i],
                    nonselected_domain_name_counts[i],
                    name_width=max(
                        [len(n) + 4 for n in nonselected_domain_names]),
                    pop_width=max([
                        len(str(p)) + 1 for p in nonselected_domain_name_counts
                    ]),
                )
        else:
            for i in range(len(nonselected_domain_names)):
                domain_names_str += '{:^{name_width}s}\n'.format(
                    nonselected_domain_names[i],
                    name_width=max(
                        [len(n) + 4 for n in nonselected_domain_names]),
                )
        domain_names_str += '\n'

        with open(self.domain_names_filename, 'w') as domain_names_file:
            domain_names_file.write(domain_names_str)
Exemple #15
0
    def extract_detailed_uniprot_data(self, uniprot_entry_node):
        # = IDs and names =
        ac = uniprot_entry_node.findtext('./accession')
        entry_name = uniprot_entry_node.findtext('./name')
        if self.skip_uniprot_entries and entry_name in self.skip_uniprot_entries:
            skip_message = self.skip_uniprot_entries[entry_name]
            logger.info(
                'OVERRIDE: Skipping UniProt entry {0} - reason: {1}'.format(
                    entry_name, skip_message))
            return
        recommended_name = uniprot_entry_node.findtext(
            './protein/recommendedName/fullName')
        gene_name_nodes = uniprot_entry_node.findall('./gene/name')
        gene_name_data = []
        for gene_name_node in gene_name_nodes:
            gene_name = gene_name_node.text
            gene_name_type = gene_name_node.get('type')
            gene_name_obj = models.UniProtGeneName(
                crawl_number=self.current_crawl_number,
                gene_name=gene_name,
                gene_name_type=gene_name_type)
            gene_name_data.append(gene_name_obj)

        # = Date entry was last modified in UniProt =
        last_uniprot_update = uniprot_entry_node.get('modified')

        # = Taxonomy =
        uniprot_organism_node = uniprot_entry_node.find('organism')
        ncbi_taxon_id = uniprot_organism_node.find(
            'dbReference[@type="NCBI Taxonomy"]').get('id')
        taxon_name_scientific = uniprot_organism_node.findtext(
            'name[@type="scientific"]')
        taxon_name_common = uniprot_organism_node.findtext(
            'name[@type="common"]')
        lineage = uniprot_organism_node.find('lineage')
        lineage_csv = ','.join([taxon.text for taxon in lineage.getchildren()])

        # = Functions, disease associations, subcellular locations =
        functions = []
        disease_associations = []
        subcellular_locations = []
        for domain in uniprot_entry_node.findall(
                './comment[@type="function"]'):
            functions.append(
                models.UniProtFunction(crawl_number=self.current_crawl_number,
                                       function=domain.findtext('./text')))
        for domain in uniprot_entry_node.findall('./comment[@type="disease"]'):
            disease_associations.append(
                models.UniProtDiseaseAssociation(
                    crawl_number=self.current_crawl_number,
                    disease_association=domain.findtext('./text')))
        for domain in uniprot_entry_node.findall(
                './comment[@type="subcellular location"]'):
            subcellular_locations.append(
                models.UniProtSubcellularLocation(
                    crawl_number=self.current_crawl_number,
                    subcellular_location=domain.findtext(
                        './subcellularLocation/location')))

        # = Canonical isoform =

        isoforms = []

        # Returned UniProt XML contains sequence data only for the canonical isoform
        uniprot_canonical_sequence_node = uniprot_entry_node.find(
            './sequence[@length][@mass]')
        canonical_sequence = ''.join(
            uniprot_canonical_sequence_node.text.split())
        canseq_length = uniprot_canonical_sequence_node.get('length')
        canseq_mass = uniprot_canonical_sequence_node.get('mass')
        canseq_date_modified = uniprot_canonical_sequence_node.get('modified')
        canseq_version = uniprot_canonical_sequence_node.get('version')
        uniprot_isoform = models.UniProtIsoform(
            crawl_number=self.current_crawl_number,
            ac=ac + '-1',
            is_canonical=True,
            length=canseq_length,
            mass=canseq_mass,
            date_modified=canseq_date_modified,
            version=canseq_version,
            sequence=canonical_sequence)
        # empty list for notes (which do not exist for the canonical sequence)
        isoforms.append((uniprot_isoform, []))

        # = Alternative isoforms =
        # Canonical isoform is given the attrib type="displayed", meaning that the sequence is displayed in the HTML version of the entry
        # Example alt isoform:
        #     <comment>
        #         <isoform>
        #             <id>P00519-2</id>
        #             <name>IB</name>
        #             <sequence type="described" ref="VSP_004957"/>
        #             <note>Contains a N-myristoyl glycine at position 2.</note>
        #         </isoform>
        #     </comment>

        for uniprot_isoform_node in uniprot_entry_node.findall(
                'comment/isoform'):
            isoform_ac = uniprot_isoform_node.findtext('id')
            seq_node = uniprot_isoform_node.find('sequence')
            notes = [
                models.UniProtIsoformNote(
                    crawl_number=self.current_crawl_number, note=node.text)
                for node in uniprot_isoform_node.findall('note')
            ]
            if seq_node.get('type') != 'displayed':
                uniprot_isoform = models.UniProtIsoform(
                    crawl_number=self.current_crawl_number,
                    ac=isoform_ac,
                    is_canonical=False)

            isoforms.append((uniprot_isoform, notes))

        # = UniProt "Protein kinase" domain annotations =
        # XXX TODO Generalize

        # if self.uniprot_domain_regex != None:
        #     selected_domains = uniprot_entry_node.xpath(
        #         'feature[@type="domain"][match_regex(@description, "{0}")]'.format(
        #             self.uniprot_domain_regex
        #         ),
        #         extensions={(None, 'match_regex'): xpath_match_regex_case_sensitive}
        #     )
        # else:
        domains = uniprot_entry_node.findall('feature[@type="domain"]')

        # Skip if no matching domains found
        if len(domains) < 1:
            return

        # Finally, add the domains to the new database
        domain_objs = []
        target_iter = 0
        for domain_id, domain in enumerate(domains):
            # First calculate the PK domain length and sequence
            domain_description = domain.get('description')
            if self.uniprot_domain_regex and re.match(
                    self.uniprot_domain_regex, domain_description):
                is_target_domain = True
                target_id = entry_name + '_D' + str(target_iter)
                target_iter += 1
            else:
                is_target_domain = False
            begin = int(domain.find('./location/begin').get('position'))
            end = int(domain.find('./location/end').get('position'))
            length = end - begin + 1
            domain_seq = canonical_sequence[begin - 1:end]

            if (self.pseudodomain_manual_annotations
                    and entry_name in self.pseudodomain_manual_annotations
                    and domain_description
                    == self.pseudodomain_manual_annotations[entry_name].get(
                        'description')):
                pseudodomain_notes = self.pseudodomain_manual_annotations[
                    entry_name].get('message')
                logger.info(
                    'OVERRIDE: Labeling domain "{0}" as a pseudodomain - reason: {1}'
                    .format(target_id, pseudodomain_notes))
                is_pseudodomain = True
            else:
                is_pseudodomain = False

            domain_obj = models.UniProtDomain(
                crawl_number=self.current_crawl_number,
                domain_id=domain_id,
                target_id=target_id if is_target_domain else None,
                is_target_domain=is_target_domain,
                description=domain_description,
                is_pseudodomain=is_pseudodomain,
                pseudodomain_notes=pseudodomain_notes
                if is_pseudodomain else None,
                begin=begin,
                end=end,
                length=length,
                sequence=domain_seq)
            domain_objs.append(domain_obj)

        # = References to other DBs =
        # NCBI Gene
        ncbi_gene_entries = []
        gene_ids = [
            int(domain.get('id')) for domain in uniprot_entry_node.findall(
                './dbReference[@type="GeneID"]')
        ]

        # manual annotations
        if self.ncbi_gene_id_manual_annotations and entry_name in self.ncbi_gene_id_manual_annotations:
            gene_ids = self.ncbi_gene_id_manual_annotations[entry_name].get(
                'gene_ids')
            gene_ids_message = self.ncbi_gene_id_manual_annotations[
                entry_name].get('message')
            logger.info(
                'OVERRIDE: Manually annotating Gene IDs for entry {0} - reason: {1}'
                .format(entry_name, gene_ids_message))

        for gene_id in gene_ids:
            # manual override skips
            if self.skip_ncbi_gene_entries and gene_id in self.skip_ncbi_gene_entries:
                skip_gene_id_message = self.skip_ncbi_gene_entries[gene_id]
                logger.info(
                    'OVERRIDE: Skipping Gene ID {0} for entry {1} - reason: {2}'
                    .format(gene_id, entry_name, skip_gene_id_message))
                continue

            ncbi_gene_entries.append(
                models.NCBIGeneEntry(crawl_number=self.current_crawl_number,
                                     gene_id=gene_id))

        # Ensembl

        # transcript_data = {
        #     'ENSMUST00000003710':
        #         {
        #             'gene':
        #                 'ENSG000...',
        #             'protein':
        #                 'ENSP000...',
        #         }
        # }

        ensembl_transcript_nodes = uniprot_entry_node.findall(
            './dbReference[@type="Ensembl"]')

        ensembl_data = {}
        ensembl_transcript_matched_to_uniprot_isoform = False
        for transcript_node in ensembl_transcript_nodes:
            ensembl_transcript_id = transcript_node.get('id')

            ensembl_gene_nodes = transcript_node.findall(
                'property[@type="gene ID"]')
            if len(ensembl_gene_nodes) > 1:
                logger.info(
                    'WARNING: Ensembl transcript {0} linked with > 1 gene ID'.
                    format(ensembl_transcript_id))
            ensembl_gene_id = ensembl_gene_nodes[0].get('value')

            ensembl_protein_nodes = transcript_node.findall(
                'property[@type="protein sequence ID"]')
            if len(ensembl_protein_nodes) > 1:
                logger.info(
                    'WARNING: Ensembl transcript {0} linked with > 1 protein ID'
                    .format(ensembl_transcript_id))
            ensembl_protein_id = ensembl_protein_nodes[0].get('value')

            uniprot_isoform_molecule_node = transcript_node.find('molecule')
            if uniprot_isoform_molecule_node is not None:
                uniprot_isoform_ac = uniprot_isoform_molecule_node.get('id')
                if uniprot_isoform_ac == isoforms[0][0].ac:
                    ensembl_transcript_matched_to_uniprot_isoform = True
            elif uniprot_isoform_molecule_node is None and ensembl_transcript_matched_to_uniprot_isoform is False:
                uniprot_isoform_ac = isoforms[0][0].ac
                ensembl_transcript_matched_to_uniprot_isoform = True
            else:
                uniprot_isoform_ac = None

            ensembl_data[ensembl_transcript_id] = {
                'gene': ensembl_gene_id,
                'protein': ensembl_protein_id,
                'uniprot_isoform_ac': uniprot_isoform_ac
            }

        # HGNC
        hgnc_entries = []
        hgnc_dbrefs = uniprot_entry_node.findall('./dbReference[@type="HGNC"]')
        for hgnc_dbref in hgnc_dbrefs:
            hgnc_gene_id = hgnc_dbref.get('id')
            approved_symbol = hgnc_dbref.find(
                'property[@type="gene designation"]').get('value')
            hgnc_entries.append(
                models.HGNCEntry(crawl_number=self.current_crawl_number,
                                 gene_id=hgnc_gene_id,
                                 approved_symbol=approved_symbol))

        # = Family information =
        similarity_comments = uniprot_entry_node.xpath(
            './comment[@type="similarity"]')
        family = False
        for s in similarity_comments:
            for f in kinase_family_uniprot_similarity_text.keys():
                if f in s.findtext('text'):
                    family = kinase_family_uniprot_similarity_text[f]

        # = PDB entries (from UniProt XML) =
        # keep X-ray and NMR structures (not "Model")
        pdbs = uniprot_entry_node.xpath(
            './dbReference[@type="PDB"]/property[@type="method"][@value="X-ray" or @value="NMR"]/..'
        )
        pdb_data = []
        for p in pdbs:
            pdb_id = p.get('id')
            if self.skip_pdbs and pdb_id in self.skip_pdbs:
                skip_pdb_message = self.skip_pdbs[pdb_id]
                logger.info(
                    'OVERRIDE: Skipping PDB {0} for entry {1} - reason: {2}'.
                    format(pdb_id, entry_name, skip_pdb_message))
                continue

            pdb_method = p.find('property[@type="method"]').get('value')
            resolution_node = p.find('property[@type="resolution"]')
            resolution = resolution_node.get(
                'value') if resolution_node != None else None
            chains_span_str = p.find('property[@type="chains"]').get('value')
            chains_span = parse_uniprot_pdbref_chains(chains_span_str)
            chain_data_dicts = []
            for c in chains_span.keys():
                chain_id = c
                pdb_begin = chains_span[c][0]
                pdb_end = chains_span[c][1]
                # Use the begin and end info to decide if this pdb chain includes the pk_domain. But we will get other sequence info from sifts XML files, using gather-pdb.py
                # Have to check against each PK domain
                for domain in domain_objs:
                    pk_begin = domain.begin
                    pk_end = domain.end
                    if (pdb_begin < pk_begin + 30) & (pdb_end > pk_end - 30):
                        chain_data_dict = models.PDBChain(
                            crawl_number=self.current_crawl_number,
                            chain_id=chain_id,
                            begin=pdb_begin,
                            end=pdb_end)
                        chain_data_dicts.append({
                            'chain_obj': chain_data_dict,
                            'domain_obj': domain
                        })
                    else:
                        continue

            if len(chain_data_dicts) > 0:
                pdb_obj = models.PDBEntry(
                    crawl_number=self.current_crawl_number,
                    pdb_id=pdb_id,
                    method=pdb_method,
                    resolution=resolution)
                pdb_data.append({
                    'pdb_obj': pdb_obj,
                    'chain_data_dicts': chain_data_dicts
                })

        # ========
        # Construct data objects and add to db
        # ========

        db_entry = models.DBEntry(
            crawl_number=self.current_crawl_number,
            npdbs=len(pdb_data),
            ndomains=len(domain_objs),
            nisoforms=len(isoforms),
            nfunctions=len(functions),
            ndisease_associations=len(disease_associations),
        )
        db.session.add(db_entry)
        uniprot_entry = models.UniProtEntry(
            crawl_number=self.current_crawl_number,
            ac=ac,
            entry_name=entry_name,
            last_uniprot_update=last_uniprot_update,
            ncbi_taxon_id=ncbi_taxon_id,
            db_entry=db_entry,
            recommended_name=recommended_name,
            taxon_name_scientific=taxon_name_scientific,
            taxon_name_common=taxon_name_common,
            lineage=lineage_csv,
        )
        if family:
            uniprot_entry.family = family
        db.session.add(uniprot_entry)
        for function_obj in functions:
            function_obj.db_entry = db_entry
            function_obj.uniprot_entry = uniprot_entry
            db.session.add(function_obj)
        for disease_association_obj in disease_associations:
            disease_association_obj.db_entry = db_entry
            disease_association_obj.uniprot_entry = uniprot_entry
            db.session.add(disease_association_obj)
        for subcellular_location_obj in subcellular_locations:
            subcellular_location_obj.db_entry = db_entry
            subcellular_location_obj.uniprot_entry = uniprot_entry
            db.session.add(subcellular_location_obj)
        for isoform_data in isoforms:
            isoform_obj = isoform_data[0]
            notes = isoform_data[1]
            isoform_obj.db_entry = db_entry
            isoform_obj.uniprot_entry = uniprot_entry
            db.session.add(isoform_obj)
            for note_obj in notes:
                note_obj.uniprotisoform = isoform_obj
                db.session.add(note_obj)
        for domain_obj in domain_objs:
            domain_obj.db_entry = db_entry
            domain_obj.uniprot_entry = uniprot_entry
            db.session.add(domain_obj)
        for pdb_data_dict in pdb_data:
            pdb_obj = pdb_data_dict['pdb_obj']
            chain_data_dicts = pdb_data_dict['chain_data_dicts']
            pdb_obj.db_entry = db_entry
            db.session.add(pdb_obj)
            for chain_data_dict in chain_data_dicts:
                chain_obj = chain_data_dict['chain_obj']
                domain_obj = chain_data_dict['domain_obj']
                chain_obj.pdb_entry = pdb_obj
                chain_obj.uniprot_domain = domain_obj
                db.session.add(chain_obj)
        for gene_name_obj in gene_name_data:
            gene_name_obj.db_entry = db_entry
            db.session.add(gene_name_obj)
        for NCBIGeneEntry in ncbi_gene_entries:
            NCBIGeneEntry.db_entry = db_entry
            db.session.add(NCBIGeneEntry)
        for HGNCEntry in hgnc_entries:
            HGNCEntry.db_entry = db_entry
            db.session.add(HGNCEntry)
        for ensembl_transcript_id in ensembl_data:
            ensembl_gene_id = ensembl_data[ensembl_transcript_id]['gene']
            ensembl_gene_row = models.EnsemblGene(
                crawl_number=self.current_crawl_number,
                gene_id=ensembl_gene_id,
                db_entry=db_entry,
            )
            db.session.add(ensembl_gene_row)

            ensembl_transcript_row = models.EnsemblTranscript(
                crawl_number=self.current_crawl_number,
                transcript_id=ensembl_transcript_id,
                ensembl_gene=ensembl_gene_row,
            )
            ensembl_transcript_uniprot_isoform_ac = ensembl_data[
                ensembl_transcript_id]['uniprot_isoform_ac']
            if ensembl_transcript_uniprot_isoform_ac is not None:
                matching_uniprot_isoform_obj = [
                    isoform[0] for isoform in isoforms
                    if isoform[0].ac == ensembl_transcript_uniprot_isoform_ac
                ]
                if len(matching_uniprot_isoform_obj) != 0:
                    ensembl_transcript_row.uniprot_isoform = matching_uniprot_isoform_obj[
                        0]
            db.session.add(ensembl_transcript_row)

            ensembl_protein_id = ensembl_data[ensembl_transcript_id]['protein']
            ensembl_protein_row = models.EnsemblProtein(
                crawl_number=self.current_crawl_number,
                protein_id=ensembl_protein_id,
                ensembl_gene=ensembl_gene_row,
                ensembl_transcript=ensembl_transcript_row,
            )
            db.session.add(ensembl_protein_row)
Exemple #16
0
    def extract_mutation_data(self):
        case_rows = {}
        n_mutations_added = 0
        for maf_index_row_tuple in self.maf_df.iterrows():
            maf_row = maf_index_row_tuple[1]
            # hgnc_symbol = maf_row.Hugo_Symbol
            oncotator_ensembl_transcript_id = maf_row.Transcript_ID
            matching_db_ensembl_transcript_row = models.EnsemblTranscript.query.filter_by(
                transcript_id=oncotator_ensembl_transcript_id).first()
            if matching_db_ensembl_transcript_row is None:
                continue
            study = 'internal'
            case_id = maf_row.Tumor_Sample_Barcode
            if case_id not in case_rows:
                case_rows[case_id] = models.CbioportalCase(
                    crawl_number=self.current_crawl_number,
                    case_id=case_id,
                    study=study)
                db.session.add(case_rows[case_id])

            type = maf_row.Variant_Classification
            chromosome_index = maf_row.Chromosome
            chromosome_startpos = maf_row.Start_Position
            chromosome_endpos = maf_row.End_Position
            reference_dna_allele = maf_row.Reference_Allele
            if maf_row.Tumor_Seq_Allele1 != reference_dna_allele:
                variant_dna_allele = maf_row.Tumor_Seq_Allele1
            elif maf_row.Tumor_Seq_Allele2 != reference_dna_allele:
                variant_dna_allele = maf_row.Tumor_Seq_Allele2
            else:
                variant_dna_allele = maf_row.Tumor_Seq_Allele1
            cbioportal_aa_change_string = None
            oncotator_reference_aa = None
            oncotator_aa_pos = None
            oncotator_variant_aa = None
            if maf_row.Amino_Acid_Change is not np.nan:
                aa_change_regex_match = re.match(self.aa_change_regex,
                                                 maf_row.Amino_Acid_Change)
                if aa_change_regex_match:
                    cbioportal_aa_change_string = aa_change_regex_match.groups(
                    )[0]
                    if type == 'Missense_Mutation':
                        aa_change_split_regex_match = re.match(
                            self.aa_change_split_regex,
                            cbioportal_aa_change_string)
                        if aa_change_split_regex_match:
                            oncotator_reference_aa = aa_change_split_regex_match.groups(
                            )[0]
                            oncotator_aa_pos = int(
                                aa_change_split_regex_match.groups()[1])
                            oncotator_variant_aa = aa_change_split_regex_match.groups(
                            )[2]
            validation_status = maf_row.Validation_Status
            functional_impact_score = maf_row['MA:FImpact']
            print type, cbioportal_aa_change_string, oncotator_reference_aa, oncotator_aa_pos, oncotator_variant_aa

            mutation_row = models.CbioportalMutation(
                crawl_number=self.current_crawl_number,
                type=type,
                cbioportal_aa_change_string=cbioportal_aa_change_string,
                mutation_origin=None,
                validation_status=validation_status,
                functional_impact_score=functional_impact_score,
                chromosome_index=chromosome_index,
                chromosome_startpos=chromosome_startpos,
                chromosome_endpos=chromosome_endpos,
                reference_dna_allele=reference_dna_allele,
                variant_dna_allele=variant_dna_allele,
                oncotator_aa_pos=oncotator_aa_pos,
                oncotator_reference_aa=oncotator_reference_aa,
                oncotator_variant_aa=oncotator_variant_aa,
                oncotator_ensembl_transcript_id=oncotator_ensembl_transcript_id,
                db_entry=matching_db_ensembl_transcript_row.ensembl_gene.
                db_entry,
                cbioportal_case=case_rows[case_id],
                in_uniprot_domain=False,
            )

            # is mutation within a uniprot domain?
            matching_uniprot_domains = matching_db_ensembl_transcript_row.ensembl_gene.db_entry.uniprot_domains.all(
            )
            for domain in matching_uniprot_domains:
                if oncotator_aa_pos >= domain.begin and oncotator_aa_pos <= domain.end:
                    if oncotator_reference_aa != cbioportal_aa_change_string[0]:
                        continue
                    mutation_row.in_uniprot_domain = True
                    mutation_row.uniprot_domain = domain

            db.session.add(mutation_row)
            n_mutations_added += 1

        logger.info(
            'From {} mutation annotations, added {} mutations and {} cases.'.
            format(len(self.maf_df), n_mutations_added, len(case_rows)))
Exemple #17
0
 def commit(self):
     db.session.commit()
     logger.info('Database committed.')
     logger.info('New safe crawl number: {0}'.format(self.current_crawl_number))
     logger.info('New current crawl number: {0}'.format(self.current_crawl_number+1))
     logger.info('Done.')
Exemple #18
0
def extract_sifts_seq(sifts_filepath, uniprot_ac, uniprot_entry_name, pdb_id,
                      chain_id, uniprot_sequence):
    exception_message = None

    sifts = etree.fromstring(gzip.open(sifts_filepath, 'r').read())

    # First check whether the first residue with matching chainID and a UniProt crossref has the same UniProt AC as was picked up from UniProt (by gather-uniprot.py).
    # 3O50 and 3O51 are picked up by gather-uniprot.py from uniprot AC O14965. But these have uniprot AC B4DX16 in the sifts .xml files, which is a TrEMBL entry. Sequences are almost identical except for deletion of ~70 residues prior to PK domain of B4DX16. This means that experimental_sequence_aln and related sequences are not added by gather-pdb.py. Need to sort out a special case for these pdbs. Should check for similar cases in other kinases.
    # 3O50 and 3O51 can be ignored. (Plenty of other PDBs for that protein)
    # 3OG7 is picked up from uniprot AC P15056, but the PDB entry links to Q5IBP5 - this is the AKAP9-BRAF fusion protein.
    # XXX TODO XXX 3OG7 will be ignored for now, but at some point should make separate entries for fusion proteins, and add the PDB files accordingly.

    first_matching_uniprot_resi = sifts.find(
        'entity[@type="protein"]/segment/listResidue/residue/crossRefDb[@dbSource="PDB"][@dbChainId="%s"]/../crossRefDb[@dbSource="UniProt"]'
        % chain_id)
    sifts_uniprot_ac = first_matching_uniprot_resi.get('dbAccessionId')
    if uniprot_ac != sifts_uniprot_ac:
        logger.info(
            'PDB %s chain %s picked up from UniProt entry %s %s. Non-matching UniProtAC in sifts: %s. This chain will be deleted.'
            % (pdb_id, chain_id, uniprot_entry_name, uniprot_ac,
               sifts_uniprot_ac))
        exception_message = 'DELETE_ME'

    #
    #
    # TODO check if there are any PDBs where two proteins share the same chainID (I seem to remember that there are - check previous scripts)
    #
    #

    # ======
    # Extract sequence data from the SIFTS XML
    # ======

    # These are the sifts residues which include a PDB crossref with matching chainID
    chain_residues = sifts.findall(
        'entity[@type="protein"]/segment/listResidue/residue/crossRefDb[@dbSource="PDB"][@dbChainId="%s"]/..'
        % chain_id)
    experimental_sequence = ''
    experimental_sequence_pdb_resids = []
    experimental_sequence_uniprot_res_indices = []
    observed_sequence_aln_exp = ''
    experimental_sequence_aln = ['-'] * len(
        uniprot_sequence
    )  # This will contain the alignment of the experimental sequence against the full UniProt sequence. Conflicting residues will be added if they are contiguous with non-conflicting segments. NOTE: this is no longer added to the database.
    experimental_sequence_aln_conflicts = ['-'] * len(
        uniprot_sequence
    )  # Same, but conflicting residues are added as lower case
    observed_sequence_aln = ['-'] * len(
        uniprot_sequence
    )  # This will contain the alignment of the observed sequence against the full UniProt sequence. Conflicting residues will be ignored.
    ss_aln = ['-'] * len(
        uniprot_sequence
    )  # This will contain the alignment of the secondary structure codes against the full UniProt sequence. Conflicting residues will be ignored.
    n_crossref_uniprot_matches = 0

    for r in chain_residues:
        residue_details = r.findall('residueDetail')
        residue_detail_texts = [
            detail.text.strip() for detail in residue_details
        ]  # list of strings
        ss = r.findtext('residueDetail[@property="codeSecondaryStructure"]')
        resname = r.attrib['dbResName']
        if resname == None:
            print 'ERROR: UniProt crossref not found for conflicting residue!', uniprot_ac, pdb_id, chain_id, r.attrib
            raise Exception
        try:
            # Note that this BioPython dict converts a modified aa to the single-letter code of its unmodified parent (e.g. "TPO":"T")
            single_letter = Bio.Data.SCOPData.protein_letters_3to1[resname]
        except KeyError:
            if resname == 'ACE':  # Just ignore N-terminal ACE
                continue
            elif resname == 'CAS':  # S-(dimethylarsenic)cysteine
                single_letter = 'C'
            elif resname == 'MHO':  # S-oxymethionine
                single_letter = 'M'
            elif resname == 'LGY':  # 3NX8. (E)-N-(4-oxobutylidene)lysine
                single_letter = 'K'
            elif resname == 'AME':  # N-acetylmethionine
                single_letter = 'M'
            elif resname == 'NMM':  # 3KB7
                single_letter = 'R'
            elif resname == 'OCY':  # 2R9S
                single_letter = 'C'
            elif resname == 'CY0':  # 2J5E
                single_letter = 'C'
            elif resname == 'CY7':  # 2JIV
                single_letter = 'C'
            else:
                print 'KeyError: Problem converting resname', resname, 'to single letter code.', chain_id, r.attrib
                raise KeyError
        # Add residue to experimental_sequence
        experimental_sequence += single_letter

        # Also save the pdb resids, which we will use later
        pdb_resid = r.find('crossRefDb[@dbSource="PDB"]').attrib['dbResNum']
        # TODO need to generalize this. Shift to manual_overrides.yaml or do something else? In the short-term, perhaps just skip these PDBs?
        # Some pdb resids are e.g. '464A'
        if pdb_resid.isdigit() == False:
            if pdb_id in [
                    '1O6L', '2JDO', '2JDR', '2UW9', '2X39', '2XH5'
            ]:  # These pdbs include three residues with pdb resids 464A, 464B, 464C, (all with UniProt crossrefs) then continues from 465. We will change this so that the pdb resids continue to iterate
                corrected_pdb_resids = {'464A': 465, '464B': 466, '464C': 467}
                if pdb_resid in corrected_pdb_resids.keys():
                    pdb_resid = corrected_pdb_resids[pdb_resid]
                elif int(pdb_resid[0:3]) > 464:
                    pdb_resid = int(pdb_resid) + 3
            # Otherwise just extract the number (this will also detect negative numbers)
            else:
                pdb_resid = ''.join([
                    char for char in pdb_resid
                    if (char.isdigit() or char == '-')
                ])
        try:
            experimental_sequence_pdb_resids.append(int(pdb_resid))
        except:
            print 'Problem converting pdb_resid into int.', uniprot_ac, pdb_id, chain_id, pdb_resid
            raise Exception

        # Also add residue to experimental_sequence_aln. Residues which do not match the uniprot sequence (and thus do not have a uniprot crossref) will be added later
        crossref_uniprot = r.find(
            'crossRefDb[@dbSource="UniProt"][@dbAccessionId="%s"]' %
            uniprot_ac)
        if crossref_uniprot != None:
            n_crossref_uniprot_matches += 1
            index = int(crossref_uniprot.attrib['dbResNum']) - 1
            experimental_sequence_aln[index] = single_letter
            if 'Conflict' in residue_detail_texts or 'Engineered mutation' in residue_detail_texts:
                experimental_sequence_aln_conflicts[
                    index] = single_letter.lower()
            else:
                experimental_sequence_aln_conflicts[index] = single_letter
            experimental_sequence_uniprot_res_indices.append(index)
            # Add residue to observed_sequence_aln if it is observed and is not a conflict
            if 'Not_Observed' not in residue_detail_texts and (
                    'Conflict' not in residue_detail_texts
                    or 'Engineered mutation' in residue_detail_texts):
                observed_sequence_aln[index] = single_letter
                if ss != None:
                    ss_aln[index] = ss
        else:
            experimental_sequence_uniprot_res_indices.append(None)
            pass
        # Add residue to observed_sequence_aln_exp if it is observed, otherwise '-'
        if 'Not_Observed' in residue_detail_texts:
            observed_sequence_aln_exp += '-'
        else:
            observed_sequence_aln_exp += single_letter

    # Now check whether the number of non-observed residues is more than 90% of the experimental sequence length
    n_unobserved_residues = observed_sequence_aln_exp.count('-')
    if (float(n_unobserved_residues) /
            float(len(experimental_sequence))) > 0.9:
        exception_message = 'DELETE_ME'

    # ======
    # Now we add the residues which do not have a UniProt crossref
    # ======

    #print e, uniprot_ac, pdb_id, chain_id
    #print experimental_sequence
    #print ''.join(experimental_sequence_aln_conflicts)

    i = 0

    # But first we have to deal with cases where residues have been added at the N-terminus which extend before the start of the uniprot sequence. The excess residues will be ignored.
    # Get the uniprot residue index of the first residue with a uniprot crossref
    for s in range(len(experimental_sequence_uniprot_res_indices)):
        UP_res_index = experimental_sequence_uniprot_res_indices[s]
        if UP_res_index != None:
            first_exp_seq_uniprot_res_index = UP_res_index
            # And the corresponding pdb resid
            corresponding_pdb_resid = experimental_sequence_pdb_resids[s]
            exp_seq_first_uniprot_res_index = s
            break
    # And get the pdb resid of the first residue in the experimental sequence
    for s in experimental_sequence_pdb_resids:
        if s != None:
            first_exp_seq_pdb_resid = s
            break
    ignore_excess_Nterm_residues_flag = False
    # If the experimental sequence includes the first residue of the full uniprot sequence
    try:
        if first_exp_seq_uniprot_res_index == 0:
            # And if the value of the first pdb resid is lower than that of the pdb resid corresponding to the first uniprot residue
            if first_exp_seq_pdb_resid < corresponding_pdb_resid:
                # Then we will ignore the excess residues
                ignore_excess_Nterm_residues_flag = True
    except:
        # XXX should do something better than this
        # exception occurs with P27791 (KAPCA_RAT)
        exception_message = 'DELETE_ME'

    # Now iterate through the residues in the experimental sequence and add residues which do not have a uniprot crossref, but are contiguous in terms of PDB numbering

    while i < len(experimental_sequence):
        resname_i = experimental_sequence[i]
        uniprot_res_index_i = experimental_sequence_uniprot_res_indices[i]
        pdb_resid_i = experimental_sequence_pdb_resids[i]

        if (ignore_excess_Nterm_residues_flag
                == True) and (pdb_resid_i < corresponding_pdb_resid):
            pass  # we ignore these residues

        # If this residue does not have a uniprot crossref
        elif uniprot_res_index_i == None:
            # Start a list of residues with no uniprot crossref
            contiguous_noUP_residues = [resname_i]
            # Then check the next residue
            j = i + 1
            while j < len(experimental_sequence):
                resname_j = experimental_sequence[j]
                uniprot_res_index_j = experimental_sequence_uniprot_res_indices[
                    j]
                pdb_resid_j = experimental_sequence_pdb_resids[j]
                #print 'len, i, j:', len(experimental_sequence), i, j, pdb_resid_i, pdb_resid_j, contiguous_noUP_residues

                # If this residue also has no uniprot crossref, and is contiguous in terms of pdb resnum, then add it to the list, and move on to the next one
                if (uniprot_res_index_j
                        == None) and ((pdb_resid_j - pdb_resid_i) == (j - i)):
                    #print 'adding to list:', j, resname_j
                    contiguous_noUP_residues.append(resname_j)
                    pass

                # If this residue does have a uniprot crossref, and if it is contiguous in terms of pdb resnum, then we add the list of residues without uniprot crossrefs at this position
                elif (uniprot_res_index_j != None) and (
                    (pdb_resid_j - pdb_resid_i) == (j - i)):
                    #print 'adding to sequence_aln:', j
                    experimental_sequence_aln[(
                        uniprot_res_index_j -
                        j):uniprot_res_index_j] = contiguous_noUP_residues
                    experimental_sequence_aln_conflicts[(
                        uniprot_res_index_j - j):uniprot_res_index_j] = list(
                            ''.join(contiguous_noUP_residues).lower())
                    i = j
                    break

                # If this residue is not contiguous in terms of pdb resnum, go back and check if the first of contiguous_noUP_residues is pdb-contiguous with the previous residue - if so, add contiguous_noUP_residues
                elif (pdb_resid_j - pdb_resid_i) != (j - i):
                    #print 'checking backwards:', j
                    if (pdb_resid_i -
                            experimental_sequence_pdb_resids[i - 1]) == 1:
                        last_uniprot_res_index = experimental_sequence_uniprot_res_indices[
                            i - 1]
                        experimental_sequence_aln[
                            last_uniprot_res_index + 1:last_uniprot_res_index +
                            1 + (j - i)] = contiguous_noUP_residues
                        experimental_sequence_aln_conflicts[
                            last_uniprot_res_index + 1:last_uniprot_res_index +
                            1 + (j - i)] = list(
                                ''.join(contiguous_noUP_residues).lower())
                    i = j - 1
                    break

                # If we have reached the end of experimental_sequence, go back and check if the first of contiguous_noUP_residues is pdb-contiguous with the previous residue - if so, add contiguous_noUP_residues
                if j == len(experimental_sequence) - 1:
                    #print 'THIS IS THE END', len(experimental_sequence), i, j, pdb_resid_i, experimental_sequence_pdb_resids[i], experimental_sequence_pdb_resids[i-1], contiguous_noUP_residues
                    #print experimental_sequence_pdb_resids
                    if (pdb_resid_i -
                            experimental_sequence_pdb_resids[i - 1]) == 1:
                        last_uniprot_res_index = experimental_sequence_uniprot_res_indices[
                            i - 1]
                        experimental_sequence_aln[
                            last_uniprot_res_index + 1:last_uniprot_res_index +
                            2 + (j - i)] = contiguous_noUP_residues
                        experimental_sequence_aln_conflicts[
                            last_uniprot_res_index + 1:last_uniprot_res_index +
                            2 + (j - i)] = list(
                                ''.join(contiguous_noUP_residues).lower())
                    i = j
                    break
                j += 1

        i += 1

        # ======
        # Some final processing
        # ======

        # In cases such as 3LAU and 1O6L, additional sequence at end makes experimental_sequence_aln longer than uniprot_sequence by 1
        # Handle this by removing the extraneous sequence
        if len(experimental_sequence_aln) != len(uniprot_sequence):
            experimental_sequence_aln = experimental_sequence_aln[
                0:len(uniprot_sequence)]
            experimental_sequence_aln_conflicts = experimental_sequence_aln_conflicts[
                0:len(uniprot_sequence)]

        experimental_sequence_aln = ''.join(experimental_sequence_aln)
        experimental_sequence_aln_conflicts = ''.join(
            experimental_sequence_aln_conflicts)
        observed_sequence_aln = ''.join(observed_sequence_aln)
        ss_aln = ''.join(ss_aln)

        chain_results_dict = {
            'chain_id': chain_id,
            'experimental_seq': experimental_sequence,
            'experimental_seq_aln_conflicts':
            experimental_sequence_aln_conflicts,
            'observed_seq_aln_exp': observed_sequence_aln_exp,
            'observed_seq_aln': observed_sequence_aln,
            'observed_ss_aln': ss_aln,
            'exception_message': exception_message,
        }
        return chain_results_dict
Exemple #19
0
 def commit(self):
     db.session.commit()
     logger.info('Database committed.')
     logger.info('New safe crawl number: {0}'.format(self.current_crawl_number))
     logger.info('New current crawl number: {0}'.format(self.current_crawl_number+1))
     logger.info('Done.')
Exemple #20
0
    def analyze_domain_selections(self):
        """
        Prints useful info on the domains selected by uniprot_domain_regex
        """
        selected_domain_names = list(set(
            [d.get('description') for d in self.selected_domains]
        ))

        selected_domain_name_counts = [
            len(self.uniprot_xml.findall('entry/feature[@type="domain"][@description="%s"]' % name))
            for name in selected_domain_names
        ]

        domain_names_str = 'Regex: %s\n' % self.uniprot_domain_regex
        domain_names_str += 'Number of domains matching regex: %d\n\n' % len(self.selected_domains)
        domain_names_str += '= Unique domain names which match regex =\n'

        for i in range(len(selected_domain_names)):
            domain_names_str += '{:^{name_width}s} : {:>{pop_width}d}\n'.format(selected_domain_names[i],
                selected_domain_name_counts[i],
                name_width=max([len(n)+4 for n in selected_domain_names]),
                pop_width=max([len(str(p))+1 for p in selected_domain_name_counts])
            )
        domain_names_str += '\n'
        logger.info(domain_names_str)

        logger.info(
            '(Unique domain names which do not match regex will be output to {0})'.format(
                self.domain_names_filename
            )
        )

        all_domains = self.uniprot_xml.findall('./entry/feature[@type="domain"]')
        domain_names_str += '= Unique domain names which do not match regex =\n'
        nonselected_domain_names = list(set([ d.get('description') for d in all_domains if d.get('description') not in selected_domain_names ]))

        if self.count_nonselected_domain_names:
            nonselected_domain_name_counts = [
                int(
                    self.uniprot_xml.xpath(
                        'count(entry/feature[@type="domain"][@description="{0}"])'.format(name)
                   )
                ) for name in nonselected_domain_names
            ]
            for i in range(len(nonselected_domain_names)):
                domain_names_str += '{:^{name_width}s} : {:>{pop_width}d}\n'.format(
                    nonselected_domain_names[i],
                    nonselected_domain_name_counts[i],
                    name_width=max([len(n)+4 for n in nonselected_domain_names]),
                    pop_width=max([len(str(p))+1 for p in nonselected_domain_name_counts]),
                )
        else:
            for i in range(len(nonselected_domain_names)):
                domain_names_str += '{:^{name_width}s}\n'.format(
                    nonselected_domain_names[i],
                    name_width=max([len(n)+4 for n in nonselected_domain_names]),
                )
        domain_names_str += '\n'

        with open(self.domain_names_filename, 'w') as domain_names_file:
            domain_names_file.write(domain_names_str)
Exemple #21
0
 def finish(self):
     logger.info('Done.')
    def extract_mutation_data(self):
        case_rows = {}
        n_mutations_added = 0
        for maf_index_row_tuple in self.maf_df.iterrows():
            maf_row = maf_index_row_tuple[1]
            # hgnc_symbol = maf_row.Hugo_Symbol
            oncotator_ensembl_transcript_id = maf_row.Transcript_ID
            matching_db_ensembl_transcript_row = models.EnsemblTranscript.query.filter_by(
                transcript_id=oncotator_ensembl_transcript_id
            ).first()
            if matching_db_ensembl_transcript_row is None:
                continue
            study = 'internal'
            case_id = maf_row.Tumor_Sample_Barcode
            if case_id not in case_rows:
                case_rows[case_id] = models.CbioportalCase(
                     crawl_number=self.current_crawl_number, case_id=case_id, study=study
                )
                db.session.add(case_rows[case_id])

            type = maf_row.Variant_Classification
            chromosome_index = maf_row.Chromosome
            chromosome_startpos = maf_row.Start_Position
            chromosome_endpos = maf_row.End_Position
            reference_dna_allele = maf_row.Reference_Allele
            if maf_row.Tumor_Seq_Allele1 != reference_dna_allele:
                variant_dna_allele = maf_row.Tumor_Seq_Allele1
            elif maf_row.Tumor_Seq_Allele2 != reference_dna_allele:
                variant_dna_allele = maf_row.Tumor_Seq_Allele2
            else:
                variant_dna_allele = maf_row.Tumor_Seq_Allele1
            cbioportal_aa_change_string = None
            oncotator_reference_aa = None
            oncotator_aa_pos = None
            oncotator_variant_aa = None
            if maf_row.Amino_Acid_Change is not np.nan:
                aa_change_regex_match = re.match(self.aa_change_regex, maf_row.Amino_Acid_Change)
                if aa_change_regex_match:
                    cbioportal_aa_change_string = aa_change_regex_match.groups()[0]
                    if type == 'Missense_Mutation':
                        aa_change_split_regex_match = re.match(
                            self.aa_change_split_regex, cbioportal_aa_change_string
                        )
                        if aa_change_split_regex_match:
                            oncotator_reference_aa = aa_change_split_regex_match.groups()[0]
                            oncotator_aa_pos = int(aa_change_split_regex_match.groups()[1])
                            oncotator_variant_aa = aa_change_split_regex_match.groups()[2]
            validation_status = maf_row.Validation_Status
            functional_impact_score = maf_row['MA:FImpact']
            print type, cbioportal_aa_change_string, oncotator_reference_aa, oncotator_aa_pos, oncotator_variant_aa

            mutation_row = models.CbioportalMutation(
                crawl_number=self.current_crawl_number,
                type=type,
                cbioportal_aa_change_string=cbioportal_aa_change_string,
                mutation_origin=None,
                validation_status=validation_status,
                functional_impact_score=functional_impact_score,
                chromosome_index=chromosome_index,
                chromosome_startpos=chromosome_startpos,
                chromosome_endpos=chromosome_endpos,
                reference_dna_allele=reference_dna_allele,
                variant_dna_allele=variant_dna_allele,
                oncotator_aa_pos=oncotator_aa_pos,
                oncotator_reference_aa=oncotator_reference_aa,
                oncotator_variant_aa=oncotator_variant_aa,
                oncotator_ensembl_transcript_id=oncotator_ensembl_transcript_id,
                db_entry=matching_db_ensembl_transcript_row.ensembl_gene.db_entry,
                cbioportal_case=case_rows[case_id],
                in_uniprot_domain=False,
            )

            # is mutation within a uniprot domain?
            matching_uniprot_domains = matching_db_ensembl_transcript_row.ensembl_gene.db_entry.uniprot_domains.all()
            for domain in matching_uniprot_domains:
                if oncotator_aa_pos >= domain.begin and oncotator_aa_pos <= domain.end:
                    if oncotator_reference_aa != cbioportal_aa_change_string[0]:
                        continue
                    mutation_row.in_uniprot_domain = True
                    mutation_row.uniprot_domain = domain

            db.session.add(mutation_row)
            n_mutations_added += 1

        logger.info('From {} mutation annotations, added {} mutations and {} cases.'.format(
            len(self.maf_df), n_mutations_added, len(case_rows))
        )