def retrieve_mutation_datatxt( case_set_id, genetic_profile_id, gene_ids, portal_version='public-portal', verbose=False, ): """ Queries cBioPortal for "Mutation" format data, given a list of cBioPortal cancer studies and a list of HGNC Approved gene Symbols. Returns the data file as a list of text lines. """ gene_ids_string = '+'.join(gene_ids) mutation_url = 'http://www.cbioportal.org/{0}/' \ 'webservice.do' \ '?cmd=getProfileData' \ '&case_set_id={1}' \ '&genetic_profile_id={2}' \ '&gene_list={3}'.format( portal_version, case_set_id, genetic_profile_id, gene_ids_string ) if verbose: set_loglevel('debug') logger.debug(mutation_url) response = urllib2.urlopen(mutation_url) page = response.read(1000000000) lines = page.splitlines() return lines
def retrieve_mutation_datatxt(case_set_id, genetic_profile_id, gene_ids, portal_version='public-portal', verbose=False, ): """ Queries cBioPortal for "Mutation" format data, given a list of cBioPortal cancer studies and a list of HGNC Approved gene Symbols. Returns the data file as a list of text lines. """ gene_ids_string = '+'.join(gene_ids) mutation_url = 'http://www.cbioportal.org/{0}/' \ 'webservice.do' \ '?cmd=getProfileData' \ '&case_set_id={1}' \ '&genetic_profile_id={2}' \ '&gene_list={3}'.format( portal_version, case_set_id, genetic_profile_id, gene_ids_string ) if verbose: set_loglevel('debug') logger.debug(mutation_url) response = urllib2.urlopen(mutation_url) page = response.read(1000000000) lines = page.splitlines() return lines
def extract_pdb_data(pdb_dict): '''Extract data for a single PDB structure ''' pdb_row_id = pdb_dict['pdb_row_id'] pdb_id = pdb_dict['pdb_id'] ac = pdb_dict['ac'] entry_name = pdb_dict['entry_name'] seq = pdb_dict['seq'] chain_data = pdb_dict['chain_data'] structure_dirs = pdb_dict['structure_dirs'] # if entry_name != 'MLKL_HUMAN': # return None # # if pdb_id != '2ITN': # return None # ======== # Get PDB and SIFTS files # PDB files are used to extract expression system metadata # SIFTS files are used to extract sequence data # ======== # TODO define this via project metadata .yaml file. # structure_dirs = ['/Users/partond/tmp/kinome-MSMSeeder/structures/pdb', '/Users/partond/tmp/kinome-MSMSeeder/structures/sifts'] local_pdb_filepath = os.path.join('external-data', 'PDB', pdb_id + '.pdb.gz') local_sifts_filepath = os.path.join('external-data', 'SIFTS', pdb_id + '.xml.gz') # Check if PDB file/symlink already exists and is not empty search_for_pdb = True if os.path.exists(local_pdb_filepath): if os.path.getsize(local_pdb_filepath) > 0: search_for_pdb = False # If not, search any user-defined paths and create a symlink if found if search_for_pdb: if structure_dirs: for structure_dir in structure_dirs: pdb_filepath = os.path.join(structure_dir, pdb_id + '.pdb.gz') if os.path.exists(pdb_filepath): if os.path.getsize(pdb_filepath) > 0: if os.path.exists(local_pdb_filepath): os.remove(local_pdb_filepath) os.symlink(pdb_filepath, local_pdb_filepath) break # If still not found, download the PDB file if not os.path.exists(local_pdb_filepath): print 'Downloading PDB file and saving as:', local_pdb_filepath page = retrieve_pdb(pdb_id, compressed='yes') # download and write compressed file with open(local_pdb_filepath, 'wb') as local_pdb_file: local_pdb_file.write(page) # Check if SIFTS file already exists and is not empty search_for_sifts = True if os.path.exists(local_sifts_filepath): if os.path.getsize(local_sifts_filepath) > 0: search_for_sifts = False # If not, search any user-defined paths and create a symlink if found if search_for_sifts: if structure_dirs: for structure_dir in structure_dirs: sifts_filepath = os.path.join(structure_dir, pdb_id + '.xml.gz') if os.path.exists(sifts_filepath): if os.path.getsize(sifts_filepath) > 0: if os.path.exists(local_sifts_filepath): os.remove(local_sifts_filepath) os.symlink(sifts_filepath, local_sifts_filepath) break # If still not found, download the SIFTS XML file if not os.path.exists(local_sifts_filepath): print 'Downloading SIFTS file (compressed) and saving as:', local_sifts_filepath try: page = retrieve_sifts(pdb_id) except urllib2.URLError as urlerror: if urlerror.reason == 'ftp error: [Errno ftp error] 550 Failed to change directory.': # Check the PDB file has definitely been downloaded. If so, then the problem is probably that the SIFTS people have not yet created the file for this PDB entry, or they have not added it to their server yet. if os.path.exists(local_pdb_filepath): # In this case, just add a message telling the script to delete this PDB structure from the DB. The continue clause skips to the end of the function. print '%s SIFTS file could not be downloaded - this PDB entry will be deleted from the DB' % pdb_id return {'pdb_row_id': pdb_row_id, 'exception_message': 'SIFTS file could not be downloaded'} else: raise urlerror else: raise urlerror with gzip.open(local_sifts_filepath, 'wb') as local_sifts_file: local_sifts_file.write(page) # ====== # From PDB file, get EXPRESSION_SYSTEM and related fields, using Bio.PDB.PDBParser # ====== db_chain_ids_lower = [chain_dict['chain_id'].lower() for chain_dict in chain_data] pdbparser = Bio.PDB.PDBParser(QUIET=True) with gzip.open(local_pdb_filepath) as local_pdb_file: pdbdata = pdbparser.get_structure(pdb_id, local_pdb_file) pdbheader = pdbparser.get_header() # Bio PDB compound structure: {'compound': {'1': {'chain': 'a, b'}}} pdb_compounds = pdbheader['compound'] matching_pdb_compound_id = None try: for pdb_compound_id in pdb_compounds.keys(): for pdb_chain_id in pdb_compounds[pdb_compound_id]['chain'].split(', '): if pdb_chain_id in db_chain_ids_lower: matching_pdb_compound_id = pdb_compound_id break assert matching_pdb_compound_id is not None except Exception as e: print 'ERROR for entry %s PDB %s. PDB header dict as parsed by BioPython follows:' % (entry_name, pdb_id) print pdbheader print traceback.format_exc() raise e expression_data = {} # Bio PDB source structure: {'source': {'1': {'expression_system': 'escherichia coli'}}} pdbexpression_data = pdbheader['source'][matching_pdb_compound_id] for key in pdbexpression_data.keys(): if key[0:10] == 'expression': # Make expression data upper-case again. I think it looks better for single-case text. expression_data[key.upper()] = pdbexpression_data[key].upper() # expression_data_obj = models.PDBExpressionData(expression_data_type=key.upper(), expression_data_value=pdbexpression_data[key].upper(), pdb=pdb_row) # db.session.add(expression_data_obj) # ====== # Iterate through chains in PDBRow and extract sequence data from SIFTS file, and add to database # ====== results = {'pdb_row_id': pdb_row_id, 'expression_data': expression_data, 'chain_dicts': {}} for chain_dict in chain_data: chain_row_id = chain_dict['chain_row_id'] chain_id = chain_dict['chain_id'] logger.debug(entry_name, ac, pdb_id, chain_id) pdb_chain_dict = extract_sifts_seq(local_sifts_filepath, ac, entry_name, pdb_id, chain_id, seq) results['chain_dicts'][chain_row_id] = pdb_chain_dict return results
def extract_pdb_data(pdb_dict): '''Extract data for a single PDB structure ''' pdb_row_id = pdb_dict['pdb_row_id'] pdb_id = pdb_dict['pdb_id'] ac = pdb_dict['ac'] entry_name = pdb_dict['entry_name'] seq = pdb_dict['seq'] chain_data = pdb_dict['chain_data'] structure_dirs = pdb_dict['structure_dirs'] # if entry_name != 'MLKL_HUMAN': # return None # # if pdb_id != '2ITN': # return None # ======== # Get PDB and SIFTS files # PDB files are used to extract expression system metadata # SIFTS files are used to extract sequence data # ======== # TODO define this via project metadata .yaml file. # structure_dirs = ['/Users/partond/tmp/kinome-MSMSeeder/structures/pdb', '/Users/partond/tmp/kinome-MSMSeeder/structures/sifts'] local_pdb_filepath = os.path.join('external-data', 'PDB', pdb_id + '.pdb.gz') local_sifts_filepath = os.path.join('external-data', 'SIFTS', pdb_id + '.xml.gz') # Check if PDB file/symlink already exists and is not empty search_for_pdb = True if os.path.exists(local_pdb_filepath): if os.path.getsize(local_pdb_filepath) > 0: search_for_pdb = False # If not, search any user-defined paths and create a symlink if found if search_for_pdb: if structure_dirs: for structure_dir in structure_dirs: pdb_filepath = os.path.join(structure_dir, pdb_id + '.pdb.gz') if os.path.exists(pdb_filepath): if os.path.getsize(pdb_filepath) > 0: if os.path.exists(local_pdb_filepath): os.remove(local_pdb_filepath) os.symlink(pdb_filepath, local_pdb_filepath) break # If still not found, download the PDB file if not os.path.exists(local_pdb_filepath): print 'Downloading PDB file and saving as:', local_pdb_filepath page = retrieve_pdb(pdb_id, compressed='yes') # download and write compressed file with open(local_pdb_filepath, 'wb') as local_pdb_file: local_pdb_file.write(page) # Check if SIFTS file already exists and is not empty search_for_sifts = True if os.path.exists(local_sifts_filepath): if os.path.getsize(local_sifts_filepath) > 0: search_for_sifts = False # If not, search any user-defined paths and create a symlink if found if search_for_sifts: if structure_dirs: for structure_dir in structure_dirs: sifts_filepath = os.path.join(structure_dir, pdb_id + '.xml.gz') if os.path.exists(sifts_filepath): if os.path.getsize(sifts_filepath) > 0: if os.path.exists(local_sifts_filepath): os.remove(local_sifts_filepath) os.symlink(sifts_filepath, local_sifts_filepath) break # If still not found, download the SIFTS XML file if not os.path.exists(local_sifts_filepath): print 'Downloading SIFTS file (compressed) and saving as:', local_sifts_filepath try: page = retrieve_sifts(pdb_id) except urllib2.URLError as urlerror: if urlerror.reason == 'ftp error: [Errno ftp error] 550 Failed to change directory.': # Check the PDB file has definitely been downloaded. If so, then the problem is probably that the SIFTS people have not yet created the file for this PDB entry, or they have not added it to their server yet. if os.path.exists(local_pdb_filepath): # In this case, just add a message telling the script to delete this PDB structure from the DB. The continue clause skips to the end of the function. print '%s SIFTS file could not be downloaded - this PDB entry will be deleted from the DB' % pdb_id return { 'pdb_row_id': pdb_row_id, 'exception_message': 'SIFTS file could not be downloaded' } else: raise urlerror else: raise urlerror with gzip.open(local_sifts_filepath, 'wb') as local_sifts_file: local_sifts_file.write(page) # ====== # From PDB file, get EXPRESSION_SYSTEM and related fields, using Bio.PDB.PDBParser # ====== db_chain_ids_lower = [ chain_dict['chain_id'].lower() for chain_dict in chain_data ] pdbparser = Bio.PDB.PDBParser(QUIET=True) with gzip.open(local_pdb_filepath) as local_pdb_file: pdbdata = pdbparser.get_structure(pdb_id, local_pdb_file) pdbheader = pdbparser.get_header() # Bio PDB compound structure: {'compound': {'1': {'chain': 'a, b'}}} pdb_compounds = pdbheader['compound'] matching_pdb_compound_id = None try: for pdb_compound_id in pdb_compounds.keys(): for pdb_chain_id in pdb_compounds[pdb_compound_id]['chain'].split( ', '): if pdb_chain_id in db_chain_ids_lower: matching_pdb_compound_id = pdb_compound_id break assert matching_pdb_compound_id is not None except Exception as e: print 'ERROR for entry %s PDB %s. PDB header dict as parsed by BioPython follows:' % ( entry_name, pdb_id) print pdbheader print traceback.format_exc() raise e expression_data = {} # Bio PDB source structure: {'source': {'1': {'expression_system': 'escherichia coli'}}} pdbexpression_data = pdbheader['source'][matching_pdb_compound_id] for key in pdbexpression_data.keys(): if key[0:10] == 'expression': # Make expression data upper-case again. I think it looks better for single-case text. expression_data[key.upper()] = pdbexpression_data[key].upper() # expression_data_obj = models.PDBExpressionData(expression_data_type=key.upper(), expression_data_value=pdbexpression_data[key].upper(), pdb=pdb_row) # db.session.add(expression_data_obj) # ====== # Iterate through chains in PDBRow and extract sequence data from SIFTS file, and add to database # ====== results = { 'pdb_row_id': pdb_row_id, 'expression_data': expression_data, 'chain_dicts': {} } for chain_dict in chain_data: chain_row_id = chain_dict['chain_row_id'] chain_id = chain_dict['chain_id'] logger.debug(entry_name, ac, pdb_id, chain_id) pdb_chain_dict = extract_sifts_seq(local_sifts_filepath, ac, entry_name, pdb_id, chain_id, seq) results['chain_dicts'][chain_row_id] = pdb_chain_dict return results