Ejemplo n.º 1
0
def retrieve_mutation_datatxt(
    case_set_id,
    genetic_profile_id,
    gene_ids,
    portal_version='public-portal',
    verbose=False,
):
    """
    Queries cBioPortal for "Mutation" format data, given a list of cBioPortal cancer studies and a list of HGNC Approved gene Symbols.
    Returns the data file as a list of text lines.
    """
    gene_ids_string = '+'.join(gene_ids)
    mutation_url = 'http://www.cbioportal.org/{0}/' \
                   'webservice.do' \
                   '?cmd=getProfileData' \
                   '&case_set_id={1}' \
                   '&genetic_profile_id={2}' \
                   '&gene_list={3}'.format(
                       portal_version,
                       case_set_id,
                       genetic_profile_id,
                       gene_ids_string
                   )
    if verbose:
        set_loglevel('debug')
        logger.debug(mutation_url)
    response = urllib2.urlopen(mutation_url)
    page = response.read(1000000000)
    lines = page.splitlines()
    return lines
Ejemplo n.º 2
0
def retrieve_mutation_datatxt(case_set_id,
                              genetic_profile_id,
                              gene_ids,
                              portal_version='public-portal',
                              verbose=False,
                              ):
    """
    Queries cBioPortal for "Mutation" format data, given a list of cBioPortal cancer studies and a list of HGNC Approved gene Symbols.
    Returns the data file as a list of text lines.
    """
    gene_ids_string = '+'.join(gene_ids)
    mutation_url = 'http://www.cbioportal.org/{0}/' \
                   'webservice.do' \
                   '?cmd=getProfileData' \
                   '&case_set_id={1}' \
                   '&genetic_profile_id={2}' \
                   '&gene_list={3}'.format(
                       portal_version,
                       case_set_id,
                       genetic_profile_id,
                       gene_ids_string
                   )
    if verbose:
        set_loglevel('debug')
        logger.debug(mutation_url)
    response = urllib2.urlopen(mutation_url)
    page = response.read(1000000000)
    lines = page.splitlines()
    return lines
Ejemplo n.º 3
0
def extract_pdb_data(pdb_dict):
    '''Extract data for a single PDB structure
    '''
    pdb_row_id = pdb_dict['pdb_row_id']
    pdb_id = pdb_dict['pdb_id']
    ac = pdb_dict['ac']
    entry_name = pdb_dict['entry_name']
    seq = pdb_dict['seq']
    chain_data = pdb_dict['chain_data']
    structure_dirs = pdb_dict['structure_dirs']

    # if entry_name != 'MLKL_HUMAN':
    #     return None
    #
    # if pdb_id != '2ITN':
    #     return None

    # ========
    # Get PDB and SIFTS files
    # PDB files are used to extract expression system metadata
    # SIFTS files are used to extract sequence data
    # ========

    # TODO define this via project metadata .yaml file.
    # structure_dirs = ['/Users/partond/tmp/kinome-MSMSeeder/structures/pdb', '/Users/partond/tmp/kinome-MSMSeeder/structures/sifts']

    local_pdb_filepath = os.path.join('external-data', 'PDB', pdb_id + '.pdb.gz')
    local_sifts_filepath = os.path.join('external-data', 'SIFTS', pdb_id + '.xml.gz')

    # Check if PDB file/symlink already exists and is not empty
    search_for_pdb = True
    if os.path.exists(local_pdb_filepath):
        if os.path.getsize(local_pdb_filepath) > 0:
            search_for_pdb = False

    # If not, search any user-defined paths and create a symlink if found
    if search_for_pdb:
        if structure_dirs:
            for structure_dir in structure_dirs:
                pdb_filepath = os.path.join(structure_dir, pdb_id + '.pdb.gz')
                if os.path.exists(pdb_filepath):
                    if os.path.getsize(pdb_filepath) > 0:
                        if os.path.exists(local_pdb_filepath):
                            os.remove(local_pdb_filepath)
                        os.symlink(pdb_filepath, local_pdb_filepath)
                        break

        # If still not found, download the PDB file
        if not os.path.exists(local_pdb_filepath):
            print 'Downloading PDB file and saving as:', local_pdb_filepath
            page = retrieve_pdb(pdb_id, compressed='yes')
            # download and write compressed file
            with open(local_pdb_filepath, 'wb') as local_pdb_file:
                local_pdb_file.write(page)

    # Check if SIFTS file already exists and is not empty
    search_for_sifts = True
    if os.path.exists(local_sifts_filepath):
        if os.path.getsize(local_sifts_filepath) > 0:
            search_for_sifts = False

    # If not, search any user-defined paths and create a symlink if found
    if search_for_sifts:
        if structure_dirs:
            for structure_dir in structure_dirs:
                sifts_filepath = os.path.join(structure_dir, pdb_id + '.xml.gz')
                if os.path.exists(sifts_filepath):
                    if os.path.getsize(sifts_filepath) > 0:
                        if os.path.exists(local_sifts_filepath):
                            os.remove(local_sifts_filepath)
                        os.symlink(sifts_filepath, local_sifts_filepath)
                        break

        # If still not found, download the SIFTS XML file
        if not os.path.exists(local_sifts_filepath):
            print 'Downloading SIFTS file (compressed) and saving as:', local_sifts_filepath
            try:
                page = retrieve_sifts(pdb_id)
            except urllib2.URLError as urlerror:
                if urlerror.reason == 'ftp error: [Errno ftp error] 550 Failed to change directory.':
                    # Check the PDB file has definitely been downloaded. If so, then the problem is probably that the SIFTS people have not yet created the file for this PDB entry, or they have not added it to their server yet.
                    if os.path.exists(local_pdb_filepath):
                        # In this case, just add a message telling the script to delete this PDB structure from the DB. The continue clause skips to the end of the function.
                        print '%s SIFTS file could not be downloaded - this PDB entry will be deleted from the DB' % pdb_id
                        return {'pdb_row_id': pdb_row_id, 'exception_message': 'SIFTS file could not be downloaded'}
                    else:
                        raise urlerror
                else:
                    raise urlerror

            with gzip.open(local_sifts_filepath, 'wb') as local_sifts_file:
                local_sifts_file.write(page)

    # ======
    # From PDB file, get EXPRESSION_SYSTEM and related fields, using Bio.PDB.PDBParser
    # ======

    db_chain_ids_lower = [chain_dict['chain_id'].lower() for chain_dict in chain_data]

    pdbparser = Bio.PDB.PDBParser(QUIET=True)
    with gzip.open(local_pdb_filepath) as local_pdb_file:
        pdbdata = pdbparser.get_structure(pdb_id, local_pdb_file)
    pdbheader = pdbparser.get_header()
    # Bio PDB compound structure: {'compound': {'1': {'chain': 'a, b'}}}
    pdb_compounds = pdbheader['compound']
    matching_pdb_compound_id = None
    try:
        for pdb_compound_id in pdb_compounds.keys():
            for pdb_chain_id in pdb_compounds[pdb_compound_id]['chain'].split(', '):
                if pdb_chain_id in db_chain_ids_lower:
                    matching_pdb_compound_id = pdb_compound_id
                    break
        assert matching_pdb_compound_id is not None
    except Exception as e:
        print 'ERROR for entry %s PDB %s. PDB header dict as parsed by BioPython follows:' % (entry_name, pdb_id)
        print pdbheader
        print traceback.format_exc()
        raise e

    expression_data = {}
    # Bio PDB source structure: {'source': {'1': {'expression_system': 'escherichia coli'}}}
    pdbexpression_data = pdbheader['source'][matching_pdb_compound_id]
    for key in pdbexpression_data.keys():
        if key[0:10] == 'expression':
            # Make expression data upper-case again. I think it looks better for single-case text.
            expression_data[key.upper()] = pdbexpression_data[key].upper()
            # expression_data_obj = models.PDBExpressionData(expression_data_type=key.upper(), expression_data_value=pdbexpression_data[key].upper(), pdb=pdb_row)
            # db.session.add(expression_data_obj)

    # ======
    # Iterate through chains in PDBRow and extract sequence data from SIFTS file, and add to database
    # ======

    results = {'pdb_row_id': pdb_row_id, 'expression_data': expression_data, 'chain_dicts': {}}

    for chain_dict in chain_data:
        chain_row_id = chain_dict['chain_row_id']
        chain_id = chain_dict['chain_id']
        logger.debug(entry_name, ac, pdb_id, chain_id)
        pdb_chain_dict = extract_sifts_seq(local_sifts_filepath, ac, entry_name, pdb_id, chain_id, seq)
        results['chain_dicts'][chain_row_id] = pdb_chain_dict

    return results
Ejemplo n.º 4
0
def extract_pdb_data(pdb_dict):
    '''Extract data for a single PDB structure
    '''
    pdb_row_id = pdb_dict['pdb_row_id']
    pdb_id = pdb_dict['pdb_id']
    ac = pdb_dict['ac']
    entry_name = pdb_dict['entry_name']
    seq = pdb_dict['seq']
    chain_data = pdb_dict['chain_data']
    structure_dirs = pdb_dict['structure_dirs']

    # if entry_name != 'MLKL_HUMAN':
    #     return None
    #
    # if pdb_id != '2ITN':
    #     return None

    # ========
    # Get PDB and SIFTS files
    # PDB files are used to extract expression system metadata
    # SIFTS files are used to extract sequence data
    # ========

    # TODO define this via project metadata .yaml file.
    # structure_dirs = ['/Users/partond/tmp/kinome-MSMSeeder/structures/pdb', '/Users/partond/tmp/kinome-MSMSeeder/structures/sifts']

    local_pdb_filepath = os.path.join('external-data', 'PDB',
                                      pdb_id + '.pdb.gz')
    local_sifts_filepath = os.path.join('external-data', 'SIFTS',
                                        pdb_id + '.xml.gz')

    # Check if PDB file/symlink already exists and is not empty
    search_for_pdb = True
    if os.path.exists(local_pdb_filepath):
        if os.path.getsize(local_pdb_filepath) > 0:
            search_for_pdb = False

    # If not, search any user-defined paths and create a symlink if found
    if search_for_pdb:
        if structure_dirs:
            for structure_dir in structure_dirs:
                pdb_filepath = os.path.join(structure_dir, pdb_id + '.pdb.gz')
                if os.path.exists(pdb_filepath):
                    if os.path.getsize(pdb_filepath) > 0:
                        if os.path.exists(local_pdb_filepath):
                            os.remove(local_pdb_filepath)
                        os.symlink(pdb_filepath, local_pdb_filepath)
                        break

        # If still not found, download the PDB file
        if not os.path.exists(local_pdb_filepath):
            print 'Downloading PDB file and saving as:', local_pdb_filepath
            page = retrieve_pdb(pdb_id, compressed='yes')
            # download and write compressed file
            with open(local_pdb_filepath, 'wb') as local_pdb_file:
                local_pdb_file.write(page)

    # Check if SIFTS file already exists and is not empty
    search_for_sifts = True
    if os.path.exists(local_sifts_filepath):
        if os.path.getsize(local_sifts_filepath) > 0:
            search_for_sifts = False

    # If not, search any user-defined paths and create a symlink if found
    if search_for_sifts:
        if structure_dirs:
            for structure_dir in structure_dirs:
                sifts_filepath = os.path.join(structure_dir,
                                              pdb_id + '.xml.gz')
                if os.path.exists(sifts_filepath):
                    if os.path.getsize(sifts_filepath) > 0:
                        if os.path.exists(local_sifts_filepath):
                            os.remove(local_sifts_filepath)
                        os.symlink(sifts_filepath, local_sifts_filepath)
                        break

        # If still not found, download the SIFTS XML file
        if not os.path.exists(local_sifts_filepath):
            print 'Downloading SIFTS file (compressed) and saving as:', local_sifts_filepath
            try:
                page = retrieve_sifts(pdb_id)
            except urllib2.URLError as urlerror:
                if urlerror.reason == 'ftp error: [Errno ftp error] 550 Failed to change directory.':
                    # Check the PDB file has definitely been downloaded. If so, then the problem is probably that the SIFTS people have not yet created the file for this PDB entry, or they have not added it to their server yet.
                    if os.path.exists(local_pdb_filepath):
                        # In this case, just add a message telling the script to delete this PDB structure from the DB. The continue clause skips to the end of the function.
                        print '%s SIFTS file could not be downloaded - this PDB entry will be deleted from the DB' % pdb_id
                        return {
                            'pdb_row_id':
                            pdb_row_id,
                            'exception_message':
                            'SIFTS file could not be downloaded'
                        }
                    else:
                        raise urlerror
                else:
                    raise urlerror

            with gzip.open(local_sifts_filepath, 'wb') as local_sifts_file:
                local_sifts_file.write(page)

    # ======
    # From PDB file, get EXPRESSION_SYSTEM and related fields, using Bio.PDB.PDBParser
    # ======

    db_chain_ids_lower = [
        chain_dict['chain_id'].lower() for chain_dict in chain_data
    ]

    pdbparser = Bio.PDB.PDBParser(QUIET=True)
    with gzip.open(local_pdb_filepath) as local_pdb_file:
        pdbdata = pdbparser.get_structure(pdb_id, local_pdb_file)
    pdbheader = pdbparser.get_header()
    # Bio PDB compound structure: {'compound': {'1': {'chain': 'a, b'}}}
    pdb_compounds = pdbheader['compound']
    matching_pdb_compound_id = None
    try:
        for pdb_compound_id in pdb_compounds.keys():
            for pdb_chain_id in pdb_compounds[pdb_compound_id]['chain'].split(
                    ', '):
                if pdb_chain_id in db_chain_ids_lower:
                    matching_pdb_compound_id = pdb_compound_id
                    break
        assert matching_pdb_compound_id is not None
    except Exception as e:
        print 'ERROR for entry %s PDB %s. PDB header dict as parsed by BioPython follows:' % (
            entry_name, pdb_id)
        print pdbheader
        print traceback.format_exc()
        raise e

    expression_data = {}
    # Bio PDB source structure: {'source': {'1': {'expression_system': 'escherichia coli'}}}
    pdbexpression_data = pdbheader['source'][matching_pdb_compound_id]
    for key in pdbexpression_data.keys():
        if key[0:10] == 'expression':
            # Make expression data upper-case again. I think it looks better for single-case text.
            expression_data[key.upper()] = pdbexpression_data[key].upper()
            # expression_data_obj = models.PDBExpressionData(expression_data_type=key.upper(), expression_data_value=pdbexpression_data[key].upper(), pdb=pdb_row)
            # db.session.add(expression_data_obj)

    # ======
    # Iterate through chains in PDBRow and extract sequence data from SIFTS file, and add to database
    # ======

    results = {
        'pdb_row_id': pdb_row_id,
        'expression_data': expression_data,
        'chain_dicts': {}
    }

    for chain_dict in chain_data:
        chain_row_id = chain_dict['chain_row_id']
        chain_id = chain_dict['chain_id']
        logger.debug(entry_name, ac, pdb_id, chain_id)
        pdb_chain_dict = extract_sifts_seq(local_sifts_filepath, ac,
                                           entry_name, pdb_id, chain_id, seq)
        results['chain_dicts'][chain_row_id] = pdb_chain_dict

    return results