Beispiel #1
0
    def retrieve(pdb_id, cache_dir = None):
        '''Creates a PDBML object by using a cached copy of the files if they exists or by retrieving the files from the RCSB.'''

        pdb_contents = None
        xml_contents = None
        pdb_id = pdb_id.upper()

        if cache_dir:
            # Check to see whether we have a cached copy of the PDB file
            filename = os.path.join(cache_dir, "%s.pdb" % pdb_id)
            if os.path.exists(filename):
                pdb_contents = read_file(filename)

            # Check to see whether we have a cached copy of the XML file
            filename = os.path.join(cache_dir, "%s.xml" % pdb_id)
            if os.path.exists(filename):
                xml_contents = read_file(filename)

        # Get any missing files from the RCSB and create cached copies if appropriate
        if not pdb_contents:
            pdb_contents = rcsb.retrieve_pdb(pdb_id)
            if cache_dir:
                write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents)

        if not xml_contents:
            xml_contents = rcsb.retrieve_xml(pdb_id)
            if cache_dir:
                write_file(os.path.join(cache_dir, "%s.xml" % pdb_id), xml_contents)

        # Return the object
        return PDBML_slow(xml_contents, pdb_contents)
Beispiel #2
0
def create_template(settings_file):
    write_file(
        settings_file,
        json.dumps(
            {
                "database": {
                    "username": "******",
                    "hostname": "myserver.mydomain.com",
                    "port": 3306,
                    "database": "database_name",
                    "password": "******",
                    "socket":
                    "path_to_socket_file e.g. /var/lib/mysql/mysql.sock",
                    "host_config_name": "if_my.cnf_is_used"
                },
                "cache": {
                    "cache_dir": "/path/to/cache/files"
                },
                "PDBTM": {
                    "xml": "/path/to/pdbtmall.xml"
                },
                "monomer_api": {
                    "prediction_data_path": "/path/to/monomeric_job_archives"
                },
                "ppi_api": {
                    "prediction_data_path": "/path/to/ppi_job_archives"
                }
            },
            sort_keys=True,
            indent=4))
Beispiel #3
0
    def retrieve(pdb_id, cache_dir = None, bio_cache = None):
        '''Creates a FASTA object by using a cached copy of the file if it exists or by retrieving the file from the RCSB.'''

        pdb_id = pdb_id.upper()

        if bio_cache:
            return FASTA(bio_cache.get_fasta_contents(pdb_id))

        # Check to see whether we have a cached copy
        if cache_dir:
            filename = os.path.join(cache_dir, "%s.fasta" % pdb_id)
            if os.path.exists(filename):
                return FASTA(read_file(filename))
            else:
                filename += ".txt"
                if os.path.exists(filename):
                    return FASTA(read_file(filename))

        # Get a copy from the RCSB
        contents = rcsb.retrieve_fasta(pdb_id)

        # Create a cached copy if appropriate
        if cache_dir:
            write_file(os.path.join(cache_dir, "%s.fasta" % pdb_id), contents)

        # Return the object
        return FASTA(contents)
Beispiel #4
0
def create_template(settings_file):
    write_file(settings_file, json.dumps({
        "database" :
        {
            "username" : "database_username",
            "hostname" : "myserver.mydomain.com",
            "port" : 3306,
            "database" : "database_name",
            "password" : "password_for_database_username",
            "socket" : "path_to_socket_file e.g. /var/lib/mysql/mysql.sock",
            "host_config_name" : "if_my.cnf_is_used"
        },
        "cache" :
        {
            "cache_dir" : "/path/to/cache/files"
        },
        "PDBTM" :
        {
            "xml" : "/path/to/pdbtmall.xml"
        },
        "monomer_api" :
        {
            "prediction_data_path" : "/path/to/monomeric_job_archives"
        },
        "ppi_api" :
        {
            "prediction_data_path" : "/path/to/ppi_job_archives"
        }
    }, sort_keys = True, indent = 4))
Beispiel #5
0
def download_xml(pdb_id, dest_dir, silent = True, filename = None, unzip = False):
    assert(os.path.exists(dest_dir))
    lower_case_gz_filename = os.path.join(dest_dir, '{0}.sifts.xml.gz'.format(pdb_id.lower()))
    upper_case_gz_filename = os.path.join(dest_dir, '{0}.sifts.xml.gz'.format(pdb_id.upper()))
    lower_case_filename = os.path.join(dest_dir, '{0}.sifts.xml'.format(pdb_id.lower()))
    upper_case_filename = os.path.join(dest_dir, '{0}.sifts.xml'.format(pdb_id.upper()))

    if filename:
        requested_filename = os.path.join(dest_dir, filename)
        if os.path.exists(requested_filename):
            return read_file(requested_filename)

    if unzip == True:
        if os.path.exists(lower_case_filename):
            contents = read_file(lower_case_filename)
        elif os.path.exists(upper_case_filename):
            contents = read_file(upper_case_filename)
        elif os.path.exists(lower_case_gz_filename):
            contents = read_gzip_in_memory(read_file(lower_case_gz_filename))
        elif os.path.exists(upper_case_gz_filename):
            contents = read_gzip_in_memory(read_file(upper_case_gz_filename))
        else:
            contents = retrieve_xml(pdb_id, silent = silent, unzip = True)
            write_file(os.path.join(dest_dir, filename or '{0}.sifts.xml'.format(pdb_id)), contents)
        return contents
    else:
        if os.path.exists(lower_case_gz_filename):
            contents = read_file(lower_case_gz_filename) # Note: read_file already unzips .gz files
        if os.path.exists(upper_case_gz_filename):
            contents = read_file(upper_case_gz_filename) # Note: read_file already unzips .gz files
        else:
            gzip_contents = retrieve_xml(pdb_id, silent = silent, unzip = False)
            write_file(os.path.join(dest_dir, filename or '{0}.sifts.xml.gz'.format(pdb_id)), gzip_contents)
            contents = read_gzip_in_memory(gzip_contents)
        return contents
Beispiel #6
0
 def send_email(self, email_address, cut_off = None):
     s = ['Cumulative file counts for directories under %s.\n' % self.root]
     for k, v in sorted(self.cumulative_counts.iteritems(), key = lambda x:-x[1]):
         if v:
             if not(cut_off) or v >= cut_off:
                 s.append('%s: %d' % (k, v))
     msg = '\n'.join(s)
     write_file('/tmp/filecount_output.txt', msg)
     ms = MailServer()
     ms.sendgmail('Directory file count statistics', [email_address], msg, pw_filepath = '/admin/pw/google')
Beispiel #7
0
 def generate_schema_diagram(self, output_filepath = None, show_fk_only = False):
     if self.num_tables == 0:
         raise EmptyDiagramException('No tables in schema.')
     tempfiles = self._generate_schema_diagram(show_fk_only)
     self.schema_diagram = read_file(tempfiles[1])
     for fname in tempfiles:
         if os.path.exists(fname):
             os.remove(fname)
     if output_filepath:
         write_file(output_filepath, self.schema_diagram)
def dump_pdbs():
    pdbs = json.loads(read_file('../rawdata/pdbs.json'))

    # Sanity check
    for pdb_id, v in sorted(pdbs.iteritems()):
        records = ddGdb.execute_select('SELECT ID FROM PDBFile WHERE ID=%s', parameters=(pdb_id,))
        assert(len(records) == 1)

    # Dump
    for pdb_id, v in sorted(pdbs.iteritems()):
        content = ddGdb.execute_select('SELECT Content FROM PDBFile WHERE ID=%s', parameters=(pdb_id,))[0]['Content']
        write_file('../rawdata/%s.pdb' % pdb_id, content)
Beispiel #9
0
 def save_sequence_blast(self, sequence, cut_off, matrix, sequence_identity_cut_off, data):
     assert(data['sequence'] == sequence)
     sequence_data = [data] # put the new hit at the start of the file
     if self.cache_dir:
         filepath = self._get_blast_sequence_filepath(sequence, cut_off, matrix, sequence_identity_cut_off)
         if os.path.exists(filepath):
             for sequence_hits in json.loads(read_file(filepath)):
                 if sequence_hits['sequence'] != sequence:
                     sequence_data.append(sequence_hits)
         write_file(filepath, json.dumps(sequence_data))
         return True
     return False
Beispiel #10
0
    def __init__(self, UniProtAC, XML = None, cache_dir = None, silent = True):
        if cache_dir and not(os.path.exists(cache_dir)):
            raise Exception("The cache directory %s does not exist." % cache_dir)

        self.UniProtAC = UniProtAC
        self.silent = silent

        # Get XML
        if XML == None:
            protein_xml = None
            cached_filepath = None
            if cache_dir:
                cached_filepath = os.path.join(cache_dir, '%s.xml' % UniProtAC)
            if cached_filepath and os.path.exists(cached_filepath):
                protein_xml = read_file(cached_filepath)
            else:
                if not silent:
                    colortext.write("Retrieving %s\n" % UniProtAC, "cyan")
                url = 'http://www.uniprot.org/uniprot/%s.xml' % UniProtAC
                protein_xml = http_get(url)
                if not(protein_xml.strip()):
                    raise EmptyUniProtACXMLException('The file %s is empty.' % UniProtAC)
                if cached_filepath:
                    write_file(cached_filepath, protein_xml)
            self.XML = protein_xml
        else:
            self.XML = XML

        self.recommended_name = None
        self.submitted_names = []
        self.alternative_names = []

        # Get DOM
        try:
            self._dom = parseString(protein_xml)
        except:
            if cached_filepath:
                raise Exception("The UniProtAC XML for '%s' was invalid. The cached file is located at %s. Check this file - if it is not valid XML then delete the file and rerun the script." % (UniProtAC, cached_filepath))
            else:
                raise Exception("The UniProtAC XML for '%s' was invalid." % UniProtAC)
        main_tags = self._dom.getElementsByTagName("uniprot")
        assert(len(main_tags) == 1)
        entry_tags = main_tags[0].getElementsByTagName("entry")
        assert(len(entry_tags) == 1)
        self.entry_tag = entry_tags[0]

        self._parse_evidence_tag()
        self._parse_sequence_tag()
        self._parse_protein_tag()
        self._parse_organism_tag()
        self._parse_subsections()
        self._parse_PDB_mapping()
Beispiel #11
0
    def retrieve(pdb_id, cache_dir = None, acceptable_sequence_percentage_match = 70.0, require_uniprot_residue_mapping = True, bio_cache = None):
        '''Creates a PDBML object by using a cached copy of the files if they exists or by retrieving the files from the RCSB.
           bio_cache should be a klab.bio.cache.py::BioCache object and is used to avoid reading/downloading cached files repeatedly.
        '''

        pdb_contents = None
        xml_contents = None
        pdb_id = pdb_id.upper()

        l_pdb_id = pdb_id.lower()

        if len(pdb_id) != 4 or not pdb_id.isalnum():
            raise Exception("Bad PDB identifier '%s'." % pdb_id)

        if bio_cache:
            pdb_contents = bio_cache.get_pdb_contents(pdb_id)
            xml_contents = bio_cache.get_sifts_xml_contents(pdb_id)

        if cache_dir:
            if not pdb_contents:
                # Check to see whether we have a cached copy of the PDB file
                filename = os.path.join(cache_dir, "%s.pdb" % pdb_id)
                if os.path.exists(filename):
                    pdb_contents = read_file(filename)

            if not xml_contents:
                # Check to see whether we have a cached copy of the XML file
                filename = os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id)
                if os.path.exists(filename):
                    xml_contents = read_file(filename)

        # Get any missing files from the RCSB and create cached copies if appropriate
        if not pdb_contents:
            pdb_contents = rcsb.retrieve_pdb(pdb_id)
            if cache_dir:
                write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents)

        if not xml_contents:
            try:
                xml_contents = retrieve_xml(pdb_id, silent = False)
                if cache_dir:
                    write_file(os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id), xml_contents)
            except FTPException550:
                raise MissingSIFTSRecord('The file "%s.sifts.xml.gz" could not be found on the EBI FTP server.' % l_pdb_id)

        xml_contents = xml_contents

        # Return the object
        handler = SIFTS(xml_contents, pdb_contents, acceptable_sequence_percentage_match = acceptable_sequence_percentage_match, cache_dir = cache_dir, require_uniprot_residue_mapping = require_uniprot_residue_mapping, bio_cache = bio_cache, pdb_id = pdb_id)
        xml.sax.parseString(xml_contents, handler)
        return handler
Beispiel #12
0
def download_fasta(pdb_id, dest_dir, silent = True, filename = None):
    assert(os.path.exists(dest_dir))
    lower_case_filename = os.path.join(dest_dir, '{0}.fasta'.format(pdb_id.lower()))
    upper_case_filename = os.path.join(dest_dir, '{0}.fasta'.format(pdb_id.upper()))
    if filename:
        requested_filename = os.path.join(dest_dir, filename)
        if os.path.exists(requested_filename):
            return read_file(requested_filename)
    if os.path.exists(lower_case_filename):
        return read_file(lower_case_filename)
    elif os.path.exists(upper_case_filename):
        return read_file(upper_case_filename)
    else:
        contents = retrieve_fasta(pdb_id, silent = silent)
        write_file(os.path.join(dest_dir, filename or '{0}.fasta'.format(pdb_id)), contents)
        return contents
def update_public_datasets():
    dsets = ['alascan-gpk.json', 'curatedprotherm.json', 'guerois.json', 'kellogg.json', 'potapov.json']
    source_path = '../rawdata/'
    dest_path = '/home/oconchus/t14benchmarking/ddg/input/json/'
    for dset in dsets:
        assert(os.path.exists(os.path.join(source_path, dset)))
        assert(os.path.exists(os.path.join(dest_path, dset)))
    for dset in dsets:
        print(dset)
        source_set = json.loads(read_file(os.path.join(source_path, dset)))
        dest_set = json.loads(read_file(os.path.join(dest_path, dset)))
        assert(len(source_set['data']) == len(dest_set['data']))
        for x in range(len(source_set['data'])):
            assert(dest_set['data'][x]['RecordID'] == source_set['data'][x]['RecordID'])
            dest_set['data'][x]['DerivedMutation'] = source_set['data'][x]['DerivedMutation']
        write_file(os.path.join(dest_path, dset) + '.new', json.dumps(dest_set, indent=4, sort_keys=True))
Beispiel #14
0
    def static_get_pdb_object(pdb_id, bio_cache = None, cache_dir = None):
        '''This method does not necessarily use a BioCache but it seems to fit here.'''
        pdb_id = pdb_id.upper()

        if bio_cache:
            return bio_cache.get_pdb_object(pdb_id)

        if cache_dir:
            # Check to see whether we have a cached copy of the PDB file
            filepath = os.path.join(cache_dir, '{0}.pdb'.format(pdb_id))
            if os.path.exists(filepath):
                return PDB.from_filepath(filepath)

        # Get any missing files from the RCSB and create cached copies if appropriate
        pdb_contents = retrieve_pdb(pdb_id)
        if cache_dir:
            write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents)
        return PDB(pdb_contents)
def dump_data(prediction_set, outfile):
    ddG_connection = db_api.ddG()
    ddGdb = ddgdbapi.ddGDatabase()

    userdata_set = 'AllValidPGPK'

    cached_pdb_details = json.loads(read_file('cached_pdb_details.json'))
    analysis_breakdown = ddG_connection.get_predictionset_data(prediction_set, userdata_set, cached_pdb_details = cached_pdb_details)

    test_data = dict(
        amino_acids = analysis_breakdown.amino_acids,
        pdb_details = analysis_breakdown.pdb_details,
        predictions = analysis_breakdown.predictions,
        #single_mutation_GP_predictions = analysis_breakdown.single_mutation_GP_predictions,
        #single_mutation_no_GP_predictions = analysis_breakdown.single_mutation_no_GP_predictions,
        #multiple_mutation_predictions = analysis_breakdown.multiple_mutation_predictions,
        analysis_datasets = analysis_breakdown.analysis_datasets,
    )
    write_file(outfile, json.dumps(test_data))
Beispiel #16
0
    def retrieve(cls, pdb_id, cache_dir = None):
        '''Creates a PDB object by using a cached copy of the file if it exists or by retrieving the file from the RCSB.'''

        # Check to see whether we have a cached copy
        pdb_id = pdb_id.upper()
        if cache_dir:
            filename = os.path.join(cache_dir, "%s.pdb" % pdb_id)
            if os.path.exists(filename):
                return cls(read_file(filename))

        # Get a copy from the RCSB
        contents = rcsb.retrieve_pdb(pdb_id)

        # Create a cached copy if appropriate
        if cache_dir:
            write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), contents)

        # Return the object
        return cls(contents)
Beispiel #17
0
    def retrieve_data_from_rcsb(cls, ligand_code, pdb_id = None, silent = True, cached_dir = None):
        '''Retrieve a file from the RCSB.'''
        if not silent:
            colortext.printf("Retrieving data from RCSB")
        if cached_dir:
            assert(os.path.exists(cached_dir))

        ligand_info_path, ligand_info, pdb_ligand_info, pdb_ligand_info_path = None, None, None, None
        if cached_dir:
            ligand_info_path = os.path.join(cached_dir, '{0}.cif'.format(ligand_code))
            if os.path.exists(ligand_info_path):
                ligand_info = read_file(ligand_info_path)
        if not ligand_info:
            ligand_info = retrieve_ligand_cif(ligand_code)
            if cached_dir:
                write_file(ligand_info_path, ligand_info)

        # Parse .cif
        l = cls(ligand_code)
        l.parse_cif(ligand_info)
        l.pdb_id = pdb_id or l.pdb_id
        has_pdb_id = l.pdb_id and (len(l.pdb_id) == 4) and (l.pdb_id != '?')  # the last case is unnecessary and will be short-cut but I included it to show possible values

        # Parse PDB XML
        if has_pdb_id:
            if cached_dir:
                pdb_ligand_info_path = os.path.join(cached_dir, '{0}.pdb.ligandinfo'.format(l.pdb_id.lower()))
                if os.path.exists(pdb_ligand_info_path):
                    pdb_ligand_info = read_file(pdb_ligand_info_path)
                else:
                    pdb_ligand_info = retrieve_pdb_ligand_info(l.pdb_id)
                    write_file(pdb_ligand_info_path, pdb_ligand_info)
            else:
                pdb_ligand_info = retrieve_pdb_ligand_info(l.pdb_id)
        if pdb_ligand_info:
            l.parse_pdb_ligand_info(pdb_ligand_info)

        # Retrive the diagram image
        l.get_diagram()

        return l
 def _create_input_files(self):
     #colortext.message('self.outdir: ' + self.outdir)
     if self.Scaffold:
         write_file(self._filepath('scaffold.pdb'), self.Scaffold.pdb_contents)
     write_file(self._filepath('model.pdb'), self.Model.pdb_contents)
     if self.ExpStructure:
         write_file(self._filepath('design.pdb'), self.ExpStructure.pdb_contents)
def test_abacus_graph():
    '''This function can be deleted. It was added to test the abacus graph with different numbers of datapoints.'''
    import os
    import json
    if not(os.path.exists('results_cache.txt')):
        results = ddG_connection.get_flattened_prediction_results('FPP biosensor: protocol 16')
        for r in results:
            r['TimeTaken'] = r['TimeTaken'].total_seconds() # timedelta objects are not JSON serializable
        write_file('results_cache.txt', json.dumps(results), 'w')
    results = json.loads(read_file('results_cache.txt'))

    try:
        ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_3.png', cached_results = results, num_datapoints = 3)
    except:
        pass
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_5.png', cached_results = results, num_datapoints = 5)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_8.png', cached_results = results, num_datapoints = 8)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_10.png', cached_results = results, num_datapoints = 10)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_12.png', cached_results = results, num_datapoints = 12)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_20.png', cached_results = results, num_datapoints = 20)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_50.png', cached_results = results, num_datapoints = 50)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_127.png', cached_results = results, num_datapoints = 127)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_255.png', cached_results = results, num_datapoints = 255)
Beispiel #20
0
    def _get_XML(self):
        uparc_xml = None
        cached_filepath = None
        if self.cache_dir:
            cached_filepath = os.path.join(self.cache_dir, '%s.xml' % self.UniParcID)
        if cached_filepath and os.path.exists(cached_filepath):
            uparc_xml = read_file(cached_filepath)
        else:
            if not self.silent:
                colortext.write("Retrieving %s\n" % self.UniParcID, "cyan")
            url = 'http://www.uniprot.org/uniparc/%s.xml' % self.UniParcID
            uparc_xml = http_get(url)
            if cached_filepath:
                write_file(cached_filepath, uparc_xml)
        self.XML = uparc_xml

        # Get DOM
        self._dom = parseString(uparc_xml)
        main_tags = self._dom.getElementsByTagName("uniparc")
        assert(len(main_tags) == 1)
        entry_tags = main_tags[0].getElementsByTagName("entry")
        assert(len(entry_tags) == 1)
        self.entry_tag = entry_tags[0]
Beispiel #21
0
    def run(self):

        # Create input files
        self._create_temp_directory()
        self._create_input_files()
        self._create_script()
        write_file(self._filepath('script.pml'), self.script)

        # Run PyMOL
        #colortext.message(self.visualization_pymol +' -c ' + self._filepath('script.pml'))
        po = tprocess.Popen(self.outdir, [self.visualization_pymol, '-c', self._filepath('script.pml')])
        #colortext.message(po.stdout)
        #colortext.warning(po.errorcode)
        #colortext.error(po.stderr)
        self.stdout = po.stdout
        self.stderr = po.stderr
        self.return_code = po.errorcode

        if self.return_code != 0:
            raise Exception('Error: %s' % str(self.stderr))

        pse_path = self._filepath('session.pse')
        if os.path.exists(pse_path):
            self.PSE = read_file(pse_path, binary = True)
def create_dataset_JSON_files():

    todays_date = datetime.date.today().strftime('%Y-%m-%d')
    read_publications()
    pdb_data = {}
    pub_data = {}

    # Add the publications for the datasets
    for k, v in JSON_datasets.iteritems():
        for _, ref in v['references'].iteritems():
            pub_data[ref] = cached_publications[ref]

    #del JSON_datasets["CuratedProTherm_2014/12/04"]
    #del JSON_datasets["Guerois_10.1016/S0022-2836(02)00442-4_2002/07/05"]
    #del JSON_datasets["Potapov_10.1093/protein/gzp030_2009/09/01"]
    #del JSON_datasets["AlaScan-GPK_2014/09/25"]
    #del JSON_datasets["Kellogg_10.1002/prot.22921_2010/12/03"]

    for dataset_ID in JSON_datasets.keys():
        generate_JSON_dataset(dataset_ID, pdb_data, pub_data)
        check_JSON_dataset(dataset_ID)

    max_res = 0
    min_res = 10
    techniques = set()
    for p, v in pdb_data.iteritems():
        techniques.add(v['MethodOfDetermination'])
        if v['Resolution'] != 'N/A':
            max_res = max(max_res, v['Resolution'])
            min_res = min(min_res, v['Resolution'])
    print('Resolutions', min_res, max_res)
    print('Techniques', techniques)

    print(JSON_datasets.keys())
    for k, v in JSON_datasets.iteritems():
        v['version'] = 'This dataset was last updated on %s.' % todays_date
        filename = k.split('_')[0].lower() + '.json'
        x = json.dumps(v, indent=4, sort_keys=True)
        write_file('../rawdata/%s' % filename, x)
    write_file('../rawdata/pdbs.json', json.dumps(pdb_data, indent=4, sort_keys=True))
    write_file('../rawdata/references.json', json.dumps(pub_data, indent=4, sort_keys=True))
Beispiel #23
0
 def _create_input_files(self):
     write_file(self._filepath('main.pdb'), self.MainStructure.pdb_contents)
     write_file(self._filepath('loop.pdb'), self.Loop.pdb_contents)
def error_by_error_scatterplot(output_directory, file_prefix, df,
                             reference_series_index, x_series_index, y_series_index,
                             x_color, y_color,
                             x_series_name = None, y_series_name = None,
                             plot_title = '', x_axis_label = '', y_axis_label = '', similarity_range = 0.25,
                             add_similarity_range_annotation = True,
                             shape_by_category = False, shape_category_series_index = None, shape_category_title = 'Case',
                             label_series_index = None, label_outliers = True,
                             use_geom_text_repel = True,
                             ):

    """ Creates a scatterplot of error versus error intended to show which computational method (X or Y) has the least amount of error relative to a reference series.

        The difference vectors (reference_series - x_series, reference_series - y_series) are created and these differences (errors)
        are plotted against each other.

        :param output_directory: The output directory.
        :param file_prefix: A prefix for the generated files. A CSV file with the plot points, the R script, and the R output is saved along with the plot itself.
        :param df: A pandas dataframe. Note: The dataframe is zero-indexed.
        :param reference_series_index: The numerical index of the reference series e.g. experimental data.
        :param x_series_index: The numerical index of the X-axis series e.g. predictions from a computational method.
        :param y_series_index: The numerical index of the Y-axis series e.g. predictions from a second computational method.
        :param x_color: The color of the "method X is better" points.
        :param y_color: The color of the "method Y is better" points.
        :param x_series_name: A name for the X-series which is used in the the classification legend.
        :param y_series_name: A name for the Y-series which is used in the the classification legend.
        :param plot_title: Plot title.
        :param x_axis_label: X-axis label.
        :param y_axis_label: Y-axis label.
        :param similarity_range: A point (x, y) is considered as similar if |x - y| <= similarity_range.
        :param add_similarity_range_annotation: If true then the similarity range is included in the plot.
        :param shape_by_category: Boolean. If set then points are shaped by the column identified with shape_category_series_index. Otherwise, points are shaped by classification ("X is better", "Y is better", or "Similar")
        :param shape_category_series_index: The numerical index of the series used to choose point shapes.
        :param shape_category_title: The title of the shape legend.
        :param label_series_index: The numerical index of the series label_series_index
        :param label_outliers: Boolean. If set then label outliers using the column identified with label_series_index.
        :param use_geom_text_repel: Boolean. If set then the ggrepel package is used to avoid overlapping labels.

        This function was adapted from the Kortemme Lab covariation benchmark (https://github.com/Kortemme-Lab/covariation).
        todo: I need to check that ggplot2 is respecting the color choices. It may be doing its own thing.
    """
    try:
        os.mkdir(output_directory)
    except:
        pass
    assert (os.path.exists(output_directory))

    if not isinstance(shape_category_series_index, int):
        shape_by_category = False
    if not isinstance(label_series_index, int):
        label_outliers = False
    assert(x_series_name != None and y_series_name != None)

    df = df.copy()
    headers = df.columns.values

    num_categories = len(set(df.ix[:, shape_category_series_index].values))
    legal_shapes = range(15,25+1) + range(0,14+1)
    if num_categories > len(legal_shapes):
        colortext.warning('Too many categories ({0}) to plot using meaningful shapes.'.format(num_categories))
        shape_by_category = False
    else:
        legal_shapes = legal_shapes[:num_categories]

    df['X_error'] = abs(df[headers[reference_series_index]] - df[headers[x_series_index]])
    x_error_index = len(df.columns.values) - 1
    df['Y_error'] = abs(df[headers[reference_series_index]] - df[headers[y_series_index]])
    y_error_index = len(df.columns.values) - 1

    # Get the list of domains common to both runs
    df['Classification'] = df.apply(lambda r: _classify_smallest_error(r['X_error'], r['Y_error'], similarity_range, x_series_name, y_series_name), axis = 1)
    error_classification_index = len(df.columns.values) - 1

    # Create the R script
    boxplot_r_script = '''
library(ggplot2)
library(gridExtra)
library(scales)
library(qualV)
library(grid)'''
    if use_geom_text_repel:
        boxplot_r_script +='''
library(ggrepel) # install with 'install.packages("ggrepel")' inside the R interactive shell.
'''
    boxplot_r_script += '''

# PNG generation
png('%(file_prefix)s.png', width=2560, height=2048, bg="white", res=600)
txtalpha <- 0.8
redtxtalpha <- 0.8

%(png_plot_commands)s
        '''

    xy_table_filename = '{0}.txt'.format(file_prefix)
    xy_table_filepath = os.path.join(output_directory, xy_table_filename)

    data_table = df.to_csv(header = True, index = False)
    write_file(xy_table_filepath, data_table)

    main_plot_script = '''
# Set the margins
par(mar=c(5, 5, 1, 1))

xy_data <- read.csv('%(xy_table_filename)s', header=T)

names(xy_data)[%(x_error_index)d + 1] <- "xerrors"
names(xy_data)[%(y_error_index)d + 1] <- "yerrors"
'''

    if label_outliers:
        main_plot_script +='''names(xy_data)[%(label_series_index)d + 1] <- "outlier_labels"'''
    main_plot_script +='''
names(xy_data)[%(shape_category_series_index)d + 1] <- "categories"

xy_data[%(x_error_index)d + 1]
xy_data[%(y_error_index)d + 1]

# coefs contains two values: (Intercept) and yerrors
coefs <- coef(lm(xerrors~yerrors, data = xy_data))
fitcoefs = coef(lm(xerrors~0 + yerrors, data = xy_data))
fitlmv_yerrors <- as.numeric(fitcoefs[1])
lmv_intercept <- as.numeric(coefs[1])
lmv_yerrors <- as.numeric(coefs[2])
lm(xy_data$yerrors~xy_data$xerrors)

xlabel <- "%(x_axis_label)s"
ylabel <- "%(y_axis_label)s"
plot_title <- "%(plot_title)s"
rvalue <- cor(xy_data$yerrors, xy_data$xerrors)

# Alphabetically, "Similar" < "X" < "Y" so the logic below works
countsim <- paste("Similar =", dim(subset(xy_data, Classification=="Similar"))[1])
countX <- paste("%(x_series_name)s =", dim(subset(xy_data, Classification=="%(x_series_name)s"))[1])
countY <- paste("%(y_series_name)s =", dim(subset(xy_data, Classification=="%(y_series_name)s"))[1])

countX
countY
countsim

# Set graph limits and the position for the correlation value

minx <- min(0.0, min(xy_data$xerrors) - 0.1)
miny <- min(0.0, min(xy_data$yerrors) - 0.1)
maxx <- max(1.0, max(xy_data$xerrors) + 0.1)
maxy <- max(1.0, max(xy_data$yerrors) + 0.1)

# Create a square plot (x-range = y-range)
minx <- min(minx, miny)
miny <- minx
maxx <- max(maxx, maxy)
maxy <- maxx

xpos <- maxx / 25.0
ypos <- maxy - (maxy / 25.0)
ypos_2 <- maxy - (2 * maxy / 25.0)


plot_scale <- scale_color_manual(
    "Counts",
    values = c( "Similar" = '#444444', "%(x_series_name)s" = '%(x_color)s', "%(y_series_name)s" ='%(y_color)s'),
    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY) )'''

    if add_similarity_range_annotation:
        main_plot_script += '''
# Polygon denoting the similarity range. We turn off plot clipping below (gt$layout$clip) so we need to be more exact than using 4 points when defining the region
boxy_mc_boxface <- data.frame(
  X = c(minx - 0,                        maxx - %(similarity_range)f, maxx + 0, maxx + 0,                       0 + %(similarity_range)f, 0),
  Y = c(minx - 0 + %(similarity_range)f, maxx + 0,                    maxx + 0, maxx + 0 -%(similarity_range)f, 0, 0 )
)'''
    else:
        main_plot_script += '''
# Polygon denoting the similarity range. We turn off plot clipping below (gt$layout$clip) so we need to be more exact than using 4 points when defining the region
boxy_mc_boxface <- data.frame(
  X = c(minx - 1, maxx + 1, maxx + 1, minx - 1),
  Y = c(minx - 1 + %(similarity_range)f, maxx + 1 + %(similarity_range)f, maxx + 1 - %(similarity_range)f, minx - 1 - %(similarity_range)f)
)'''

    if shape_by_category:
        main_plot_script += '''
# Plot
p <- qplot(main="", xerrors, yerrors, data=xy_data, xlab=xlabel, ylab=ylabel, alpha = I(txtalpha), shape=factor(categories), col=factor(Classification)) +'''
    else:
        main_plot_script += '''
# Plot
p <- qplot(main="", xerrors, yerrors, data=xy_data, xlab=xlabel, ylab=ylabel, alpha = I(txtalpha), shape=factor(Classification), col=factor(Classification)) +'''

    main_plot_script += '''
geom_polygon(data=boxy_mc_boxface, aes(X, Y), fill = "#bbbbbb", alpha = 0.4, color = "darkseagreen", linetype="blank", inherit.aes = FALSE, show.legend = FALSE) +
plot_scale +
geom_point() +
guides(col = guide_legend()) +
labs(title = "%(plot_title)s") +
theme(plot.title = element_text(color = "#555555", size=rel(0.75))) +
theme(axis.title = element_text(color = "#555555", size=rel(0.6))) +
theme(legend.title = element_text(color = "#555555", size=rel(0.45)), legend.text = element_text(color = "#555555", size=rel(0.4))) +
coord_cartesian(xlim = c(minx, maxx), ylim = c(miny, maxy)) + # set the graph limits
annotate("text", hjust=0, size = 2, colour="#222222", x = xpos, y = ypos, label = sprintf("R = %%0.2f", round(rvalue, digits = 4))) + # add correlation text; hjust=0 sets left-alignment. Using annotate instead of geom_text avoids blocky text caused by geom_text being run multiple times over the series'''

    if label_outliers:
        if use_geom_text_repel:
            main_plot_script += '''

# Label outliers
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +'''
        else:
            main_plot_script += '''

# Label outliers
geom_text(hjust = 0, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text(hjust = 0, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +
geom_text(hjust = 1, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text(hjust = 1, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +'''

        counts_title = 'Counts'
        if add_similarity_range_annotation:
            counts_title += '*'

        main_plot_script += '''


#geom_text(hjust = 0, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yvalues - xvalues) > 2 & xvalues <= 0), aes(xvalues, yvalues+0.35, label=Origin_of_peptide), check_overlap = TRUE) + # label outliers
#geom_text(hjust = 1, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yvalues - xvalues) > 2 & xvalues > 0), aes(xvalues, yvalues+0.35, label=Origin_of_peptide), check_overlap = TRUE) + # label outliers




scale_colour_manual('%(counts_title)s', values = c('#444444', '%(x_color)s', '%(y_color)s'),
                    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY)) +'''

    if shape_by_category:
        legal_shapes_str = ', '.join(map(str, legal_shapes))
        main_plot_script += '''
scale_shape_manual('%(shape_category_title)s', values = c(%(legal_shapes_str)s),
                    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY))'''

    else:
        main_plot_script += '''
scale_shape_manual('%(counts_title)s', values = c(18, 16, 15),
                    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY))'''

    if add_similarity_range_annotation:
        main_plot_script += '''+
    # Add a caption
    annotation_custom(grob = textGrob(gp = gpar(fontsize = 5), hjust = 0, sprintf("* Similar \\u225d \\u00b1 %%0.2f", round(%(similarity_range)f, digits = 2))), xmin = maxx + (2 * maxx / 10), ymin = -1, ymax = -1)'''

    main_plot_script += '''

# Plot graph
p
    '''
    if add_similarity_range_annotation:
        main_plot_script += '''
# Code to override clipping
gt <- ggplot_gtable(ggplot_build(p))
gt$layout$clip[gt$layout$name=="panel"] <- "off"
grid.draw(gt)'''

    main_plot_script +='''
dev.off()
'''

    # Create the R script
    plot_type = 'png'
    png_plot_commands = main_plot_script % locals()
    boxplot_r_script = boxplot_r_script % locals()
    r_script_filename = '{0}.R'.format(file_prefix)
    r_script_filepath = os.path.join(output_directory, r_script_filename)
    write_file(r_script_filepath, boxplot_r_script)

    # Run the R script
    run_r_script(r_script_filename, cwd = output_directory)
def prepare_structures(file_filter,
                       output_directory,
                       loop_definitions,
                       require_filter=True,
                       create_partial_structures=False,
                       expected_min_loop_length=None,
                       expected_max_loop_length=None,
                       remove_hetatm=False):
    search_radius = 10.0

    if not (os.path.exists(output_directory)):
        os.mkdir(output_directory)

    # Iterate through the dataset cases
    for pdb_file in sorted(glob.glob(file_filter)):

        pdb_prefix = os.path.splitext(os.path.split(pdb_file)[1])[0].lower()

        # Read the benchmark loop definition
        if not loop_definitions.get(pdb_prefix):
            raise Exception(
                'The loop definition for {0} is missing.'.format(pdb_prefix))
        loop_definition = loop_definitions[pdb_prefix]
        loops = [
            PDBSection(loop_definition['chainID'],
                       loop_definition['StartResidueID'],
                       loop_definition['EndResidueID'],
                       Sequence=loop_definition['Sequence'])
        ]

        # Only process files that passed the benchmark criteria
        if require_filter and not loop_definition['PassedFilter']:
            continue

        # Read in the PDB content, removing HETATM lines if requested
        pdb_content = read_file(pdb_file)
        if remove_hetatm:
            new_content = []
            for l in pdb_content.split('\n'):
                if not l.startswith('HETATM'):
                    new_content.append(l)
            pdb_content = '\n'.join(new_content)

        # Remove the loops and surrounding sidechain atoms from the structure
        b = Bonsai(pdb_content)
        bonsai, cutting, PSE_file, PSE_script, FASTA_file = b.prune_loop_for_kic(
            loops,
            search_radius,
            expected_min_loop_length=expected_min_loop_length,
            expected_max_loop_length=expected_max_loop_length,
            generate_pymol_session=True)

        # Create a PyMOL session file for visual inspection
        write_file(
            os.path.join(output_directory, '{0}.pse'.format(pdb_prefix)),
            PSE_file)

        # Create the new PDB file with the loop and surrounding sidechains removed
        write_file(
            os.path.join(output_directory, '{0}.pdb'.format(pdb_prefix)),
            bonsai)
        if create_partial_structures:
            write_file(
                os.path.join(
                    output_directory,
                    '{0}_missing_loop_and_surrounding_sidechains.pdb'.format(
                        pdb_prefix)), bonsai)
            write_file(
                os.path.join(
                    output_directory,
                    '{0}_loop_and_surrounding_sidechains.pdb'.format(
                        pdb_prefix)), cutting)

        # Create the FASTA file containing the loop sequence. This will be used along with the loop_definitions.json file
        # to add the residues back into the Rosetta structure
        write_file(
            os.path.join(output_directory, '{0}.fasta'.format(pdb_prefix)),
            FASTA_file)

        sys.stdout.write('.')
        sys.stdout.flush()

    print('')
            headers = [
                l for l in fasta_contents.split('\n') if l.startswith('>')
            ]
            assert (len(headers) == 1)
            header = headers[0]
            pdb_residue_ids = [
                PDB.ChainResidueID2String(l[0], l[1:])
                for l in header[header.find('Residues ') + 9:].split(';')
            ]

            # Add the missing atoms atoms back into the PDB file
            spackler = Spackler.from_filepath(pdb_file)
            new_pdb_content = spackler.add_backbone_atoms_linearly_from_loop_filepaths(
                loop_file, fasta_file, pdb_residue_ids)
            write_file(
                os.path.join(output_directory, '{0}.pdb'.format(pdb_prefix)),
                new_pdb_content)

            # Create a Rosetta .loop file
            loop_set = json.loads(read_file(loop_file)).get('LoopSet')
            assert (len(loop_set) == 1)
            start_res = '{chainID}{resSeq:>4d}{iCode}'.format(
                **loop_set[0]['start'])
            end_res = '{chainID}{resSeq:>4d}{iCode}'.format(
                **loop_set[0]['stop'])

            success, result = get_pdb_contents_to_pose_residue_map(
                new_pdb_content,
                rosetta_scripts_binary,
                None,
                pdb_id=None,
Beispiel #27
0
from klab.fs.fsio import read_file, write_temp_file, open_temp_file, write_file
from klab.bio.pfam import Pfam
from klab.bio.dssp import MonomerDSSP, ComplexDSSP, MissingAtomException
from klab.bio.ligand import Ligand, PDBLigand
from klab.bio.pdbtm import PDBTM
from klab.db.sqlalchemy_interface import get_single_record_from_query, get_or_create_in_transaction

from kddg.api.schema import test_schema_against_database_instance
from kddg.api.schema import PDBFile, PDBChain, PDBMolecule, PDBMoleculeChain, PDBResidue, LigandDescriptor, LigandIdentifier, LigandSynonym, PDBLigand
from kddg.api.schema import Ligand as DBLigand
#from kddg.api.schema import Publication, PublicationAuthor, PublicationIdentifier
from kddg.api.layers import *
from kddg.api.db import ddG, PartialDataException, SanityCheckException
import kddg.api.dbi as dbi

rosetta_scripts_path =  '/home/oconchus/t14benchmarking/r57934/main/source/bin/rosetta_scripts.linuxgccrelease'
rosetta_database_path = '/home/oconchus/t14benchmarking/r57934/main/database'
p = PDB(read_file('/kortemmelab/data/kyleb/ddg_numbering_for_shane/24548-data/1CBW_FGHI.pdb'))
#p.construct_pdb_to_rosetta_residue_map(rosetta_scripts_path, rosetta_database_path)
p.construct_pdb_to_rosetta_residue_map(rosetta_scripts_path, rosetta_database_path, extra_command_flags = '-ignore_zero_occupancy false -ignore_unrecognized_res')
pprint.pprint(p.get_atom_sequence_to_rosetta_map())
pprint.pprint(p.rosetta_sequences)

from kddg.api.ppi import get_interface as get_ppi_interface
ppi_api = get_ppi_interface(read_file('../misc/ddgdb.pw'),
                                rosetta_scripts_path =  '/home/oconchus/t14benchmarking/r57934/main/source/bin/rosetta_scripts.linuxgccrelease',
                                rosetta_database_path = '/home/oconchus/t14benchmarking/r57934/main/database')
content = ppi_api.DDG_db.execute_select('SELECT Content FROM PDBFile WHERE ID="1CBW"')[0]['Content']
print(content)
write_file('/tmp/ddginterface/1CBW_FGHI_db.pdb', content)
Beispiel #28
0
def extract_analysis_data(dataset_list_file, output_directory, data_extraction_method, expectn, top_x, prefix, test_mode = False):
    '''This is the main function in this script and is where the basic analysis is compiled.

       output_directory should contain the results of the prediction run.
       data_extraction_method should be a function pointer to the method-specific function used to retrieve the prediction results e.g. get_kic_run_details
       expectn specifies how many predictions we expect to find (useful in case some jobs failed).
       top_x specifies how many of the best-scoring predictions should be used to generate the TopX metric results e.g.
       the Top5 RMSD metric value measures the lowest RMSD amongst the five best-scoring structures.
       prefix is used to name the output files.
    '''

    # Sanity check
    assert(top_x <= expectn)

    # Set up reference structures
    structures_folder = os.path.join('..', 'input', 'structures', '12_res')
    rcsb_references = os.path.join(structures_folder, 'rcsb', 'reference')
    rosetta_references = os.path.join(structures_folder, 'rosetta', 'reference')

    # Set up the per-case statistics dicts
    best_scoring_structures = {}
    median_scoring_structures = {}
    worst_scoring_structures = {}
    total_percent_subanstrom = {}
    top_x_percent_subanstrom = {}
    top_x_loop_prediction_sets = {}

    # Set up the input file used to generate the graph plotting the "percentage of subangstrom models" metric over
    # varying values of X used to select the TopX structures
    percentage_subangstrom_over_top_X_plot_input = ['PDB\tX\tPercentage of subangstrom cases for TopX']
    percent_subangrom_by_top_x = {}

    # Set up the summary analysis file
    csv_file = ['\t'.join(['PDB ID', 'Models', '%<1.0A', 'Top{0} %<1.0A'.format(top_x), 'Best score', 'Top{0} score'.format(top_x), 'Median score', 'Worst score', 'Closest score', 'Top1 RMSD', 'Top{0} RMSD'.format(top_x), 'Closest RMSD'])]

    # Read in the benchmark input
    pdb_ids = [os.path.splitext(os.path.split(s.strip())[1])[0] for s in get_file_lines(dataset_list_file) if s.strip()]

    # Truncate the benchmark input for test mode
    if test_mode:
        pdb_ids = pdb_ids[:10]

    # Analyze the performance for each case in the benchmark
    for pdb_id in pdb_ids:

        rcsb_reference_pdb = os.path.join(rcsb_references, pdb_id + '.pdb')
        assert(os.path.exists(rcsb_reference_pdb))
        rosetta_reference_pdb = os.path.join(rosetta_references, pdb_id + '.pdb')
        assert(os.path.exists(rosetta_reference_pdb))
        assert(len(pdb_id) == 4)
        loops_file = os.path.join(structures_folder, 'rosetta', 'pruned', '{0}.loop.json'.format(pdb_id))
        loop_sets = json.loads(read_file(loops_file))
        assert(len(loop_sets['LoopSet']) == 1)

        # Create a container for loop predictions
        loop_prediction_set = LoopPredictionSet()

        # Read the coordinates from the reference PDB file
        rcsb_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rcsb_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)
        rosetta_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rosetta_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)

        colortext.wgreen('\n\nReading in the run details for {0}:'.format(pdb_id))
        details = data_extraction_method(output_directory, pdb_id, loop_sets, test_mode = test_mode)
        for d in details:
            loop_prediction = loop_prediction_set.add(d['id'], d['score'], pdb_id = pdb_id, rmsd = None, pdb_path = d['predicted_structure'], pdb_loop_residue_matrix = d['pdb_loop_residue_matrix'])
        print(' Done')

        # Compute the RMSD for this case for the structure using the pandas dataframe
        # It is more efficient to do this after truncation if truncating by score but in the general case users will
        # probably want to consider all predictions. If not (e.g. for testing) then arbitrary subsets can be chosen
        # in the loop above
        colortext.wgreen('Computing RMSDs for {0}:'.format(pdb_id))
        loop_prediction_set.compute_rmsds(rcsb_reference_matrix)
        loop_prediction_set.check_rmsds(rosetta_reference_matrix)
        print(' Done\n')

        # Truncate the structures to the top expectn-scoring files
        loop_prediction_set.sort_by_score()
        loop_prediction_set.truncate(expectn)
        if len(loop_prediction_set) != expectn:
            print('Error: Expected {0} structures but only found {1}.'.format(expectn, len(loop_prediction_set)))
            sys.exit(1)

        # Create a new set containing the top-X-scoring structures and identify the median-scoring structure
        top_x_loop_prediction_sets[pdb_id] = loop_prediction_set[:top_x]
        median_scoring_structures[pdb_id] = loop_prediction_set[int(expectn / 2)]

        # Determine the lowest-/best-scoring structure
        best_scoring_structures[pdb_id] = loop_prediction_set[0]
        best_score = best_scoring_structures[pdb_id].score
        worst_scoring_structures[pdb_id] = loop_prediction_set[-1]
        worst_score = worst_scoring_structures[pdb_id].score
        assert(top_x_loop_prediction_sets[pdb_id][0] == best_scoring_structures[pdb_id])

        # Print structures
        colortext.warning('Top{0} structures'.format(top_x))
        print(top_x_loop_prediction_sets[pdb_id])
        colortext.warning('Top1 structure')
        print(best_scoring_structures[pdb_id])
        colortext.warning('Median (by score) structure')
        print(median_scoring_structures[pdb_id])
        colortext.warning('Lowest-scoring structures')
        print(worst_scoring_structures[pdb_id])

        # Create values for TopX variable plot
        loop_prediction_set.sort_by_score()
        for top_x_var in range(1, len(loop_prediction_set) + 1):
            new_subset = loop_prediction_set[:top_x_var]
            percent_subangstrom = 100 * new_subset.fraction_with_rmsd_lt(1.0)
            percentage_subangstrom_over_top_X_plot_input.append('{0}\t{1}\t{2}'.format(pdb_id, top_x_var, percent_subangstrom))
            percent_subangrom_by_top_x[top_x_var] = percent_subangrom_by_top_x.get(top_x_var, {})
            percent_subangrom_by_top_x[top_x_var][pdb_id] = percent_subangstrom

        total_percent_subanstrom[pdb_id] = 100 * loop_prediction_set.fraction_with_rmsd_lt(1.0)
        top_x_percent_subanstrom[pdb_id] = 100 * top_x_loop_prediction_sets[pdb_id].fraction_with_rmsd_lt(1.0)
        colortext.warning('Number of sub-angstrom cases in the full set of {0}: {1}'.format(expectn, total_percent_subanstrom[pdb_id]))
        colortext.warning('Number of sub-angstrom cases in the TopX structures: {1}'.format(expectn, top_x_percent_subanstrom[pdb_id]))

        loop_prediction_set.sort_by_rmsd()
        closest_rmsd = loop_prediction_set[0].rmsd
        closest_score = loop_prediction_set[0].score
        colortext.warning('RMSD of closest model: {0}'.format(closest_rmsd))
        colortext.warning('Score of closest model: {0}'.format(closest_score))

        top_1_rmsd = best_scoring_structures[pdb_id].rmsd

        top_x_rmsd = best_scoring_structures[pdb_id].rmsd
        top_x_score = best_scoring_structures[pdb_id].score
        for s in top_x_loop_prediction_sets[pdb_id]:
            if (s.rmsd < top_x_rmsd) or (s.rmsd == top_x_rmsd and s.score < top_x_score):
                top_x_rmsd = s.rmsd
                top_x_score = s.score
        assert(top_x_score <= worst_score)
        assert(top_x_rmsd <= top_1_rmsd)

        print('Top 1 RMSD (predicted vs Rosetta/RCSB reference structure): {0}'.format(top_1_rmsd))
        print('Top {0} RMSD (predicted vs Rosetta/RCSB reference structure): {1}'.format(top_x, top_x_rmsd))

        csv_file.append('\t'.join(map(str, [pdb_id, expectn, total_percent_subanstrom[pdb_id], top_x_percent_subanstrom[pdb_id], best_score, top_x_score, median_scoring_structures[pdb_id].score, worst_score, closest_score, top_1_rmsd, top_x_rmsd, closest_rmsd])))

    # Add a column of median percent subangstrom values
    for top_x_var, values_by_pdb in sorted(percent_subangrom_by_top_x.iteritems()):
        assert(sorted(values_by_pdb.keys()) == sorted(pdb_ids))
        median_value = sorted(values_by_pdb.values())[len(pdb_ids) / 2]
        percentage_subangstrom_over_top_X_plot_input.append('Median\t{1}\t{2}'.format(pdb_id, top_x_var, median_value))

    write_file('{0}analysis.csv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}analysis.tsv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}percentage_subangstrom_over_top_X.tsv'.format(prefix), '\n'.join(percentage_subangstrom_over_top_X_plot_input))
 def _create_input_files(self):
     colortext.message('self.outdir: ' + self.outdir)
     write_file(self._filepath('scaffold.pdb'), self.Scaffold.pdb_contents)
     write_file(self._filepath('model.pdb'), self.Model.pdb_contents)
     if self.Crystal:
         write_file(self._filepath('crystal.pdb'), self.Crystal.pdb_contents)
Beispiel #30
0
    def __init__(self, UniParcID, UniProtACs = None, UniProtIDs = None, cache_dir = None, silent = False):
        if cache_dir and not(os.path.exists(os.path.abspath(cache_dir))):
            raise Exception("The cache directory %s does not exist." % os.path.abspath(cache_dir))
        self.UniParcID = UniParcID
        self.cache_dir = cache_dir
        self.recommended_name = None
        self.silent = silent

        # Get AC mapping
        if not UniProtACs or UniParcID=='UPI0000047CA3': # todo: is this UPI0000047CA3 special handling necessary?
            mapping = uniprot_map('UPARC', 'ACC', [UniParcID], cache_dir = cache_dir, silent = silent)[UniParcID]
            self.UniProtACs = mapping
        else:
            self.UniProtACs = UniProtACs

        # Get ID mapping
        if not UniProtIDs:
            mapping = uniprot_map('UPARC', 'ID', [UniParcID], cache_dir = cache_dir, silent = silent)[UniParcID]
            self.UniProtIDs = mapping
        else:
            self.UniProtIDs = UniProtIDs

        # Get FASTA
        cached_filepath = None
        if cache_dir:
            cached_filepath = os.path.join(cache_dir, '%s.fasta' % UniParcID)
        if cached_filepath and os.path.exists(cached_filepath):
            fasta = read_file(cached_filepath)
        else:
            if not silent:
                print("Getting FASTA file")
            url = 'http://www.uniprot.org/uniparc/%s.fasta' % UniParcID
            fasta = http_get(url)
            if cached_filepath:
                write_file(cached_filepath, fasta)

        # Get sequence
        header = fasta.split("\n")[0].split()
        assert(len(header) == 2)
        assert(header[0] == ">%s" % UniParcID)
        assert(header[1].startswith("status="))
        sequence = "".join(map(string.strip, fasta.split("\n")[1:]))
        self.sequence = sequence

        # Get atomic mass (and sequence again)
        self.atomic_mass = None
        self.CRC64Digest = None
        recommended_names = []
        alternative_names = []
        submitted_names = []

        self.AC_entries = {}
        subsections = ProteinSubsectionHolder(len(sequence))

        for UniProtAC in self.UniProtACs:
            #colortext.write("%s\n" % UniProtAC, 'cyan')
            try:
                AC_entry = UniProtACEntry(UniProtAC, cache_dir = self.cache_dir, silent = silent)
            except EmptyUniProtACXMLException:
                continue
            self.AC_entries[UniProtAC] = AC_entry

            # Mass sanity check
            if self.atomic_mass != None:
                assert(self.atomic_mass == AC_entry.atomic_mass)
            self.atomic_mass = AC_entry.atomic_mass

            # Sequence sanity check
            assert(self.sequence == AC_entry.sequence)
            # CRC 64 sanity check
            if self.CRC64Digest != None:
                assert(self.CRC64Digest == AC_entry.CRC64Digest)
            self.CRC64Digest = AC_entry.CRC64Digest
            assert(CRC64.CRC64digest(self.sequence) == self.CRC64Digest)

            if AC_entry.recommended_name:
                found = False
                for n in recommended_names:
                    if n[0] == AC_entry.recommended_name:
                        n[1] += 1
                        found = True
                        break
                if not found:
                    recommended_names.append([AC_entry.recommended_name, 1])

            for alternative_name in AC_entry.alternative_names:
                found = False
                for n in alternative_names:
                    if n[0] == alternative_name:
                        n[1] += 1
                        found = True
                        break
                if not found:
                    alternative_names.append([alternative_name, 1])

            for submitted_name in AC_entry.submitted_names:
                found = False
                for n in submitted_names:
                    if n[0] == submitted_name:
                        n[1] += 1
                        found = True
                        break
                if not found:
                    submitted_names.append([submitted_name, 1])

            subsections += AC_entry.subsections
        self.subsections = subsections

        assert(len(set(UniParcMergedRecommendedNamesRemap.keys()).intersection(set(UniParcMergedSubmittedNamesRemap.keys()))) == 0)
        if UniParcID in UniParcMergedRecommendedNamesRemap:
            recommended_names = [[UniParcMergedRecommendedNamesRemap[UniParcID], 1]]
        elif UniParcID in UniParcMergedSubmittedNamesRemap:
            recommended_names = [[UniParcMergedSubmittedNamesRemap[UniParcID], 1]]

        if not silent:
            colortext.write('Subsections\n', 'orange')
        #print(subsections)

        if len(recommended_names) == 0 and len(alternative_names) == 0 and len(submitted_names) == 0:
            raise UniParcEntryStandardizationException("UniParcID %s has no recommended names." % UniParcID)
        elif len(recommended_names) == 0:
            s = ["UniParcID %s has no recommended names.\n" % UniParcID]
            if alternative_names:
                s.append("It has the following alternative names:")
                for tpl in sorted(alternative_names, key=lambda x:-x[1]):
                    s.append("\n  count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name']))
                    if tpl[0]['Short names']:
                        s.append(" (short names: %s)" % ",".join(tpl[0]['Short names']))
                    if tpl[0]['EC numbers']:
                        s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers']))
            if submitted_names:
                s.append("It has the following submitted names:")
                for tpl in sorted(submitted_names, key=lambda x:-x[1]):
                    s.append("\n  count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name']))
                    if tpl[0]['Short names']:
                        s.append(" (short names: %s)" % ",".join(tpl[0]['Short names']))
                    if tpl[0]['EC numbers']:
                        s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers']))
            #raise UniParcEntryStandardizationException("".join(s))
        elif len(recommended_names) > 1:
            s = ["UniParcID %s has multiple recommended names: " % UniParcID]
            for tpl in sorted(recommended_names, key=lambda x:-x[1]):
                s.append("\n  count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name']))
                if tpl[0]['Short names']:
                    s.append(" (short names: %s)" % ",".join(tpl[0]['Short names']))
                if tpl[0]['EC numbers']:
                    s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers']))
            raise UniParcEntryStandardizationException("".join(s))

        #assert(len(recommended_names) == 1) # todo: this is not always available
        #print(recommended_names)
        self.recommended_name = None
        if len(recommended_names) == 1:
            self.recommended_name = recommended_names[0][0]
        self.get_organisms()
            file_tuples.append( (file_info['Filename'], file_info['Content']) )
        substitution_parameters = json.loads(job_details['JSONParameters'])

        # Scrub the folder
        if not all_files_exist:
            if os.path.isdir(job_data_dir):
                shutil.rmtree(job_data_dir)
            os.makedirs(job_data_dir)

        files_dict = {} # Maps name to filepath position
        for file_name, file_contents in file_tuples:
            new_file_location = os.path.join(job_data_dir, file_name)
            if not all_files_exist:
                if '.pdb' in file_name:
                    if keep_hetatm_lines or keep_all_lines:
                        write_file(new_file_location, file_contents)
                    else:
                        write_file(new_file_location, '\n'.join([l for l in file_contents.split('\n') if l.startswith('ATOM')]))
                else:
                    with open(new_file_location, 'w') as f:
                        f.write(file_contents)
            files_dict[file_name] = os.path.relpath(new_file_location, settings['output_dir'])
        if not all_files_exist:
            write_file(os.path.join(job_data_dir, '.ready'), '')

        argdict = {
            'input_file_list' : [files_dict[substitution_parameters['%%input_pdb%%']]],
            '%%chainstomove%%' : substitution_parameters['%%chainstomove%%'],
        }
        for file_name, file_location in files_dict.iteritems():
            if 'params' in file_name:
def create_dataset_CSV_files():

    csv_lines = []
    csv_lines.append('#' + ','.join(['PDB ID', 'Resolution', 'Techniques']))
    pdbs = json.loads(read_file('../rawdata/pdbs.json'))
    for pdb_id, v in sorted(pdbs.iteritems()):
        csv_lines.append(','.join([pdb_id, str(v['Resolution']), v['MethodOfDetermination']]))
    write_file('../rawdata/pdbs.csv', '\n'.join(csv_lines))

    csv_lines = []
    csv_lines.append('#' + ','.join(['ID', 'Authors', 'Title', 'Publication', 'Volume', 'Issue', 'Date', 'URL']))
    pdbs = json.loads(read_file('../rawdata/references.json'))
    for ref_id, v in sorted(pdbs.iteritems()):
        author_surnames = [a['Surname'] for a in v['Authors']]
        url = ''
        if v['DOI']:
            url = 'https://dx.doi.org/%s' % v['DOI']
        else:
            url = v['URL']
        line = [ref_id, '_'.join(author_surnames), v['Title'], v.get('Publication') or '', v.get('Volume') or '', v.get('Issue') or '', str(v.get('PublicationDate')) or v.get('PublicationYear') or '', url]
        csv_lines.append(','.join(line))
    references_text = '\n'.join(csv_lines)
    import codecs
    f = codecs.open('../rawdata/references.csv', mode="w", encoding="utf-8")
    f.write(references_text)

    for k, v in JSON_datasets.iteritems():
        filename = k.split('_')[0].lower() + '.json'
        json_s = read_file('../rawdata/%s' % filename)
        d = json.loads(json_s)
        csv_lines = []
        print(filename,d.keys())
        for line in d['information'].split('\n'):
            csv_lines.append('# %s' % line)
        for k, v in sorted(d['references'].iteritems()):
            csv_lines.append('# [%s] %s' % (k, v))
        todays_date = datetime.date.today().strftime('%Y-%m-%d')
        csv_lines.append('\n# This dataset was last updated on %s.' % todays_date)
        csv_lines.append('')
        csv_lines.append('# The RecordID below refers to the record ID in the original dataset. When no ID was specified, we added an ID based on the published order of the records.')
        csv_lines.append('# Mutations is an underscore-separated list of mutations. Each mutation takes the form "Chain Wildtype ResidueID Mutant".')
        csv_lines.append('# DDG is the aggregated (mean) DDG value used for analysis.')
        csv_lines.append('# ResidueExposures is an underscore-separated list of exposure values, each one corresponding to its respective mutated position. Each exposure value is based on the solvent accessibility reported by DSSP divided by a maximum solvent accessibility for that residue type and represents whether the residue is buried (0.0) or exposed (1.0).')
        csv_lines.append("# DSSPTypes is an underscore-separated list of DSSP secondary structure assignments, each one corresponding to its respective mutated position.")
        csv_lines.append("# DSSPSimpleTypes is an underscore-separated list of DSSPSimpleType secondary structure assignments, each one corresponding to its respective mutated position.")
        csv_lines.append('# IndividualDDGs lists the individual DDG values which can be used to filter out records with high variance.')
        csv_lines.append('# DerivedMutation is 0 if the record represents an actual set of experiments and 1 if it was derived e.g. if the mutant structure is taken as wildtype and the DDG value is negated. Typically the original records also exist in the dataset so the derived records can introduce a bias.')
        csv_lines.append('# Note: The .json file accompanying this CSV file contains more information, including a list of publications from where the DDG values were taken.')
        csv_lines.append('')

        csv_lines.append('#' + ','.join(['RecordID', 'PDBFileID', 'Mutations', 'DDG', 'ResidueExposures', 'DSSPTypes', 'DSSPSimpleTypes', 'IndividualDDGs', 'DerivedMutation']))
        for record in d['data']:
            mutations = []
            exposures = []
            dssp = []
            ddgs = []
            ddgs_simple = []
            for m in record['Mutations']:
                mutations.append('%(Chain)s %(WildTypeAA)s %(ResidueID)s %(MutantAA)s' % m)
                exposures.append(m['DSSPExposure'])
                dssp.append('%(DSSPType)s' % m)
                ddgs_simple.append('%(DSSPSimpleSSType)s' % m)
            for eddg in record['ExperimentalDDGs']:
                ddgs.append(eddg['DDG'])

            csv_lines.append(','.join([
                str(record['RecordID']),
                record['PDBFileID'],
                '_'.join(mutations),
                str(record['DDG']),
                '_'.join(map(str, exposures)),
                '_'.join(dssp),
                '_'.join(ddgs_simple),
                '_'.join(map(str, ddgs)),
                str(int(record['DerivedMutation'])),
                ]))
        write_file('../rawdata/%s' % filename.replace('.json', '.csv'), '\n'.join(csv_lines))
def correctness_abacus_plot(output_directory, file_prefix, df,
                                   x_series_index, y_series_index, facet_index, peptide_index, series_color, plot_title = '', x_axis_label = '', y_axis_label = '',
                                   fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0,
                                   min_experimental_ddg = None,
                                   max_experimental_ddg = None):
    try:
        os.mkdir(output_directory)
    except:
        pass
    assert (os.path.exists(output_directory))

    #first_peptide = df.ix[:, peptide_index].min()
    #last_peptide = df.ix[:, peptide_index].max()

    df['Categorization'] = df.apply(lambda r: _determine_fraction_correct_class(r[x_series_index], r[y_series_index])[0], axis = 1)
    categorization_index = len(df.columns.values) - 1
    df['CategorizationShape'] = df.apply(lambda r: _determine_fraction_correct_class(r[x_series_index], r[y_series_index])[1], axis = 1)
    categorization_shape_index = len(df.columns.values) - 1
    df['CategorizationColor'] = df.apply(lambda r: _determine_fraction_correct_class(r[x_series_index], r[y_series_index])[2], axis = 1)
    categorization_color_index = len(df.columns.values) - 1

    # Create the R script
    boxplot_r_script = '''
library(ggplot2)
library(gridExtra)
library(scales)
library(qualV)

# PNG generation
png('%(file_prefix)s.png', width=2560, height=2048, bg="white", res=600)
txtalpha <- 0.6
redtxtalpha <- 0.6

%(png_plot_commands)s
        '''

    xy_table_filename = '{0}.txt'.format(file_prefix)
    xy_table_filepath = os.path.join(output_directory, xy_table_filename)

    header_names = df.columns.values
    #x_series = header_names[x_series_index]
    #y_series = header_names[y_series_index]
    facet_series = header_names[facet_index]
    peptide_series = header_names[peptide_index]
    #categorization_series = header_names[categorization_index]
    #print(x_series,y_series, facet_series, peptide_series, categorization_series)

    data_table = df.to_csv(header = True, index = False)
    print(data_table)

    df = df.sort_values([facet_series, peptide_series])
    data_table = df.to_csv(header = True, index = False)
    print(data_table)

    write_file(xy_table_filepath, data_table)

    main_plot_script = '''
# Set the margins
par(mar=c(5, 5, 1, 1))

xy_data <- read.csv('%(xy_table_filename)s', header=T)

names(xy_data)[%(x_series_index)d + 1] <- "xvalues"
names(xy_data)[%(y_series_index)d + 1] <- "yvalues"
names(xy_data)[%(facet_index)d + 1] <- "facets"
names(xy_data)[%(peptide_index)d + 1] <- "peptides"
names(xy_data)[%(categorization_index)d + 1] <- "categorization"
names(xy_data)[%(categorization_shape_index)d + 1] <- "categorization_shape"
names(xy_data)[%(categorization_color_index)d + 1] <- "categorization_color"


xy_data[%(peptide_index)d + 1]

peptide_names <- sort(xy_data[[%(peptide_index)d + 1]])

peptide_names
class(peptide_names)

first_peptide = peptide_names[1]
last_peptide = peptide_names[length(peptide_names)]

xlabel <- "%(x_axis_label)s"
ylabel <- "%(y_axis_label)s"
plot_title <- "%(plot_title)s"

xy_data

# Set graph limits and the position for the correlation value

miny <- min(0.0, min(xy_data$xvalues) - 0.1) # "X-axis" values are plotted on to Y-axis
maxy <- max(1.0, max(xy_data$xvalues) + 0.1)
'''

    if min_experimental_ddg != None:
        main_plot_script += '''
miny <- min(miny  - 0.2, %(min_experimental_ddg)f  - 0.2)
'''
    if min_experimental_ddg != None:
        main_plot_script += '''
maxy <- max(maxy + 0.5, %(min_experimental_ddg)f  + 0.5)

first_peptide
last_peptide
'''

    main_plot_script += '''

#aes(color = categorization_color, shape = categorization_shape)

p <- ggplot(data=xy_data, aes(x=peptides, y = xvalues, color = categorization_color, shape = categorization_color, group = facets)) +
       theme(legend.position = "none") + # hide the legend
       annotate("rect", xmin = first_peptide, xmax = last_peptide, ymin = -1, ymax = +1, alpha = .2) +
       xlab(xlabel) +
       labs(title = "%(plot_title)s") +
       theme(plot.title = element_text(color = "#555555", size=rel(0.55))) +
       labs(x = xlabel, y = ylabel) +
       theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 3)) +
       geom_point() +
       scale_colour_manual(values = c("black", "blue", "green", "red")) +
       scale_shape_manual(values = c(16, 18, 25, 17)) +
       facet_wrap(~facets)

# Plot graph
p
dev.off()
        '''

    # Create the R script
    plot_type = 'png'
    png_plot_commands = main_plot_script % locals()
    boxplot_r_script = boxplot_r_script % locals()
    r_script_filename = '{0}.R'.format(file_prefix)
    r_script_filepath = os.path.join(output_directory, r_script_filename)
    write_file(r_script_filepath, boxplot_r_script)

    # Run the R script
    run_r_script(r_script_filename, cwd = output_directory)
def main(prediction_ids = None, memory_free='3.0G', cfg = None):
    # This uses the version of Rosetta from your cluster template settings file
    settings = parse_settings.get_dict()
    rosetta_scripts_path = settings['local_rosetta_installation_path'] + '/source/bin/' + 'rosetta_scripts' + settings['local_rosetta_binary_type']
    ppi_api = get_interface_with_config_file(rosetta_scripts_path = rosetta_scripts_path, rosetta_database_path = '/home/kyleb/rosetta/working_branches/alascan/database')

    t1, t2 = None, None

    # Read the keep_hetatm_lines optional setting
    keep_hetatm_lines = False
    keep_all_lines = False
    try: keep_hetatm_lines = cfg.keep_hetatm_lines
    except: colortext.warning('Note: keep_hetatm_lines is not specified in {0}. Defaulting to {1}.'.format(sys.argv[1], keep_hetatm_lines))
    try: keep_all_lines = cfg.keep_all_lines
    except: colortext.warning('Note: keep_all_lines is not specified in {0}. Defaulting to {1}.'.format(sys.argv[1], keep_all_lines))

    prediction_set_id = cfg.prediction_set_id

    if prediction_ids == None:
        assert( len(sys.argv) > 1 )
        cfg = importlib.import_module(sys.argv[1], package=None)

        protocol_name = cfg.protocol_name

        suppress_warnings = True

        if not ppi_api.prediction_set_exists(prediction_set_id):
            print 'Creating new prediction set:', prediction_set_id
            t1 = time.time()
            ppi_api.add_prediction_set(prediction_set_id, halted = True, priority = 7, allow_existing_prediction_set = False, description = cfg.prediction_set_description)

            # Populate the prediction set with jobs from a (tagged subset of a) user dataset
            print 'Created PredictionSet:', prediction_set_id
            ppi_api.add_prediction_run(prediction_set_id, cfg.user_dataset_name, keep_all_lines = keep_all_lines, keep_hetatm_lines = keep_hetatm_lines, tagged_subset = cfg.tagged_subset, extra_rosetta_command_flags = '-ignore_zero_occupancy false -ignore_unrecognized_res', show_full_errors = True, suppress_warnings = suppress_warnings)
            t2 = time.time()

        existing_job = False
        end_job_name  = '%s_%s' % (getpass.getuser(), prediction_set_id)
        if not os.path.exists(job_output_directory):
            os.makedirs(job_output_directory)

        for d in os.listdir(job_output_directory):
            if os.path.isdir(os.path.join(job_output_directory, d)) and end_job_name in d:
                print 'Found existing job:', d
                job_name = d
                existing_job = True
        if not existing_job:
            job_name = '%s-%s' % (time.strftime("%y%m%d"), end_job_name)

            ppi_api.add_development_protocol_command_lines(
                prediction_set_id, protocol_name, 'minimize_with_cst', ''
            )
            # 2x because bugs
            ppi_api.add_development_protocol_command_lines(
                prediction_set_id, protocol_name, 'minimize_with_cst', ''
            )

        prediction_ids = sorted(ppi_api.get_prediction_ids(prediction_set_id))
        output_dir = os.path.join(job_output_directory, job_name )
    else:
        # Prediction_ids passed in
        job_name = '%s-%s_%s-rerun' % (time.strftime("%y%m%d"), getpass.getuser(), prediction_set_id)

        output_dir = os.path.join(job_output_directory, job_name )
        if os.path.isdir(output_dir):
            shutil.rmtree(output_dir)
        existing_job = False

    settings['scriptname'] = prediction_set_id + '_run'
    settings['tasks_per_process'] = 5
    settings['mem_free'] = memory_free
    settings['output_dir'] = output_dir
    settings['rosetta_args_list'] = [
        '-in:file:fullatom',
        '-ignore_zero_occupancy false',
        '-ignore_unrecognized_res',
        '-fa_max_dis 9.0',
        '-ddg::harmonic_ca_tether 0.5',
        '-ddg::constraint_weight 1.0',
        '-ddg::out_pdb_prefix min_cst_0.5',
        '-ddg::sc_min_only false',
    ]
    settings['rosetta_args_list'].extend(cfg.extra_flags)
    print settings['rosetta_args_list']

    # Now get run settings from database and save to pickle file
    job_dict = {}
    output_data_dir = os.path.join(settings['output_dir'], 'data')

    if not os.path.isdir(output_data_dir):
        os.makedirs(output_data_dir)

    if t1 != None and t2 != None and len(prediction_ids) != 0:
        print('Time taken for {0} predictions: {1}s ({2}s per prediction).'.format(len(prediction_ids), t2-t1, (t2-t1)/len(prediction_ids)))
    print('File cache statistics:')
    pprint.pprint(ppi_api.get_file_content_cache_stats())
    settings['numjobs'] = len(prediction_ids)
    app_name = 'minimize_with_cst'
    settings['appname'] = app_name

    print('')

    t1 = time.time()

    # Progress counter setup
    colortext.message('Creating input data for %d predictions.' % (len(prediction_ids)))
    count, records_per_dot = 0, 50
    print("|" + ("*" * (int(len(prediction_ids)/records_per_dot)-2)) + "|")
    for prediction_id in prediction_ids:
        # Progress counter
        count += 1
        if count % records_per_dot == 0: colortext.write(".", "cyan", flush = True)

        # Check if job already ran
        prediction_id_dir = os.path.join(output_dir, str(prediction_id))
        if existing_job:
            if os.path.isdir( prediction_id_dir ):
                pdb_output_files = [x for x in os.listdir( prediction_id_dir ) if '.pdb' in x]
            else:
                pdb_output_files = []
            if len(pdb_output_files) >= 1:
                print 'Skipping', prediction_id
                settings['numjobs'] = settings['numjobs'] - 1
                continue
            if os.path.isdir(prediction_id_dir):
                print 'Job directory %s already exists, deleting' % prediction_id_dir
                shutil.rmtree(prediction_id_dir)
            # else:
            #     print 'Creating new job directory %s' % prediction_id_dir

        job_data_dir = os.path.join(output_data_dir, str(prediction_id))

        # Allow us to resume from an interrupted setup
        truncate_content = None
        all_files_exist = os.path.exists(job_data_dir) and os.path.exists(os.path.join(job_data_dir, '.ready'))
        if all_files_exist:
            truncate_content = 0

        job_details = ppi_api.get_job_details(prediction_id, truncate_content = truncate_content)
        file_tuples = [] # List of names, contents
        for file_info in job_details['Files']['Input']:
            file_tuples.append( (file_info['Filename'], file_info['Content']) )
        substitution_parameters = json.loads(job_details['JSONParameters'])

        # Scrub the folder
        if not all_files_exist:
            if os.path.isdir(job_data_dir):
                shutil.rmtree(job_data_dir)
            os.makedirs(job_data_dir)

        files_dict = {} # Maps name to filepath position
        for file_name, file_contents in file_tuples:
            new_file_location = os.path.join(job_data_dir, file_name)
            if not all_files_exist:
                if '.pdb' in file_name:
                    if keep_hetatm_lines or keep_all_lines:
                        write_file(new_file_location, file_contents)
                    else:
                        write_file(new_file_location, '\n'.join([l for l in file_contents.split('\n') if l.startswith('ATOM')]))
                else:
                    with open(new_file_location, 'w') as f:
                        f.write(file_contents)
            files_dict[file_name] = os.path.relpath(new_file_location, settings['output_dir'])
        if not all_files_exist:
            write_file(os.path.join(job_data_dir, '.ready'), '')

        argdict = {
            'input_file_list' : [files_dict[substitution_parameters['%%input_pdb%%']]],
        }
        for file_name, file_location in files_dict.iteritems():
            if 'params' in file_name:
                argdict['-extra_res_fa'] = file_location
        job_dict[prediction_id] = argdict


    t2 = time.time()

    print('')
    if count != 0:
        print('Time taken for {0} predictions: {1}s ({2}s per prediction).'.format(count, t2-t1, (t2-t1)/count))
    print('File cache statistics:')
    pprint.pprint(ppi_api.get_file_content_cache_stats())

    print('')
    if len(job_dict) > 0:
        write_run_file(settings, database_run = False, job_dict = job_dict)
        print 'Job files written to directory:', os.path.abspath(output_dir)
    else:
        print 'No tasks to process, not writing job files'
 def _create_input_files(self):
     for s in self.structures:
         write_file(self._filepath('{0}.pdb'.format(s.structure_name)), str(s.pdb_object))