Ejemplo n.º 1
0
def setup():
    global pdb_file_paths  # RCSB PDB_ID -> PDB file
    global rcsb_pdb_objects # RCSB PDB_ID -> PDB object
    global tina_pdb_objects # Tina's PDB_ID -> PDB object
    global tina_pdb_id_to_rcsb_pdb_id # Tina's PDB_ID -> RCSB PDB_ID
    global mutations_dataframe

    if not mutations_dataframe:
        setup_mutations_dataframe()

    # old_mutations_csv is missing some cases but has the mapping from pdb -> partner 1 name, partner 2 name
    old_mutations_csv = os.path.join('temp', 'mutations_Gsp1_old.txt')
    assert(os.path.exists('temp'))
    assert(os.path.exists(old_mutations_csv))

    df = pandas.read_csv(old_mutations_csv, sep = '\t')

    tina_pdb_ids = sorted(set([p for p in df['pdb'].values]))
    rcsb_pdb_ids = set()
    for pdb_id in tina_pdb_ids:
        rcsb_pdb_ids.add(pdb_id[:4])
        tina_pdb_id_to_rcsb_pdb_id[pdb_id] = pdb_id[:4]
    rcsb_pdb_ids = sorted(rcsb_pdb_ids)

    assert(rcsb_pdb_ids == sorted(set([p[:4] for p in mutations_dataframe['pdb'].values])))
    rcsb_file_dir = '../../rawdata'

    for pdb_id in tina_pdb_ids:
        tina_pdb_objects[pdb_id] = PDB.from_filepath(os.path.join('temp', 'pdbs', '{0}.pdb'.format(pdb_id)), parse_ligands = True)

    for pdb_id in rcsb_pdb_ids:
        filename = '{0}.pdb'.format(pdb_id.upper())
        pdb_file_paths[pdb_id.upper()] = os.path.join(rcsb_file_dir, filename)
        pdb_contents = download_pdb(pdb_id, rcsb_file_dir, silent = True, filename = filename)
        p = PDB(pdb_contents, parse_ligands = True)
        rcsb_pdb_objects[pdb_id] = p

    print('\nRosetta files  ({0}) : {1}'.format(str(len(tina_pdb_ids)).rjust(2), ', '.join([s.rjust(5) for s in tina_pdb_ids])))
    print('Original files ({0}) : {1}\n'.format(str(len(rcsb_pdb_ids)).rjust(2), ', '.join([s.rjust(5) for s in rcsb_pdb_ids])))

    ppi_api = get_ppi_api()
    for pdb_id, pdb_file_path in pdb_file_paths.iteritems():
        existing_records = ppi_api.DDG_db.execute_select('SELECT * FROM PDBFile WHERE ID=%s', parameters=(pdb_id,))
        if existing_records:
            colortext.warning('The PDB file {0} exists in the database.'.format(pdb_id))
        complex_ids = ppi_api.search_complexes_by_pdb_id(pdb_id)

        if complex_ids:
            colortext.warning('The PDB file {0} has associated complexes: {1}'.format(pdb_id, ', '.join(map(str, complex_ids))))
    print('')
Ejemplo n.º 2
0
def get_kic_run_details(output_directory, pdb_id, loop_sets, test_mode = False):
    '''This function returns the details required to set up the analysis for the Rosetta KIC and NGK methods.'''
    details = []
    c = 0
    for sc_file in glob.glob(os.path.join(output_directory, '{0}*.sc'.format(pdb_id))):

        # Determine the id
        sc_filename = os.path.split(sc_file)[1]
        assert(sc_filename.startswith('{0}_score'.format(pdb_id)))
        run_id = int(sc_filename[10:-3])

        # Determine the score
        sc_lines = [l.strip() for l in get_file_lines(sc_file) if l.strip()]
        assert(sc_lines[0] == 'SEQUENCE:')
        assert(sc_lines[1].split()[:2] == ['SCORE:', 'total_score'])
        assert(sc_lines[2].split()[0] == 'SCORE:')
        total_score = float(sc_lines[2].split()[1])

        # Determine the filepath of the predicted structure
        associated_pdb_file = os.path.join(output_directory, '{0}_{0}{1}_0001.pdb'.format(pdb_id, run_id))

        # Extract the PDB coordinates into a pandas dataframe (HDF5 format)
        assert(os.path.exists(associated_pdb_file))
        hdf5_file = os.path.splitext(associated_pdb_file)[0] + '.hdf5'
        if os.path.exists(hdf5_file):
            store = pandas.HDFStore(hdf5_file)
            pdb_loop_residue_matrix = store['dataframe']
            store.close()
        else:
            pdb_loop_residue_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(associated_pdb_file).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)
            store = pandas.HDFStore(hdf5_file)
            store['dataframe'] = pdb_loop_residue_matrix
            store.close()

        details.append(dict(
            id = run_id,
            score = total_score,
            predicted_structure = associated_pdb_file,
            pdb_loop_residue_matrix = pdb_loop_residue_matrix,
        ))
        if test_mode:
            c += 1
            if c >= 10:
                break

    return details
Ejemplo n.º 3
0
    def static_get_pdb_object(pdb_id, bio_cache = None, cache_dir = None):
        '''This method does not necessarily use a BioCache but it seems to fit here.'''
        pdb_id = pdb_id.upper()

        if bio_cache:
            return bio_cache.get_pdb_object(pdb_id)

        if cache_dir:
            # Check to see whether we have a cached copy of the PDB file
            filepath = os.path.join(cache_dir, '{0}.pdb'.format(pdb_id))
            if os.path.exists(filepath):
                return PDB.from_filepath(filepath)

        # Get any missing files from the RCSB and create cached copies if appropriate
        pdb_contents = retrieve_pdb(pdb_id)
        if cache_dir:
            write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents)
        return PDB(pdb_contents)
Ejemplo n.º 4
0
def main(FixedIDs = [], radii = [6.0, 7.0, 8.0, 9.0]):
    max_processors = get_number_of_processors()

    rescore_process_file = "/tmp/klab_rescore.txt"
    parser = OptionParser()
    parser.add_option("-n", "--numprocesses", default=1, type='int', dest="num_processes", help="The number of processes used for the rescoring. The cases are split according to this number.", metavar="NUM_PROCESSES")
    parser.add_option("-p", "--process", default=1, type='int', dest="process", help="The ID of this process. This should be an integer between 1 and the number of processes used for the rescoring.", metavar="PROCESS_ID")
    parser.add_option("-d", "--delete",  action="store_true", dest="delete", help="Delete the process tracking file %s." % rescore_process_file)
    parser.add_option("-s", "--set",  type='string', dest="prediction_set", help="The prediction set to rescore.")
    (options, args) = parser.parse_args()

    if options.delete and os.path.exists(rescore_process_file):
        print("Removing %s." % rescore_process_file)
        os.remove(rescore_process_file)

    num_processes = options.num_processes
    prediction_set = options.prediction_set
    process_id = options.process

    for i in FixedIDs:
        assert(type(i) == type(1))

    # SELECT * FROM `Prediction` WHERE `PredictionSet`= 'RosCon2013_P16_score12prime'  AND Status='done' LIMIT 1
    # Check prediction set
    if not prediction_set:
        raise colortext.Exception("A prediction set must be specified.")
    else:
        if FixedIDs:
            results = ddGdb.execute("SELECT DISTINCT PredictionSet FROM Prediction WHERE ID IN (%s)" % ",".join(map(str, FixedIDs)))
            if len(results) != 1:
                raise colortext.Exception("Error: The fixed IDs cover %d different prediction sets." % len(results))
        else:
            results = ddGdb.execute("SELECT ID FROM PredictionSet WHERE ID=%s", parameters=(prediction_set,))
        if not results:
            raise colortext.Exception("The prediction set '%s' does not exist in the database." % prediction_set)

    if num_processes < 1:
        raise colortext.Exception("At least 1 processor must be used.")
    if num_processes > max_processors:
        raise colortext.Exception("Only %d processors/cores were detected. Cannot run with %d processes." % (max_processors, num_processes))
    if num_processes > (max_processors * 0.75):
        colortext.warning("Warning: Using %d processors/cores out of %d which is %0.2f%% of the total available." % (num_processes, max_processors, (100.0*float(num_processes)/float(max_processors))))
    if not(1 <= process_id <= min(max_processors, num_processes)):
        raise colortext.Exception("The process ID %d must be between 1 and the number of processes, %d." % (process_id, num_processes))

    if os.path.exists(rescore_process_file):
        lines = readFileLines(rescore_process_file)
        idx = lines[0].find("numprocesses")
        if idx == -1:
            raise Exception("Badly formatted %s." % rescore_process_file)
        existing_num_processes = int(lines[0][idx+len("numprocesses"):])
        if existing_num_processes != num_processes:
            raise colortext.Exception("You specified the number of processes to be %d but %s already specifies it as %d." % (num_processes, rescore_process_file, existing_num_processes))
        for line in [line for line in lines[1:] if line.strip()]:
            idx = line.find("process")
            if idx == -1:
                raise colortext.Exception("Badly formatted %s. Line is '%s'." % (rescore_process_file, line))
            existing_process = int(line[idx+len('process'):])
            if process_id == existing_process:
                raise colortext.Exception("Process %d is already logged as running. Check if this is so and edit %s." % (process_id, rescore_process_file))
        F = open(rescore_process_file, 'a')
        F.write("process %d\n" % process_id)
        F.close()
    else:
        F = open(rescore_process_file, 'w')
        F.write("numprocesses %d\n" % num_processes)
        F.write("process %d\n" % process_id)
        F.close()

    output_dir = os.path.join('rescoring', str(process_id))
    if not(os.path.exists(output_dir)):
        os.makedirs(output_dir)
    abs_output_dir = os.path.abspath(os.path.join(os.getcwd(), output_dir))
    print("Running process in %s.\n" % abs_output_dir)

    ReallyFixedIDs = False

    results = ddGdb.execute("SELECT ID, ExperimentID, Scores FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion <> %s", parameters=(prediction_set, float(current_score_revision),))
    if not(FixedIDs) and results:
        raise WrongScoreRevisionException("Score versions found which are not %s. Need to update table structure." % current_score_revision)
    else:
        # Hacky way to run multiple processes
        if ReallyFixedIDs:
            num_to_score = len(remaining_unscored)
            num_for_this_to_score = num_to_score / num_processes
            IDs_to_score = remaining_unscored[(process_id-1) * num_for_this_to_score : (process_id) * num_for_this_to_score]
            results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s)" % (",".join(map(str, IDs_to_score))))
        elif FixedIDs:
            results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s) AND MOD(ID,%s)=%s" % (",".join(map(str, FixedIDs)), num_processes,process_id-1))
        else:
            results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion=%s AND MOD(ID,%s)=%s", parameters=(prediction_set, float(current_score_revision),num_processes,process_id-1))

    count = 0
    cases_computed = 0
    total_time_in_secs = 0

    number_of_cases_left = len(results) * len(radii)

    failed_cases = []
    colortext.printf("Rescoring %d predictions over %d radii...\n" % (len(results), len(radii)), 'lightgreen')
    for r in results:
        t = Timer()
        t.add('Preamble')
        inner_count = 0

        mutations = ddGdb.execute('SELECT * FROM ExperimentMutation WHERE ExperimentID=%s', parameters=(r['ExperimentID'],))
        mutation_str = ', '.join(['%s %s%s%s' % (m['Chain'], m['WildTypeAA'], m['ResidueID'], m['MutantAA']) for m in mutations])
        extracted_data = False

        details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],))
        details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],))
        colortext.message("Prediction: %d, %s chain %s. Mutations: %s. Experiment ID #%d. UserDataSetExperimentID #%d." % (details[0]['ID'], details[0]['PDBFileID'], details[0]['Chain'], mutation_str, r['ExperimentID'], r['UserDataSetExperimentID']))

        experiment_pdbID = ddGdb.execute('SELECT PDBFileID FROM Experiment WHERE ID=%s', parameters=(r['ExperimentID'],))[0]['PDBFileID']
        print('Experiment PDB file ID = %s' % experiment_pdbID)
        pdbID = ddGdb.execute('SELECT UserDataSetExperiment.PDBFileID FROM Prediction INNER JOIN UserDataSetExperiment ON UserDataSetExperimentID=UserDataSetExperiment.ID WHERE Prediction.ID=%s', parameters=(r['ID'],))[0]['PDBFileID']
        print('UserDataSetExperiment PDB file ID = %s' % pdbID)

        count += 1
        if True:#len(mutations) == 1:
            timestart = time.time()

            #mutation = mutations[0]
            dbchains = sorted(set([mutation['Chain'] for mutation in mutations]))
            # todo: note: assuming monomeric structures here
            assert(len(dbchains) == 1)
            dbchain = dbchains[0]
            #mutantaa = mutation['MutantAA']

            ddG_dict = json.loads(r['Scores'])
            kellogg_ddG = ddG_dict['data']['kellogg']['total']['ddG']

            #assert(ddG_dict['version'] == current_score_revision)

            all_done = True
            for radius in radii:
                score_name = ('noah_%0.1fA' % radius).replace(".", ",")
                if not(ddG_dict['data'].get(score_name)):
                    all_done = False
                else:
                    cases_computed += 1
                    number_of_cases_left -= 1
            if all_done:
                print('Prediction %d: done.' % r["ID"])
                continue

            # Extract data
            t.add('Grab data')
            #archivefile = None
            #prediction_data_path = ddGdb.execute('SELECT Value FROM _DBCONSTANTS WHERE VariableName="PredictionDataPath"')[0]['Value']
            #job_data_path = os.path.join(prediction_data_path, '%d.zip' % r['ID'])
            #print(job_data_path)
            #assert(os.path.exists(job_data_path))
            #archivefile = readBinaryFile(job_data_path)
            archivefile = DDG_interface.getData(r['ID'])
            zipfilename = os.path.join(output_dir, "%d.zip" % r['ID'])
            F = open(zipfilename, "wb")
            F.write(archivefile)
            F.close()

            t.add('Extract data')
            zipped_content = zipfile.ZipFile(zipfilename, 'r', zipfile.ZIP_DEFLATED)
            tmpdir = None
            repacked_files = []
            mutant_files = []

            rosetta_resids = []
            try:
                tmpdir = makeTemp755Directory(output_dir)
                highestIndex = -1
                foundResfile = False
                foundMutfile = False

                presumed_mutation = None
                for fname in sorted(zipped_content.namelist()):
                    if fname.endswith(".pdb"):
                        if fname.startswith("%s/mut_" % r['ID']) or fname.startswith("%s/repacked_" % r['ID']):
                            structnum = int(fname[fname.rindex('_')+1:-4])
                            if fname.startswith("%s/mut_" % r['ID']):
                                if presumed_mutation:
                                    assert(presumed_mutation == os.path.split(fname)[1].split('_')[1])
                                else:
                                    presumed_mutation = os.path.split(fname)[1].split('_')[1]
                                newfname = 'mutant_%02d' % structnum
                            if fname.startswith("%s/repacked_" % r['ID']):
                                newfname = 'repacked_%02d' % structnum
                            highestIndex = max(highestIndex, structnum)

                            newfilepath = os.path.join(tmpdir, newfname)
                            writeFile(newfilepath, zipped_content.read(fname))

                            if fname.startswith("%s/mut_" % r['ID']):
                                mutant_files.append(newfilepath)
                            if fname.startswith("%s/repacked_" % r['ID']):
                                repacked_files.append(newfilepath)
                        #elif fname.startswith("%s/%s-%s" % (r['ID'],r['ExperimentID'],pdbID)) or fname.startswith("%s/repacked_" % r['ID']):
                        #    writeFile(os.path.join(tmpdir, '%s.pdb' % pdbID), zipped_content.read(fname))
                    if fname.startswith("%s/%s-%s.resfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)):
                        raise Exception('This case needs to be updated (see the mutfile section below). We mainly use mutfiles now so I did not update this section.')
                        foundResfile = True
                        lines = zipped_content.read(fname).split("\n")
                        assert(len(lines) == 3)
                        assert(lines[0] == "NATAA")
                        assert(lines[1] == "start")
                        resfile_mutation = lines[2].split(" ")
                        assert(len(resfile_mutation) == 4)
                        rosetta_resid = resfile_mutation[0]
                        rosetta_chain = resfile_mutation[1]
                        rosetta_mutaa = resfile_mutation[3]
                        assert(mutantaa == rosetta_mutaa)
                        assert(dbchain == rosetta_chain)
                        assert(resfile_mutation[2] == 'PIKAA')
                        assert(len(rosetta_mutaa) == 1)
                    if fname.startswith("%s/%s-%s.mutfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)):
                        foundMutfile = True
                        lines = zipped_content.read(fname).split("\n")
                        assert(lines[0].startswith('total '))
                        num_mutations = int(lines[0][6:])
                        assert(lines[1] == str(num_mutations))
                        # todo: note: assuming monomeric structures here
                        rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],))
                        assert(len(rosetta_chain) == 1)
                        rosetta_chain = rosetta_chain[0]['Chain']

                        resfile_mutations = lines[2:]
                        for resfile_mutation in resfile_mutations:
                            resfile_mutation = resfile_mutation.split(" ")
                            assert(len(resfile_mutation) == 3)
                            rosetta_resids.append(resfile_mutation[1])
                            rosetta_mutaa = resfile_mutation[2]
                            assert(dbchain == rosetta_chain)
                            assert(len(rosetta_mutaa) == 1)

                # Make sure the wtaa->mutantaa types match the structures
                assert(not(foundResfile))
                if not foundMutfile:
                    raise Exception('This case needs to be updated (see the mutfile section below). This was added as a hack for cases where I did not store the mutfile so I did not update this section.')
                    input_files = ddGdb.execute_select('SELECT InputFiles FROM Prediction WHERE ID=%s', parameters=(r['ID'],))
                    assert(len(input_files) == 1)
                    lines = pickle.loads(input_files[0]['InputFiles'])['MUTFILE'].split("\n")

                    #lines = regenerate_mutfile(r['ID']).split("\n")
                    assert(len(lines) == 3)
                    assert(lines[0] == "total 1")
                    assert(lines[1] == "1")
                    resfile_mutation = lines[2].split(" ")
                    assert(len(resfile_mutation) == 3)
                    rosetta_resid = resfile_mutation[1]
                    rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],))
                    assert(len(rosetta_chain) == 1)
                    rosetta_chain = rosetta_chain[0]['Chain']
                    rosetta_mutaa = resfile_mutation[2]
                    assert(dbchain == rosetta_chain)
                    assert(len(rosetta_mutaa) == 1)
                    assert("%s%s%s" % (resfile_mutation[0], resfile_mutation[1], resfile_mutation[2]) == presumed_mutation)

                fullresids = []

                for rosetta_resid in rosetta_resids:
                    fullresid = None
                    if rosetta_resid.isdigit():
                        fullresid = '%s%s%s ' % (rosetta_chain, (4-len(rosetta_resid)) * ' ', rosetta_resid)
                    else:
                        assert(False)
                        fullresid = '%s%s%s' % (rosetta_chain, (5-len(rosetta_resid)) * ' ', rosetta_resid)
                    fullresids.append(fullresid)


                resultst1 = ddGdb.execute_select("SELECT ExperimentID, UserDataSetExperimentID FROM Prediction WHERE ID=%s", parameters = (r['ID'],))
                assert(len(resultst1) == 1)
                ExperimentIDt1 = resultst1[0]['ExperimentID']
                UserDataSetExperimentIDt1 = resultst1[0]['UserDataSetExperimentID']

                if UserDataSetExperimentIDt1:
                    resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM UserDataSetExperiment WHERE ID=%s", parameters = (UserDataSetExperimentIDt1,))
                else:
                    resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM Experiment WHERE ID=%s", parameters = (ExperimentIDt1,))
                assert(len(resultst2) == 1)

                prediction_PDB_ID = resultst2[0]['PDBFileID']

                if False and prediction_PDB_ID not in ['1TEN', '1AYE', '1H7M'] + ['1A2P', '1BNI', '1STN']:
                    for fullresid in fullresids:
                        wtaa = None
                        for m in mutations:
                            # Hack for ub_RPN13
                            if prediction_PDB_ID == 'ub_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_RPN13_yeast
                            elif prediction_PDB_ID == 'uby_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_OTU
                            elif prediction_PDB_ID == 'ub_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_OTU_yeast
                            elif prediction_PDB_ID == 'uby_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_UQcon
                            elif prediction_PDB_ID == 'ub_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) + 213): # starts at 501
                                wtaa = m['WildTypeAA']
                            # Hack for uby_UQcon
                            elif prediction_PDB_ID == 'uby_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 287):
                                wtaa = m['WildTypeAA']
                            elif m['Chain'] == fullresid[0] and m['ResidueID'] == fullresid[1:].strip():
                                wtaa = m['WildTypeAA']
                        if (wtaa == None):
                            colortext.error(prediction_PDB_ID)
                            colortext.error('wtaa == None')
                            colortext.error('fullresid = %s' % str(fullresid))
                            colortext.error(str(mutations))
                            colortext.warning([rosetta_resid.strip() for rosetta_resid in rosetta_resids])
                            #sys.exit(0)
                        assert(wtaa != None)
                        assert(PDB.from_filepath(repacked_files[0]).get_residue_id_to_type_map()[fullresid] == wtaa)
                    #assert(PDB(mutant_files[0]).get_residue_id_to_type_map()[fullresid] == mutantaa)

                for radius in radii:
                    score_name = ('noah_%0.1fA' % radius).replace(".", ",")

                    if ddG_dict['data'].get(score_name):
                        print('Radius %0.1f: done.' % radius)
                        continue
                    cases_computed += 1
                    number_of_cases_left -= 1

                    t.add('Radius %0.3f: repacked' % radius)
                    colortext.printf("Prediction ID: %d. Calculating radius %0.1f. Calculation #%d of %d." % (r['ID'], radius, cases_computed, len(results) * len(radii)), 'orange')

                    repacked_score = NoahScore()
                    repacked_score.calculate(repacked_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius)
                    colortext.message("Repacked")
                    print(repacked_score)

                    t.add('Radius %0.3f: mutant' % radius)
                    mutant_score = NoahScore()
                    mutant_score.calculate(mutant_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius)
                    colortext.printf("Mutant", color = 'cyan')
                    print(mutant_score)

                    t.add('Radius %0.3f: postamble' % radius)
                    colortext.printf("ddG", color = 'lightpurple')
                    ddg_score = repacked_score.ddg(mutant_score)
                    print(ddg_score)

                    colortext.printf("Liz's ddG", color = 'yellow')
                    print("Total score: %0.3f" % kellogg_ddG)

                    ddG_dict['version'] = '0.23'
                    if ddG_dict['version'] == '0.1':
                        ddG_dict['version'] = '0.21'
                        ddG_dict['data'] = {
                            'kellogg' : {
                                'total' : ddG_dict['data'],
                            },
                            'noah': {
                                'total' : {'ddG' : ddg_score.total},
                                'positional' : {'ddG' : ddg_score.positional},
                                'positional_twoscore' : {'ddG' : ddg_score.positional_twoscore},
                            },
                        }
                    elif ddG_dict['version'] == '0.2':
                        ddG_dict['version'] = '0.21'
                        ddG_dict['data']['noah']['total']['ddG'] = ddg_score.total
                        ddG_dict['data']['noah']['positional']['ddG'] = ddg_score.positional
                        ddG_dict['data']['noah']['positional_twoscore']['ddG'] = ddg_score.positional_twoscore
                    elif ddG_dict['version'] == '0.22':
                        ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}}
                        ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total
                        ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional
                        ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore
                    elif ddG_dict['version'] == '0.23':
                        ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}}
                        ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total
                        ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional
                        ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore

                    jsonified_ddG = json.dumps(ddG_dict)
                    ddGdb.execute('UPDATE Prediction SET Scores=%s WHERE ID=%s', parameters=(jsonified_ddG, r['ID'],))
                t.add('Cleanup')
                shutil.rmtree(tmpdir)
                os.remove(zipfilename)

            except Exception, e:
                print("Exception! In prediction %d" % r['ID'], str(e))
                failed_cases.append(r['ID'])
                import traceback
                print(traceback.format_exc())
                if tmpdir:
                    shutil.rmtree(tmpdir)

            total_time_in_secs += t.sum()
            average_time_taken = float(total_time_in_secs)/float(cases_computed or 1)
            estimate_remaining_time = number_of_cases_left * average_time_taken

            t.stop()
            colortext.printf("**Profile**", 'orange')
            print(t)
            colortext.message("Time taken for this case: %0.2fs." % t.sum())
            colortext.message("Average time taken per case: %0.2fs." % average_time_taken)
            colortext.message("Estimated time remaining: %dh%dm%ds." % (int(estimate_remaining_time/3600), int((estimate_remaining_time/60) % 60), estimate_remaining_time % 60))
            print("\n")
Ejemplo n.º 5
0
def extract_analysis_data(dataset_list_file, output_directory, data_extraction_method, expectn, top_x, prefix, test_mode = False):
    '''This is the main function in this script and is where the basic analysis is compiled.

       output_directory should contain the results of the prediction run.
       data_extraction_method should be a function pointer to the method-specific function used to retrieve the prediction results e.g. get_kic_run_details
       expectn specifies how many predictions we expect to find (useful in case some jobs failed).
       top_x specifies how many of the best-scoring predictions should be used to generate the TopX metric results e.g.
       the Top5 RMSD metric value measures the lowest RMSD amongst the five best-scoring structures.
       prefix is used to name the output files.
    '''

    # Sanity check
    assert(top_x <= expectn)

    # Set up reference structures
    structures_folder = os.path.join('..', 'input', 'structures', '12_res')
    rcsb_references = os.path.join(structures_folder, 'rcsb', 'reference')
    rosetta_references = os.path.join(structures_folder, 'rosetta', 'reference')

    # Set up the per-case statistics dicts
    best_scoring_structures = {}
    median_scoring_structures = {}
    worst_scoring_structures = {}
    total_percent_subanstrom = {}
    top_x_percent_subanstrom = {}
    top_x_loop_prediction_sets = {}

    # Set up the input file used to generate the graph plotting the "percentage of subangstrom models" metric over
    # varying values of X used to select the TopX structures
    percentage_subangstrom_over_top_X_plot_input = ['PDB\tX\tPercentage of subangstrom cases for TopX']
    percent_subangrom_by_top_x = {}

    # Set up the summary analysis file
    csv_file = ['\t'.join(['PDB ID', 'Models', '%<1.0A', 'Top{0} %<1.0A'.format(top_x), 'Best score', 'Top{0} score'.format(top_x), 'Median score', 'Worst score', 'Closest score', 'Top1 RMSD', 'Top{0} RMSD'.format(top_x), 'Closest RMSD'])]

    # Read in the benchmark input
    pdb_ids = [os.path.splitext(os.path.split(s.strip())[1])[0] for s in get_file_lines(dataset_list_file) if s.strip()]

    # Truncate the benchmark input for test mode
    if test_mode:
        pdb_ids = pdb_ids[:10]

    # Analyze the performance for each case in the benchmark
    for pdb_id in pdb_ids:

        rcsb_reference_pdb = os.path.join(rcsb_references, pdb_id + '.pdb')
        assert(os.path.exists(rcsb_reference_pdb))
        rosetta_reference_pdb = os.path.join(rosetta_references, pdb_id + '.pdb')
        assert(os.path.exists(rosetta_reference_pdb))
        assert(len(pdb_id) == 4)
        loops_file = os.path.join(structures_folder, 'rosetta', 'pruned', '{0}.loop.json'.format(pdb_id))
        loop_sets = json.loads(read_file(loops_file))
        assert(len(loop_sets['LoopSet']) == 1)

        # Create a container for loop predictions
        loop_prediction_set = LoopPredictionSet()

        # Read the coordinates from the reference PDB file
        rcsb_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rcsb_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)
        rosetta_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rosetta_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)

        colortext.wgreen('\n\nReading in the run details for {0}:'.format(pdb_id))
        details = data_extraction_method(output_directory, pdb_id, loop_sets, test_mode = test_mode)
        for d in details:
            loop_prediction = loop_prediction_set.add(d['id'], d['score'], pdb_id = pdb_id, rmsd = None, pdb_path = d['predicted_structure'], pdb_loop_residue_matrix = d['pdb_loop_residue_matrix'])
        print(' Done')

        # Compute the RMSD for this case for the structure using the pandas dataframe
        # It is more efficient to do this after truncation if truncating by score but in the general case users will
        # probably want to consider all predictions. If not (e.g. for testing) then arbitrary subsets can be chosen
        # in the loop above
        colortext.wgreen('Computing RMSDs for {0}:'.format(pdb_id))
        loop_prediction_set.compute_rmsds(rcsb_reference_matrix)
        loop_prediction_set.check_rmsds(rosetta_reference_matrix)
        print(' Done\n')

        # Truncate the structures to the top expectn-scoring files
        loop_prediction_set.sort_by_score()
        loop_prediction_set.truncate(expectn)
        if len(loop_prediction_set) != expectn:
            print('Error: Expected {0} structures but only found {1}.'.format(expectn, len(loop_prediction_set)))
            sys.exit(1)

        # Create a new set containing the top-X-scoring structures and identify the median-scoring structure
        top_x_loop_prediction_sets[pdb_id] = loop_prediction_set[:top_x]
        median_scoring_structures[pdb_id] = loop_prediction_set[int(expectn / 2)]

        # Determine the lowest-/best-scoring structure
        best_scoring_structures[pdb_id] = loop_prediction_set[0]
        best_score = best_scoring_structures[pdb_id].score
        worst_scoring_structures[pdb_id] = loop_prediction_set[-1]
        worst_score = worst_scoring_structures[pdb_id].score
        assert(top_x_loop_prediction_sets[pdb_id][0] == best_scoring_structures[pdb_id])

        # Print structures
        colortext.warning('Top{0} structures'.format(top_x))
        print(top_x_loop_prediction_sets[pdb_id])
        colortext.warning('Top1 structure')
        print(best_scoring_structures[pdb_id])
        colortext.warning('Median (by score) structure')
        print(median_scoring_structures[pdb_id])
        colortext.warning('Lowest-scoring structures')
        print(worst_scoring_structures[pdb_id])

        # Create values for TopX variable plot
        loop_prediction_set.sort_by_score()
        for top_x_var in range(1, len(loop_prediction_set) + 1):
            new_subset = loop_prediction_set[:top_x_var]
            percent_subangstrom = 100 * new_subset.fraction_with_rmsd_lt(1.0)
            percentage_subangstrom_over_top_X_plot_input.append('{0}\t{1}\t{2}'.format(pdb_id, top_x_var, percent_subangstrom))
            percent_subangrom_by_top_x[top_x_var] = percent_subangrom_by_top_x.get(top_x_var, {})
            percent_subangrom_by_top_x[top_x_var][pdb_id] = percent_subangstrom

        total_percent_subanstrom[pdb_id] = 100 * loop_prediction_set.fraction_with_rmsd_lt(1.0)
        top_x_percent_subanstrom[pdb_id] = 100 * top_x_loop_prediction_sets[pdb_id].fraction_with_rmsd_lt(1.0)
        colortext.warning('Number of sub-angstrom cases in the full set of {0}: {1}'.format(expectn, total_percent_subanstrom[pdb_id]))
        colortext.warning('Number of sub-angstrom cases in the TopX structures: {1}'.format(expectn, top_x_percent_subanstrom[pdb_id]))

        loop_prediction_set.sort_by_rmsd()
        closest_rmsd = loop_prediction_set[0].rmsd
        closest_score = loop_prediction_set[0].score
        colortext.warning('RMSD of closest model: {0}'.format(closest_rmsd))
        colortext.warning('Score of closest model: {0}'.format(closest_score))

        top_1_rmsd = best_scoring_structures[pdb_id].rmsd

        top_x_rmsd = best_scoring_structures[pdb_id].rmsd
        top_x_score = best_scoring_structures[pdb_id].score
        for s in top_x_loop_prediction_sets[pdb_id]:
            if (s.rmsd < top_x_rmsd) or (s.rmsd == top_x_rmsd and s.score < top_x_score):
                top_x_rmsd = s.rmsd
                top_x_score = s.score
        assert(top_x_score <= worst_score)
        assert(top_x_rmsd <= top_1_rmsd)

        print('Top 1 RMSD (predicted vs Rosetta/RCSB reference structure): {0}'.format(top_1_rmsd))
        print('Top {0} RMSD (predicted vs Rosetta/RCSB reference structure): {1}'.format(top_x, top_x_rmsd))

        csv_file.append('\t'.join(map(str, [pdb_id, expectn, total_percent_subanstrom[pdb_id], top_x_percent_subanstrom[pdb_id], best_score, top_x_score, median_scoring_structures[pdb_id].score, worst_score, closest_score, top_1_rmsd, top_x_rmsd, closest_rmsd])))

    # Add a column of median percent subangstrom values
    for top_x_var, values_by_pdb in sorted(percent_subangrom_by_top_x.iteritems()):
        assert(sorted(values_by_pdb.keys()) == sorted(pdb_ids))
        median_value = sorted(values_by_pdb.values())[len(pdb_ids) / 2]
        percentage_subangstrom_over_top_X_plot_input.append('Median\t{1}\t{2}'.format(pdb_id, top_x_var, median_value))

    write_file('{0}analysis.csv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}analysis.tsv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}percentage_subangstrom_over_top_X.tsv'.format(prefix), '\n'.join(percentage_subangstrom_over_top_X_plot_input))
Ejemplo n.º 6
0
def setup_jobs(outpath, options, input_files):
    ''' This function sets up the jobs by creating the necessary input files as expected.
          - outpath is where the output is to be stored.
          - options is the optparse options object.
          - input_files is a list of paths to input files.
    '''

    job_inputs = None
    reverse_mapping = None
    fasta_file_contents = {}

    # Generate FASTA files for PDB inputs
    # fasta_file_contents is a mapping from a file path to a pair (FASTA contents, file type). We remember the file type
    # since we offset residue IDs depending on file type i.e. for FASTA files, we treat each sequence separately and do
    # not renumber the fragments in postprocessing. For PDB files, however, we need to respect the order and length of
    # sequences so that we renumber the fragments appropriately in postprocessing - we assume that if a PDB file is passed in
    # then all chains (protein, RNA, or DNA) will be used in a Rosetta run.
    for input_file in input_files:
        assert(not(fasta_file_contents.get(input_file)))
        if any(fnmatch(input_file, x) for x in pdb_file_wildcards):
            pdb = PDB.from_filepath(input_file, strict=True)
            pdb.pdb_id = os.path.basename(input_file).split('.')[0]            
            if pdb.pdb_id.startswith('pdb') and len(pdb.pdb_id) >= 7:
                # Hack to rename FASTA identifiers for pdb*.ent files which are present in mirrors of the PDB
                pdb.pdb_id = pdb.pdb_id.replace('pdb', '')    
            fasta_file_contents[input_file] = (pdb.create_fasta(prefer_seqres_order = False), 'PDB')
        else:
            fasta_file_contents[input_file] = (read_file(input_file), 'FASTA')

    # Extract sequences from the input FASTA files.
    found_sequences, reverse_mapping, errors = get_sequences(options, fasta_file_contents)
    if found_sequences:
        reformat(found_sequences)
    if errors:
        return None, False, errors

    # Discard sequences that are the wrong chain.
    desired_sequences = {}
    for key, sequence in found_sequences.iteritems():
        pdb_id, chain, file_name = key
        if options.chain is None or chain == options.chain:
            desired_sequences[key] = sequence

    # Create the input FASTA and script files.
    job_inputs, errors = create_inputs(options, outpath, desired_sequences)

    # Create the reverse mapping file
    if reverse_mapping:
        segment_mapping_file = os.path.join(outpath, "segment_map.json")
        colorprinter.message("Creating a reverse mapping file %s." % segment_mapping_file)
        write_file(segment_mapping_file, json.dumps(reverse_mapping))

    # Create the post-processing script file
    post_processing_script = read_file(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'post_processing.py'))
    write_file(os.path.join(outpath, 'post_processing.py'), post_processing_script, 'w')

    # Create the secondary structure filter file
    if options.secondary_structure_file:
        write_file(os.path.join(outpath, 'ss_filter.json'), json.dumps({'secondary_structure_filter' : SecondaryStructureDefinition.from_filepath(options.secondary_structure_file).data}), 'w')

    return job_inputs, reverse_mapping != None, errors
Ejemplo n.º 7
0
    #    align_two_simple_sequences(fasta_sequence, uniparc_sequence, sequence1name = '%s:%s|PDBID|CHAIN|SEQUENCE' % (pdb_id, c), sequence2name = uniparc_id)


# sanity check - see if uniprotAC in pdb is in the list of the matched uniprot id

print(chains)


sys.exit(0)

px = PDBML.retrieve('1A2C', cache_dir='/home/oconchus/temp')
for k, v in sorted(px.atom_to_seqres_sequence_maps.iteritems(), key=lambda x:(x[0], x[1])):
    print(k,v)


p = PDB.from_filepath('../.testdata/1H38.pdb') # has protein, DNA, RNA
p = PDB.from_filepath('../.testdata/1ZC8.pdb')
p = PDB.from_filepath('../.testdata/4IHY.pdb')
#p = PDB('../.testdata/2GRB.pdb')
p = PDB.from_filepath('../.testdata/1J1M.pdb')
p = PDB.from_filepath('../.testdata/1H38.pdb')
p = PDB.from_filepath('../.testdata/1A2C.pdb')

#print(p.structure_lines)

colortext.message("Resolution")
print(p.get_resolution())

colortext.message("Techniques")
print(p.get_techniques())