Ejemplo n.º 1
0
def get_kic_run_details(output_directory, pdb_id, loop_sets, test_mode = False):
    '''This function returns the details required to set up the analysis for the Rosetta KIC and NGK methods.'''
    details = []
    c = 0
    for sc_file in glob.glob(os.path.join(output_directory, '{0}*.sc'.format(pdb_id))):

        # Determine the id
        sc_filename = os.path.split(sc_file)[1]
        assert(sc_filename.startswith('{0}_score'.format(pdb_id)))
        run_id = int(sc_filename[10:-3])

        # Determine the score
        sc_lines = [l.strip() for l in get_file_lines(sc_file) if l.strip()]
        assert(sc_lines[0] == 'SEQUENCE:')
        assert(sc_lines[1].split()[:2] == ['SCORE:', 'total_score'])
        assert(sc_lines[2].split()[0] == 'SCORE:')
        total_score = float(sc_lines[2].split()[1])

        # Determine the filepath of the predicted structure
        associated_pdb_file = os.path.join(output_directory, '{0}_{0}{1}_0001.pdb'.format(pdb_id, run_id))

        # Extract the PDB coordinates into a pandas dataframe (HDF5 format)
        assert(os.path.exists(associated_pdb_file))
        hdf5_file = os.path.splitext(associated_pdb_file)[0] + '.hdf5'
        if os.path.exists(hdf5_file):
            store = pandas.HDFStore(hdf5_file)
            pdb_loop_residue_matrix = store['dataframe']
            store.close()
        else:
            pdb_loop_residue_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(associated_pdb_file).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)
            store = pandas.HDFStore(hdf5_file)
            store['dataframe'] = pdb_loop_residue_matrix
            store.close()

        details.append(dict(
            id = run_id,
            score = total_score,
            predicted_structure = associated_pdb_file,
            pdb_loop_residue_matrix = pdb_loop_residue_matrix,
        ))
        if test_mode:
            c += 1
            if c >= 10:
                break

    return details
Ejemplo n.º 2
0
def setup():
    global pdb_file_paths  # RCSB PDB_ID -> PDB file
    global rcsb_pdb_objects # RCSB PDB_ID -> PDB object
    global tina_pdb_objects # Tina's PDB_ID -> PDB object
    global tina_pdb_id_to_rcsb_pdb_id # Tina's PDB_ID -> RCSB PDB_ID
    global mutations_dataframe

    if not mutations_dataframe:
        setup_mutations_dataframe()

    # old_mutations_csv is missing some cases but has the mapping from pdb -> partner 1 name, partner 2 name
    old_mutations_csv = os.path.join('temp', 'mutations_Gsp1_old.txt')
    assert(os.path.exists('temp'))
    assert(os.path.exists(old_mutations_csv))

    df = pandas.read_csv(old_mutations_csv, sep = '\t')

    tina_pdb_ids = sorted(set([p for p in df['pdb'].values]))
    rcsb_pdb_ids = set()
    for pdb_id in tina_pdb_ids:
        rcsb_pdb_ids.add(pdb_id[:4])
        tina_pdb_id_to_rcsb_pdb_id[pdb_id] = pdb_id[:4]
    rcsb_pdb_ids = sorted(rcsb_pdb_ids)

    assert(rcsb_pdb_ids == sorted(set([p[:4] for p in mutations_dataframe['pdb'].values])))
    rcsb_file_dir = '../../rawdata'

    for pdb_id in tina_pdb_ids:
        tina_pdb_objects[pdb_id] = PDB.from_filepath(os.path.join('temp', 'pdbs', '{0}.pdb'.format(pdb_id)), parse_ligands = True)

    for pdb_id in rcsb_pdb_ids:
        filename = '{0}.pdb'.format(pdb_id.upper())
        pdb_file_paths[pdb_id.upper()] = os.path.join(rcsb_file_dir, filename)
        pdb_contents = download_pdb(pdb_id, rcsb_file_dir, silent = True, filename = filename)
        p = PDB(pdb_contents, parse_ligands = True)
        rcsb_pdb_objects[pdb_id] = p

    print('\nRosetta files  ({0}) : {1}'.format(str(len(tina_pdb_ids)).rjust(2), ', '.join([s.rjust(5) for s in tina_pdb_ids])))
    print('Original files ({0}) : {1}\n'.format(str(len(rcsb_pdb_ids)).rjust(2), ', '.join([s.rjust(5) for s in rcsb_pdb_ids])))

    ppi_api = get_ppi_api()
    for pdb_id, pdb_file_path in pdb_file_paths.iteritems():
        existing_records = ppi_api.DDG_db.execute_select('SELECT * FROM PDBFile WHERE ID=%s', parameters=(pdb_id,))
        if existing_records:
            colortext.warning('The PDB file {0} exists in the database.'.format(pdb_id))
        complex_ids = ppi_api.search_complexes_by_pdb_id(pdb_id)

        if complex_ids:
            colortext.warning('The PDB file {0} has associated complexes: {1}'.format(pdb_id, ', '.join(map(str, complex_ids))))
    print('')
Ejemplo n.º 3
0
def regenerate_mutfile(PredictionID):
    '''I needed to write this function as I forgot to add a *.mutfile mask to the ProtocolCleaner at first so mutfiles were not kept.'''
    raise Exception("We should never need to call this")

    KeepHETATMLines = False

    results = ddGdb.execute_select("SELECT ExperimentID, UserDataSetExperimentID FROM Prediction WHERE ID=%s", parameters = (PredictionID,))
    assert(len(results) == 1)
    ExperimentID = results[0]['ExperimentID']
    UserDataSetExperimentID = results[0]['UserDataSetExperimentID']

    results = ddGdb.execute_select("SELECT PDBFileID FROM UserDataSetExperiment WHERE ID=%s", parameters = (UserDataSetExperimentID,))
    assert(len(results) == 1)
    PDB_ID = results[0]['PDBFileID']

    results = ddGdb.execute_select("SELECT PDBFileID, Content FROM Experiment INNER JOIN PDBFile WHERE Experiment.PDBFileID=PDBFile.ID AND Experiment.ID=%s", parameters = (ExperimentID,))
    assert(len(results) == 1)
    experimentPDB_ID = results[0]["PDBFileID"]

    results = ddGdb.execute_select("SELECT ID, Content FROM PDBFile WHERE ID=%s", parameters=(PDB_ID))
    if len(results) != 1:
        raise colortext.Exception("The SQL query '%s' returned %d results where 1 result was expected." % (sql, len(results)))
    predictionPDB_ID = results[0]["ID"]

    # Get the related PDB ID and file
    assert(len(results) == 1)
    result = results[0]
    pdbID = result["ID"]
    contents = result["Content"]

    pdb = PDB(contents.split("\n"))

    # Check that the mutated positions exist and that the wild-type matches the PDB
    mutations = ddGdb.call_select_proc("GetMutations", parameters = (ExperimentID,))

    # todo: Hack. This should be removed when PDB homologs are dealt with properly.
    for mutation in mutations:
        if experimentPDB_ID == "1AJ3" and predictionPDB_ID == "1U5P":
            assert(int(mutation['ResidueID']) < 1000)
            mutation['ResidueID'] = str(int(mutation['ResidueID']) + 1762)

    pdb.validate_mutations(mutations)

    # Strip the PDB to the list of chains. This also renumbers residues in the PDB for Rosetta.
    chains = [result['Chain'] for result in ddGdb.call_select_proc("GetChains", parameters = (ExperimentID,))]
    pdb.stripForDDG(chains, KeepHETATMLines, numberOfModels = 1)

    # - Post stripping checks -
    # Get the 'Chain ResidueID' PDB-formatted identifier for each mutation mapped to Rosetta numbering
    # then check again that the mutated positions exist and that the wild-type matches the PDB
    remappedMutations = pdb.remapMutations(mutations, pdbID)
    remappedMutations = [[m[0], PDB.ResidueID2String(m[1]), m[2], m[3]] for m in remappedMutations]

    #resfile = self._createResfile(pdb, remappedMutations)
    return( _createMutfile(pdb, remappedMutations))
Ejemplo n.º 4
0
    def static_get_pdb_object(pdb_id, bio_cache = None, cache_dir = None):
        '''This method does not necessarily use a BioCache but it seems to fit here.'''
        pdb_id = pdb_id.upper()

        if bio_cache:
            return bio_cache.get_pdb_object(pdb_id)

        if cache_dir:
            # Check to see whether we have a cached copy of the PDB file
            filepath = os.path.join(cache_dir, '{0}.pdb'.format(pdb_id))
            if os.path.exists(filepath):
                return PDB.from_filepath(filepath)

        # Get any missing files from the RCSB and create cached copies if appropriate
        pdb_contents = retrieve_pdb(pdb_id)
        if cache_dir:
            write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents)
        return PDB(pdb_contents)
Ejemplo n.º 5
0
    def __init__(self, xml_contents, pdb_contents, acceptable_sequence_percentage_match = 70.0, cache_dir = None, domain_overlap_cutoff = 0.88, require_uniprot_residue_mapping = True, bio_cache = None, pdb_id = None):
        ''' The PDB contents should be passed so that we can deal with HETATM records as the XML does not contain the necessary information.
            If require_uniprot_residue_mapping is set and there is no PDB residue -> UniProt sequence index mapping (e.g. 2IMM at the time of writing) then we raise an exception.
            Otherwise, we store the information we can which can still be useful e.g. SCOP domain data.
            bio_cache should be a klab.bio.cache.py::BioCache object and is used to avoid reading/downloading cached files repeatedly.
        '''

        self.atom_to_uniparc_sequence_maps = {} # PDB Chain -> PDBUniParcSequenceMap(PDB ResidueID -> (UniParc ID, UniParc sequence index)) where the UniParc sequence index is 1-based (first element has index 1)

        # Note: These maps map from PDB residue IDs to PDBe residue IDs
        self.atom_to_seqres_sequence_maps = {} # PDB Chain -> SequenceMap(PDB ResidueID -> SEQRES sequence index) where the SEQRES sequence index is 1-based (first element has index 1)

        self.seqres_to_uniparc_sequence_maps = {} # PDB Chain -> PDBUniParcSequenceMap(SEQRES index -> (UniParc ID, UniParc sequence index)) where the SEQRES index and UniParc sequence index is 1-based (first element has index 1)
        self.counters = {}
        self.pdb_id = pdb_id
        self.bio_cache = bio_cache
        self.acceptable_sequence_percentage_match = acceptable_sequence_percentage_match
        self.tag_data = []
        self.cache_dir = cache_dir
        self.uniparc_sequences = {}
        self.uniparc_objects = {}
        self.pdb_chain_to_uniparc_id_map = {}
        self.region_mapping = {}
        self.region_map_coordinate_systems = {}
        self.domain_overlap_cutoff = domain_overlap_cutoff # the percentage (measured in the range [0, 1.0]) at which we consider two domains to be the same e.g. if a Pfam domain of length 60 overlaps with a SCOP domain on 54 residues then the overlap would be 54/60 = 0.9
        self.require_uniprot_residue_mapping = require_uniprot_residue_mapping
        self.xml_contents = xml_contents

        if bio_cache and pdb_id:
            self.modified_residues = bio_cache.get_pdb_object(pdb_id).modified_residues
        else:
            self.modified_residues = PDB(pdb_contents).modified_residues

        self._STACK = []                        # This is used to create a simple FSA for the parsing
        self.current_residue = None
        self.residues = []
        self.reading_unobserved_property = False
        self.uniparc_ids = set()

        assert(0 <= acceptable_sequence_percentage_match <= 100)
        assert(xml_contents.find("encoding='UTF-8'") != -1)
Ejemplo n.º 6
0
def main(FixedIDs = [], radii = [6.0, 7.0, 8.0, 9.0]):
    max_processors = get_number_of_processors()

    rescore_process_file = "/tmp/klab_rescore.txt"
    parser = OptionParser()
    parser.add_option("-n", "--numprocesses", default=1, type='int', dest="num_processes", help="The number of processes used for the rescoring. The cases are split according to this number.", metavar="NUM_PROCESSES")
    parser.add_option("-p", "--process", default=1, type='int', dest="process", help="The ID of this process. This should be an integer between 1 and the number of processes used for the rescoring.", metavar="PROCESS_ID")
    parser.add_option("-d", "--delete",  action="store_true", dest="delete", help="Delete the process tracking file %s." % rescore_process_file)
    parser.add_option("-s", "--set",  type='string', dest="prediction_set", help="The prediction set to rescore.")
    (options, args) = parser.parse_args()

    if options.delete and os.path.exists(rescore_process_file):
        print("Removing %s." % rescore_process_file)
        os.remove(rescore_process_file)

    num_processes = options.num_processes
    prediction_set = options.prediction_set
    process_id = options.process

    for i in FixedIDs:
        assert(type(i) == type(1))

    # SELECT * FROM `Prediction` WHERE `PredictionSet`= 'RosCon2013_P16_score12prime'  AND Status='done' LIMIT 1
    # Check prediction set
    if not prediction_set:
        raise colortext.Exception("A prediction set must be specified.")
    else:
        if FixedIDs:
            results = ddGdb.execute("SELECT DISTINCT PredictionSet FROM Prediction WHERE ID IN (%s)" % ",".join(map(str, FixedIDs)))
            if len(results) != 1:
                raise colortext.Exception("Error: The fixed IDs cover %d different prediction sets." % len(results))
        else:
            results = ddGdb.execute("SELECT ID FROM PredictionSet WHERE ID=%s", parameters=(prediction_set,))
        if not results:
            raise colortext.Exception("The prediction set '%s' does not exist in the database." % prediction_set)

    if num_processes < 1:
        raise colortext.Exception("At least 1 processor must be used.")
    if num_processes > max_processors:
        raise colortext.Exception("Only %d processors/cores were detected. Cannot run with %d processes." % (max_processors, num_processes))
    if num_processes > (max_processors * 0.75):
        colortext.warning("Warning: Using %d processors/cores out of %d which is %0.2f%% of the total available." % (num_processes, max_processors, (100.0*float(num_processes)/float(max_processors))))
    if not(1 <= process_id <= min(max_processors, num_processes)):
        raise colortext.Exception("The process ID %d must be between 1 and the number of processes, %d." % (process_id, num_processes))

    if os.path.exists(rescore_process_file):
        lines = readFileLines(rescore_process_file)
        idx = lines[0].find("numprocesses")
        if idx == -1:
            raise Exception("Badly formatted %s." % rescore_process_file)
        existing_num_processes = int(lines[0][idx+len("numprocesses"):])
        if existing_num_processes != num_processes:
            raise colortext.Exception("You specified the number of processes to be %d but %s already specifies it as %d." % (num_processes, rescore_process_file, existing_num_processes))
        for line in [line for line in lines[1:] if line.strip()]:
            idx = line.find("process")
            if idx == -1:
                raise colortext.Exception("Badly formatted %s. Line is '%s'." % (rescore_process_file, line))
            existing_process = int(line[idx+len('process'):])
            if process_id == existing_process:
                raise colortext.Exception("Process %d is already logged as running. Check if this is so and edit %s." % (process_id, rescore_process_file))
        F = open(rescore_process_file, 'a')
        F.write("process %d\n" % process_id)
        F.close()
    else:
        F = open(rescore_process_file, 'w')
        F.write("numprocesses %d\n" % num_processes)
        F.write("process %d\n" % process_id)
        F.close()

    output_dir = os.path.join('rescoring', str(process_id))
    if not(os.path.exists(output_dir)):
        os.makedirs(output_dir)
    abs_output_dir = os.path.abspath(os.path.join(os.getcwd(), output_dir))
    print("Running process in %s.\n" % abs_output_dir)

    ReallyFixedIDs = False

    results = ddGdb.execute("SELECT ID, ExperimentID, Scores FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion <> %s", parameters=(prediction_set, float(current_score_revision),))
    if not(FixedIDs) and results:
        raise WrongScoreRevisionException("Score versions found which are not %s. Need to update table structure." % current_score_revision)
    else:
        # Hacky way to run multiple processes
        if ReallyFixedIDs:
            num_to_score = len(remaining_unscored)
            num_for_this_to_score = num_to_score / num_processes
            IDs_to_score = remaining_unscored[(process_id-1) * num_for_this_to_score : (process_id) * num_for_this_to_score]
            results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s)" % (",".join(map(str, IDs_to_score))))
        elif FixedIDs:
            results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s) AND MOD(ID,%s)=%s" % (",".join(map(str, FixedIDs)), num_processes,process_id-1))
        else:
            results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion=%s AND MOD(ID,%s)=%s", parameters=(prediction_set, float(current_score_revision),num_processes,process_id-1))

    count = 0
    cases_computed = 0
    total_time_in_secs = 0

    number_of_cases_left = len(results) * len(radii)

    failed_cases = []
    colortext.printf("Rescoring %d predictions over %d radii...\n" % (len(results), len(radii)), 'lightgreen')
    for r in results:
        t = Timer()
        t.add('Preamble')
        inner_count = 0

        mutations = ddGdb.execute('SELECT * FROM ExperimentMutation WHERE ExperimentID=%s', parameters=(r['ExperimentID'],))
        mutation_str = ', '.join(['%s %s%s%s' % (m['Chain'], m['WildTypeAA'], m['ResidueID'], m['MutantAA']) for m in mutations])
        extracted_data = False

        details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],))
        details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],))
        colortext.message("Prediction: %d, %s chain %s. Mutations: %s. Experiment ID #%d. UserDataSetExperimentID #%d." % (details[0]['ID'], details[0]['PDBFileID'], details[0]['Chain'], mutation_str, r['ExperimentID'], r['UserDataSetExperimentID']))

        experiment_pdbID = ddGdb.execute('SELECT PDBFileID FROM Experiment WHERE ID=%s', parameters=(r['ExperimentID'],))[0]['PDBFileID']
        print('Experiment PDB file ID = %s' % experiment_pdbID)
        pdbID = ddGdb.execute('SELECT UserDataSetExperiment.PDBFileID FROM Prediction INNER JOIN UserDataSetExperiment ON UserDataSetExperimentID=UserDataSetExperiment.ID WHERE Prediction.ID=%s', parameters=(r['ID'],))[0]['PDBFileID']
        print('UserDataSetExperiment PDB file ID = %s' % pdbID)

        count += 1
        if True:#len(mutations) == 1:
            timestart = time.time()

            #mutation = mutations[0]
            dbchains = sorted(set([mutation['Chain'] for mutation in mutations]))
            # todo: note: assuming monomeric structures here
            assert(len(dbchains) == 1)
            dbchain = dbchains[0]
            #mutantaa = mutation['MutantAA']

            ddG_dict = json.loads(r['Scores'])
            kellogg_ddG = ddG_dict['data']['kellogg']['total']['ddG']

            #assert(ddG_dict['version'] == current_score_revision)

            all_done = True
            for radius in radii:
                score_name = ('noah_%0.1fA' % radius).replace(".", ",")
                if not(ddG_dict['data'].get(score_name)):
                    all_done = False
                else:
                    cases_computed += 1
                    number_of_cases_left -= 1
            if all_done:
                print('Prediction %d: done.' % r["ID"])
                continue

            # Extract data
            t.add('Grab data')
            #archivefile = None
            #prediction_data_path = ddGdb.execute('SELECT Value FROM _DBCONSTANTS WHERE VariableName="PredictionDataPath"')[0]['Value']
            #job_data_path = os.path.join(prediction_data_path, '%d.zip' % r['ID'])
            #print(job_data_path)
            #assert(os.path.exists(job_data_path))
            #archivefile = readBinaryFile(job_data_path)
            archivefile = DDG_interface.getData(r['ID'])
            zipfilename = os.path.join(output_dir, "%d.zip" % r['ID'])
            F = open(zipfilename, "wb")
            F.write(archivefile)
            F.close()

            t.add('Extract data')
            zipped_content = zipfile.ZipFile(zipfilename, 'r', zipfile.ZIP_DEFLATED)
            tmpdir = None
            repacked_files = []
            mutant_files = []

            rosetta_resids = []
            try:
                tmpdir = makeTemp755Directory(output_dir)
                highestIndex = -1
                foundResfile = False
                foundMutfile = False

                presumed_mutation = None
                for fname in sorted(zipped_content.namelist()):
                    if fname.endswith(".pdb"):
                        if fname.startswith("%s/mut_" % r['ID']) or fname.startswith("%s/repacked_" % r['ID']):
                            structnum = int(fname[fname.rindex('_')+1:-4])
                            if fname.startswith("%s/mut_" % r['ID']):
                                if presumed_mutation:
                                    assert(presumed_mutation == os.path.split(fname)[1].split('_')[1])
                                else:
                                    presumed_mutation = os.path.split(fname)[1].split('_')[1]
                                newfname = 'mutant_%02d' % structnum
                            if fname.startswith("%s/repacked_" % r['ID']):
                                newfname = 'repacked_%02d' % structnum
                            highestIndex = max(highestIndex, structnum)

                            newfilepath = os.path.join(tmpdir, newfname)
                            writeFile(newfilepath, zipped_content.read(fname))

                            if fname.startswith("%s/mut_" % r['ID']):
                                mutant_files.append(newfilepath)
                            if fname.startswith("%s/repacked_" % r['ID']):
                                repacked_files.append(newfilepath)
                        #elif fname.startswith("%s/%s-%s" % (r['ID'],r['ExperimentID'],pdbID)) or fname.startswith("%s/repacked_" % r['ID']):
                        #    writeFile(os.path.join(tmpdir, '%s.pdb' % pdbID), zipped_content.read(fname))
                    if fname.startswith("%s/%s-%s.resfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)):
                        raise Exception('This case needs to be updated (see the mutfile section below). We mainly use mutfiles now so I did not update this section.')
                        foundResfile = True
                        lines = zipped_content.read(fname).split("\n")
                        assert(len(lines) == 3)
                        assert(lines[0] == "NATAA")
                        assert(lines[1] == "start")
                        resfile_mutation = lines[2].split(" ")
                        assert(len(resfile_mutation) == 4)
                        rosetta_resid = resfile_mutation[0]
                        rosetta_chain = resfile_mutation[1]
                        rosetta_mutaa = resfile_mutation[3]
                        assert(mutantaa == rosetta_mutaa)
                        assert(dbchain == rosetta_chain)
                        assert(resfile_mutation[2] == 'PIKAA')
                        assert(len(rosetta_mutaa) == 1)
                    if fname.startswith("%s/%s-%s.mutfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)):
                        foundMutfile = True
                        lines = zipped_content.read(fname).split("\n")
                        assert(lines[0].startswith('total '))
                        num_mutations = int(lines[0][6:])
                        assert(lines[1] == str(num_mutations))
                        # todo: note: assuming monomeric structures here
                        rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],))
                        assert(len(rosetta_chain) == 1)
                        rosetta_chain = rosetta_chain[0]['Chain']

                        resfile_mutations = lines[2:]
                        for resfile_mutation in resfile_mutations:
                            resfile_mutation = resfile_mutation.split(" ")
                            assert(len(resfile_mutation) == 3)
                            rosetta_resids.append(resfile_mutation[1])
                            rosetta_mutaa = resfile_mutation[2]
                            assert(dbchain == rosetta_chain)
                            assert(len(rosetta_mutaa) == 1)

                # Make sure the wtaa->mutantaa types match the structures
                assert(not(foundResfile))
                if not foundMutfile:
                    raise Exception('This case needs to be updated (see the mutfile section below). This was added as a hack for cases where I did not store the mutfile so I did not update this section.')
                    input_files = ddGdb.execute_select('SELECT InputFiles FROM Prediction WHERE ID=%s', parameters=(r['ID'],))
                    assert(len(input_files) == 1)
                    lines = pickle.loads(input_files[0]['InputFiles'])['MUTFILE'].split("\n")

                    #lines = regenerate_mutfile(r['ID']).split("\n")
                    assert(len(lines) == 3)
                    assert(lines[0] == "total 1")
                    assert(lines[1] == "1")
                    resfile_mutation = lines[2].split(" ")
                    assert(len(resfile_mutation) == 3)
                    rosetta_resid = resfile_mutation[1]
                    rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],))
                    assert(len(rosetta_chain) == 1)
                    rosetta_chain = rosetta_chain[0]['Chain']
                    rosetta_mutaa = resfile_mutation[2]
                    assert(dbchain == rosetta_chain)
                    assert(len(rosetta_mutaa) == 1)
                    assert("%s%s%s" % (resfile_mutation[0], resfile_mutation[1], resfile_mutation[2]) == presumed_mutation)

                fullresids = []

                for rosetta_resid in rosetta_resids:
                    fullresid = None
                    if rosetta_resid.isdigit():
                        fullresid = '%s%s%s ' % (rosetta_chain, (4-len(rosetta_resid)) * ' ', rosetta_resid)
                    else:
                        assert(False)
                        fullresid = '%s%s%s' % (rosetta_chain, (5-len(rosetta_resid)) * ' ', rosetta_resid)
                    fullresids.append(fullresid)


                resultst1 = ddGdb.execute_select("SELECT ExperimentID, UserDataSetExperimentID FROM Prediction WHERE ID=%s", parameters = (r['ID'],))
                assert(len(resultst1) == 1)
                ExperimentIDt1 = resultst1[0]['ExperimentID']
                UserDataSetExperimentIDt1 = resultst1[0]['UserDataSetExperimentID']

                if UserDataSetExperimentIDt1:
                    resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM UserDataSetExperiment WHERE ID=%s", parameters = (UserDataSetExperimentIDt1,))
                else:
                    resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM Experiment WHERE ID=%s", parameters = (ExperimentIDt1,))
                assert(len(resultst2) == 1)

                prediction_PDB_ID = resultst2[0]['PDBFileID']

                if False and prediction_PDB_ID not in ['1TEN', '1AYE', '1H7M'] + ['1A2P', '1BNI', '1STN']:
                    for fullresid in fullresids:
                        wtaa = None
                        for m in mutations:
                            # Hack for ub_RPN13
                            if prediction_PDB_ID == 'ub_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_RPN13_yeast
                            elif prediction_PDB_ID == 'uby_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_OTU
                            elif prediction_PDB_ID == 'ub_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_OTU_yeast
                            elif prediction_PDB_ID == 'uby_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_UQcon
                            elif prediction_PDB_ID == 'ub_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) + 213): # starts at 501
                                wtaa = m['WildTypeAA']
                            # Hack for uby_UQcon
                            elif prediction_PDB_ID == 'uby_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 287):
                                wtaa = m['WildTypeAA']
                            elif m['Chain'] == fullresid[0] and m['ResidueID'] == fullresid[1:].strip():
                                wtaa = m['WildTypeAA']
                        if (wtaa == None):
                            colortext.error(prediction_PDB_ID)
                            colortext.error('wtaa == None')
                            colortext.error('fullresid = %s' % str(fullresid))
                            colortext.error(str(mutations))
                            colortext.warning([rosetta_resid.strip() for rosetta_resid in rosetta_resids])
                            #sys.exit(0)
                        assert(wtaa != None)
                        assert(PDB.from_filepath(repacked_files[0]).get_residue_id_to_type_map()[fullresid] == wtaa)
                    #assert(PDB(mutant_files[0]).get_residue_id_to_type_map()[fullresid] == mutantaa)

                for radius in radii:
                    score_name = ('noah_%0.1fA' % radius).replace(".", ",")

                    if ddG_dict['data'].get(score_name):
                        print('Radius %0.1f: done.' % radius)
                        continue
                    cases_computed += 1
                    number_of_cases_left -= 1

                    t.add('Radius %0.3f: repacked' % radius)
                    colortext.printf("Prediction ID: %d. Calculating radius %0.1f. Calculation #%d of %d." % (r['ID'], radius, cases_computed, len(results) * len(radii)), 'orange')

                    repacked_score = NoahScore()
                    repacked_score.calculate(repacked_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius)
                    colortext.message("Repacked")
                    print(repacked_score)

                    t.add('Radius %0.3f: mutant' % radius)
                    mutant_score = NoahScore()
                    mutant_score.calculate(mutant_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius)
                    colortext.printf("Mutant", color = 'cyan')
                    print(mutant_score)

                    t.add('Radius %0.3f: postamble' % radius)
                    colortext.printf("ddG", color = 'lightpurple')
                    ddg_score = repacked_score.ddg(mutant_score)
                    print(ddg_score)

                    colortext.printf("Liz's ddG", color = 'yellow')
                    print("Total score: %0.3f" % kellogg_ddG)

                    ddG_dict['version'] = '0.23'
                    if ddG_dict['version'] == '0.1':
                        ddG_dict['version'] = '0.21'
                        ddG_dict['data'] = {
                            'kellogg' : {
                                'total' : ddG_dict['data'],
                            },
                            'noah': {
                                'total' : {'ddG' : ddg_score.total},
                                'positional' : {'ddG' : ddg_score.positional},
                                'positional_twoscore' : {'ddG' : ddg_score.positional_twoscore},
                            },
                        }
                    elif ddG_dict['version'] == '0.2':
                        ddG_dict['version'] = '0.21'
                        ddG_dict['data']['noah']['total']['ddG'] = ddg_score.total
                        ddG_dict['data']['noah']['positional']['ddG'] = ddg_score.positional
                        ddG_dict['data']['noah']['positional_twoscore']['ddG'] = ddg_score.positional_twoscore
                    elif ddG_dict['version'] == '0.22':
                        ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}}
                        ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total
                        ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional
                        ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore
                    elif ddG_dict['version'] == '0.23':
                        ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}}
                        ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total
                        ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional
                        ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore

                    jsonified_ddG = json.dumps(ddG_dict)
                    ddGdb.execute('UPDATE Prediction SET Scores=%s WHERE ID=%s', parameters=(jsonified_ddG, r['ID'],))
                t.add('Cleanup')
                shutil.rmtree(tmpdir)
                os.remove(zipfilename)

            except Exception, e:
                print("Exception! In prediction %d" % r['ID'], str(e))
                failed_cases.append(r['ID'])
                import traceback
                print(traceback.format_exc())
                if tmpdir:
                    shutil.rmtree(tmpdir)

            total_time_in_secs += t.sum()
            average_time_taken = float(total_time_in_secs)/float(cases_computed or 1)
            estimate_remaining_time = number_of_cases_left * average_time_taken

            t.stop()
            colortext.printf("**Profile**", 'orange')
            print(t)
            colortext.message("Time taken for this case: %0.2fs." % t.sum())
            colortext.message("Average time taken per case: %0.2fs." % average_time_taken)
            colortext.message("Estimated time remaining: %dh%dm%ds." % (int(estimate_remaining_time/3600), int((estimate_remaining_time/60) % 60), estimate_remaining_time % 60))
            print("\n")
                os.path.split(pdb_file)[1])[0].lower()
            file_prefix = os.path.splitext(pdb_file)[0]
            fasta_file = file_prefix + '.fasta'
            loop_file = file_prefix + '.loop.json'
            assert (os.path.exists(fasta_file))
            assert (os.path.exists(loop_file))

            # Convert the FASTA headers back into PDB residue IDs
            fasta_contents = read_file(fasta_file)
            headers = [
                l for l in fasta_contents.split('\n') if l.startswith('>')
            ]
            assert (len(headers) == 1)
            header = headers[0]
            pdb_residue_ids = [
                PDB.ChainResidueID2String(l[0], l[1:])
                for l in header[header.find('Residues ') + 9:].split(';')
            ]

            # Add the missing atoms atoms back into the PDB file
            spackler = Spackler.from_filepath(pdb_file)
            new_pdb_content = spackler.add_backbone_atoms_linearly_from_loop_filepaths(
                loop_file, fasta_file, pdb_residue_ids)
            write_file(
                os.path.join(output_directory, '{0}.pdb'.format(pdb_prefix)),
                new_pdb_content)

            # Create a Rosetta .loop file
            loop_set = json.loads(read_file(loop_file)).get('LoopSet')
            assert (len(loop_set) == 1)
            start_res = '{chainID}{resSeq:>4d}{iCode}'.format(
Ejemplo n.º 8
0
class SIFTS(xml.sax.handler.ContentHandler):


    def __init__(self, xml_contents, pdb_contents, acceptable_sequence_percentage_match = 70.0, cache_dir = None, domain_overlap_cutoff = 0.88, require_uniprot_residue_mapping = True, bio_cache = None, pdb_id = None):
        ''' The PDB contents should be passed so that we can deal with HETATM records as the XML does not contain the necessary information.
            If require_uniprot_residue_mapping is set and there is no PDB residue -> UniProt sequence index mapping (e.g. 2IMM at the time of writing) then we raise an exception.
            Otherwise, we store the information we can which can still be useful e.g. SCOP domain data.
            bio_cache should be a klab.bio.cache.py::BioCache object and is used to avoid reading/downloading cached files repeatedly.
        '''

        self.atom_to_uniparc_sequence_maps = {} # PDB Chain -> PDBUniParcSequenceMap(PDB ResidueID -> (UniParc ID, UniParc sequence index)) where the UniParc sequence index is 1-based (first element has index 1)

        # Note: These maps map from PDB residue IDs to PDBe residue IDs
        self.atom_to_seqres_sequence_maps = {} # PDB Chain -> SequenceMap(PDB ResidueID -> SEQRES sequence index) where the SEQRES sequence index is 1-based (first element has index 1)

        self.seqres_to_uniparc_sequence_maps = {} # PDB Chain -> PDBUniParcSequenceMap(SEQRES index -> (UniParc ID, UniParc sequence index)) where the SEQRES index and UniParc sequence index is 1-based (first element has index 1)
        self.counters = {}
        self.pdb_id = pdb_id
        self.bio_cache = bio_cache
        self.acceptable_sequence_percentage_match = acceptable_sequence_percentage_match
        self.tag_data = []
        self.cache_dir = cache_dir
        self.uniparc_sequences = {}
        self.uniparc_objects = {}
        self.pdb_chain_to_uniparc_id_map = {}
        self.region_mapping = {}
        self.region_map_coordinate_systems = {}
        self.domain_overlap_cutoff = domain_overlap_cutoff # the percentage (measured in the range [0, 1.0]) at which we consider two domains to be the same e.g. if a Pfam domain of length 60 overlaps with a SCOP domain on 54 residues then the overlap would be 54/60 = 0.9
        self.require_uniprot_residue_mapping = require_uniprot_residue_mapping
        self.xml_contents = xml_contents

        if bio_cache and pdb_id:
            self.modified_residues = bio_cache.get_pdb_object(pdb_id).modified_residues
        else:
            self.modified_residues = PDB(pdb_contents).modified_residues

        self._STACK = []                        # This is used to create a simple FSA for the parsing
        self.current_residue = None
        self.residues = []
        self.reading_unobserved_property = False
        self.uniparc_ids = set()

        assert(0 <= acceptable_sequence_percentage_match <= 100)
        assert(xml_contents.find("encoding='UTF-8'") != -1)


    def get_pdb_chain_to_uniparc_id_map(self):
        if self.pdb_chain_to_uniparc_id_map:
            return self.pdb_chain_to_uniparc_id_map
        else:
            self.pdb_chain_to_uniparc_id_map = {}

            for c, mp in self.atom_to_uniparc_sequence_maps.iteritems():
                self.pdb_chain_to_uniparc_id_map[c] = self.pdb_chain_to_uniparc_id_map.get(c, set())
                for _, v, _ in mp:
                    self.pdb_chain_to_uniparc_id_map[c].add(v[0])

            for c, mp in self.seqres_to_uniparc_sequence_maps.iteritems():
                self.pdb_chain_to_uniparc_id_map[c] = self.pdb_chain_to_uniparc_id_map.get(c, set())
                for _, v, _ in mp:
                    self.pdb_chain_to_uniparc_id_map[c].add(v[0])

            for c, s in self.pdb_chain_to_uniparc_id_map.iteritems():
                self.pdb_chain_to_uniparc_id_map[c] = sorted(s)

            return self.pdb_chain_to_uniparc_id_map

    def get_uniparc_sequences(self):
        if self.uniparc_sequences:
            return self.uniparc_sequences
        else:
            self.uniparc_sequences = {}
            self.uniparc_objects = {}
            for UniParcID in self.uniparc_ids:
                entry = UniParcEntry(UniParcID, cache_dir = self.cache_dir)
                self.uniparc_sequences[entry.UniParcID] = Sequence.from_sequence(entry.UniParcID, entry.sequence)
                self.uniparc_objects[entry.UniParcID] = entry
            return self.uniparc_sequences


    @staticmethod
    def retrieve(pdb_id, cache_dir = None, acceptable_sequence_percentage_match = 70.0, require_uniprot_residue_mapping = True, bio_cache = None):
        '''Creates a PDBML object by using a cached copy of the files if they exists or by retrieving the files from the RCSB.
           bio_cache should be a klab.bio.cache.py::BioCache object and is used to avoid reading/downloading cached files repeatedly.
        '''

        pdb_contents = None
        xml_contents = None
        pdb_id = pdb_id.upper()

        l_pdb_id = pdb_id.lower()

        if len(pdb_id) != 4 or not pdb_id.isalnum():
            raise Exception("Bad PDB identifier '%s'." % pdb_id)

        if bio_cache:
            pdb_contents = bio_cache.get_pdb_contents(pdb_id)
            xml_contents = bio_cache.get_sifts_xml_contents(pdb_id)

        if cache_dir:
            if not pdb_contents:
                # Check to see whether we have a cached copy of the PDB file
                filename = os.path.join(cache_dir, "%s.pdb" % pdb_id)
                if os.path.exists(filename):
                    pdb_contents = read_file(filename)

            if not xml_contents:
                # Check to see whether we have a cached copy of the XML file
                filename = os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id)
                if os.path.exists(filename):
                    xml_contents = read_file(filename)

        # Get any missing files from the RCSB and create cached copies if appropriate
        if not pdb_contents:
            pdb_contents = rcsb.retrieve_pdb(pdb_id)
            if cache_dir:
                write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents)

        if not xml_contents:
            try:
                xml_contents = retrieve_xml(pdb_id, silent = False)
                if cache_dir:
                    write_file(os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id), xml_contents)
            except FTPException550:
                raise MissingSIFTSRecord('The file "%s.sifts.xml.gz" could not be found on the EBI FTP server.' % l_pdb_id)

        xml_contents = xml_contents

        # Return the object
        handler = SIFTS(xml_contents, pdb_contents, acceptable_sequence_percentage_match = acceptable_sequence_percentage_match, cache_dir = cache_dir, require_uniprot_residue_mapping = require_uniprot_residue_mapping, bio_cache = bio_cache, pdb_id = pdb_id)
        xml.sax.parseString(xml_contents, handler)
        return handler


    def stack_push(self, lvl, data):
        if lvl == 0:
            assert(not(self._STACK))
        else:
            assert(self._STACK and (len(self._STACK) == lvl))
            for x in range(lvl):
                assert(self._STACK[x][0] == x)

        self._STACK.append((lvl, data))


    def stack_pop(self, lvl):
        num_levels = lvl + 1
        assert(self._STACK and (len(self._STACK) == num_levels))
        for x in range(num_levels):
            assert(self._STACK[x][0] == x)
        self._STACK.pop()
        if lvl == 0:
            assert(not(self._STACK))


    def check_stack(self, lvl):
        assert(self._STACK and (len(self._STACK) == lvl))
        for x in range(lvl):
            assert(self._STACK[x][0] == x)


    def start_document(self):
        '''"The SAX parser will invoke this method only once, before any other methods in this interface or in DTDHandler (except for setDocumentLocator())."'''
        pass

    a='''
<entity type="protein" entityId="A">
    <segment segId="1aqt_A_1_2" start="1" end="2">
      <listResidue>

<listMapRegion>
        <mapRegion start="3" end="138">
          <db dbSource="PDB" dbCoordSys="PDBresnum" dbAccessionId="1aqt" dbChainId="A" start="3" end="138"/>
        </mapRegion>'''



    def add_region_mapping(self, attributes):
        chain_id = (self._get_current_PDBe_chain())
        mapRegion_attributes = self._STACK[3][1]
        segment_range = (int(mapRegion_attributes['start']), int(mapRegion_attributes['end']))
        dbSource = attributes['dbSource']
        dbAccessionId = attributes['dbAccessionId']
        self.region_mapping[chain_id] = self.region_mapping.get(chain_id, {})
        self.region_mapping[chain_id][dbSource] = self.region_mapping[chain_id].get(dbSource, {})
        self.region_mapping[chain_id][dbSource][dbAccessionId] = self.region_mapping[chain_id][dbSource].get(dbAccessionId, [])
        self.region_mapping[chain_id][dbSource][dbAccessionId].append(segment_range)

        # Note: I do not currently store the coordinate system type on a range level since I am assuming that each mapping uses one coordinate system
        if attributes.get('dbCoordSys'):
            self.region_map_coordinate_systems[dbSource] = self.region_map_coordinate_systems.get(dbSource, set())
            self.region_map_coordinate_systems[dbSource].add(attributes['dbCoordSys'])


    def start_element(self, name, attributes):
        self.tag_data = ''

        # Residue details and mappings

        if name == 'crossRefDb':
            self.start_crossRefDb(attributes)

        elif name == 'residueDetail':
            self.stack_push(4, None)
            self.start_residueDetail(attributes)

        elif name == 'residue':
            self.stack_push(3, None)
            assert(attributes.get('dbSource'))
            assert(attributes.get('dbCoordSys'))
            assert(attributes.get('dbResNum'))
            assert(attributes.get('dbResName'))
            assert(attributes['dbSource'] == 'PDBe')
            assert(attributes['dbCoordSys'] == 'PDBe')
            self.current_residue = SIFTSResidue(self._get_current_PDBe_chain(), attributes['dbResNum'], attributes['dbResName'])

        elif name == 'listResidue':
            self.stack_push(2, None)

        # Region mappings

        elif name == 'db':
            if len(self._STACK) == 4 and self._STACK[3][1].get('nodeType') == 'mapRegion':
                assert(attributes.get('dbSource'))
                assert(attributes.get('dbAccessionId'))
                self.add_region_mapping(attributes)

        elif name == 'mapRegion':
            assert(attributes.get('start'))
            assert(attributes.get('end'))
            self.stack_push(3, dict(start=attributes['start'], end=attributes['end'], nodeType = 'mapRegion'))

        elif name == 'listMapRegion':
            self.stack_push(2, None)

        # Entities and segments

        elif name == 'segment':
            assert(attributes.get('segId'))
            assert(attributes.get('start'))
            assert(attributes.get('end'))
            self.stack_push(1, dict(segId=attributes['segId'], start=attributes['start'], end=attributes['end']))

        elif name == 'entity':
            assert(attributes.get('type'))
            entityId = None
            if attributes['type'] == 'protein':
                entityId = attributes.get('entityId')
            self.stack_push(0, entityId)

        elif name == 'entry':
            self.counters['entry'] = self.counters.get('entry', 0) + 1
            self.parse_header(attributes)


    def parse_header(self, attributes):
        if attributes.get('dbAccessionId'):
            pdb_id = attributes.get('dbAccessionId').upper()
            if self.pdb_id:
                assert(self.pdb_id.upper() == pdb_id)
            self.pdb_id = pdb_id
        else:
            raise Exception('Could not verify the PDB ID from the <entry> tag.')


    def start_residueDetail(self, attributes):
        self.check_stack(5)
        self.reading_unobserved_property = False
        dbSource = attributes.get('dbSource')
        assert(dbSource)
        if dbSource == 'PDBe':
            residue_detail_property = attributes.get('property')
            if residue_detail_property and residue_detail_property == 'Annotation':
                self.reading_unobserved_property = True


    def start_crossRefDb(self, attributes):
        self.check_stack(4)
        dbSource = attributes.get('dbSource')
        assert(dbSource)

        if dbSource == 'PDB' or dbSource == 'UniProt':
            current_residue = self.current_residue

            dbCoordSys = attributes.get('dbCoordSys')
            dbAccessionId = attributes.get('dbAccessionId')
            dbResNum = attributes.get('dbResNum')
            dbResName = attributes.get('dbResName')

            if dbSource == 'PDB':
                dbChainId = attributes.get('dbChainId')
                assert(dbCoordSys == "PDBresnum")
                assert(dbAccessionId.upper() == self.pdb_id.upper())
                #assert(dbChainId == self._STACK[0][1]) # this is not always true e.g. 1lmb has entityId="C" but dbChainId="3"
                if not dbChainId == self._STACK[0][1]: # use the dbChainId chain ID since that is what is used in the logic later on. Note: this may introduce bugs if the dbChainIds differ amongst themselves
                    self._STACK[0] = (0, dbChainId)

                assert(dbCoordSys and dbAccessionId and dbResNum and dbResName and dbChainId )
                current_residue.add_pdb_residue(dbChainId, dbResNum, dbResName)

            elif dbSource == 'UniProt':
                assert(dbCoordSys and dbAccessionId and dbResNum and dbResName)
                assert(dbCoordSys == "UniProt")
                assert(dbCoordSys and dbAccessionId and dbResNum and dbResName)
                current_residue.add_uniprot_residue(dbAccessionId, dbResNum, dbResName)


    def _get_current_PDBe_chain(self):
        return self._STACK[0][1]


    def _get_current_segment_range(self):
        return (self._STACK[1][1]['start'], self._STACK[1][1]['end'])


    def end_element(self, name):
        tag_content = self.tag_data

        # Residue details and mappings

        if name == 'residueDetail':
            self.stack_pop(4)
            if self.reading_unobserved_property and (tag_content == 'Not_Observed'):
                self.current_residue.WasNotObserved = True
            self.reading_unobserved_property = False

        elif name == 'residue':
            self.stack_pop(3)
            current_residue = self.current_residue
            #assert(self._get_current_PDBe_chain() == current_residue.PDBChainID) # this is not always true e.g. 1lmb has entityId="C" but dbChainId="3"
            self.residues.append(current_residue)
            self.current_residue = None

        elif name == 'listResidue':
            self.stack_pop(2)

        # Region mappings

        elif name == 'mapRegion':
            self.stack_pop(3)

        elif name == 'listMapRegion':
            self.stack_pop(2)

        # Entities and segments

        elif name == 'segment':
            self.stack_pop(1)

        elif name == 'entity':
            self.stack_pop(0)


    def end_document(self):
        assert(self.counters['entry'] == 1)

        residue_count = 0
        residues_matched = {}
        residues_encountered = set()
        atom_to_uniparc_residue_map = {}
        atom_to_seqres_residue_map = {}
        seqres_to_uniparc_residue_map = {}

        UniProtACs = set()
        for r in self.residues:
            if r.UniProtAC:
                UniProtACs.add(r.UniProtAC)

        ACC_to_UPARC_mapping = uniprot_map('ACC', 'UPARC', list(UniProtACs), cache_dir = self.cache_dir)
        assert(sorted(ACC_to_UPARC_mapping.keys()) == sorted(list(UniProtACs)))
        for k, v in ACC_to_UPARC_mapping.iteritems():
            assert(len(v) == 1)
            ACC_to_UPARC_mapping[k] = v[0]

        map_chains = set()
        for r in self.residues:
            if not(r.PDBResidueID.isalnum() and int(r.PDBResidueID.isalnum()) < 0):
                # These are not valid PDB residue IDs - the SIFTS XML convention sometimes assigns negative residue IDs to unobserved residues before the first ATOM record
                # (only if the first residue ID is 1?)
                pass

            # Store the PDB->UniProt mapping
            if r.has_pdb_to_uniprot_mapping():
                UniProtAC = r.UniProtAC
                UniParcID = ACC_to_UPARC_mapping[UniProtAC]
                self.uniparc_ids.add(UniParcID)

            full_pdb_residue_ID = r.get_pdb_residue_id()
            PDBChainID = r.PDBChainID
            map_chains.add(PDBChainID)
            residues_matched[PDBChainID] = residues_matched.get(PDBChainID, 0)

            if not r.WasNotObserved:
                # Do not add ATOM mappings when the ATOM data does not exist
                if r.has_pdb_to_uniprot_mapping():
                    atom_to_uniparc_residue_map[PDBChainID] = atom_to_uniparc_residue_map.get(PDBChainID, {})
                    atom_to_uniparc_residue_map[PDBChainID][full_pdb_residue_ID] = (UniParcID, r.UniProtResidueIndex)

                atom_to_seqres_residue_map[PDBChainID] = atom_to_seqres_residue_map.get(PDBChainID, {})
                atom_to_seqres_residue_map[PDBChainID][full_pdb_residue_ID] = r.PDBeResidueID

            if r.has_pdb_to_uniprot_mapping():
                seqres_to_uniparc_residue_map[PDBChainID] = seqres_to_uniparc_residue_map.get(PDBChainID, {})
                seqres_to_uniparc_residue_map[PDBChainID][r.PDBeResidueID] = (UniParcID, r.UniProtResidueIndex)

            # Make sure we only have at most one match per PDB residue
            assert(full_pdb_residue_ID not in residues_encountered)
            residues_encountered.add(full_pdb_residue_ID)

            # Count the number of exact sequence matches
            PDBResidue3AA = r.PDBResidue3AA
            pdb_residue_type = residue_type_3to1_map.get(PDBResidue3AA) or self.modified_residues.get(PDBResidue3AA) or protonated_residue_type_3to1_map.get(PDBResidue3AA) or non_canonical_amino_acids.get(PDBResidue3AA)
            if r.has_pdb_to_uniprot_mapping():
                if pdb_residue_type == r.UniProtResidue1AA:

                    residues_matched[PDBChainID] += 1
            residue_count += 1

        # Create the SequenceMaps
        for c in map_chains:
            if residues_matched[c] > 0:
                # 1IR3 has chains A,
                # Chain A has mappings from atom and seqres (PDBe) residues to UniParc as usual
                # Chain B (18 residues long) has mappings from atom to seqres residues but not to UniParc residues
                self.atom_to_uniparc_sequence_maps[c] = PDBUniParcSequenceMap.from_dict(atom_to_uniparc_residue_map[c])
                self.seqres_to_uniparc_sequence_maps[c] = PDBUniParcSequenceMap.from_dict(seqres_to_uniparc_residue_map[c])
            self.atom_to_seqres_sequence_maps[c] = SequenceMap.from_dict(atom_to_seqres_residue_map[c])

        # Check the match percentage
        total_residues_matched = sum([residues_matched[c] for c in residues_matched.keys()])
        if total_residues_matched == 0:
            if self.pdb_id and self.pdb_id in NoSIFTSPDBUniParcMappingCases:
                if self.require_uniprot_residue_mapping:
                    raise NoSIFTSPDBUniParcMapping('The PDB file %s has a bad or missing SIFTS mapping at the time of writing.' % self.pdb_id)
                else:
                    colortext.error('Warning: The PDB file %s has a a bad or missing SIFTS mapping at the time of writing so there is no PDB -> UniProt residue mapping.' % self.pdb_id)
            else:
                if self.require_uniprot_residue_mapping:
                    raise Exception('No residue information matching PDB residues to UniProt residues was found.')
                else:
                    colortext.error('Warning: No residue information matching PDB residues to UniProt residues was found.')
        else:
            percentage_matched = float(total_residues_matched)*100.0/float(residue_count)
            if percentage_matched < self.acceptable_sequence_percentage_match:
                if self.pdb_id and self.pdb_id in BadSIFTSMappingCases:
                    raise BadSIFTSMapping('The PDB file %s has a known bad SIFTS mapping at the time of writing.' % self.pdb_id)
                else:
                    raise Exception('Expected %.2f%% sequence match on matched residues but the SIFTS results only gave us %.2f%%.' % (self.acceptable_sequence_percentage_match, percentage_matched))

        # Merge the ranges for the region mappings i.e. so [1-3],[3-86] becomes [1-86]
        region_mapping = self.region_mapping
        for chain_id, chain_details in region_mapping.iteritems():
            for dbSource, source_details in chain_details.iteritems():
                for dbAccessionId, range_list in source_details.iteritems():
                    source_details[dbAccessionId] = merge_range_pairs(range_list)

        # Check to see if the expected numbering schemes hold
        for k, v in expected_residue_numbering_schemes.iteritems():
            if self.region_map_coordinate_systems.get(k):
                assert(self.region_map_coordinate_systems[k] == set([v]))

        pfam_scop_mapping = {}
        scop_pfam_mapping = {}
        for chain_id, chain_details in region_mapping.iteritems():
            if chain_details.get('Pfam') and chain_details.get('SCOP'):
                for pfamAccessionId, pfam_range_lists in chain_details['Pfam'].iteritems():
                    pfam_residues = parse_range(','.join(['%d-%d' % (r[0], r[1]) for r in pfam_range_lists]))
                    for scopAccessionId, scop_range_lists in chain_details['SCOP'].iteritems():
                        scop_residues = parse_range(','.join(['%d-%d' % (r[0], r[1]) for r in scop_range_lists]))
                        num_same_residues = len(set(pfam_residues).intersection(set(scop_residues)))
                        if num_same_residues > 10:
                            Pfam_match_quality = float(num_same_residues) / float(len(pfam_residues))
                            SCOP_match_quality = float(num_same_residues) / float(len(scop_residues))
                            if (Pfam_match_quality >= self.domain_overlap_cutoff) or (SCOP_match_quality >= self.domain_overlap_cutoff):
                                pfam_scop_mapping[pfamAccessionId] = pfam_scop_mapping.get(pfamAccessionId, DomainMatch(pfamAccessionId, 'Pfam'))
                                pfam_scop_mapping[pfamAccessionId].add(scopAccessionId, 'SCOP', SCOP_match_quality)
                                scop_pfam_mapping[scopAccessionId] = scop_pfam_mapping.get(scopAccessionId, DomainMatch(scopAccessionId, 'SCOP'))
                                scop_pfam_mapping[scopAccessionId].add(pfamAccessionId, 'Pfam', Pfam_match_quality)

        self.pfam_scop_mapping = pfam_scop_mapping
        self.scop_pfam_mapping = scop_pfam_mapping

        self._validate()


    def _validate(self):
        '''Tests that the maps agree through composition.'''

        # I used to use the assertion "self.atom_to_uniparc_sequence_maps.keys() == self.atom_to_seqres_sequence_maps.keys() == self.seqres_to_uniparc_sequence_maps.keys()"
        # but that failed for 2IMM where "self.atom_to_uniparc_sequence_maps.keys() == self.seqres_to_uniparc_sequence_maps.keys() == []" but THAT fails for 1IR3 so I removed
        # the assertions entirely.
        for c, m in self.atom_to_seqres_sequence_maps.iteritems():
            if self.seqres_to_uniparc_sequence_maps.keys():
                atom_uniparc_keys = set(self.atom_to_uniparc_sequence_maps.get(c, {}).keys())
                atom_seqres_keys = set(self.atom_to_seqres_sequence_maps.get(c, {}).keys())
                assert(atom_uniparc_keys.intersection(atom_seqres_keys) == atom_uniparc_keys)
                for k, v in m.map.iteritems():
                    uparc_id_1, uparc_id_2 = None, None
                    try:
                        uparc_id_1 = self.seqres_to_uniparc_sequence_maps[c].map[v]
                        uparc_id_2 = self.atom_to_uniparc_sequence_maps[c].map[k]
                    except:
                        continue
                    assert(uparc_id_1 == uparc_id_2)


    def characters(self, chrs):
        self.tag_data += chrs


    startDocument = start_document
    endDocument = end_document
    startElement = start_element
    endElement = end_element
Ejemplo n.º 9
0
def extract_analysis_data(dataset_list_file, output_directory, data_extraction_method, expectn, top_x, prefix, test_mode = False):
    '''This is the main function in this script and is where the basic analysis is compiled.

       output_directory should contain the results of the prediction run.
       data_extraction_method should be a function pointer to the method-specific function used to retrieve the prediction results e.g. get_kic_run_details
       expectn specifies how many predictions we expect to find (useful in case some jobs failed).
       top_x specifies how many of the best-scoring predictions should be used to generate the TopX metric results e.g.
       the Top5 RMSD metric value measures the lowest RMSD amongst the five best-scoring structures.
       prefix is used to name the output files.
    '''

    # Sanity check
    assert(top_x <= expectn)

    # Set up reference structures
    structures_folder = os.path.join('..', 'input', 'structures', '12_res')
    rcsb_references = os.path.join(structures_folder, 'rcsb', 'reference')
    rosetta_references = os.path.join(structures_folder, 'rosetta', 'reference')

    # Set up the per-case statistics dicts
    best_scoring_structures = {}
    median_scoring_structures = {}
    worst_scoring_structures = {}
    total_percent_subanstrom = {}
    top_x_percent_subanstrom = {}
    top_x_loop_prediction_sets = {}

    # Set up the input file used to generate the graph plotting the "percentage of subangstrom models" metric over
    # varying values of X used to select the TopX structures
    percentage_subangstrom_over_top_X_plot_input = ['PDB\tX\tPercentage of subangstrom cases for TopX']
    percent_subangrom_by_top_x = {}

    # Set up the summary analysis file
    csv_file = ['\t'.join(['PDB ID', 'Models', '%<1.0A', 'Top{0} %<1.0A'.format(top_x), 'Best score', 'Top{0} score'.format(top_x), 'Median score', 'Worst score', 'Closest score', 'Top1 RMSD', 'Top{0} RMSD'.format(top_x), 'Closest RMSD'])]

    # Read in the benchmark input
    pdb_ids = [os.path.splitext(os.path.split(s.strip())[1])[0] for s in get_file_lines(dataset_list_file) if s.strip()]

    # Truncate the benchmark input for test mode
    if test_mode:
        pdb_ids = pdb_ids[:10]

    # Analyze the performance for each case in the benchmark
    for pdb_id in pdb_ids:

        rcsb_reference_pdb = os.path.join(rcsb_references, pdb_id + '.pdb')
        assert(os.path.exists(rcsb_reference_pdb))
        rosetta_reference_pdb = os.path.join(rosetta_references, pdb_id + '.pdb')
        assert(os.path.exists(rosetta_reference_pdb))
        assert(len(pdb_id) == 4)
        loops_file = os.path.join(structures_folder, 'rosetta', 'pruned', '{0}.loop.json'.format(pdb_id))
        loop_sets = json.loads(read_file(loops_file))
        assert(len(loop_sets['LoopSet']) == 1)

        # Create a container for loop predictions
        loop_prediction_set = LoopPredictionSet()

        # Read the coordinates from the reference PDB file
        rcsb_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rcsb_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)
        rosetta_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rosetta_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)

        colortext.wgreen('\n\nReading in the run details for {0}:'.format(pdb_id))
        details = data_extraction_method(output_directory, pdb_id, loop_sets, test_mode = test_mode)
        for d in details:
            loop_prediction = loop_prediction_set.add(d['id'], d['score'], pdb_id = pdb_id, rmsd = None, pdb_path = d['predicted_structure'], pdb_loop_residue_matrix = d['pdb_loop_residue_matrix'])
        print(' Done')

        # Compute the RMSD for this case for the structure using the pandas dataframe
        # It is more efficient to do this after truncation if truncating by score but in the general case users will
        # probably want to consider all predictions. If not (e.g. for testing) then arbitrary subsets can be chosen
        # in the loop above
        colortext.wgreen('Computing RMSDs for {0}:'.format(pdb_id))
        loop_prediction_set.compute_rmsds(rcsb_reference_matrix)
        loop_prediction_set.check_rmsds(rosetta_reference_matrix)
        print(' Done\n')

        # Truncate the structures to the top expectn-scoring files
        loop_prediction_set.sort_by_score()
        loop_prediction_set.truncate(expectn)
        if len(loop_prediction_set) != expectn:
            print('Error: Expected {0} structures but only found {1}.'.format(expectn, len(loop_prediction_set)))
            sys.exit(1)

        # Create a new set containing the top-X-scoring structures and identify the median-scoring structure
        top_x_loop_prediction_sets[pdb_id] = loop_prediction_set[:top_x]
        median_scoring_structures[pdb_id] = loop_prediction_set[int(expectn / 2)]

        # Determine the lowest-/best-scoring structure
        best_scoring_structures[pdb_id] = loop_prediction_set[0]
        best_score = best_scoring_structures[pdb_id].score
        worst_scoring_structures[pdb_id] = loop_prediction_set[-1]
        worst_score = worst_scoring_structures[pdb_id].score
        assert(top_x_loop_prediction_sets[pdb_id][0] == best_scoring_structures[pdb_id])

        # Print structures
        colortext.warning('Top{0} structures'.format(top_x))
        print(top_x_loop_prediction_sets[pdb_id])
        colortext.warning('Top1 structure')
        print(best_scoring_structures[pdb_id])
        colortext.warning('Median (by score) structure')
        print(median_scoring_structures[pdb_id])
        colortext.warning('Lowest-scoring structures')
        print(worst_scoring_structures[pdb_id])

        # Create values for TopX variable plot
        loop_prediction_set.sort_by_score()
        for top_x_var in range(1, len(loop_prediction_set) + 1):
            new_subset = loop_prediction_set[:top_x_var]
            percent_subangstrom = 100 * new_subset.fraction_with_rmsd_lt(1.0)
            percentage_subangstrom_over_top_X_plot_input.append('{0}\t{1}\t{2}'.format(pdb_id, top_x_var, percent_subangstrom))
            percent_subangrom_by_top_x[top_x_var] = percent_subangrom_by_top_x.get(top_x_var, {})
            percent_subangrom_by_top_x[top_x_var][pdb_id] = percent_subangstrom

        total_percent_subanstrom[pdb_id] = 100 * loop_prediction_set.fraction_with_rmsd_lt(1.0)
        top_x_percent_subanstrom[pdb_id] = 100 * top_x_loop_prediction_sets[pdb_id].fraction_with_rmsd_lt(1.0)
        colortext.warning('Number of sub-angstrom cases in the full set of {0}: {1}'.format(expectn, total_percent_subanstrom[pdb_id]))
        colortext.warning('Number of sub-angstrom cases in the TopX structures: {1}'.format(expectn, top_x_percent_subanstrom[pdb_id]))

        loop_prediction_set.sort_by_rmsd()
        closest_rmsd = loop_prediction_set[0].rmsd
        closest_score = loop_prediction_set[0].score
        colortext.warning('RMSD of closest model: {0}'.format(closest_rmsd))
        colortext.warning('Score of closest model: {0}'.format(closest_score))

        top_1_rmsd = best_scoring_structures[pdb_id].rmsd

        top_x_rmsd = best_scoring_structures[pdb_id].rmsd
        top_x_score = best_scoring_structures[pdb_id].score
        for s in top_x_loop_prediction_sets[pdb_id]:
            if (s.rmsd < top_x_rmsd) or (s.rmsd == top_x_rmsd and s.score < top_x_score):
                top_x_rmsd = s.rmsd
                top_x_score = s.score
        assert(top_x_score <= worst_score)
        assert(top_x_rmsd <= top_1_rmsd)

        print('Top 1 RMSD (predicted vs Rosetta/RCSB reference structure): {0}'.format(top_1_rmsd))
        print('Top {0} RMSD (predicted vs Rosetta/RCSB reference structure): {1}'.format(top_x, top_x_rmsd))

        csv_file.append('\t'.join(map(str, [pdb_id, expectn, total_percent_subanstrom[pdb_id], top_x_percent_subanstrom[pdb_id], best_score, top_x_score, median_scoring_structures[pdb_id].score, worst_score, closest_score, top_1_rmsd, top_x_rmsd, closest_rmsd])))

    # Add a column of median percent subangstrom values
    for top_x_var, values_by_pdb in sorted(percent_subangrom_by_top_x.iteritems()):
        assert(sorted(values_by_pdb.keys()) == sorted(pdb_ids))
        median_value = sorted(values_by_pdb.values())[len(pdb_ids) / 2]
        percentage_subangstrom_over_top_X_plot_input.append('Median\t{1}\t{2}'.format(pdb_id, top_x_var, median_value))

    write_file('{0}analysis.csv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}analysis.tsv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}percentage_subangstrom_over_top_X.tsv'.format(prefix), '\n'.join(percentage_subangstrom_over_top_X_plot_input))
Ejemplo n.º 10
0
def setup_jobs(outpath, options, input_files):
    ''' This function sets up the jobs by creating the necessary input files as expected.
          - outpath is where the output is to be stored.
          - options is the optparse options object.
          - input_files is a list of paths to input files.
    '''

    job_inputs = None
    reverse_mapping = None
    fasta_file_contents = {}

    # Generate FASTA files for PDB inputs
    # fasta_file_contents is a mapping from a file path to a pair (FASTA contents, file type). We remember the file type
    # since we offset residue IDs depending on file type i.e. for FASTA files, we treat each sequence separately and do
    # not renumber the fragments in postprocessing. For PDB files, however, we need to respect the order and length of
    # sequences so that we renumber the fragments appropriately in postprocessing - we assume that if a PDB file is passed in
    # then all chains (protein, RNA, or DNA) will be used in a Rosetta run.
    for input_file in input_files:
        assert(not(fasta_file_contents.get(input_file)))
        if any(fnmatch(input_file, x) for x in pdb_file_wildcards):
            pdb = PDB.from_filepath(input_file, strict=True)
            pdb.pdb_id = os.path.basename(input_file).split('.')[0]            
            if pdb.pdb_id.startswith('pdb') and len(pdb.pdb_id) >= 7:
                # Hack to rename FASTA identifiers for pdb*.ent files which are present in mirrors of the PDB
                pdb.pdb_id = pdb.pdb_id.replace('pdb', '')    
            fasta_file_contents[input_file] = (pdb.create_fasta(prefer_seqres_order = False), 'PDB')
        else:
            fasta_file_contents[input_file] = (read_file(input_file), 'FASTA')

    # Extract sequences from the input FASTA files.
    found_sequences, reverse_mapping, errors = get_sequences(options, fasta_file_contents)
    if found_sequences:
        reformat(found_sequences)
    if errors:
        return None, False, errors

    # Discard sequences that are the wrong chain.
    desired_sequences = {}
    for key, sequence in found_sequences.iteritems():
        pdb_id, chain, file_name = key
        if options.chain is None or chain == options.chain:
            desired_sequences[key] = sequence

    # Create the input FASTA and script files.
    job_inputs, errors = create_inputs(options, outpath, desired_sequences)

    # Create the reverse mapping file
    if reverse_mapping:
        segment_mapping_file = os.path.join(outpath, "segment_map.json")
        colorprinter.message("Creating a reverse mapping file %s." % segment_mapping_file)
        write_file(segment_mapping_file, json.dumps(reverse_mapping))

    # Create the post-processing script file
    post_processing_script = read_file(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'post_processing.py'))
    write_file(os.path.join(outpath, 'post_processing.py'), post_processing_script, 'w')

    # Create the secondary structure filter file
    if options.secondary_structure_file:
        write_file(os.path.join(outpath, 'ss_filter.json'), json.dumps({'secondary_structure_filter' : SecondaryStructureDefinition.from_filepath(options.secondary_structure_file).data}), 'w')

    return job_inputs, reverse_mapping != None, errors
Ejemplo n.º 11
0
from klab.bio.basics import ChainMutation
from klab.fs.fsio import read_file, write_temp_file, open_temp_file, write_file
from klab.bio.pfam import Pfam
from klab.bio.dssp import MonomerDSSP, ComplexDSSP, MissingAtomException
from klab.bio.ligand import Ligand, PDBLigand
from klab.bio.pdbtm import PDBTM
from klab.db.sqlalchemy_interface import get_single_record_from_query, get_or_create_in_transaction

from kddg.api.schema import test_schema_against_database_instance
from kddg.api.schema import PDBFile, PDBChain, PDBMolecule, PDBMoleculeChain, PDBResidue, LigandDescriptor, LigandIdentifier, LigandSynonym, PDBLigand
from kddg.api.schema import Ligand as DBLigand
#from kddg.api.schema import Publication, PublicationAuthor, PublicationIdentifier
from kddg.api.layers import *
from kddg.api.db import ddG, PartialDataException, SanityCheckException
import kddg.api.dbi as dbi

rosetta_scripts_path =  '/home/oconchus/t14benchmarking/r57934/main/source/bin/rosetta_scripts.linuxgccrelease'
rosetta_database_path = '/home/oconchus/t14benchmarking/r57934/main/database'
p = PDB(read_file('/kortemmelab/data/kyleb/ddg_numbering_for_shane/24548-data/1CBW_FGHI.pdb'))
#p.construct_pdb_to_rosetta_residue_map(rosetta_scripts_path, rosetta_database_path)
p.construct_pdb_to_rosetta_residue_map(rosetta_scripts_path, rosetta_database_path, extra_command_flags = '-ignore_zero_occupancy false -ignore_unrecognized_res')
pprint.pprint(p.get_atom_sequence_to_rosetta_map())
pprint.pprint(p.rosetta_sequences)

from kddg.api.ppi import get_interface as get_ppi_interface
ppi_api = get_ppi_interface(read_file('../misc/ddgdb.pw'),
                                rosetta_scripts_path =  '/home/oconchus/t14benchmarking/r57934/main/source/bin/rosetta_scripts.linuxgccrelease',
                                rosetta_database_path = '/home/oconchus/t14benchmarking/r57934/main/database')
content = ppi_api.DDG_db.execute_select('SELECT Content FROM PDBFile WHERE ID="1CBW"')[0]['Content']
print(content)
write_file('/tmp/ddginterface/1CBW_FGHI_db.pdb', content)
Ejemplo n.º 12
0
    #    align_two_simple_sequences(fasta_sequence, uniparc_sequence, sequence1name = '%s:%s|PDBID|CHAIN|SEQUENCE' % (pdb_id, c), sequence2name = uniparc_id)


# sanity check - see if uniprotAC in pdb is in the list of the matched uniprot id

print(chains)


sys.exit(0)

px = PDBML.retrieve('1A2C', cache_dir='/home/oconchus/temp')
for k, v in sorted(px.atom_to_seqres_sequence_maps.iteritems(), key=lambda x:(x[0], x[1])):
    print(k,v)


p = PDB.from_filepath('../.testdata/1H38.pdb') # has protein, DNA, RNA
p = PDB.from_filepath('../.testdata/1ZC8.pdb')
p = PDB.from_filepath('../.testdata/4IHY.pdb')
#p = PDB('../.testdata/2GRB.pdb')
p = PDB.from_filepath('../.testdata/1J1M.pdb')
p = PDB.from_filepath('../.testdata/1H38.pdb')
p = PDB.from_filepath('../.testdata/1A2C.pdb')

#print(p.structure_lines)

colortext.message("Resolution")
print(p.get_resolution())

colortext.message("Techniques")
print(p.get_techniques())