Example #1
0
    def _runRScript(RScript):
        rscriptname = rosettahelper.writeTempFile(".", RScript)
        #p = subprocess.Popen(["/opt/R-2.15.1/bin/R","CMD", "BATCH", rscriptname])
        p = subprocess.Popen(["R", "CMD", "BATCH", rscriptname])
        while True:
            time.sleep(0.3)
            errcode = p.poll()
            if errcode != None:
                break
        rout = "%s.Rout" % rscriptname
        delete_file(rscriptname)
        #colortext.warning(rosettahelper.readFile(rout))

        rout_contents = None
        if os.path.exists(rout):
            rout_contents = rosettahelper.readFile(rout)

        if errcode != 0:
            if os.path.exists(rout):
                colortext.warning(rout_contents)
                delete_file(rout)
            raise colortext.Exception(
                "The R script failed with error code %d." % errcode)
        delete_file(rout)
        return rout_contents
Example #2
0
    def get_organisms(self):
        self.organisms = {}
        self._get_XML()
        ACCs = self._get_active_ACCs()
        #print(ACCs)
        name_count = {}
        for UniProtAC in ACCs:
            #print(UniProtAC)
            if UniProtAC in self.AC_entries:
                AC_entry = self.AC_entries[UniProtAC]
            else:
                if UniProtAC in ['N2XE95', 'N1E9H6', 'N2JUB3', 'N2Z3Z2']: # hack for bad XML documents at time of writing
                    continue
                if not self.silent:
                    colortext.warning("Retrieving %s" % UniProtAC)
                try:
                    AC_entry = UniProtACEntry(UniProtAC, cache_dir = self.cache_dir, silent = self.silent)
                except EmptyUniProtACXMLException:
                    continue

            for o in AC_entry.organisms:
                name_count[o['scientific']] = name_count.get(o['scientific'], 0)
                name_count[o['scientific']] += 1
            assert(len(AC_entry.organisms) == 1)
            self.organisms[UniProtAC] = AC_entry.organisms[0]
    def get_sqlalchemy_schema(self, restrict_to_tables = []):
        colortext.warning(' *** MySQL schema ***')
        schema = []
        #print(self.intermediate_schema)

        typedefs = {'sqlalchemy.types' : set(), 'sqlalchemy.dialects.mysql' : set()}

        for tbl in self.tables:
            if (not restrict_to_tables) or (tbl in restrict_to_tables):
                colortext.message(tbl)

                print(self.db_interface.execute("SHOW CREATE TABLE %s" % tbl))[0]['Create Table']
                print('')
                code = []
                code.append("class %s(DeclarativeBase):" % tbl)
                code.append("    __tablename__ = '%s'\n" % tbl)
                #print('\n'.join(code))

                intermediate_table = self.intermediate_schema[tbl]
                for field in intermediate_table:
                    s = field.to_sql_alchemy(typedefs)
                    code.append('    {0}'.format(s))
                    #print(s)
                code.append('\n')
                #print('')
                schema.extend(code)

        imports = []
        for module, types in sorted(typedefs.iteritems()):
            imports.append('from %s import %s' % (module, ', '.join(sorted(types))))
        schema = imports + [''] + schema

        colortext.warning('*** SQLAlchemy class definitions ***')
        print('\n'.join(schema))
 def __init__(self, user, host, db, passwd, port = 3306, socket = '/var/lib/mysql/mysql.sock'):
     try:
         self.db_interface = DatabaseInterface({}, isInnoDB=True, numTries=1, host=host, db=db, user=user, passwd=passwd, port=3306,
                  unix_socket=socket, passwdfile=None, use_utf=False, use_locking=True)
     except Exception, e:
         colortext.error('An exception was thrown trying to connect to the database.')
         colortext.warning(str(e))
         print(traceback.format_exc())
         sys.exit(1)
Example #5
0
    def blast_by_pdb_chain(self, pdb_id, chain_id, take_top_percentile = 30.0, cut_off = None, matrix = None, sequence_identity_cut_off = None, silent = None):

        # Checks
        pdb_id, chain_id = pdb_id.strip(), chain_id.strip()
        if len(pdb_id) != 4:
            raise Exception('A PDB ID of four characters was expected. "{0}" was passed.'.format(pdb_id))
        if 5 <= len(chain_id) <= 0:
            raise Exception('A chain ID of between 1-4 characters was expected. "{0}" was passed.'.format(chain_id))

        self.log('BLASTing {0}:{1}'.format(pdb_id, chain_id), silent)

        # Construct query
        query_data = dict(
            structureId = pdb_id,
            chainId = chain_id,
        )
        xml_query = self._construct_query(query_data, cut_off = cut_off, matrix = matrix, sequence_identity_cut_off = sequence_identity_cut_off)

        # Read cached results
        if self.bio_cache:
            data = self.bio_cache.load_pdb_chain_blast(pdb_id, chain_id, query_data['eCutOff'], query_data['matrix'], query_data['sequenceIdentityCutoff'])
            if data:
                assert('query_date' in data)
                query_date = datetime.datetime.strptime(data['query_date'], BLAST.date_format)
                age_in_hours = ((datetime.datetime.now() -  query_date).total_seconds()) / (3600.0)
                assert(age_in_hours > -24.01)
                if not self.force_lookup:
                    if age_in_hours < self.stale_period_in_hours:
                        return data['hits']

        # POST the request and parse the PDB hits
        result = self._post(xml_query)
        hits = [l.strip().split(':')[0] for l in result.split('\n') if l.strip()]
        if pdb_id not in hits:
            if not hits:
                try:
                    p = self.bio_cache.get_pdb_object(pdb_id)
                    chain_type = p.chain_types[chain_id]
                    sequence_length = len(p.seqres_sequences[chain_id])
                    if not(chain_type == 'Protein' or chain_type == 'Protein skeleton'):
                        colortext.warning('Chain {1} of {0} is a {2} chain.'.format(pdb_id, chain_id, chain_type))
                        hits = None # None suggests that the chain was not a protein chain whereas an empty list suggest a protein chain with no hits
                    elif sequence_length < self.min_sequence_length:
                        colortext.warning('Chain {1} of {0} only contains {2} residues. The minimum sequence length is set to {3} residues so we will ignore this chain in matching.'.format(pdb_id, chain_id, sequence_length, self.min_sequence_length))
                        hits = None # None suggests that the chain was not a protein chain whereas an empty list suggest a protein chain with no hits
                except:
                    raise colortext.Exception('Failed to determine the chain type for chain {1} of {0}.'.format(pdb_id, chain_id))
            else:
                raise Exception('A BLAST of {0} chain {1} failed to find any hits for {0}. Is the chain a polypeptide chain?'.format(pdb_id, chain_id))

        query_data['hits'] = hits

        # Cache the results
        if self.bio_cache:
            self.bio_cache.save_pdb_chain_blast(pdb_id, chain_id, query_data['eCutOff'], query_data['matrix'], query_data['sequenceIdentityCutoff'], query_data)

        return query_data['hits']
Example #6
0
def load():
    global sys_settings
    if not sys_settings:
        settings_file = os.path.splitext(os.path.abspath(__file__))[0] + '.json'
        if not os.path.exists(settings_file):
            create_template(settings_file)
            colortext.warning('\nThe settings file {0} needs to be configured. Exiting.\n'.format(settings_file))
            sys.exit(1)
        d = json.loads(read_file(settings_file))
        sys_settings = NestedBunch(d)
    return sys_settings
Example #7
0
def test_prediction_set():
    c = 0
    counts = {}
    ppi_api = get_ppi_api()
    for j in ppi_api.get_queued_jobs(prediction_set, order_by = 'Cost', order_order_asc = False, include_files = False, truncate_content = None):
        counts[j['Structure']['PDBFileID']] = counts.get(j['Structure']['PDBFileID'], 0)
        counts[j['Structure']['PDBFileID']] += 1
        c += 1
    colortext.warning('Counts by PDB ID:')
    pprint.pprint(counts)
    colortext.warning('Total count: {0}'.format(c))
Example #8
0
def setup():
    global pdb_file_paths  # RCSB PDB_ID -> PDB file
    global rcsb_pdb_objects # RCSB PDB_ID -> PDB object
    global tina_pdb_objects # Tina's PDB_ID -> PDB object
    global tina_pdb_id_to_rcsb_pdb_id # Tina's PDB_ID -> RCSB PDB_ID
    global mutations_dataframe

    if not mutations_dataframe:
        setup_mutations_dataframe()

    # old_mutations_csv is missing some cases but has the mapping from pdb -> partner 1 name, partner 2 name
    old_mutations_csv = os.path.join('temp', 'mutations_Gsp1_old.txt')
    assert(os.path.exists('temp'))
    assert(os.path.exists(old_mutations_csv))

    df = pandas.read_csv(old_mutations_csv, sep = '\t')

    tina_pdb_ids = sorted(set([p for p in df['pdb'].values]))
    rcsb_pdb_ids = set()
    for pdb_id in tina_pdb_ids:
        rcsb_pdb_ids.add(pdb_id[:4])
        tina_pdb_id_to_rcsb_pdb_id[pdb_id] = pdb_id[:4]
    rcsb_pdb_ids = sorted(rcsb_pdb_ids)

    assert(rcsb_pdb_ids == sorted(set([p[:4] for p in mutations_dataframe['pdb'].values])))
    rcsb_file_dir = '../../rawdata'

    for pdb_id in tina_pdb_ids:
        tina_pdb_objects[pdb_id] = PDB.from_filepath(os.path.join('temp', 'pdbs', '{0}.pdb'.format(pdb_id)), parse_ligands = True)

    for pdb_id in rcsb_pdb_ids:
        filename = '{0}.pdb'.format(pdb_id.upper())
        pdb_file_paths[pdb_id.upper()] = os.path.join(rcsb_file_dir, filename)
        pdb_contents = download_pdb(pdb_id, rcsb_file_dir, silent = True, filename = filename)
        p = PDB(pdb_contents, parse_ligands = True)
        rcsb_pdb_objects[pdb_id] = p

    print('\nRosetta files  ({0}) : {1}'.format(str(len(tina_pdb_ids)).rjust(2), ', '.join([s.rjust(5) for s in tina_pdb_ids])))
    print('Original files ({0}) : {1}\n'.format(str(len(rcsb_pdb_ids)).rjust(2), ', '.join([s.rjust(5) for s in rcsb_pdb_ids])))

    ppi_api = get_ppi_api()
    for pdb_id, pdb_file_path in pdb_file_paths.iteritems():
        existing_records = ppi_api.DDG_db.execute_select('SELECT * FROM PDBFile WHERE ID=%s', parameters=(pdb_id,))
        if existing_records:
            colortext.warning('The PDB file {0} exists in the database.'.format(pdb_id))
        complex_ids = ppi_api.search_complexes_by_pdb_id(pdb_id)

        if complex_ids:
            colortext.warning('The PDB file {0} has associated complexes: {1}'.format(pdb_id, ', '.join(map(str, complex_ids))))
    print('')
def fix_1AYE_InputFiles(prediction_set):
    '''This is a once-off function which should only be run once per prediction set as each run changes the mutfile and this change should only occur once.'''
    import pickle
    ddGdb = ddgdbapi.ddGDatabase()

    BadPredictions = sorted(set([(r['PredictionID'], r['Status']) for r in ddGdb.execute_select('''
    SELECT Prediction.ID AS PredictionID, Status FROM Prediction INNER JOIN UserDataSetExperiment ON UserDataSetExperiment.ID=Prediction.UserDataSetExperimentID WHERE PredictionSet=%s AND PDBFileID='1AYE'
    ''', parameters=(prediction_set,))]))
    BadPredictionIDs = sorted(set([r[0] for r in BadPredictions]))
    print(BadPredictions)
    num_active = len([r for r in BadPredictions if r[1] == 'active'])
    num_queued = len([r for r in BadPredictions if r[1] == 'queued'])
    statuses = sorted(set([r[1] for r in BadPredictions]))
    if ('active' in statuses) or ('queued' in statuses):
        colortext.error("Cannot proceed - there are %d active jobs and %d queued in the list that need to be fixed up. Stop the DDG scheduler, remove the queued constraint, and rerun this function. " % (num_active, num_queued))
        if num_active:
            print("%d active jobs: %s" % (num_active, ", ".join([str(r[0]) for r in BadPredictions if r[1] == 'active'])))
        if num_queued:
            print("%d queued jobs: %s" % (num_queued, ", ".join([str(r[0]) for r in BadPredictions if r[1] == 'queued'])))
        return

    for PredictionID in BadPredictionIDs:
        r = ddGdb.execute_select("SELECT InputFiles FROM Prediction WHERE ID=%s", parameters=(PredictionID,))
        assert(len(r) == 1)
        r = r[0]

        InputFiles = pickle.loads(r['InputFiles'])
        assert(InputFiles.keys() == ['MUTFILE'])
        mutfile = InputFiles['MUTFILE']

        colortext.message("\n%d" % PredictionID)

        colortext.warning('original')
        print(mutfile)

        lines = mutfile.split("\n")
        assert(lines[0].startswith('total'))
        num_muts = int(lines[0][5:])
        assert(lines[1] == str(num_muts))
        for x in range(2, num_muts + 2):
            mutline = lines[x]
            tokens = mutline.split()
            tokens[1] = str(int(tokens[1]) - 1)
            lines[x] = " ".join(tokens)

        new_mutfile = "\n".join(lines)
        colortext.warning('fixed')
        print(new_mutfile)

        p = pickle.dumps({'MUTFILE' : new_mutfile})
Example #10
0
def retrieve_ligand_diagram(pdb_ligand_code):
    from PIL import Image
    file = BytesIO(urllib.urlopen('http://www.rcsb.org/pdb/images/{0}_600.gif'.format(pdb_ligand_code)).read())
    img = Image.open(file)
    width, height = img.size
    if width < 100: # not a foolproof method - they may change the failure picture in future
        file = BytesIO(urllib.urlopen('http://www.rcsb.org/pdb/images/{0}_270.gif'.format(pdb_ligand_code)).read())
        img = Image.open(file)
        width, height = img.size
        if width < 100:
            colortext.warning('Could not find a diagram for ligand {0}. It is possible that the URLs have changed.'.format(pdb_ligand_code))
            return None
    file.seek(0)
    return file.read()
Example #11
0
def load():
    global sys_settings
    if not sys_settings:
        settings_file = os.path.splitext(
            os.path.abspath(__file__))[0] + '.json'
        if not os.path.exists(settings_file):
            create_template(settings_file)
            colortext.warning(
                '\nThe settings file {0} needs to be configured. Exiting.\n'.
                format(settings_file))
            sys.exit(1)
        d = json.loads(read_file(settings_file))
        sys_settings = NestedBunch(d)
    return sys_settings
Example #12
0
def test_sifts_module():
    failures = []
    ddG_pdb_ids = ['107L','108L','109L','110L','111L','112L','113L','114L','115L','118L','119L','120L','122L','123L','125L','126L','127L','128L','129L','130L','131L','137L','149L','150L','151L','160L','161L','162L','163L','164L','165L','168L','169L','171L','172L','173L','190L','191L','192L','195L','196L','1A23','1A2I','1A2P','1A3Y','1A43','1A4Y','1A53','1A5E','1A70','1A7A','1A7H','1A7V','1AAL','1AAR','1AAZ','1ABE','1ACB','1ADO','1ADW','1AG2','1AG4','1AG6','1AIE','1AIN','1AJ3','1AJQ','1AKK','1AKM','1AM7','1AMQ','1ANF','1ANK','1ANT','1AO6','1AON','1AOZ','1APC','1APL','1APS','1AQH','1AR1','1ARR','1ATJ','1ATN','1AU1','1AUT','1AV1','1AVR','1AX1','1AXB','1AYE','1AYF','1AZP','1B0O','1B26','1B5M','1B8J','1BAH','1BAN','1BAO','1BCX','1BD8','1BET','1BF4','1BFM','1BGD','1BGL','1BJP','1BKE','1BKS','1BLC','1BMC','1BNI','1BNL','1BNS','1BNZ','1BOY','1BP2','1BPI','1BPL','1BPR','1BPT','1BRF','1BRG','1BRH','1BRI','1BRJ','1BRK','1BSA','1BSB','1BSC','1BSD','1BSE','1BSR','1BTA','1BTI','1BTM','1BUJ','1BVC','1BVU','1BZO','1C0L','1C17','1C2R','1C52','1C53','1C5G','1C6P','1C9O','1CAH','1CBW','1CDC','1CEA','1CEY','1CHK','1CHO','1CHP','1CLW','1CM7','1CMB','1CMS','1COA','1COK','1COL','1CPM','1CSP','1CTS','1CUN','1CUS','1CVW','1CX1','1CX8','1CYC','1CYO','1D0X','1D1G','1DAQ','1DDN','1DE3','1DEC','1DEQ','1DFO','1DFX','1DHN','1DIL','1DIV','1DJU','1DKG','1DKT','1DLC','1DM0','1DO9','1DPM','1DTD','1DTO','1DVC','1DVF','1DVV','1DXX','1DYA','1DYB','1DYC','1DYD','1DYE','1DYF','1DYG','1DYJ','1E21','1E6K','1E6L','1E6M','1E6N','1EDH','1EFC','1EG1','1EHK','1EKG','1EL1','1ELV','1EMV','1EQ1','1ERU','1ESF','1ETE','1EVQ','1EW4','1EXG','1EZA','1F88','1FAJ','1FAN','1FC1','1FEP','1FGA','1FKB','1FKJ','1FLV','1FMK','1FMM','1FNF','1FR2','1FRD','1FTG','1FTT','1FXA','1G6N','1G6V','1G6W','1GA0','1GAD','1GAL','1GAY','1GAZ','1GB0','1GB2','1GB3','1GB7','1GBX','1GD1','1GF8','1GF9','1GFA','1GFE','1GFG','1GFH','1GFJ','1GFK','1GFL','1GFR','1GFT','1GFU','1GFV','1GKG','1GLH','1GLM','1GOB','1GPC','1GQ2','1GRL','1GRX','1GSD','1GTM','1GTX','1GUY','1GXE','1H09','1H0C','1H2I','1H7M','1H8V','1HA4','1HCD','1HEM','1HEN','1HEO','1HEP','1HEQ','1HER','1HEV','1HFY','1HFZ','1HGH','1HGU','1HIB','1HIC','1HIO','1HIX','1HK0','1HME','1HML','1HNG','1HNL','1HOR','1HQK','1HTI','1HUE','1HXN','1HYN','1HYW','1HZ6','1I4N','1I5T','1IAR','1IC2','1IDS','1IFB','1IFC','1IGS','1IGV','1IHB','1IMQ','1INQ','1INU','1IO2','1IOB','1IOF','1IOJ','1IR3','1IRL','1IRO','1ISK','1IX0','1J0X','1J4S','1J7N','1JAE','1JBK','1JHN','1JIW','1JJI','1JKB','1JNK','1JTD','1JTG','1JTK','1K23','1K3B','1K40','1K9Q','1KA6','1KBP','1KDN','1KDU','1KDX','1KEV','1KFD','1KFW','1KJ1','1KKJ','1KTQ','1KUM','1KVA','1KVB','1KVC','1L00','1L02','1L03','1L04','1L05','1L06','1L07','1L08','1L09','1L10','1L11','1L12','1L13','1L14','1L15','1L16','1L17','1L18','1L19','1L20','1L21','1L22','1L23','1L24','1L33','1L34','1L36','1L37','1L38','1L40','1L41','1L42','1L43','1L44','1L45','1L46','1L47','1L48','1L49','1L50','1L51','1L52','1L53','1L54','1L55','1L56','1L57','1L59','1L60','1L61','1L62','1L63','1L65','1L66','1L67','1L68','1L69','1L70','1L71','1L72','1L73','1L74','1L75','1L76','1L77','1L85','1L86','1L87','1L88','1L89','1L90','1L91','1L92','1L93','1L94','1L95','1L96','1L97','1L98','1L99','1LAV','1LAW','1LBI','1LFO','1LHH','1LHI','1LHJ','1LHK','1LHL','1LHM','1LHP','1LLI','1LMB','1LOZ','1LPS','1LRA','1LRE','1LRP','1LS4','1LSN','1LUC','1LVE','1LYE','1LYF','1LYG','1LYH','1LYI','1LYJ','1LZ1','1M7T','1MAX','1MBD','1MBG','1MCP','1MGR','1MJC','1MLD','1MSI','1MUL','1MX2','1MX4','1MX6','1MYK','1MYL','1N02','1N0J','1NAG','1NM1','1NZI','1OA2','1OA3','1OCC','1OH0','1OIA','1OKI','1OLR','1OMU','1ONC','1OPD','1ORC','1OSA','1OSI','1OTR','1OUA','1OUB','1OUC','1OUD','1OUE','1OUF','1OUG','1OUH','1OUI','1OUJ','1OVA','1P2M','1P2N','1P2O','1P2P','1P2Q','1P3J','1PAH','1PBA','1PCA','1PDO','1PGA','1PHP','1PII','1PIN','1PK2','1PMC','1POH','1PPI','1PPN','1PPP','1PQN','1PRE','1PRR','1Q5Y','1QEZ','1QGV','1QHE','1QJP','1QK1','1QLP','1QLX','1QM4','1QND','1QQR','1QQV','1QT6','1QT7','1QU0','1QU7','1QUW','1R2R','1RBN','1RBP','1RBR','1RBT','1RBU','1RBV','1RCB','1RDA','1RDB','1RDC','1REX','1RGC','1RGG','1RH1','1RHD','1RHG','1RIL','1RIS','1RN1','1ROP','1RRO','1RTB','1RTP','1RX4','1S0W','1SAK','1SAP','1SCE','1SEE','1SFP','1SHF','1SHG','1SHK','1SMD','1SPD','1SPH','1SSO','1STF','1STN','1SUP','1SYC','1SYD','1SYE','1SYG','1T3A','1T7C','1T8L','1T8M','1T8N','1T8O','1TBR','1TCA','1TCY','1TEN','1TFE','1TGN','1THQ','1TI5','1TIN','1TIT','1TLA','1TML','1TMY','1TOF','1TPE','1TPK','1TTG','1TUP','1TUR','1U5P','1UBQ','1UCU','1UOX','1URK','1UW3','1UWO','1UZC','1V6S','1VAR','1VFB','1VIE','1VQA','1VQB','1VQC','1VQD','1VQE','1VQF','1VQG','1VQH','1VQI','1VQJ','1W3D','1W4E','1W4H','1W99','1WIT','1WLG','1WPW','1WQ5','1WQM','1WQN','1WQO','1WQP','1WQQ','1WQR','1WRP','1WSY','1XAS','1XY1','1Y4Y','1Y51','1YAL','1YAM','1YAN','1YAO','1YAP','1YAQ','1YCC','1YEA','1YGV','1YHB','1YMB','1YNR','1YPA','1YPB','1YPC','1YPI','1Z1I','1ZNJ','200L','206L','216L','217L','219L','221L','224L','227L','230L','232L','233L','235L','236L','237L','238L','239L','240L','241L','242L','243L','244L','246L','247L','253L','254L','255L','2A01','2A36','2ABD','2AC0','2ACE','2ACY','2ADA','2AFG','2AIT','2AKY','2ASI','2ATC','2B4Z','2BBM','2BQA','2BQB','2BQC','2BQD','2BQE','2BQF','2BQG','2BQH','2BQI','2BQJ','2BQK','2BQM','2BQN','2BQO','2BRD','2CBR','2CHF','2CI2','2CPP','2CRK','2CRO','2DQJ','2DRI','2EQL','2FAL','2FHA','2FX5','2G3P','2GA5','2GSR','2GZI','2HEA','2HEB','2HEC','2HED','2HEE','2HEF','2HIP','2HMB','2HPR','2IFB','2IMM','2L3Y','2L78','2LZM','2MBP','2MLT','2NUL','2OCJ','2PDD','2PEC','2PEL','2PRD','2Q98','2RBI','2RN2','2RN4','2SNM','2SOD','2TMA','2TRT','2TRX','2TS1','2WSY','2ZAJ','2ZTA','3BCI','3BCK','3BD2','3BLS','3CHY','3D2A','3ECA','3FIS','3HHR','3MBP','3PGK','3PRO','3PSG','3SSI','3TIM','3VUB','451C','487D','4BLM','4CPA','4GCR','4LYZ','4SGB','4TLN','4TMS','5AZU','5CPV','5CRO','5MDH','5PEP','6TAA','7AHL','7PTI','8PTI','8TIM','9INS','9PCY',]
    for no_xml_case in ['1GTX', '1SEE', '1UOX', '1WSY', '1YGV', '2MBP']:
        ddG_pdb_ids.remove(no_xml_case)
    for bad_sifts_mapping_case in ['1N02', '487D']:
        ddG_pdb_ids.remove(bad_sifts_mapping_case)
    for no_pdb_uniprot_mapping_case in ['2IMM']:
        ddG_pdb_ids.remove(no_pdb_uniprot_mapping_case)

    ddG_pdb_ids = ['1GTX', '1SEE', '1UOX', '1WSY', '1YGV', '2MBP']
    ddG_pdb_ids = ['1N02', '487D'] + ['2IMM']

    count = 1
    num_cases = len(ddG_pdb_ids)
    for pdb_id in ddG_pdb_ids:
        try:
            print('Case %d/%d: %s' % (count, num_cases, pdb_id))
            sifts_map = SIFTS.retrieve(pdb_id, cache_dir = cache_dir, acceptable_sequence_percentage_match = 80.0)
        except MissingSIFTSRecord:
            colortext.warning('No SIFTS XML exists for %s.' % pdb_id)
        except BadSIFTSMapping:
            colortext.warning('The SIFTS mapping for %s was considered a bad mapping at the time of writing.' % pdb_id)
        except NoSIFTSPDBUniParcMapping:
            colortext.warning('The SIFTS file for %s does not map to UniParc sequences at the time of writing.' % pdb_id)
        except Exception, e:
            colortext.warning(str(e))
            colortext.error(traceback.format_exc())
            failures.append(pdb_id)
        count += 1
Example #13
0
def test_sequences(b, sequences):
    failed_cases = []
    c = 0
    for sequence in sequences:
        try:
            c += 1
            colortext.message('\n{0}/{1}: {2}'.format(c, len(sequences), sequence))
            hits = b.by_sequence(sequence)
            if hits:
                colortext.warning('{0} hits: {1}'.format(len(hits), ','.join(hits)))
            else:
                colortext.warning('No hits')
        except Exception, e:
            colortext.error('FAILED')
            failed_cases.append((sequence, str(e), traceback.format_exc()))
Example #14
0
    def updateEvents(self, calendar_id, newEvents):
        currentEvents = self.getEventsTable(calendar_id)

        #colortext.message(newEvents)
        #colortext.warning(currentEvents)

        # Events to remove
        toRemove = []
        for startdateTitle, event in sorted(currentEvents.iteritems()):
            if event["title"].find("birthday") != -1:
                # Don't remove birthdays
                continue
            if newEvents.get(startdateTitle):
                newEvent = newEvents[startdateTitle]
                if newEvent["enddate"] == event["enddate"]:
                    if event["location"].startswith(newEvent["location"]):
                        if str(newEvent["title"]) == str(event["title"]):
                            # Don't remove events which are in both newEvents and the calendar
                            continue

            # Remove events which are on the calendar but not in newEvents
            toRemove.append(startdateTitle)

        # Events to add
        toAdd = []
        for startdateTitle, event in sorted(newEvents.iteritems()):
            if currentEvents.get(startdateTitle):
                currentEvent = currentEvents[startdateTitle]
                if currentEvent["enddate"] == event["enddate"]:
                    if currentEvent["location"].startswith(event["location"]):
                        if str(currentEvent["title"]) == str(event["title"]):
                            # Don't add events which are in both newEvents and the calendar
                            continue
            # Add events which are in newEvents but not on the calendar
            toAdd.append(startdateTitle)

        if toRemove:
            colortext.error("Removing these %d events:" % len(toRemove))
            for dtTitle in toRemove:
                colortext.warning(dtTitle)
                self.removeEvent(calendar_id, currentEvents[dtTitle]["event"].id)

        if toAdd:
            colortext.message("Adding these %d events:" % len(toAdd))
            for dtTitle in toAdd:
                newEvent = newEvents[dtTitle]
                #print(dtTitle, newEvent)
                self.addNewEvent(calendar_id, dtTitle[0], newEvent["enddate"], newEvent["location"], newEvent["title"])
Example #15
0
    def AddPublishedDDGsToAnalysisTables(self):
        ddGdb = self.ddGdb
        analysis_tables = self.analysis_tables
        for AnalysisSet, analysis_table in analysis_tables.iteritems():
            published_dataset_scores = PublishedDatasetScores(ddGdb, AnalysisSet).scores

            for analysis_point in analysis_table.points:
                if analysis_point.section and analysis_point.recordnumber:
                    section = analysis_point.section
                    recordnumber = analysis_point.recordnumber
                    if published_dataset_scores.get(section) and published_dataset_scores[section].get(recordnumber):
                        published_dataset_score = published_dataset_scores[section][recordnumber]["PublishedDatasetDDG"]
                        analysis_point.DatasetPublishedDDG = published_dataset_score
                    else:
                        if self.quiet_level >= 1:
                            colortext.warning("No published dataset score found for %s-%s-%s." % (AnalysisSet, Section, RecordNumber))
Example #16
0
def test_pdb_files(b, pdb_ids):

    failed_cases = []
    c = 0
    for pdb_id in pdb_ids:
        try:
            c += 1
            colortext.message('\n{0}/{1}: {2}'.format(c, len(pdb_ids), pdb_id))
            hits = b.by_pdb(pdb_id)
            if hits:
                colortext.warning('{0} hits: {1}'.format(len(hits), ','.join(hits)))
            else:
                colortext.warning('No hits')
        except Exception, e:
            colortext.error('FAILED')
            failed_cases.append((pdb_id, str(e), traceback.format_exc()))
Example #17
0
    def add_company_quarter(self, company_name, quarter_name, dt, calendar_id = 'notices'):
        '''Adds a company_name quarter event to the calendar. dt should be a date object. Returns True if the event was added.'''

        assert(calendar_id in self.configured_calendar_ids.keys())
        calendarId = self.configured_calendar_ids[calendar_id]

        quarter_name = quarter_name.title()
        quarter_numbers = {
            'Spring' : 1,
            'Summer' : 2,
            'Fall' : 3,
            'Winter' : 4
        }
        assert(quarter_name in quarter_numbers.keys())

        start_time = datetime(year=dt.year, month=dt.month, day=dt.day, hour=0, minute=0, second=0, tzinfo=self.timezone) + timedelta(days = -1)
        end_time = start_time + timedelta(days = 3, seconds = -1)
        summary = '%s %s Quarter begins' % (company_name, quarter_name)

        # Do not add the quarter multiple times
        events = self.get_events(start_time.isoformat(), end_time.isoformat(), ignore_cancelled = True)
        for event in events:
            if event.summary.find(summary) != -1:
                return False

        event_body = {
            'summary' : summary,
            'description' : summary,
            'start' : {'date' : dt.isoformat(), 'timeZone' : self.timezone_string},
            'end' : {'date' : dt.isoformat(), 'timeZone' : self.timezone_string},
            'status' : 'confirmed',
            'gadget' : {
                'display' : 'icon',
                'iconLink' : 'https://guybrush.ucsf.edu/images/Q%d_32.png' % quarter_numbers[quarter_name],
                'title' : summary,
            },
            'extendedProperties' : {
                'shared' : {
                    'event_type' : '%s quarter' % company_name,
                    'quarter_name' : quarter_name
                }
            }
        }
        colortext.warning('\n%s\n' % pprint.pformat(event_body))
        created_event = self.service.events().insert(calendarId = self.configured_calendar_ids[calendar_id], body = event_body).execute()
        return True
Example #18
0
    def CreateAnalysisTables(self):
        ddGdb = self.ddGdb
        PredictionSet = self.PredictionSet
        predictions = PredictionScores(ddGdb, PredictionSet, self.ddG_score_type, score_cap = self.score_cap)
        predicted_scores = predictions.Predictions

        s = "Analyzing %d predictions in PredictionSet '%s' for UserDataSet '%s'. " % (predictions.NumberOfPredictions, predictions.PredictionSet.replace("_", "\_"), predictions.UserDataSetName)
        if self.score_cap:
            s += "Running analysis over the following analysis sets: '%s' with predicted scores capped at +-%0.2f." % (join(predictions.AnalysisSets, "', '"), self.score_cap)
        else:
            s += "Running analysis over the following analysis sets: '%s'." % (join(predictions.AnalysisSets, "', '"))
        self.description.append(("black", s))
        if self.quiet_level >= 1:
            colortext.message("Analyzing %d predictions in PredictionSet '%s' for UserDataSet '%s'." % (predictions.NumberOfPredictions, predictions.PredictionSet, predictions.UserDataSetName))
            colortext.message("Running analysis over the following analysis sets: '%s'." % (join(predictions.AnalysisSets, "', '")))

        analysis_tables = {}
        # Analyze data for
        for AnalysisSet in predictions.AnalysisSets:
            analysis_table = AnalysisTable()

            experiments = UserDataSetExperimentalScores(ddGdb, predictions.UserDataSetID, AnalysisSet)

            count = 0
            numMissing = 0
            for section, sectiondata in sorted(experiments.iteritems()):
                for recordnumber, record_data in sorted(sectiondata.iteritems()):
                    count += 1
                    PDB_ID = record_data["PDB_ID"]
                    ExperimentID = record_data["ExperimentID"]
                    ExperimentalDDG = record_data["ExperimentalDDG"]
                    if predicted_scores.get(ExperimentID) and predicted_scores[ExperimentID].get(PDB_ID):
                        PredictedDDG = predicted_scores[ExperimentID][PDB_ID]["PredictedDDG"]
                        analysis_table.add(AnalysisPoint(ExperimentalDDG, PredictedDDG, ExperimentID = ExperimentID, PDB_ID = PDB_ID, section = section, recordnumber = recordnumber))
                    else:
                        numMissing += 1
            if numMissing > 0 and self.quiet_level >= 1:
                self.description.append(("Bittersweet", "Missing %d predictions out of %d records for analysis set %s." % (numMissing, count, AnalysisSet)))
                colortext.warning("Missing %d predictions out of %d records for analysis set %s." % (numMissing, count, AnalysisSet))
            analysis_tables[AnalysisSet] = analysis_table

        self.analysis_tables = analysis_tables
Example #19
0
    def remove_all_cancelled_events(self, calendar_ids = []):

        for calendar_id in calendar_ids or self.calendar_ids:
            colortext.message('Removing cancelled events in %s' % calendar_id)
            events = self.service.events().list(calendarId = self.configured_calendar_ids[calendar_id]).execute()
            print(len(events['items']))

            for event in events['items']:
                dt = None
                nb = DeepNonStrictNestedBunch(event)
                if nb.status == 'cancelled':
                    if nb.recurringEventId:
                        colortext.warning(nb.recurringEventId)
                        # Retrieve all occurrences of the recurring event within the timeframe
                        start_time = datetime(year=2010, month=1, day=1, tzinfo=self.timezone).isoformat()
                        end_time = datetime(year=2015, month=1, day=1, tzinfo=self.timezone).isoformat()
                        for e in self.get_recurring_events(calendar_id, nb.id, start_time, end_time, maxResults = 10):
                            print(e)
                    else:
                        colortext.warning(nb)
Example #20
0
def print_existing_experimental_data():
    # These PDB files existed in the database before the import so I am interested to see whether any of the experimental
    # data matches the requested predictions
    print('')
    ppi_api = get_ppi_api()
    for pdb_id in ['1A2K', '1K5D', '1I2M']:
        colortext.message(pdb_id)
        complex_ids = ppi_api.search_complexes_by_pdb_id(pdb_id)
        if complex_ids:
            assert(len(complex_ids) == 1)
            complex_id = complex_ids[0]
            colortext.warning('Complex #{0}'.format(complex_id))
            pprint.pprint(ppi_api.get_complex_details(complex_id))

        mutation_records = mutations_dataframe[mutations_dataframe['pdb'].str.contains(pdb_id)]# mutations_dataframe.loc[mutations_dataframe['pdb'][0:4] == pdb_id]
        with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
            print mutation_records

    # There is no experimental binding affinity data at present
    assert(not(ppi_api.DDG_db.execute_select('SELECT * FROM PPMutagenesisPDBMutation WHERE PPComplexID IN (202, 119, 176) ORDER BY PPComplexID, Chain, ResidueID, MutantAA')))
Example #21
0
def import_structures():
    setup()
    ppi_api = get_ppi_api()
    complex_definitions = json.loads(read_file('tinas_complexes.json'))
    for tina_pdb_id, complex_structure_definition_pair in sorted(complex_definitions.iteritems()):
        #if tina_pdb_id != '1WA52':
        #    continue
        colortext.warning(tina_pdb_id)
        del complex_structure_definition_pair['Structure']['file_path']
        complex_structure_definition_pair['Structure']['pdb_object'] = tina_pdb_objects[tina_pdb_id]
        pdb_set = ppi_api.add_complex_structure_pair(complex_structure_definition_pair, keywords = ['GSP1'],
                                                     force = True, trust_database_content = False, allow_missing_params_files = False, debug = False)
        if pdb_set['success'] == False:
            print(pdb_set['error'])
            if 'possible_matches' in pdb_set:
                for d in pdb_set['possible_matches']:
                    colortext.warning(d['ID'])
                    print('{0}, {1}, {2}'.format(d['LName'].encode('utf-8').strip(), d['LShortName'].encode('utf-8').strip(), d['LHTMLName'].encode('utf-8').strip()))
                    print('{0}, {1}, {2}'.format(d['RName'].encode('utf-8').strip(), d['RShortName'].encode('utf-8').strip(), d['RHTMLName'].encode('utf-8').strip()))

    create_project_pdb_records()
Example #22
0
def main():

    # Create up the database session
    dbi = DatabaseInterface(can_email = True)
    tsession = dbi.get_session()

    # Create a map from usernames to the database IDs (typically initials)
    user_map = {}
    for u in tsession.query(Users):
        user_map[u.lab_username] = u.ID

    # Read the import path from the database
    colortext.message('\nPrimers import script')
    colortext.pcyan('Database admin contacts: {0}'.format(', '.join(dbi.get_admin_contacts())))
    colortext.warning('Registered users: {0}\n'.format(', '.join(   ['{0} ({1})'.format(v, k) for k, v in sorted(user_map.iteritems(), key = lambda x: x[1])])))

    errors = []
    import_path = tsession.query(DBConstants).filter(DBConstants.Parameter == u'import_path').one().Value
    import_path_folders = sorted([d for d in os.listdir(import_path) if os.path.isdir(os.path.join(import_path,d))])
    for ipf in import_path_folders:
        if ipf in user_map:
            user_folder = os.path.join(import_path, ipf)
            user_id = user_map[ipf]
            primers_file = os.path.join(user_folder, 'primers.tsv')
            if os.path.exists(primers_file):
                case_errors = []
                try:
                    parse(dbi, primers_file, user_id, case_errors)
                    if case_errors:
                        errors.append("Errors occurred processing '{0}':\n\t{1}".format(primers_file, '\n\t'.join(case_errors)))
                        colortext.warning(errors[-1])
                except Exception, e:
                    errors.append("Errors occurred processing '{0}': {1}\n\t{2}\n{3}".format(primers_file, str(e), '\n\t'.join(case_errors), traceback.format_exc()))
                    colortext.warning('Error: {0}\n{1}'.format(str(e), traceback.format_exc()))
Example #23
0
    def AddPublishedDDGsToAnalysisTables(self):
        ddGdb = self.ddGdb
        analysis_tables = self.analysis_tables
        for AnalysisSet, analysis_table in analysis_tables.iteritems():
            published_dataset_scores = PublishedDatasetScores(
                ddGdb, AnalysisSet).scores

            for analysis_point in analysis_table.points:
                if analysis_point.section and analysis_point.recordnumber:
                    section = analysis_point.section
                    recordnumber = analysis_point.recordnumber
                    if published_dataset_scores.get(
                            section) and published_dataset_scores[section].get(
                                recordnumber):
                        published_dataset_score = published_dataset_scores[
                            section][recordnumber]["PublishedDatasetDDG"]
                        analysis_point.DatasetPublishedDDG = published_dataset_score
                    else:
                        if self.quiet_level >= 1:
                            colortext.warning(
                                "No published dataset score found for %s-%s-%s."
                                % (AnalysisSet, Section, RecordNumber))
Example #24
0
    def _runRScript(RScript):
        rscriptname = rosettahelper.writeTempFile(".", RScript)
        #p = subprocess.Popen(["/opt/R-2.15.1/bin/R","CMD", "BATCH", rscriptname])
        p = subprocess.Popen(["R", "CMD", "BATCH", rscriptname])
        while True:
            time.sleep(0.3)
            errcode = p.poll()
            if errcode != None:
                break
        rout = "%s.Rout" % rscriptname
        delete_file(rscriptname)
        #colortext.warning(rosettahelper.readFile(rout))

        rout_contents = None
        if os.path.exists(rout):
            rout_contents = rosettahelper.readFile(rout)

        if errcode != 0:
            if os.path.exists(rout):
                colortext.warning(rout_contents)
                delete_file(rout)
            raise colortext.Exception("The R script failed with error code %d." % errcode)
        delete_file(rout)
        return rout_contents
def error_by_error_scatterplot(output_directory, file_prefix, df,
                             reference_series_index, x_series_index, y_series_index,
                             x_color, y_color,
                             x_series_name = None, y_series_name = None,
                             plot_title = '', x_axis_label = '', y_axis_label = '', similarity_range = 0.25,
                             add_similarity_range_annotation = True,
                             shape_by_category = False, shape_category_series_index = None, shape_category_title = 'Case',
                             label_series_index = None, label_outliers = True,
                             use_geom_text_repel = True,
                             ):

    """ Creates a scatterplot of error versus error intended to show which computational method (X or Y) has the least amount of error relative to a reference series.

        The difference vectors (reference_series - x_series, reference_series - y_series) are created and these differences (errors)
        are plotted against each other.

        :param output_directory: The output directory.
        :param file_prefix: A prefix for the generated files. A CSV file with the plot points, the R script, and the R output is saved along with the plot itself.
        :param df: A pandas dataframe. Note: The dataframe is zero-indexed.
        :param reference_series_index: The numerical index of the reference series e.g. experimental data.
        :param x_series_index: The numerical index of the X-axis series e.g. predictions from a computational method.
        :param y_series_index: The numerical index of the Y-axis series e.g. predictions from a second computational method.
        :param x_color: The color of the "method X is better" points.
        :param y_color: The color of the "method Y is better" points.
        :param x_series_name: A name for the X-series which is used in the the classification legend.
        :param y_series_name: A name for the Y-series which is used in the the classification legend.
        :param plot_title: Plot title.
        :param x_axis_label: X-axis label.
        :param y_axis_label: Y-axis label.
        :param similarity_range: A point (x, y) is considered as similar if |x - y| <= similarity_range.
        :param add_similarity_range_annotation: If true then the similarity range is included in the plot.
        :param shape_by_category: Boolean. If set then points are shaped by the column identified with shape_category_series_index. Otherwise, points are shaped by classification ("X is better", "Y is better", or "Similar")
        :param shape_category_series_index: The numerical index of the series used to choose point shapes.
        :param shape_category_title: The title of the shape legend.
        :param label_series_index: The numerical index of the series label_series_index
        :param label_outliers: Boolean. If set then label outliers using the column identified with label_series_index.
        :param use_geom_text_repel: Boolean. If set then the ggrepel package is used to avoid overlapping labels.

        This function was adapted from the Kortemme Lab covariation benchmark (https://github.com/Kortemme-Lab/covariation).
        todo: I need to check that ggplot2 is respecting the color choices. It may be doing its own thing.
    """
    try:
        os.mkdir(output_directory)
    except:
        pass
    assert (os.path.exists(output_directory))

    if not isinstance(shape_category_series_index, int):
        shape_by_category = False
    if not isinstance(label_series_index, int):
        label_outliers = False
    assert(x_series_name != None and y_series_name != None)

    df = df.copy()
    headers = df.columns.values

    num_categories = len(set(df.ix[:, shape_category_series_index].values))
    legal_shapes = range(15,25+1) + range(0,14+1)
    if num_categories > len(legal_shapes):
        colortext.warning('Too many categories ({0}) to plot using meaningful shapes.'.format(num_categories))
        shape_by_category = False
    else:
        legal_shapes = legal_shapes[:num_categories]

    df['X_error'] = abs(df[headers[reference_series_index]] - df[headers[x_series_index]])
    x_error_index = len(df.columns.values) - 1
    df['Y_error'] = abs(df[headers[reference_series_index]] - df[headers[y_series_index]])
    y_error_index = len(df.columns.values) - 1

    # Get the list of domains common to both runs
    df['Classification'] = df.apply(lambda r: _classify_smallest_error(r['X_error'], r['Y_error'], similarity_range, x_series_name, y_series_name), axis = 1)
    error_classification_index = len(df.columns.values) - 1

    # Create the R script
    boxplot_r_script = '''
library(ggplot2)
library(gridExtra)
library(scales)
library(qualV)
library(grid)'''
    if use_geom_text_repel:
        boxplot_r_script +='''
library(ggrepel) # install with 'install.packages("ggrepel")' inside the R interactive shell.
'''
    boxplot_r_script += '''

# PNG generation
png('%(file_prefix)s.png', width=2560, height=2048, bg="white", res=600)
txtalpha <- 0.8
redtxtalpha <- 0.8

%(png_plot_commands)s
        '''

    xy_table_filename = '{0}.txt'.format(file_prefix)
    xy_table_filepath = os.path.join(output_directory, xy_table_filename)

    data_table = df.to_csv(header = True, index = False)
    write_file(xy_table_filepath, data_table)

    main_plot_script = '''
# Set the margins
par(mar=c(5, 5, 1, 1))

xy_data <- read.csv('%(xy_table_filename)s', header=T)

names(xy_data)[%(x_error_index)d + 1] <- "xerrors"
names(xy_data)[%(y_error_index)d + 1] <- "yerrors"
'''

    if label_outliers:
        main_plot_script +='''names(xy_data)[%(label_series_index)d + 1] <- "outlier_labels"'''
    main_plot_script +='''
names(xy_data)[%(shape_category_series_index)d + 1] <- "categories"

xy_data[%(x_error_index)d + 1]
xy_data[%(y_error_index)d + 1]

# coefs contains two values: (Intercept) and yerrors
coefs <- coef(lm(xerrors~yerrors, data = xy_data))
fitcoefs = coef(lm(xerrors~0 + yerrors, data = xy_data))
fitlmv_yerrors <- as.numeric(fitcoefs[1])
lmv_intercept <- as.numeric(coefs[1])
lmv_yerrors <- as.numeric(coefs[2])
lm(xy_data$yerrors~xy_data$xerrors)

xlabel <- "%(x_axis_label)s"
ylabel <- "%(y_axis_label)s"
plot_title <- "%(plot_title)s"
rvalue <- cor(xy_data$yerrors, xy_data$xerrors)

# Alphabetically, "Similar" < "X" < "Y" so the logic below works
countsim <- paste("Similar =", dim(subset(xy_data, Classification=="Similar"))[1])
countX <- paste("%(x_series_name)s =", dim(subset(xy_data, Classification=="%(x_series_name)s"))[1])
countY <- paste("%(y_series_name)s =", dim(subset(xy_data, Classification=="%(y_series_name)s"))[1])

countX
countY
countsim

# Set graph limits and the position for the correlation value

minx <- min(0.0, min(xy_data$xerrors) - 0.1)
miny <- min(0.0, min(xy_data$yerrors) - 0.1)
maxx <- max(1.0, max(xy_data$xerrors) + 0.1)
maxy <- max(1.0, max(xy_data$yerrors) + 0.1)

# Create a square plot (x-range = y-range)
minx <- min(minx, miny)
miny <- minx
maxx <- max(maxx, maxy)
maxy <- maxx

xpos <- maxx / 25.0
ypos <- maxy - (maxy / 25.0)
ypos_2 <- maxy - (2 * maxy / 25.0)


plot_scale <- scale_color_manual(
    "Counts",
    values = c( "Similar" = '#444444', "%(x_series_name)s" = '%(x_color)s', "%(y_series_name)s" ='%(y_color)s'),
    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY) )'''

    if add_similarity_range_annotation:
        main_plot_script += '''
# Polygon denoting the similarity range. We turn off plot clipping below (gt$layout$clip) so we need to be more exact than using 4 points when defining the region
boxy_mc_boxface <- data.frame(
  X = c(minx - 0,                        maxx - %(similarity_range)f, maxx + 0, maxx + 0,                       0 + %(similarity_range)f, 0),
  Y = c(minx - 0 + %(similarity_range)f, maxx + 0,                    maxx + 0, maxx + 0 -%(similarity_range)f, 0, 0 )
)'''
    else:
        main_plot_script += '''
# Polygon denoting the similarity range. We turn off plot clipping below (gt$layout$clip) so we need to be more exact than using 4 points when defining the region
boxy_mc_boxface <- data.frame(
  X = c(minx - 1, maxx + 1, maxx + 1, minx - 1),
  Y = c(minx - 1 + %(similarity_range)f, maxx + 1 + %(similarity_range)f, maxx + 1 - %(similarity_range)f, minx - 1 - %(similarity_range)f)
)'''

    if shape_by_category:
        main_plot_script += '''
# Plot
p <- qplot(main="", xerrors, yerrors, data=xy_data, xlab=xlabel, ylab=ylabel, alpha = I(txtalpha), shape=factor(categories), col=factor(Classification)) +'''
    else:
        main_plot_script += '''
# Plot
p <- qplot(main="", xerrors, yerrors, data=xy_data, xlab=xlabel, ylab=ylabel, alpha = I(txtalpha), shape=factor(Classification), col=factor(Classification)) +'''

    main_plot_script += '''
geom_polygon(data=boxy_mc_boxface, aes(X, Y), fill = "#bbbbbb", alpha = 0.4, color = "darkseagreen", linetype="blank", inherit.aes = FALSE, show.legend = FALSE) +
plot_scale +
geom_point() +
guides(col = guide_legend()) +
labs(title = "%(plot_title)s") +
theme(plot.title = element_text(color = "#555555", size=rel(0.75))) +
theme(axis.title = element_text(color = "#555555", size=rel(0.6))) +
theme(legend.title = element_text(color = "#555555", size=rel(0.45)), legend.text = element_text(color = "#555555", size=rel(0.4))) +
coord_cartesian(xlim = c(minx, maxx), ylim = c(miny, maxy)) + # set the graph limits
annotate("text", hjust=0, size = 2, colour="#222222", x = xpos, y = ypos, label = sprintf("R = %%0.2f", round(rvalue, digits = 4))) + # add correlation text; hjust=0 sets left-alignment. Using annotate instead of geom_text avoids blocky text caused by geom_text being run multiple times over the series'''

    if label_outliers:
        if use_geom_text_repel:
            main_plot_script += '''

# Label outliers
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +'''
        else:
            main_plot_script += '''

# Label outliers
geom_text(hjust = 0, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text(hjust = 0, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +
geom_text(hjust = 1, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text(hjust = 1, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +'''

        counts_title = 'Counts'
        if add_similarity_range_annotation:
            counts_title += '*'

        main_plot_script += '''


#geom_text(hjust = 0, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yvalues - xvalues) > 2 & xvalues <= 0), aes(xvalues, yvalues+0.35, label=Origin_of_peptide), check_overlap = TRUE) + # label outliers
#geom_text(hjust = 1, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yvalues - xvalues) > 2 & xvalues > 0), aes(xvalues, yvalues+0.35, label=Origin_of_peptide), check_overlap = TRUE) + # label outliers




scale_colour_manual('%(counts_title)s', values = c('#444444', '%(x_color)s', '%(y_color)s'),
                    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY)) +'''

    if shape_by_category:
        legal_shapes_str = ', '.join(map(str, legal_shapes))
        main_plot_script += '''
scale_shape_manual('%(shape_category_title)s', values = c(%(legal_shapes_str)s),
                    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY))'''

    else:
        main_plot_script += '''
scale_shape_manual('%(counts_title)s', values = c(18, 16, 15),
                    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY))'''

    if add_similarity_range_annotation:
        main_plot_script += '''+
    # Add a caption
    annotation_custom(grob = textGrob(gp = gpar(fontsize = 5), hjust = 0, sprintf("* Similar \\u225d \\u00b1 %%0.2f", round(%(similarity_range)f, digits = 2))), xmin = maxx + (2 * maxx / 10), ymin = -1, ymax = -1)'''

    main_plot_script += '''

# Plot graph
p
    '''
    if add_similarity_range_annotation:
        main_plot_script += '''
# Code to override clipping
gt <- ggplot_gtable(ggplot_build(p))
gt$layout$clip[gt$layout$name=="panel"] <- "off"
grid.draw(gt)'''

    main_plot_script +='''
dev.off()
'''

    # Create the R script
    plot_type = 'png'
    png_plot_commands = main_plot_script % locals()
    boxplot_r_script = boxplot_r_script % locals()
    r_script_filename = '{0}.R'.format(file_prefix)
    r_script_filepath = os.path.join(output_directory, r_script_filename)
    write_file(r_script_filepath, boxplot_r_script)

    # Run the R script
    run_r_script(r_script_filename, cwd = output_directory)
def determine_structure_scores(DDG_api, skip_if_we_have_pairs = 50):
    pp = pprint.PrettyPrinter(indent=4)

    ddGdb = DDG_api.ddGDB
    ddGdb_utf = ddgdbapi.ddGDatabase(use_utf = True)
    # Get the list of completed prediction set
    completed_prediction_sets = get_completed_prediction_sets(DDG_api)
    print(completed_prediction_sets)

    # Create the mapping from the old score types to the ScoreMethod record IDs
    ScoreMethodMap = {}
    results = ddGdb_utf.execute('SELECT * FROM ScoreMethod')
    for r in results:
        if r['MethodName'] == 'Global' and r['MethodType'] == 'Protocol 16':
            ScoreMethodMap[("kellogg", "total")] = r['ID']
        if r['Authors'] == 'Noah Ollikainen':
            if r['MethodName'] == 'Local' and r['MethodType'] == 'Position' and r['Parameters'] == u'8Ã… radius':
                ScoreMethodMap[("noah_8,0A", "positional")] = r['ID']
            if r['MethodName'] == 'Local' and r['MethodType'] == 'Position (2-body)' and r['Parameters'] == u'8Ã… radius':
                ScoreMethodMap[("noah_8,0A", "positional_twoscore")] = r['ID']
            if r['MethodName'] == 'Global' and r['MethodType'] == 'By residue' and r['Parameters'] == u'8Ã… radius':
                ScoreMethodMap[("noah_8,0A", "total")] = r['ID']

    # For each completed prediction set, determine the structure scores
    for prediction_set in completed_prediction_sets:
        #if prediction_set not in ['Ubiquitin scan: UQ_con_yeast p16']:
        #    continue

        predictions = ddGdb.execute('SELECT ID, ddG, Scores, status, ScoreVersion FROM Prediction WHERE PredictionSet=%s ORDER BY ID', parameters=(prediction_set,))
        num_predictions = len(predictions)

        # Pass #1: Iterate over all Predictions and make sure that they gave completed and contain all the scores we expect
        colortext.message('Prediction set: %s' % prediction_set)
        colortext.warning('Checking that all data exists...')
        for prediction in predictions:
            #assert(prediction['status'] == 'done')
            PredictionID = prediction['ID']
            if PredictionID != 72856:
                continue
            global_scores = pickle.loads(prediction['ddG'])
            assert(global_scores)
            assert(prediction['ScoreVersion'] == 0.23)
            if not prediction['Scores']:
                raise Exception("This prediction needs to be scored with Noah's method.")

            gs2 = json.loads(prediction['Scores'])
            if True not in set([k.find('noah') != -1 for k in gs2['data'].keys()]):
                raise Exception("This prediction needs to be scored with Noah's method.")
            assert (gs2['data']['kellogg'] == global_scores['data']['kellogg'])

        # Pass #2: Iterate over all completed Predictions with null StructureScores.
        # For each Prediction, determine and store the structure scores
        count = 0
        for prediction in predictions:

            count += 1
            PredictionID = prediction['ID']
            colortext.message('%s: %d of %d (Prediction #%d)' % (prediction_set, count, num_predictions, PredictionID))

            #if PredictionID != 72856:
            #if PredictionID < 73045: continue
            if prediction['status'] == 'failed':
                colortext.error('Skipping failed prediction %d.' % PredictionID)
                continue
            if prediction['status'] == 'queued':
                colortext.warning('Skipping queued prediction %d.' % PredictionID)
                continue
            if prediction['status'] == 'postponed':
                colortext.printf('Skipping postponed prediction %d.' % PredictionID, 'cyan')
                continue

            # Store the ensemble scores
            try:
                global_scores = json.loads(prediction['Scores'])['data']
            except:
                raise colortext.Exception("Failed reading the Scores field's JSON object. The Prediction Status is %(status)s. The Scores field is: '%(Scores)s'." % prediction)
            for score_type, inner_data in global_scores.iteritems():
                for inner_score_type, data in inner_data.iteritems():
                    components = {}
                    if score_type == 'kellogg' and inner_score_type == 'total':
                        components = data['components']
                        ddG = data['ddG']

                    elif score_type == 'noah_8,0A' and inner_score_type == 'positional':
                        ddG = data['ddG']
                    elif score_type == 'noah_8,0A' and inner_score_type == 'positional_twoscore':
                        ddG = data['ddG']
                    elif score_type == 'noah_8,0A' and inner_score_type == 'total':
                        ddG = data['ddG']
                    else:
                        continue
                        raise Exception('Unhandled score types: "%s", "%s".' % (score_type, inner_score_type))

                    ScoreMethodID = ScoreMethodMap[(score_type, inner_score_type)]
                    new_record = dict(
                        PredictionID = PredictionID,
                        ScoreMethodID = ScoreMethodID,
                        ScoreType = 'DDG',
                        StructureID = -1, # This score is for the Prediction rather than a structure
                        DDG = ddG,
                    )
                    assert(not(set(components.keys()).intersection(set(new_record.keys()))))
                    new_record.update(components)
                    ddGdb.insertDictIfNew('PredictionStructureScore', new_record, ['PredictionID', 'ScoreMethodID', 'ScoreType', 'StructureID'])

            if skip_if_we_have_pairs != None:
                # Skip this case if we have a certain number of existing records (much quicker since we do not have to extract the binary)
                num_wt = ddGdb.execute_select("SELECT COUNT(ID) AS NumRecords FROM PredictionStructureScore WHERE PredictionID=%s AND ScoreType='WildType'", parameters=(PredictionID,))[0]['NumRecords']
                num_mut = ddGdb.execute_select("SELECT COUNT(ID) AS NumRecords FROM PredictionStructureScore WHERE PredictionID=%s AND ScoreType='Mutant'", parameters=(PredictionID,))[0]['NumRecords']
                print(num_wt, num_mut)
                if num_wt == num_mut and num_mut == skip_if_we_have_pairs:
                    continue

            # Store the ddg_monomer scores for each structure
            grouped_scores = DDG_api.get_ddg_monomer_scores_per_structure(PredictionID)
            for structure_id, wt_scores in sorted(grouped_scores['WildType'].iteritems()):
                new_record = dict(
                    PredictionID = PredictionID,
                    ScoreMethodID = ScoreMethodMap[("kellogg", "total")],
                    ScoreType = 'WildType',
                    StructureID = structure_id,
                    DDG = None,
                )
                new_record.update(wt_scores)
                ddGdb.insertDictIfNew('PredictionStructureScore', new_record, ['PredictionID', 'ScoreMethodID', 'ScoreType', 'StructureID'])
            for structure_id, wt_scores in sorted(grouped_scores['Mutant'].iteritems()):
                new_record = dict(
                    PredictionID = PredictionID,
                    ScoreMethodID = ScoreMethodMap[("kellogg", "total")],
                    ScoreType = 'Mutant',
                    StructureID = structure_id,
                    DDG = None,
                )
                new_record.update(wt_scores)
                ddGdb.insertDictIfNew('PredictionStructureScore', new_record, ['PredictionID', 'ScoreMethodID', 'ScoreType', 'StructureID'])

            # Test to make sure that we can pick a best pair of structures (for generating a PyMOL session)
            assert(DDG_api.determine_best_pair(PredictionID) != None)
Example #27
0
def extract_analysis_data(dataset_list_file, output_directory, data_extraction_method, expectn, top_x, prefix, test_mode = False):
    '''This is the main function in this script and is where the basic analysis is compiled.

       output_directory should contain the results of the prediction run.
       data_extraction_method should be a function pointer to the method-specific function used to retrieve the prediction results e.g. get_kic_run_details
       expectn specifies how many predictions we expect to find (useful in case some jobs failed).
       top_x specifies how many of the best-scoring predictions should be used to generate the TopX metric results e.g.
       the Top5 RMSD metric value measures the lowest RMSD amongst the five best-scoring structures.
       prefix is used to name the output files.
    '''

    # Sanity check
    assert(top_x <= expectn)

    # Set up reference structures
    structures_folder = os.path.join('..', 'input', 'structures', '12_res')
    rcsb_references = os.path.join(structures_folder, 'rcsb', 'reference')
    rosetta_references = os.path.join(structures_folder, 'rosetta', 'reference')

    # Set up the per-case statistics dicts
    best_scoring_structures = {}
    median_scoring_structures = {}
    worst_scoring_structures = {}
    total_percent_subanstrom = {}
    top_x_percent_subanstrom = {}
    top_x_loop_prediction_sets = {}

    # Set up the input file used to generate the graph plotting the "percentage of subangstrom models" metric over
    # varying values of X used to select the TopX structures
    percentage_subangstrom_over_top_X_plot_input = ['PDB\tX\tPercentage of subangstrom cases for TopX']
    percent_subangrom_by_top_x = {}

    # Set up the summary analysis file
    csv_file = ['\t'.join(['PDB ID', 'Models', '%<1.0A', 'Top{0} %<1.0A'.format(top_x), 'Best score', 'Top{0} score'.format(top_x), 'Median score', 'Worst score', 'Closest score', 'Top1 RMSD', 'Top{0} RMSD'.format(top_x), 'Closest RMSD'])]

    # Read in the benchmark input
    pdb_ids = [os.path.splitext(os.path.split(s.strip())[1])[0] for s in get_file_lines(dataset_list_file) if s.strip()]

    # Truncate the benchmark input for test mode
    if test_mode:
        pdb_ids = pdb_ids[:10]

    # Analyze the performance for each case in the benchmark
    for pdb_id in pdb_ids:

        rcsb_reference_pdb = os.path.join(rcsb_references, pdb_id + '.pdb')
        assert(os.path.exists(rcsb_reference_pdb))
        rosetta_reference_pdb = os.path.join(rosetta_references, pdb_id + '.pdb')
        assert(os.path.exists(rosetta_reference_pdb))
        assert(len(pdb_id) == 4)
        loops_file = os.path.join(structures_folder, 'rosetta', 'pruned', '{0}.loop.json'.format(pdb_id))
        loop_sets = json.loads(read_file(loops_file))
        assert(len(loop_sets['LoopSet']) == 1)

        # Create a container for loop predictions
        loop_prediction_set = LoopPredictionSet()

        # Read the coordinates from the reference PDB file
        rcsb_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rcsb_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)
        rosetta_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rosetta_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)

        colortext.wgreen('\n\nReading in the run details for {0}:'.format(pdb_id))
        details = data_extraction_method(output_directory, pdb_id, loop_sets, test_mode = test_mode)
        for d in details:
            loop_prediction = loop_prediction_set.add(d['id'], d['score'], pdb_id = pdb_id, rmsd = None, pdb_path = d['predicted_structure'], pdb_loop_residue_matrix = d['pdb_loop_residue_matrix'])
        print(' Done')

        # Compute the RMSD for this case for the structure using the pandas dataframe
        # It is more efficient to do this after truncation if truncating by score but in the general case users will
        # probably want to consider all predictions. If not (e.g. for testing) then arbitrary subsets can be chosen
        # in the loop above
        colortext.wgreen('Computing RMSDs for {0}:'.format(pdb_id))
        loop_prediction_set.compute_rmsds(rcsb_reference_matrix)
        loop_prediction_set.check_rmsds(rosetta_reference_matrix)
        print(' Done\n')

        # Truncate the structures to the top expectn-scoring files
        loop_prediction_set.sort_by_score()
        loop_prediction_set.truncate(expectn)
        if len(loop_prediction_set) != expectn:
            print('Error: Expected {0} structures but only found {1}.'.format(expectn, len(loop_prediction_set)))
            sys.exit(1)

        # Create a new set containing the top-X-scoring structures and identify the median-scoring structure
        top_x_loop_prediction_sets[pdb_id] = loop_prediction_set[:top_x]
        median_scoring_structures[pdb_id] = loop_prediction_set[int(expectn / 2)]

        # Determine the lowest-/best-scoring structure
        best_scoring_structures[pdb_id] = loop_prediction_set[0]
        best_score = best_scoring_structures[pdb_id].score
        worst_scoring_structures[pdb_id] = loop_prediction_set[-1]
        worst_score = worst_scoring_structures[pdb_id].score
        assert(top_x_loop_prediction_sets[pdb_id][0] == best_scoring_structures[pdb_id])

        # Print structures
        colortext.warning('Top{0} structures'.format(top_x))
        print(top_x_loop_prediction_sets[pdb_id])
        colortext.warning('Top1 structure')
        print(best_scoring_structures[pdb_id])
        colortext.warning('Median (by score) structure')
        print(median_scoring_structures[pdb_id])
        colortext.warning('Lowest-scoring structures')
        print(worst_scoring_structures[pdb_id])

        # Create values for TopX variable plot
        loop_prediction_set.sort_by_score()
        for top_x_var in range(1, len(loop_prediction_set) + 1):
            new_subset = loop_prediction_set[:top_x_var]
            percent_subangstrom = 100 * new_subset.fraction_with_rmsd_lt(1.0)
            percentage_subangstrom_over_top_X_plot_input.append('{0}\t{1}\t{2}'.format(pdb_id, top_x_var, percent_subangstrom))
            percent_subangrom_by_top_x[top_x_var] = percent_subangrom_by_top_x.get(top_x_var, {})
            percent_subangrom_by_top_x[top_x_var][pdb_id] = percent_subangstrom

        total_percent_subanstrom[pdb_id] = 100 * loop_prediction_set.fraction_with_rmsd_lt(1.0)
        top_x_percent_subanstrom[pdb_id] = 100 * top_x_loop_prediction_sets[pdb_id].fraction_with_rmsd_lt(1.0)
        colortext.warning('Number of sub-angstrom cases in the full set of {0}: {1}'.format(expectn, total_percent_subanstrom[pdb_id]))
        colortext.warning('Number of sub-angstrom cases in the TopX structures: {1}'.format(expectn, top_x_percent_subanstrom[pdb_id]))

        loop_prediction_set.sort_by_rmsd()
        closest_rmsd = loop_prediction_set[0].rmsd
        closest_score = loop_prediction_set[0].score
        colortext.warning('RMSD of closest model: {0}'.format(closest_rmsd))
        colortext.warning('Score of closest model: {0}'.format(closest_score))

        top_1_rmsd = best_scoring_structures[pdb_id].rmsd

        top_x_rmsd = best_scoring_structures[pdb_id].rmsd
        top_x_score = best_scoring_structures[pdb_id].score
        for s in top_x_loop_prediction_sets[pdb_id]:
            if (s.rmsd < top_x_rmsd) or (s.rmsd == top_x_rmsd and s.score < top_x_score):
                top_x_rmsd = s.rmsd
                top_x_score = s.score
        assert(top_x_score <= worst_score)
        assert(top_x_rmsd <= top_1_rmsd)

        print('Top 1 RMSD (predicted vs Rosetta/RCSB reference structure): {0}'.format(top_1_rmsd))
        print('Top {0} RMSD (predicted vs Rosetta/RCSB reference structure): {1}'.format(top_x, top_x_rmsd))

        csv_file.append('\t'.join(map(str, [pdb_id, expectn, total_percent_subanstrom[pdb_id], top_x_percent_subanstrom[pdb_id], best_score, top_x_score, median_scoring_structures[pdb_id].score, worst_score, closest_score, top_1_rmsd, top_x_rmsd, closest_rmsd])))

    # Add a column of median percent subangstrom values
    for top_x_var, values_by_pdb in sorted(percent_subangrom_by_top_x.iteritems()):
        assert(sorted(values_by_pdb.keys()) == sorted(pdb_ids))
        median_value = sorted(values_by_pdb.values())[len(pdb_ids) / 2]
        percentage_subangstrom_over_top_X_plot_input.append('Median\t{1}\t{2}'.format(pdb_id, top_x_var, median_value))

    write_file('{0}analysis.csv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}analysis.tsv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}percentage_subangstrom_over_top_X.tsv'.format(prefix), '\n'.join(percentage_subangstrom_over_top_X_plot_input))
Example #28
0
    def CreateAnalysisTables(self):
        ddGdb = self.ddGdb
        PredictionSet = self.PredictionSet
        predictions = PredictionScores(ddGdb,
                                       PredictionSet,
                                       self.ddG_score_type,
                                       score_cap=self.score_cap)
        predicted_scores = predictions.Predictions

        s = "Analyzing %d predictions in PredictionSet '%s' for UserDataSet '%s'. " % (
            predictions.NumberOfPredictions,
            predictions.PredictionSet.replace(
                "_", "\_"), predictions.UserDataSetName)
        if self.score_cap:
            s += "Running analysis over the following analysis sets: '%s' with predicted scores capped at +-%0.2f." % (
                join(predictions.AnalysisSets, "', '"), self.score_cap)
        else:
            s += "Running analysis over the following analysis sets: '%s'." % (
                join(predictions.AnalysisSets, "', '"))
        self.description.append(("black", s))
        if self.quiet_level >= 1:
            colortext.message(
                "Analyzing %d predictions in PredictionSet '%s' for UserDataSet '%s'."
                % (predictions.NumberOfPredictions, predictions.PredictionSet,
                   predictions.UserDataSetName))
            colortext.message(
                "Running analysis over the following analysis sets: '%s'." %
                (join(predictions.AnalysisSets, "', '")))

        analysis_tables = {}
        # Analyze data for
        for AnalysisSet in predictions.AnalysisSets:
            analysis_table = AnalysisTable()

            experiments = UserDataSetExperimentalScores(
                ddGdb, predictions.UserDataSetID, AnalysisSet)

            count = 0
            numMissing = 0
            for section, sectiondata in sorted(experiments.iteritems()):
                for recordnumber, record_data in sorted(
                        sectiondata.iteritems()):
                    count += 1
                    PDB_ID = record_data["PDB_ID"]
                    ExperimentID = record_data["ExperimentID"]
                    ExperimentalDDG = record_data["ExperimentalDDG"]
                    if predicted_scores.get(ExperimentID) and predicted_scores[
                            ExperimentID].get(PDB_ID):
                        PredictedDDG = predicted_scores[ExperimentID][PDB_ID][
                            "PredictedDDG"]
                        analysis_table.add(
                            AnalysisPoint(ExperimentalDDG,
                                          PredictedDDG,
                                          ExperimentID=ExperimentID,
                                          PDB_ID=PDB_ID,
                                          section=section,
                                          recordnumber=recordnumber))
                    else:
                        numMissing += 1
            if numMissing > 0 and self.quiet_level >= 1:
                self.description.append((
                    "Bittersweet",
                    "Missing %d predictions out of %d records for analysis set %s."
                    % (numMissing, count, AnalysisSet)))
                colortext.warning(
                    "Missing %d predictions out of %d records for analysis set %s."
                    % (numMissing, count, AnalysisSet))
            analysis_tables[AnalysisSet] = analysis_table

        self.analysis_tables = analysis_tables
Example #29
0
    c = 0
    for pdb_id in pdb_ids:
        try:
            c += 1
            colortext.message('\n{0}/{1}: {2}'.format(c, len(pdb_ids), pdb_id))
            hits = b.by_pdb(pdb_id)
            if hits:
                colortext.warning('{0} hits: {1}'.format(len(hits), ','.join(hits)))
            else:
                colortext.warning('No hits')
        except Exception, e:
            colortext.error('FAILED')
            failed_cases.append((pdb_id, str(e), traceback.format_exc()))

    if failed_cases:
        colortext.warning('*** These cases failed ***')
        for p in failed_cases:
            print('')
            colortext.pcyan(p[0])
            colortext.error(p[1])
            print(p[2])
        print('')


def test_sequences(b, sequences):
    failed_cases = []
    c = 0
    for sequence in sequences:
        try:
            c += 1
            colortext.message('\n{0}/{1}: {2}'.format(c, len(sequences), sequence))
Example #30
0
                pdb_chain_to_pfam_mapping[pdb_id][chain_id].add(pfam_acc)

                pfam_to_pdb_chain_mapping[pfam_acc] = pfam_to_pdb_chain_mapping.get(pfam_acc, set())
                pfam_to_pdb_chain_mapping[pfam_acc].add(pdb_key)

        self.pdb_chain_to_pfam_mapping = pdb_chain_to_pfam_mapping
        self.pfam_to_pdb_chain_mapping = pfam_to_pdb_chain_mapping


    def get_pfam_accession_numbers_from_pdb_id(self, pdb_id):
        '''Note: an alternative is to use the RCSB API e.g. http://www.rcsb.org/pdb/rest/hmmer?structureId=1cdg.'''
        pdb_id = pdb_id.lower()
        if self.pdb_chain_to_pfam_mapping.get(pdb_id):
            return self.pdb_chain_to_pfam_mapping[pdb_id].copy()

    def get_pfam_accession_numbers_from_pdb_chain(self, pdb_id, chain):
        '''Note: an alternative is to use the RCSB API e.g. http://www.rcsb.org/pdb/rest/hmmer?structureId=1cdg.'''
        return self.pdb_chain_to_pfam_mapping.get(pdb_id.lower(), {}).get(chain)

    def get_pdb_chains_from_pfam_accession_number(self, pfam_acc):
        return self.pfam_to_pdb_chain_mapping.get(pfam_acc)


if __name__ == '__main__':
    pfam_api = Pfam()
    colortext.warning(pfam_api.get_pfam_accession_numbers_from_pdb_chain('1TVA', 'A'))
    colortext.warning(pfam_api.get_pfam_accession_numbers_from_pdb_chain('1CDG', 'A'))
    colortext.warning(pfam_api.get_pfam_accession_numbers_from_pdb_id('1A2c'))

    colortext.message(pfam_api.get_pdb_chains_from_pfam_accession_number('PF14716'))
                # requires this at the time of writing)
                write_file(
                    os.path.join(output_directory,
                                 '{0}.loop'.format(pdb_prefix)),
                    loop_file_content)

            sys.stdout.write('.')
            sys.stdout.flush()
        print('')


if __name__ == '__main__':
    from libraries import docopt
    arguments = docopt.docopt(__doc__)
    output_directory = arguments['<output_directory>']
    e, trc = '', ''

    if True:
        # Disable this code by default
        try:
            os.mkdir(output_directory)
        except Exception, e:
            trc = traceback.format_exc()
        if not os.path.exists(output_directory):
            colortext.error('Error: Could not create the output directory.')
            if e: colortext.error(str(e))
            colortext.warning(trc)

        #create_pruned_structures(output_directory)
        add_missing_residues(output_directory)
Example #32
0
 def add_kortemme_degrado_joint_meeting(self, calendar_id, start_dt, end_dt, location, presenters, summary = None, description = None, visibility = 'default', username_map = {}, email_map = {}):
     e = BasicEvent(self, start_dt, end_dt, location = location, summary = summary, description = description, visibility = visibility, username_map = username_map, email_map = email_map)
     event = e.create_lab_meeting('Kortemme/DeGrado joint meeting', presenters, locked = True)
     colortext.warning(pprint.pformat(event))
Example #33
0
def main(FixedIDs = [], radii = [6.0, 7.0, 8.0, 9.0]):
    max_processors = get_number_of_processors()

    rescore_process_file = "/tmp/klab_rescore.txt"
    parser = OptionParser()
    parser.add_option("-n", "--numprocesses", default=1, type='int', dest="num_processes", help="The number of processes used for the rescoring. The cases are split according to this number.", metavar="NUM_PROCESSES")
    parser.add_option("-p", "--process", default=1, type='int', dest="process", help="The ID of this process. This should be an integer between 1 and the number of processes used for the rescoring.", metavar="PROCESS_ID")
    parser.add_option("-d", "--delete",  action="store_true", dest="delete", help="Delete the process tracking file %s." % rescore_process_file)
    parser.add_option("-s", "--set",  type='string', dest="prediction_set", help="The prediction set to rescore.")
    (options, args) = parser.parse_args()

    if options.delete and os.path.exists(rescore_process_file):
        print("Removing %s." % rescore_process_file)
        os.remove(rescore_process_file)

    num_processes = options.num_processes
    prediction_set = options.prediction_set
    process_id = options.process

    for i in FixedIDs:
        assert(type(i) == type(1))

    # SELECT * FROM `Prediction` WHERE `PredictionSet`= 'RosCon2013_P16_score12prime'  AND Status='done' LIMIT 1
    # Check prediction set
    if not prediction_set:
        raise colortext.Exception("A prediction set must be specified.")
    else:
        if FixedIDs:
            results = ddGdb.execute("SELECT DISTINCT PredictionSet FROM Prediction WHERE ID IN (%s)" % ",".join(map(str, FixedIDs)))
            if len(results) != 1:
                raise colortext.Exception("Error: The fixed IDs cover %d different prediction sets." % len(results))
        else:
            results = ddGdb.execute("SELECT ID FROM PredictionSet WHERE ID=%s", parameters=(prediction_set,))
        if not results:
            raise colortext.Exception("The prediction set '%s' does not exist in the database." % prediction_set)

    if num_processes < 1:
        raise colortext.Exception("At least 1 processor must be used.")
    if num_processes > max_processors:
        raise colortext.Exception("Only %d processors/cores were detected. Cannot run with %d processes." % (max_processors, num_processes))
    if num_processes > (max_processors * 0.75):
        colortext.warning("Warning: Using %d processors/cores out of %d which is %0.2f%% of the total available." % (num_processes, max_processors, (100.0*float(num_processes)/float(max_processors))))
    if not(1 <= process_id <= min(max_processors, num_processes)):
        raise colortext.Exception("The process ID %d must be between 1 and the number of processes, %d." % (process_id, num_processes))

    if os.path.exists(rescore_process_file):
        lines = readFileLines(rescore_process_file)
        idx = lines[0].find("numprocesses")
        if idx == -1:
            raise Exception("Badly formatted %s." % rescore_process_file)
        existing_num_processes = int(lines[0][idx+len("numprocesses"):])
        if existing_num_processes != num_processes:
            raise colortext.Exception("You specified the number of processes to be %d but %s already specifies it as %d." % (num_processes, rescore_process_file, existing_num_processes))
        for line in [line for line in lines[1:] if line.strip()]:
            idx = line.find("process")
            if idx == -1:
                raise colortext.Exception("Badly formatted %s. Line is '%s'." % (rescore_process_file, line))
            existing_process = int(line[idx+len('process'):])
            if process_id == existing_process:
                raise colortext.Exception("Process %d is already logged as running. Check if this is so and edit %s." % (process_id, rescore_process_file))
        F = open(rescore_process_file, 'a')
        F.write("process %d\n" % process_id)
        F.close()
    else:
        F = open(rescore_process_file, 'w')
        F.write("numprocesses %d\n" % num_processes)
        F.write("process %d\n" % process_id)
        F.close()

    output_dir = os.path.join('rescoring', str(process_id))
    if not(os.path.exists(output_dir)):
        os.makedirs(output_dir)
    abs_output_dir = os.path.abspath(os.path.join(os.getcwd(), output_dir))
    print("Running process in %s.\n" % abs_output_dir)

    ReallyFixedIDs = False

    results = ddGdb.execute("SELECT ID, ExperimentID, Scores FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion <> %s", parameters=(prediction_set, float(current_score_revision),))
    if not(FixedIDs) and results:
        raise WrongScoreRevisionException("Score versions found which are not %s. Need to update table structure." % current_score_revision)
    else:
        # Hacky way to run multiple processes
        if ReallyFixedIDs:
            num_to_score = len(remaining_unscored)
            num_for_this_to_score = num_to_score / num_processes
            IDs_to_score = remaining_unscored[(process_id-1) * num_for_this_to_score : (process_id) * num_for_this_to_score]
            results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s)" % (",".join(map(str, IDs_to_score))))
        elif FixedIDs:
            results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s) AND MOD(ID,%s)=%s" % (",".join(map(str, FixedIDs)), num_processes,process_id-1))
        else:
            results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion=%s AND MOD(ID,%s)=%s", parameters=(prediction_set, float(current_score_revision),num_processes,process_id-1))

    count = 0
    cases_computed = 0
    total_time_in_secs = 0

    number_of_cases_left = len(results) * len(radii)

    failed_cases = []
    colortext.printf("Rescoring %d predictions over %d radii...\n" % (len(results), len(radii)), 'lightgreen')
    for r in results:
        t = Timer()
        t.add('Preamble')
        inner_count = 0

        mutations = ddGdb.execute('SELECT * FROM ExperimentMutation WHERE ExperimentID=%s', parameters=(r['ExperimentID'],))
        mutation_str = ', '.join(['%s %s%s%s' % (m['Chain'], m['WildTypeAA'], m['ResidueID'], m['MutantAA']) for m in mutations])
        extracted_data = False

        details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],))
        details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],))
        colortext.message("Prediction: %d, %s chain %s. Mutations: %s. Experiment ID #%d. UserDataSetExperimentID #%d." % (details[0]['ID'], details[0]['PDBFileID'], details[0]['Chain'], mutation_str, r['ExperimentID'], r['UserDataSetExperimentID']))

        experiment_pdbID = ddGdb.execute('SELECT PDBFileID FROM Experiment WHERE ID=%s', parameters=(r['ExperimentID'],))[0]['PDBFileID']
        print('Experiment PDB file ID = %s' % experiment_pdbID)
        pdbID = ddGdb.execute('SELECT UserDataSetExperiment.PDBFileID FROM Prediction INNER JOIN UserDataSetExperiment ON UserDataSetExperimentID=UserDataSetExperiment.ID WHERE Prediction.ID=%s', parameters=(r['ID'],))[0]['PDBFileID']
        print('UserDataSetExperiment PDB file ID = %s' % pdbID)

        count += 1
        if True:#len(mutations) == 1:
            timestart = time.time()

            #mutation = mutations[0]
            dbchains = sorted(set([mutation['Chain'] for mutation in mutations]))
            # todo: note: assuming monomeric structures here
            assert(len(dbchains) == 1)
            dbchain = dbchains[0]
            #mutantaa = mutation['MutantAA']

            ddG_dict = json.loads(r['Scores'])
            kellogg_ddG = ddG_dict['data']['kellogg']['total']['ddG']

            #assert(ddG_dict['version'] == current_score_revision)

            all_done = True
            for radius in radii:
                score_name = ('noah_%0.1fA' % radius).replace(".", ",")
                if not(ddG_dict['data'].get(score_name)):
                    all_done = False
                else:
                    cases_computed += 1
                    number_of_cases_left -= 1
            if all_done:
                print('Prediction %d: done.' % r["ID"])
                continue

            # Extract data
            t.add('Grab data')
            #archivefile = None
            #prediction_data_path = ddGdb.execute('SELECT Value FROM _DBCONSTANTS WHERE VariableName="PredictionDataPath"')[0]['Value']
            #job_data_path = os.path.join(prediction_data_path, '%d.zip' % r['ID'])
            #print(job_data_path)
            #assert(os.path.exists(job_data_path))
            #archivefile = readBinaryFile(job_data_path)
            archivefile = DDG_interface.getData(r['ID'])
            zipfilename = os.path.join(output_dir, "%d.zip" % r['ID'])
            F = open(zipfilename, "wb")
            F.write(archivefile)
            F.close()

            t.add('Extract data')
            zipped_content = zipfile.ZipFile(zipfilename, 'r', zipfile.ZIP_DEFLATED)
            tmpdir = None
            repacked_files = []
            mutant_files = []

            rosetta_resids = []
            try:
                tmpdir = makeTemp755Directory(output_dir)
                highestIndex = -1
                foundResfile = False
                foundMutfile = False

                presumed_mutation = None
                for fname in sorted(zipped_content.namelist()):
                    if fname.endswith(".pdb"):
                        if fname.startswith("%s/mut_" % r['ID']) or fname.startswith("%s/repacked_" % r['ID']):
                            structnum = int(fname[fname.rindex('_')+1:-4])
                            if fname.startswith("%s/mut_" % r['ID']):
                                if presumed_mutation:
                                    assert(presumed_mutation == os.path.split(fname)[1].split('_')[1])
                                else:
                                    presumed_mutation = os.path.split(fname)[1].split('_')[1]
                                newfname = 'mutant_%02d' % structnum
                            if fname.startswith("%s/repacked_" % r['ID']):
                                newfname = 'repacked_%02d' % structnum
                            highestIndex = max(highestIndex, structnum)

                            newfilepath = os.path.join(tmpdir, newfname)
                            writeFile(newfilepath, zipped_content.read(fname))

                            if fname.startswith("%s/mut_" % r['ID']):
                                mutant_files.append(newfilepath)
                            if fname.startswith("%s/repacked_" % r['ID']):
                                repacked_files.append(newfilepath)
                        #elif fname.startswith("%s/%s-%s" % (r['ID'],r['ExperimentID'],pdbID)) or fname.startswith("%s/repacked_" % r['ID']):
                        #    writeFile(os.path.join(tmpdir, '%s.pdb' % pdbID), zipped_content.read(fname))
                    if fname.startswith("%s/%s-%s.resfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)):
                        raise Exception('This case needs to be updated (see the mutfile section below). We mainly use mutfiles now so I did not update this section.')
                        foundResfile = True
                        lines = zipped_content.read(fname).split("\n")
                        assert(len(lines) == 3)
                        assert(lines[0] == "NATAA")
                        assert(lines[1] == "start")
                        resfile_mutation = lines[2].split(" ")
                        assert(len(resfile_mutation) == 4)
                        rosetta_resid = resfile_mutation[0]
                        rosetta_chain = resfile_mutation[1]
                        rosetta_mutaa = resfile_mutation[3]
                        assert(mutantaa == rosetta_mutaa)
                        assert(dbchain == rosetta_chain)
                        assert(resfile_mutation[2] == 'PIKAA')
                        assert(len(rosetta_mutaa) == 1)
                    if fname.startswith("%s/%s-%s.mutfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)):
                        foundMutfile = True
                        lines = zipped_content.read(fname).split("\n")
                        assert(lines[0].startswith('total '))
                        num_mutations = int(lines[0][6:])
                        assert(lines[1] == str(num_mutations))
                        # todo: note: assuming monomeric structures here
                        rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],))
                        assert(len(rosetta_chain) == 1)
                        rosetta_chain = rosetta_chain[0]['Chain']

                        resfile_mutations = lines[2:]
                        for resfile_mutation in resfile_mutations:
                            resfile_mutation = resfile_mutation.split(" ")
                            assert(len(resfile_mutation) == 3)
                            rosetta_resids.append(resfile_mutation[1])
                            rosetta_mutaa = resfile_mutation[2]
                            assert(dbchain == rosetta_chain)
                            assert(len(rosetta_mutaa) == 1)

                # Make sure the wtaa->mutantaa types match the structures
                assert(not(foundResfile))
                if not foundMutfile:
                    raise Exception('This case needs to be updated (see the mutfile section below). This was added as a hack for cases where I did not store the mutfile so I did not update this section.')
                    input_files = ddGdb.execute_select('SELECT InputFiles FROM Prediction WHERE ID=%s', parameters=(r['ID'],))
                    assert(len(input_files) == 1)
                    lines = pickle.loads(input_files[0]['InputFiles'])['MUTFILE'].split("\n")

                    #lines = regenerate_mutfile(r['ID']).split("\n")
                    assert(len(lines) == 3)
                    assert(lines[0] == "total 1")
                    assert(lines[1] == "1")
                    resfile_mutation = lines[2].split(" ")
                    assert(len(resfile_mutation) == 3)
                    rosetta_resid = resfile_mutation[1]
                    rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],))
                    assert(len(rosetta_chain) == 1)
                    rosetta_chain = rosetta_chain[0]['Chain']
                    rosetta_mutaa = resfile_mutation[2]
                    assert(dbchain == rosetta_chain)
                    assert(len(rosetta_mutaa) == 1)
                    assert("%s%s%s" % (resfile_mutation[0], resfile_mutation[1], resfile_mutation[2]) == presumed_mutation)

                fullresids = []

                for rosetta_resid in rosetta_resids:
                    fullresid = None
                    if rosetta_resid.isdigit():
                        fullresid = '%s%s%s ' % (rosetta_chain, (4-len(rosetta_resid)) * ' ', rosetta_resid)
                    else:
                        assert(False)
                        fullresid = '%s%s%s' % (rosetta_chain, (5-len(rosetta_resid)) * ' ', rosetta_resid)
                    fullresids.append(fullresid)


                resultst1 = ddGdb.execute_select("SELECT ExperimentID, UserDataSetExperimentID FROM Prediction WHERE ID=%s", parameters = (r['ID'],))
                assert(len(resultst1) == 1)
                ExperimentIDt1 = resultst1[0]['ExperimentID']
                UserDataSetExperimentIDt1 = resultst1[0]['UserDataSetExperimentID']

                if UserDataSetExperimentIDt1:
                    resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM UserDataSetExperiment WHERE ID=%s", parameters = (UserDataSetExperimentIDt1,))
                else:
                    resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM Experiment WHERE ID=%s", parameters = (ExperimentIDt1,))
                assert(len(resultst2) == 1)

                prediction_PDB_ID = resultst2[0]['PDBFileID']

                if False and prediction_PDB_ID not in ['1TEN', '1AYE', '1H7M'] + ['1A2P', '1BNI', '1STN']:
                    for fullresid in fullresids:
                        wtaa = None
                        for m in mutations:
                            # Hack for ub_RPN13
                            if prediction_PDB_ID == 'ub_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_RPN13_yeast
                            elif prediction_PDB_ID == 'uby_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_OTU
                            elif prediction_PDB_ID == 'ub_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_OTU_yeast
                            elif prediction_PDB_ID == 'uby_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172):
                                wtaa = m['WildTypeAA']
                            # Hack for ub_UQcon
                            elif prediction_PDB_ID == 'ub_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) + 213): # starts at 501
                                wtaa = m['WildTypeAA']
                            # Hack for uby_UQcon
                            elif prediction_PDB_ID == 'uby_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 287):
                                wtaa = m['WildTypeAA']
                            elif m['Chain'] == fullresid[0] and m['ResidueID'] == fullresid[1:].strip():
                                wtaa = m['WildTypeAA']
                        if (wtaa == None):
                            colortext.error(prediction_PDB_ID)
                            colortext.error('wtaa == None')
                            colortext.error('fullresid = %s' % str(fullresid))
                            colortext.error(str(mutations))
                            colortext.warning([rosetta_resid.strip() for rosetta_resid in rosetta_resids])
                            #sys.exit(0)
                        assert(wtaa != None)
                        assert(PDB.from_filepath(repacked_files[0]).get_residue_id_to_type_map()[fullresid] == wtaa)
                    #assert(PDB(mutant_files[0]).get_residue_id_to_type_map()[fullresid] == mutantaa)

                for radius in radii:
                    score_name = ('noah_%0.1fA' % radius).replace(".", ",")

                    if ddG_dict['data'].get(score_name):
                        print('Radius %0.1f: done.' % radius)
                        continue
                    cases_computed += 1
                    number_of_cases_left -= 1

                    t.add('Radius %0.3f: repacked' % radius)
                    colortext.printf("Prediction ID: %d. Calculating radius %0.1f. Calculation #%d of %d." % (r['ID'], radius, cases_computed, len(results) * len(radii)), 'orange')

                    repacked_score = NoahScore()
                    repacked_score.calculate(repacked_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius)
                    colortext.message("Repacked")
                    print(repacked_score)

                    t.add('Radius %0.3f: mutant' % radius)
                    mutant_score = NoahScore()
                    mutant_score.calculate(mutant_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius)
                    colortext.printf("Mutant", color = 'cyan')
                    print(mutant_score)

                    t.add('Radius %0.3f: postamble' % radius)
                    colortext.printf("ddG", color = 'lightpurple')
                    ddg_score = repacked_score.ddg(mutant_score)
                    print(ddg_score)

                    colortext.printf("Liz's ddG", color = 'yellow')
                    print("Total score: %0.3f" % kellogg_ddG)

                    ddG_dict['version'] = '0.23'
                    if ddG_dict['version'] == '0.1':
                        ddG_dict['version'] = '0.21'
                        ddG_dict['data'] = {
                            'kellogg' : {
                                'total' : ddG_dict['data'],
                            },
                            'noah': {
                                'total' : {'ddG' : ddg_score.total},
                                'positional' : {'ddG' : ddg_score.positional},
                                'positional_twoscore' : {'ddG' : ddg_score.positional_twoscore},
                            },
                        }
                    elif ddG_dict['version'] == '0.2':
                        ddG_dict['version'] = '0.21'
                        ddG_dict['data']['noah']['total']['ddG'] = ddg_score.total
                        ddG_dict['data']['noah']['positional']['ddG'] = ddg_score.positional
                        ddG_dict['data']['noah']['positional_twoscore']['ddG'] = ddg_score.positional_twoscore
                    elif ddG_dict['version'] == '0.22':
                        ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}}
                        ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total
                        ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional
                        ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore
                    elif ddG_dict['version'] == '0.23':
                        ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}}
                        ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total
                        ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional
                        ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore

                    jsonified_ddG = json.dumps(ddG_dict)
                    ddGdb.execute('UPDATE Prediction SET Scores=%s WHERE ID=%s', parameters=(jsonified_ddG, r['ID'],))
                t.add('Cleanup')
                shutil.rmtree(tmpdir)
                os.remove(zipfilename)

            except Exception, e:
                print("Exception! In prediction %d" % r['ID'], str(e))
                failed_cases.append(r['ID'])
                import traceback
                print(traceback.format_exc())
                if tmpdir:
                    shutil.rmtree(tmpdir)

            total_time_in_secs += t.sum()
            average_time_taken = float(total_time_in_secs)/float(cases_computed or 1)
            estimate_remaining_time = number_of_cases_left * average_time_taken

            t.stop()
            colortext.printf("**Profile**", 'orange')
            print(t)
            colortext.message("Time taken for this case: %0.2fs." % t.sum())
            colortext.message("Average time taken per case: %0.2fs." % average_time_taken)
            colortext.message("Estimated time remaining: %dh%dm%ds." % (int(estimate_remaining_time/3600), int((estimate_remaining_time/60) % 60), estimate_remaining_time % 60))
            print("\n")
def generate_JSON_dataset(dataset_ID, pdb_data, pub_data):

    record_data = {}

    #1LRP
    #1LMB

    # 1 JSON object per dataset record
    failure_count = 0
    records = ddGdb.execute_select('SELECT * FROM DataSetDDG WHERE DataSetID=%s', parameters=(dataset_ID,))
    colortext.warning('Starting with %d records.' % (len(records)))
    mutation_count = {1:0, 2:0, 3:0, 4:0, 5:0}
    for r in records:

        mutation_is_reversed = r['MutationIsReversed'] == 1
        d = dict(
            _DataSetDDGID = r['ID'],
            RecordID = r['RecordNumber'],
            AggregateType = r['AggregateType'],
            DDG = r['PublishedValue'],
            PDBFileID = r['PDBFileID'],
            DerivedMutation = mutation_is_reversed,
        )

        # Parse PDB
        if not(cached_pdbs.get(r['PDBFileID'])):
            cached_pdbs[r['PDBFileID']] = PDB(ddGdb.execute_select('SELECT Content FROM PDBFile WHERE ID=%s', parameters=(r['PDBFileID'],))[0]['Content'])

        # Store PDB data
        PDBResolution = None,
        PDBMethodOfDetermination = None,
        try:
            PDBResolution = cached_pdbs[r['PDBFileID']].get_resolution()
        except: pass
        try:
            PDBMethodOfDetermination = cached_pdbs[r['PDBFileID']].get_techniques()
        except: pass
        pdb_data[r['PDBFileID']] = dict(
            Resolution = PDBResolution,
            MethodOfDetermination = PDBMethodOfDetermination,
        )

        assay_DDGs = ddGdb.execute_select('''
            SELECT *
            FROM DataSetDDGSource
            INNER JOIN ExperimentAssayDDG ON DataSetDDGSource.ExperimentAssayID = ExperimentAssayDDG.ExperimentAssayID AND DataSetDDGSource.Type = ExperimentAssayDDG.Type
            INNER JOIN ExperimentAssay ON ExperimentAssayDDG.ExperimentAssayID = ExperimentAssay.ID
            WHERE DataSetDDGID=%s''', parameters=(r['ID'],))

        ExperimentID = set([a['ExperimentID'] for a in assay_DDGs])
        if len(ExperimentID) != 1:
            colortext.message('%d records passed' % len(record_data))
            # Cases where 1FLV and 1FTG need to be elided
            if sorted(ExperimentID) in ([113699, 113830], [113704, 113832], [113705, 113836]):
                ExperimentID = [sorted(ExperimentID)[0]]
            elif sorted(ExperimentID) in ([112149, 112591],):
                # ExperimentID is used below for mutation details but these agree in this case. 1LZ1, 2BQA
                ExperimentID = [sorted(ExperimentID)[0]]
            elif sorted(ExperimentID) in (
                    [112141, 112583L], [112136, 112578], [112137, 112579], [112142, 112584], [112139, 112581],
                    [112140, 112582], [112146, 112588], [112147, 112589], [112148, 112590]
                ):
                # ExperimentID is used below for mutation details but these agree in this case. 1REX, 2BQA
                ExperimentID = [sorted(ExperimentID)[0]]
            elif sorted(ExperimentID) in ([112227, 112323], [112288, 113039], [111587, 112379]):
                # ExperimentID is used below for mutation details but these agree in this case. 2LZM, 1L63
                ExperimentID = [sorted(ExperimentID)[0]]
            else:
                colortext.warning(
                    '\n'.join(['%(PDBFileID)s %(Chain)s %(WildTypeAA)s %(ResidueID)s %(MutantAA)s' % rii for rii in ddGdb.execute_select('''
                    SELECT * FROM `ExperimentMutation` INNER JOIN Experiment ON Experiment.ID=ExperimentID WHERE `ExperimentID` IN (%s)''' % ','.join(map(str, ExperimentID)))]))
                pprint.pprint(r)
                colortext.error(map(int, ExperimentID))
                #pprint.pprint(assay_DDGs)
                print(sorted(ExperimentID))
        assert(len(ExperimentID) == 1)
        ExperimentID = ExperimentID.pop()
        d['_ExperimentID'] = ExperimentID

        experimental_DDGs = []
        for a in assay_DDGs:
            experimental_DDGs.append(dict(
                DDG = a['Value'],
                DDGType = a['Type'],
                Publication = a['Publication'],
                LocationOfValueInPublication = a['LocationOfValueInPublication'],
                Temperature = a['Temperature'],
                pH= a['pH'],
            ))
            # Store Publication data
            pub_data[a['Publication']] = cached_publications[a['Publication']]
        d['ExperimentalDDGs'] = experimental_DDGs

        # Retrieve mutations
        mutation_records = ddGdb.execute_select('SELECT * FROM ExperimentMutation WHERE ExperimentID=%s ORDER BY ResidueID', parameters=(ExperimentID,))
        if dataset_ID == "AlaScan-GPK_2014/09/25":
            assert(len(mutation_records) == 1)

        mutations = []
        failed_check = False
        mutation_count[len(mutation_records)] += 1
        for mutation in mutation_records:
            mutation_d = {}
            #if ExperimentID == 109911:
            #    d['PDBFileID'] = '1WQ5' # Hack for one 1BKS case

            mutation_d['Chain'] = mutation['Chain']
            mutation_d['ResidueID'] = mutation['ResidueID']
            if mutation_is_reversed:
                mutation_d['MutantAA'] = mutation['WildTypeAA']
                mutation_d['WildTypeAA'] = mutation['MutantAA']
            else:
                mutation_d['WildTypeAA'] = mutation['WildTypeAA']
                mutation_d['MutantAA'] = mutation['MutantAA']

            if dataset_ID == "AlaScan-GPK_2014/09/25":
                if d['PDBFileID'] == '1LMB':
                    mutation_d['Chain'] = '3' # Hack for the PDB replacement 1LRP (3.2A) -> 1LMB (1.8A)
                if d['PDBFileID'] == '1U5P' and int(mutation_d['ResidueID']) < 1600:
                    mutation_d['ResidueID'] = str(int(mutation_d['ResidueID']) + 1762) # Hack for the PDB replacement 1AJ3, NMR -> 1U5P (2A)
            if dataset_ID == "Kellogg_10.1002/prot.22921_2010/12/03":
                if d['PDBFileID'] == '1U5P' and int(mutation_d['ResidueID']) < 1600:
                    mutation_d['ResidueID'] = str(int(mutation_d['ResidueID']) + 1762) # Hack for the PDB replacement 1AJ3, NMR -> 1U5P (2A)

            mutated_residue = ddGdb.execute_select('SELECT * FROM PDBResidue WHERE PDBFileID=%s AND Chain=%s AND ResidueID=%s', parameters=(d['PDBFileID'], mutation_d['Chain'], ResidueID2String(mutation_d['ResidueID'])))
            if len(mutated_residue) == 0:
                colortext.warning('Skipping Experiment #%d (%s) in %s due to missing residue %s.' % (ExperimentID, d['PDBFileID'], dataset_ID, mutation_d['ResidueID']))
                #print('SELECT * FROM PDBResidue WHERE PDBFileID=%s AND Chain=%s AND ResidueID=%s' % (d['PDBFileID'], mutation_d['Chain'], ResidueID2String(mutation_d['ResidueID'])))
                #pprint.pprint(d)
                #pprint.pprint(mutations)
                #pprint.pprint(mutation_d)
                #print(ExperimentID)
                #print(mutated_residue)
                #print(10*'*')
                #print('\n')
                failure_count += 1
                failed_check = True
                break
            assert(len(mutated_residue) == 1)

            mutated_residue = mutated_residue[0]
            mutation_d['DSSPExposure'] = mutated_residue['MonomericExposure']
            mutation_d['DSSPType'] = mutated_residue['MonomericDSSP']
            mutation_d['DSSPSimpleSSType'] = dssp_elision.get(mutation_d['DSSPType'])
            assert(mutation_d['DSSPType'] != None)
            assert(mutation_d['DSSPSimpleSSType'] != None)
            mutations.append(mutation_d)

        if failed_check:
            print('FAILED CHECK')
            continue
        d['Mutations'] = mutations

        if dataset_ID == "Potapov_10.1093/protein/gzp030_2009/09/01":
            key = '%s_%s_%s' % (d['PDBFileID'], '+'.join(['%s:%s:%s' % (mutation_d['Chain'], mutation_d['ResidueID'].strip(), mutation_d['MutantAA']) for mutation_d in mutations]), d['RecordID'])
        else:
            key = '%s_%s' % (d['PDBFileID'], '+'.join(['%s:%s:%s' % (mutation_d['Chain'], mutation_d['ResidueID'].strip(), mutation_d['MutantAA']) for mutation_d in mutations]))

        if record_data.get(key):
            colortext.warning('KEY EXISTS: %s' % key)
            print('Existing record: %s' % pprint.pformat(record_data[key]))
            print('New record: %s' % pprint.pformat(d))
            failure_count += 1
        record_data[key] = d

    colortext.message('Mutation count')
    colortext.warning(pprint.pformat(mutation_count))

    if failure_count > 0:
        colortext.error('Total length of dataset: %d. Failed on %d records.' % (len(record_data), failure_count))
    else:
        colortext.message('Total length of dataset: %d. ' % (len(record_data)))

    record_list = []
    for k, v in sorted(record_data.iteritems()):
        record_list.append(v)

    colortext.message('Adding dataset %s with %d records, %d PDB files, and %d references.' % (dataset_ID, len(record_list), len(pdb_data), len(pub_data)))
    JSON_datasets[dataset_ID]['data'] = record_list
Example #35
0
#!/usr/bin/python
# encoding: utf-8
"""
ligand.py test code.

Created by Shane O'Connor 2016
"""

import sys
import os

sys.path.insert(0, os.path.join('..', '..'))

from klab.bio.ligand import Ligand, PDBLigand
from klab import colortext

l = Ligand.retrieve_data_from_rcsb('NAG', pdb_id = '1WCO', silent = True, cached_dir = '/tmp')
colortext.warning(l)
l = Ligand.retrieve_data_from_rcsb('GDP', silent = True, cached_dir = '/tmp')
colortext.pcyan(l)

l = PDBLigand.instantiate_from_ligand(l, 'A', ' 124B')
colortext.porange(l)

l = PDBLigand.retrieve_data_from_rcsb('GOL', '1BXO', 'A', '  12B', pdb_ligand_code='TST', silent = True, cached_dir = '/tmp')
colortext.ppurple(l)
Example #36
0
 def print_schema(self):
     c = 1
     for x in self.sanitize_schema().split('\n'):
         colortext.warning('%04d: %s' % (c, x))
         c += 1