def fetch_fastas_for_DUDE(): if not os.path.exists( os.path.join(BLAST_MAIN_FOLDER, 'fastas_from_dude.txt')): log(f'Starting fetching FASTA\'s for DUDE') dude_uni_ids = DUDE_uniID_from_folder(DUDE_PATH) with open(os.path.join(BLAST_MAIN_FOLDER, 'fastas_from_dude.txt'), 'w') as handle: for uniprot_id in dude_uni_ids: pdb_ids = get_pdbs_from_unicode(uniprot_id) pdbs_dict = dict() for pdb in pdb_ids: try: if check_if_pdb_xray(pdb): pdb_fasta = get_pdb_fasta(pdb) pdbs_dict[uniprot_id + '-' + pdb] = pdb_fasta.strip('"') except HTTPError: log(f'Failed fetching FASTA sequence for: {pdb}') pdbs_dict2 = dict() for key, value in pdbs_dict.items(): if value not in list(pdbs_dict2.values()): pdbs_dict2[key] = value for key, value in pdbs_dict2.items(): handle.write(f'>{key}\n{value}\n') else: log('DUD-E fastas already exists.')
def fetch_fastas_for_DEKOIS(): if not os.path.exists( os.path.join(BLAST_MAIN_FOLDER, 'fastas_from_dekois.txt')): log('Starting fetching FASTA\' s for DEKOIS') dekois_ligands, _ = DEKOIS_uniID_from_folder(DEKOIS_PATH) dekois_ligands_uni_ids = set() for i in dekois_ligands: try: for k in dekois_ligands[i]['Uniprot_ID'].split(): dekois_ligands_uni_ids.add(k) except KeyError: log(f'No Uniprot ID for {i}') with open(os.path.join(BLAST_MAIN_FOLDER, 'fastas_from_dekois.txt'), 'w') as handle: for uniprot_id in dekois_ligands_uni_ids: pdb_ids = get_pdbs_from_unicode(uniprot_id) pdbs_dict = dict() for pdb in pdb_ids: try: if check_if_pdb_xray(pdb): pdb_fasta = get_pdb_fasta(pdb) pdbs_dict[uniprot_id + '-' + pdb] = pdb_fasta.strip('"') except HTTPError: log(f'Failed fetching FASTA sequence for: {pdb}') pdbs_dict2 = dict() for key, value in pdbs_dict.items(): if value not in list(pdbs_dict2.values()): pdbs_dict2[key] = value for key, value in pdbs_dict2.items(): handle.write(f'>{key}\n{value}\n') else: log('DEKOIS fastas already exists.')
def fetch_fastas_for_chembl(csv_path, output): if not os.path.exists(output): main_table = pd.read_csv(csv_path, index_col=0) log(f'Downloading fastas for {len(main_table)} targets') with open(output, 'w') as handle: for index, row in main_table.iterrows(): chembl_name = row['ChEMBL ID'] pdb_name = row['main_PDB_structure'] fasta = get_pdb_fasta(row['main_PDB_structure']) handle.write(f'>{chembl_name}-{pdb_name}\n{fasta}\n') else: log(f'{output} already exists!')
def choose_primary_pdb_for_chembl(csv_path): main_table = pd.read_csv(csv_path, index_col=0) main_table['main_PDB_structure'] = '' for index, row in main_table.iterrows(): pdbs = row['PDB_entry'].split() fasta = [get_pdb_fasta(i) for i in pdbs] best = '' for seq in fasta: if len(seq) > len(best): best = seq if best != '': best_pdb = pdbs[fasta.index(best)] main_table.at[index, 'main_PDB_structure'] = best_pdb main_table.to_csv(csv_path)
def choose_primary_pdb_for_chembl(main_table): main_table['main_PDB_structure'] = '' for index, row in main_table.iterrows(): pdbs = row['PDB_entry'].split() log(f'Fetching PDBs for {index}, {len(pdbs)} PDBs to fetch.') fasta = [get_pdb_fasta(i) for i in pdbs] best = '' for seq in fasta: if len(seq) > len(best): best = seq if best != '': best_pdb = pdbs[fasta.index(best)] main_table.at[index, 'main_PDB_structure'] = best_pdb main_table = main_table.sort_values( 'Active_compounds', ascending=False).drop_duplicates(subset='main_PDB_structure') main_table = main_table[main_table['main_PDB_structure'] != ''] return main_table
def make_blast_csv(master_path, blasts=None): main_table = pd.read_csv(master_path, index_col=0) output_df = main_table.loc[:, [ 'ChEMBL ID', 'main_PDB_structure', 'Active_compounds', 'Inactive_compounds' ]] output_file = os.path.join(BLAST_MAIN_FOLDER, 'chembl_blast_results.csv') for db in blasts: #chembl_Id, hit_ID, identity%, evalue blast_results = load_blast_results(db, [0, 1, 2, 6, 7, 10]) db_name = str(os.path.split(db)[-1]).split('_')[0].split('-')[1] output_df[f'identity%_{db_name}'] = '' output_df[f'evalue_{db_name}'] = '' output_df[f'target_name_{db_name}'] = '' output_df[f'query_alignment_length_{db_name}'] = '' output_df[f'total_query_length_{db_name}'] = '' output_df[f'alignment_to_total_ratio_{db_name}'] = '' name = '' for chembl_id, db_target, identity, q_start, q_end, evalue in blast_results: if name not in chembl_id or name == '': query_pdb_smile_len = len( get_pdb_fasta(chembl_id.split('-')[-1])) query_alignment_length = q_end - q_start name = chembl_id.split('-')[0] output_df.at[name, f'identity%_{db_name}'] = identity output_df.at[name, f'evalue_{db_name}'] = evalue output_df.at[name, f'target_name_{db_name}'] = db_target output_df.at[ name, f'query_alignment_length_{db_name}'] = query_alignment_length output_df.at[ name, f'total_query_length_{db_name}'] = query_pdb_smile_len output_df.at[ name, f'alignment_to_total_ratio_{db_name}'] = query_alignment_length / query_pdb_smile_len output_df.to_csv(output_file)