Exemple #1
0
def __get_mutation_center(struct_df, label_info, center_at_mut=True):
    if center_at_mut:
        # Use CA position of the mutated residue as center for subgrid center
        sel = ((struct_df.chain == label_info.chain) &
               (struct_df.residue == label_info.residue) &
               (struct_df.name == 'CA'))
        mutation_pos = struct_df[sel][['x', 'y', 'z']].astype(np.float32)
        mutation_center = util.get_center(mutation_pos)
    else:
        pos = struct_df[['x', 'y', 'z']].astype(np.float32)
        mutation_center = util.get_center(pos)
    return mutation_center
def get_data_stats(data_filename):
    """
    Get the furthest distance from the ligand's center and the number of
    atoms for each structure in the dataset.
    """
    data_df = pd.read_hdf(data_filename, 'structures')

    data = []
    for pdbcode, struct_df in data_df.groupby(['structure']):
        pos = struct_df[['x', 'y', 'z']].astype(np.float32)

        ligand_pos = struct_df[struct_df.chain == 'LIG'][['x', 'y', 'z'
                                                          ]].astype(np.float32)
        ligand_center = util.get_center(ligand_pos)

        max_dist = util.get_max_distance_from_center(pos, ligand_center)
        num_atoms = struct_df.shape[0]
        data.append((pdbcode, max_dist, num_atoms))

    df = pd.DataFrame(data, columns=['pdbcode', 'max_dist', 'num_atoms'])
    df = df.sort_values(by=['max_dist', 'num_atoms'],
                        ascending=[False, False]).reset_index(drop=True)
    print(df.describe())

    print(df[df.max_dist < 20].shape[0] * 100.0 / df.shape[0])
    return df
Exemple #3
0
def df_to_feature(struct_df, grid_config, random_seed=None):
    pos = struct_df[['x', 'y', 'z']].astype(np.float32)
    center = util.get_center(pos)

    rot_mat = subgrid_gen.gen_rot_matrix(grid_config, random_seed=random_seed)
    grid = subgrid_gen.get_grid(
        struct_df, center, config=grid_config, rot_mat=rot_mat)
    return grid
Exemple #4
0
def get_data_stats(sharded_list):
    """
    Get the furthest distance from the protein's center and max residue ID for
    every protein in the sharded dataset.
    """
    data = []
    all_elements = []
    labels = []
    for i, sharded in enumerate(sharded_list):
        for shard_num, shard_df in sharded.iter_shards():
            labels_df = sharded.read_shard(shard_num, key='labels')

            for ensemble, ensemble_df in shard_df.groupby(['ensemble']):
                all_elements.extend(ensemble_df.element.values)

                subunits = ensemble_df.subunit.unique()
                inactive = __get_subunit_name(subunits, mode='inactive')
                active = __get_subunit_name(subunits, mode='active')

                for subunit_name in [inactive, active]:
                    struct_df = ensemble_df[ensemble_df.subunit ==
                                            subunit_name]
                    pos = struct_df[['x', 'y', 'z']].astype(np.float32)
                    ligand_pos = struct_df[struct_df.chain == 'L'][[
                        'x', 'y', 'z'
                    ]].astype(np.float32)
                    ligand_center = util.get_center(ligand_pos)

                    max_dist = util.get_max_distance_from_center(
                        pos, ligand_center)
                    num_atoms = struct_df.shape[0]
                    data.append((ensemble, subunit_name, max_dist, num_atoms))

                labels.append((i, shard_num, labels_df[
                    labels_df.ensemble == ensemble].label.values[0] == 'A'))

    all_elements_df = pd.DataFrame(all_elements, columns=['element'])
    unique_elements = all_elements_df.element.unique()
    print('Unique elements ({:}): {:}'.format(len(unique_elements),
                                              unique_elements))
    print('\nElement counts:')
    print(all_elements_df.element.value_counts())
    print('\n')

    all_labels_df = pd.DataFrame(labels,
                                 columns=['sharded', 'shard_num', 'label'])
    print('\nLabel dist by dataset:')
    print(all_labels_df.groupby(['sharded', 'shard_num']).label.value_counts())
    print('\n')

    df = pd.DataFrame(data,
                      columns=['ensemble', 'subunit', 'max_dist', 'num_atoms'])
    df = df.sort_values(by=['max_dist', 'num_atoms'],
                        ascending=[False, False]).reset_index(drop=True)
    print(df.describe())

    print(df[df.max_dist < 90].shape[0] * 100.0 / df.shape[0])
    return df
Exemple #5
0
def df_to_feature(struct_df, grid_config, random_seed=None):
    # Use center of ligand for subgrid center
    ligand_pos = struct_df[struct_df.chain == 'LIG'][['x', 'y',
                                                      'z']].astype(np.float32)
    ligand_center = util.get_center(ligand_pos)

    rot_mat = subgrid_gen.gen_rot_matrix(grid_config, random_seed=random_seed)
    grid = subgrid_gen.get_grid(struct_df,
                                ligand_center,
                                config=grid_config,
                                rot_mat=rot_mat)
    return grid
Exemple #6
0
def get_data_stats(data_filename):
    """
    Get the furthest distance from the molecule's center and the number of
    atoms for each molecule in the dataset.
    """
    data_df = pd.read_hdf(data_filename, 'structures')

    data = []
    for mol_id, mol_df in data_df.groupby(['structure']):
        pos = mol_df[['x', 'y', 'z']].astype(np.float32)
        max_dist = util.get_max_distance_from_center(pos, util.get_center(pos))
        num_atoms = mol_df.shape[0]
        data.append((mol_id, max_dist, num_atoms))

    df = pd.DataFrame(data, columns=['mol_id', 'max_dist', 'num_atoms'])
    df = df.sort_values(by=['max_dist', 'num_atoms'],
                        ascending=[False, False]).reset_index(drop=True)
    print(df.describe())

    print(df[df.max_dist < 7.5].shape[0] * 100.0 / df.shape[0])
    return df
Exemple #7
0
def get_data_stats(sharded_list):
    data = []
    for sharded in sharded_list:
        for _, shard_df in sharded.iter_shards():
            for (target,
                 decoy), struct_df in shard_df.groupby(['ensemble',
                                                        'subunit']):
                pos = struct_df[['x', 'y', 'z']].astype(np.float32)
                max_dist = util.get_max_distance_from_center(
                    pos, util.get_center(pos))
                max_res = struct_df.residue.max()
                data.append((target, decoy, max_dist, max_res))
    df = pd.DataFrame(data, columns=['target', 'decoy', 'max_dist', 'max_res'])
    df = df.sort_values(by=['max_dist', 'max_res'],
                        ascending=[False, False]).reset_index(drop=True)
    print(df.describe())

    print(df[df.max_dist < 50].shape[0] * 100.0 / df.shape[0])
    print(df[df.max_dist < 50].target.unique().shape[0] * 100.0 /
          float(df.target.unique().shape[0]))
    return df
Exemple #8
0
def get_data_stats(sharded_list):
    """
    Get the furthest distance from the protein's center and max residue ID for
    every protein in the sharded dataset.
    """
    data = []
    for i, sharded in enumerate(sharded_list):
        for _, shard_df in sharded.iter_shards():
            for (target, decoy), struct_df in shard_df.groupby(['ensemble', 'subunit']):
                pos = struct_df[['x', 'y', 'z']].astype(np.float32)
                max_dist = util.get_max_distance_from_center(
                    pos, util.get_center(pos))
                max_res = struct_df.residue.max()
                data.append((i, target, decoy, max_dist, max_res))
    df = pd.DataFrame(data, columns=['sharded', 'target', 'decoy', 'max_dist', 'max_res'])
    df = df.sort_values(by=['sharded', 'max_dist', 'max_res'],
                        ascending=[True, False, False]).reset_index(drop=True)
    print(df.describe())

    print(df[df.max_dist < 90].shape[0]*100.0/df.shape[0])
    print(df[df.max_dist < 90].target.unique().shape[0]*100.0/float(df.target.unique().shape[0]))
    return df
Exemple #9
0
def df_to_feature(struct_df, grid_config, center_around_Cs, random_seed=None):
    # Consider only atoms that have mapping for computing center.
    # If <center_around_Cs> is set, consider only carbon atoms.
    if center_around_Cs:
        pruned_struct_df = struct_df[struct_df.element == 'C']
    else:
        pruned_struct_df = struct_df[struct_df.element.isin(
            grid_config.element_mapping.keys())]

    pos = pruned_struct_df[['x', 'y', 'z']].astype(np.float32)
    # Use center of ligand for subgrid center
    ligand_pos = pruned_struct_df[pruned_struct_df.chain == 'L'][[
        'x', 'y', 'z'
    ]].astype(np.float32)
    ligand_center = util.get_center(ligand_pos)

    rot_mat = subgrid_gen.gen_rot_matrix(grid_config, random_seed=random_seed)
    grid = subgrid_gen.get_grid(struct_df,
                                ligand_center,
                                config=grid_config,
                                rot_mat=rot_mat)
    return grid