def main():
    args = parse_args()
    os.makedirs(args.output_path, exist_ok=True)
    h5_files = list_files(args.h5_path)
    for h5_file in h5_files:
        pdb_id = parse_pdb_id(h5_file)
        print("{} => {}".format(h5_file, pdb_id))
        try:
            data = generate_masked_data(h5_file)
            save_h5_map(data,
                        os.path.join(args.output_path, "{}.h5".format(pdb_id)))
        except:
            print("couldn't process {}".format(pdb_id))
Beispiel #2
0
def main():
    args = parse_args()
    df_master = pd.read_hdf(args.master_df_path, "df")
    os.makedirs(args.output_path, exist_ok=True)
    df_files = list_files(args.dfs_dir_path)
    df_avai_pdbs = pd.DataFrame(df_files,
                                index=[parse_pdb_id(f) for f in df_files],
                                columns=["path"])
    df_master = df_master.merge(df_avai_pdbs,
                                left_on="PDB",
                                right_index=True,
                                how="inner")
    for subset in ["val", "test", "train"]:
        print("==================== {} ====================".format(subset))
        df_subset = df_master.query("subset == '{}'".format(subset))
        n1 = len(df_subset)
        if len(df_subset) <= 0:
            continue
        if subset == "val" and args.n_val:
            df_subset = df_subset.sample(args.n_val)
        if subset == "test" and args.n_test:
            df_subset = df_subset.sample(args.n_test)
        if subset == "train" and args.n_train:
            df_subset = df_subset.sample(args.n_train)
        n2 = len(df_subset)
        print("picking {} out of {} samples".format(n2, n1))
        df_rotamers = list()
        for i in range(len(df_subset)):
            pdb, resolution = df_subset.iloc[i][["PDB", "resolution"]]
            my_df = pd.read_hdf(df_avai_pdbs.at[pdb, "path"], "df")[[
                "rot_label", "aa_label", "O", "C", "CA", "N"
            ]]
            my_df = my_df.dropna(subset=["O", "C", "CA", "N"])
            my_df["PDB"] = pdb
            my_df["resolution"] = resolution
            my_df["subset"] = subset
            if args.verbose:
                print(pdb, resolution, subset,
                      "{} residues".format(len(my_df)))
            df_rotamers.append(my_df)
        df_rotamers = pd.concat(df_rotamers, axis=0,
                                ignore_index=True)  #.reset_index(drop=True)
        if args.verbose:
            print(df_rotamers)
        print("mean O  coor", np.mean(df_rotamers["O"].values))
        print("mean C  coor", np.mean(df_rotamers["C"].values))
        print("mean CA coor", np.mean(df_rotamers["CA"].values))
        dest_path = os.path.join(args.output_path,
                                 "{}{}.h5".format(args.output_prefix, subset))
        df_subset.to_hdf(dest_path, "df")
        print("written to {}".format(dest_path))
Beispiel #3
0
def main():
    args = parse_args()
    df_master = pd.read_hdf(args.master_df_path,
                            "df").drop_duplicates(subset="PDB")
    os.makedirs(args.output_path, exist_ok=True)
    df_files = list_files(args.dfs_dir_path)
    df_avai_pdbs = pd.DataFrame(df_files,
                                index=[parse_pdb_id(f) for f in df_files],
                                columns=["path"])
    df_master = df_master.merge(df_avai_pdbs,
                                left_on="PDB",
                                right_index=True,
                                how="inner")
    ca_ca_dists = []
    for subset in ["val", "test", "train"]:
        print("==================== {} ====================".format(subset))
        dest_path = os.path.join(args.output_path,
                                 "{}{}.h5".format(args.output_prefix, subset))
        df_master_subset = df_master.query("subset == '{}'".format(subset))
        if args.verbose:
            print(df_master_subset)
        f = h5py.File(dest_path, "w")
        for i in range(len(df_master_subset)):
            pdb = df_master_subset.iloc[i]["PDB"]
            my_df = pd.read_hdf(df_avai_pdbs.at[pdb, "path"],
                                "df")[["chain", "no", "O", "C", "CA", "N"]]
            chains = quality_check(my_df)
            if args.verbose:
                print(pdb, chains)
            for chain in chains:
                if chains[chain]:
                    pass
                else:
                    print("{}/{} skipped".format(pdb, chain))
                    continue
                ca_coors = get_ca_coors(my_df, chain)
                ca_ca_dist = 0.25 * np.mean(
                    np.linalg.norm(ca_coors[0:-1, :] - ca_coors[1:, :],
                                   axis=1))
                ca_ca_dists.append(ca_ca_dist)
                print("{}/{} mean CA-CA distance: {:.2f} A".format(
                    pdb, chain, ca_ca_dist))
                ca_coors_ds = f.create_dataset("CA/{}/{}".format(pdb, chain),
                                               ca_coors.shape,
                                               dtype=np.float)
                ca_coors_ds[()] = ca_coors
        f.close()
    ca_ca_dists = np.array(ca_ca_dists)
    print("CA-CA distance mean: {:.2f} A std: {:.2f} A".format(
        np.mean(ca_ca_dists), np.std(ca_ca_dists)))
def generate_masked_data(h5_file, sigma=10, verbose=True):
    data = load_h5_map(h5_file)
    mask = np.zeros_like(data, dtype=np.float)
    df = parse_h5_attrs(h5_file, posfix="_xyz")
    pdb_id = parse_pdb_id(h5_file)
    if verbose:
        print("{} shape:".format(pdb_id), data.shape,
              "{} residues".format(len(df)))
    atoms = [col for col in df.columns if col[-4:] == "_xyz"]
    for i in range(len(df)):
        coors = df.iloc[i][atoms].values
        for coor in coors:
            if np.isnan(coor).any():
                continue
            else:
                mask[int(coor[2]), int(coor[1]), int(coor[0])] = 1
    mask = gaussian_filter(mask, sigma=sigma)
    mask = mask / np.max(mask)
    print("mask max", np.max(mask))
    data = data * mask
    return data
Beispiel #5
0
def main():
    args = parse_args()
    os.makedirs(args.output_path, exist_ok=True)
    h5_files = list_files(args.h5_path)
    for h5_file in h5_files:
        pdb_id = parse_pdb_id(h5_file)
        dest_path = os.path.join(args.output_path, "{}.h5".format(pdb_id))
        print("({}) {} => {}".format(pdb_id, h5_file, dest_path))
        try:
            df = parse_h5_attrs(h5_file)
            df.to_hdf(dest_path, "df")
        except OSError:
            print("[OSError] cannot process {}".format(h5_file))
            if args.error_log is not None:
                with open(args.error_log, "a") as f:
                    f.write("{}, {}, {}\n".format(pdb_id, h5_file, "OSError"))
        except KeyError:
            print("[KeyError] cannot process {}".format(h5_file))
            if args.error_log is not None:
                with open(args.error_log, "a") as f:
                    f.write("{}, {}, {}\n".format(pdb_id, h5_file, "KeyError"))