def main(): args = parse_args() os.makedirs(args.output_path, exist_ok=True) h5_files = list_files(args.h5_path) for h5_file in h5_files: pdb_id = parse_pdb_id(h5_file) print("{} => {}".format(h5_file, pdb_id)) try: data = generate_masked_data(h5_file) save_h5_map(data, os.path.join(args.output_path, "{}.h5".format(pdb_id))) except: print("couldn't process {}".format(pdb_id))
def main(): args = parse_args() df_master = pd.read_hdf(args.master_df_path, "df") os.makedirs(args.output_path, exist_ok=True) df_files = list_files(args.dfs_dir_path) df_avai_pdbs = pd.DataFrame(df_files, index=[parse_pdb_id(f) for f in df_files], columns=["path"]) df_master = df_master.merge(df_avai_pdbs, left_on="PDB", right_index=True, how="inner") for subset in ["val", "test", "train"]: print("==================== {} ====================".format(subset)) df_subset = df_master.query("subset == '{}'".format(subset)) n1 = len(df_subset) if len(df_subset) <= 0: continue if subset == "val" and args.n_val: df_subset = df_subset.sample(args.n_val) if subset == "test" and args.n_test: df_subset = df_subset.sample(args.n_test) if subset == "train" and args.n_train: df_subset = df_subset.sample(args.n_train) n2 = len(df_subset) print("picking {} out of {} samples".format(n2, n1)) df_rotamers = list() for i in range(len(df_subset)): pdb, resolution = df_subset.iloc[i][["PDB", "resolution"]] my_df = pd.read_hdf(df_avai_pdbs.at[pdb, "path"], "df")[[ "rot_label", "aa_label", "O", "C", "CA", "N" ]] my_df = my_df.dropna(subset=["O", "C", "CA", "N"]) my_df["PDB"] = pdb my_df["resolution"] = resolution my_df["subset"] = subset if args.verbose: print(pdb, resolution, subset, "{} residues".format(len(my_df))) df_rotamers.append(my_df) df_rotamers = pd.concat(df_rotamers, axis=0, ignore_index=True) #.reset_index(drop=True) if args.verbose: print(df_rotamers) print("mean O coor", np.mean(df_rotamers["O"].values)) print("mean C coor", np.mean(df_rotamers["C"].values)) print("mean CA coor", np.mean(df_rotamers["CA"].values)) dest_path = os.path.join(args.output_path, "{}{}.h5".format(args.output_prefix, subset)) df_subset.to_hdf(dest_path, "df") print("written to {}".format(dest_path))
def main(): args = parse_args() df_master = pd.read_hdf(args.master_df_path, "df").drop_duplicates(subset="PDB") os.makedirs(args.output_path, exist_ok=True) df_files = list_files(args.dfs_dir_path) df_avai_pdbs = pd.DataFrame(df_files, index=[parse_pdb_id(f) for f in df_files], columns=["path"]) df_master = df_master.merge(df_avai_pdbs, left_on="PDB", right_index=True, how="inner") ca_ca_dists = [] for subset in ["val", "test", "train"]: print("==================== {} ====================".format(subset)) dest_path = os.path.join(args.output_path, "{}{}.h5".format(args.output_prefix, subset)) df_master_subset = df_master.query("subset == '{}'".format(subset)) if args.verbose: print(df_master_subset) f = h5py.File(dest_path, "w") for i in range(len(df_master_subset)): pdb = df_master_subset.iloc[i]["PDB"] my_df = pd.read_hdf(df_avai_pdbs.at[pdb, "path"], "df")[["chain", "no", "O", "C", "CA", "N"]] chains = quality_check(my_df) if args.verbose: print(pdb, chains) for chain in chains: if chains[chain]: pass else: print("{}/{} skipped".format(pdb, chain)) continue ca_coors = get_ca_coors(my_df, chain) ca_ca_dist = 0.25 * np.mean( np.linalg.norm(ca_coors[0:-1, :] - ca_coors[1:, :], axis=1)) ca_ca_dists.append(ca_ca_dist) print("{}/{} mean CA-CA distance: {:.2f} A".format( pdb, chain, ca_ca_dist)) ca_coors_ds = f.create_dataset("CA/{}/{}".format(pdb, chain), ca_coors.shape, dtype=np.float) ca_coors_ds[()] = ca_coors f.close() ca_ca_dists = np.array(ca_ca_dists) print("CA-CA distance mean: {:.2f} A std: {:.2f} A".format( np.mean(ca_ca_dists), np.std(ca_ca_dists)))
def generate_masked_data(h5_file, sigma=10, verbose=True): data = load_h5_map(h5_file) mask = np.zeros_like(data, dtype=np.float) df = parse_h5_attrs(h5_file, posfix="_xyz") pdb_id = parse_pdb_id(h5_file) if verbose: print("{} shape:".format(pdb_id), data.shape, "{} residues".format(len(df))) atoms = [col for col in df.columns if col[-4:] == "_xyz"] for i in range(len(df)): coors = df.iloc[i][atoms].values for coor in coors: if np.isnan(coor).any(): continue else: mask[int(coor[2]), int(coor[1]), int(coor[0])] = 1 mask = gaussian_filter(mask, sigma=sigma) mask = mask / np.max(mask) print("mask max", np.max(mask)) data = data * mask return data
def main(): args = parse_args() os.makedirs(args.output_path, exist_ok=True) h5_files = list_files(args.h5_path) for h5_file in h5_files: pdb_id = parse_pdb_id(h5_file) dest_path = os.path.join(args.output_path, "{}.h5".format(pdb_id)) print("({}) {} => {}".format(pdb_id, h5_file, dest_path)) try: df = parse_h5_attrs(h5_file) df.to_hdf(dest_path, "df") except OSError: print("[OSError] cannot process {}".format(h5_file)) if args.error_log is not None: with open(args.error_log, "a") as f: f.write("{}, {}, {}\n".format(pdb_id, h5_file, "OSError")) except KeyError: print("[KeyError] cannot process {}".format(h5_file)) if args.error_log is not None: with open(args.error_log, "a") as f: f.write("{}, {}, {}\n".format(pdb_id, h5_file, "KeyError"))