def read_train_data(): pro_list = [] lig_list = [] for index in range(1, 2): digit = format(index, '04d') file_name_pro = 'training_data/' + digit + '_pro_cg.pdb' file_name_lig = 'training_data/' + digit + '_lig_cg.pdb' pro = read_pdb(file_name_pro) lig = read_pdb(file_name_lig) pro_list.append(pro) lig_list.append(lig)
def read_raw_data(n, file_type): if (n < 1) or (n > 3000): return "Please provide a valid number of files between 1 to 3000" if (file_type != "lig") and (file_type != "pro"): return "Please provide a valid file type which should be 'lig' or 'pro'" res = [] for i in range(1, n + 1): file_number = ("0000" + str(i))[-4:] file_name = "{}_{}_cg.pdb".format(file_number, file_type) X_list, Y_list, Z_list, atomtype_list = read_pdb( "./training_data/{}".format(file_name)) res.append([X_list, Y_list, Z_list, atomtype_list]) return res
def plot_distribution_range_xyz(): num = [] x = [] y = [] z = [] for i in range(1, 3001): print(i) X_list, Y_list, Z_list, atomtype_list = read_pdb( '../data/training_data/%s_pro_cg.pdb' % str(i).zfill(4)) num.append((len(X_list))) x.append(max(X_list) - min(X_list)) y.append(max(Y_list) - min(Y_list)) z.append(max(Z_list) - min(Z_list)) fig = plt.figure(dpi=128, figsize=(12, 6)) fig.autofmt_xdate() fig.suptitle('Spatial Range VS Number of Atom within a Protein') ax = plt.subplot(131) ax.set_title('X Range VS Number of Atoms') ax.set_ylabel('X') ax.set_xlabel('The Numbers of Atoms within a Protein') ax.scatter(num, x, label='Protein') plt.legend() plt.grid(True) ax = plt.subplot(132) ax.set_title('Y Range VS Number of Atoms') ax.set_ylabel('Y') ax.set_xlabel('The Numbers of Atoms within a Protein') ax.scatter(num, y, label='Protein') plt.legend() plt.grid(True) ax = plt.subplot(133) ax.set_title('Z Range VS Number of Atoms') ax.set_ylabel('Z') ax.set_xlabel('The Numbers of Atoms within a Protein') ax.scatter(num, z, label='Protein') plt.legend() plt.grid(True) plt.tight_layout() plt.subplots_adjust(top=0.88) plt.show()
def extract_data(i, type): x_list, y_list, z_list, atomtype_list = read_pdb('../data/training_data/%s_%s_cg.pdb' % (str(i).zfill(4), type)) data = np.transpose(np.array([x_list, y_list, z_list, atomtype_list])) return np.array(data)