def test_ContactFeaturizer_describe_features(): scheme = np.random.choice(['ca','closest','closest-heavy']) feat = ContactFeaturizer(scheme=scheme, ignore_nonprotein=True) rnd_traj = np.random.randint(len(trajectories)) features = feat.transform([trajectories[rnd_traj]]) df = pd.DataFrame(feat.describe_features(trajectories[rnd_traj])) for f in range(25): f_index = np.random.choice(len(df)) residue_ind = df.iloc[f_index].resids feature_value, _ = md.compute_contacts(trajectories[rnd_traj], contacts=[residue_ind], scheme=scheme) assert (features[0][:, f_index] == feature_value.flatten()).all()
def _extract_data(self, traj): contact = ContactFeaturizer( contacts=self.contacts, scheme=self.scheme, ignore_nonprotein=self.ignore_nonprotein ) distances = contact.partial_transform(traj) summary = contact.describe_features(traj) pairs = [item["resids"] for item in summary] resids = np.unique(pairs) data = [] for resid in resids: idx = list(list(set(pair) - {resid})[0] for pair in pairs if resid in pair) mapping = np.array([True if resid in pair else False for pair in pairs]) data.append(pd.DataFrame(distances[:, mapping], columns=[idx, len(idx) * [resid]])) return pd.concat(data, axis=1)
def Get_contacts_features_villin(): import os import shutil import mdtraj as md os.chdir('/homes/anuginueni/traj_villin') if(os.path.isdir('./contacts')): shutil.rmtree('./contacts') from msmbuilder.dataset import dataset xyz = dataset( "/homes/anuginueni/traj_villin/*.xtc",topology='/homes/anuginueni/traj_villin/filtered.pdb',stride=5) t=md.load( "/homes/anuginueni/traj_villin/trajectory-331.xtc",top='/homes/anuginueni/traj_villin/filtered.pdb',stride=5) from msmbuilder.featurizer import ContactFeaturizer #for contacts featurizer = ContactFeaturizer(scheme='ca') #for contacts des_feat=featurizer.describe_features(t) res = [ sub['resids'] for sub in des_feat ] print(str(res)) contacts = xyz.fit_transform_with(featurizer, 'contacts/', fmt='dir-npy') #for contacts return contacts
def _extract_data(self, traj): contact = ContactFeaturizer(contacts=self.contacts, scheme=self.scheme, ignore_nonprotein=self.ignore_nonprotein) distances = contact.partial_transform(traj) summary = contact.describe_features(traj) pairs = [item['resids'] for item in summary] resids = np.unique(pairs) data = [] for resid in resids: idx = list( list(set(pair) - {resid})[0] for pair in pairs if resid in pair) mapping = np.array( [True if resid in pair else False for pair in pairs]) data.append( pd.DataFrame(distances[:, mapping], columns=[idx, len(idx) * [resid]])) return pd.concat(data, axis=1)
def test_FeatureSelector_describe_features(): rnd_traj = np.random.randint(len(trajectories)) f_ca = ContactFeaturizer(scheme='CA', ignore_nonprotein=True) f1 = f_ca.transform([trajectories[rnd_traj]]) df1 = pd.DataFrame(f_ca.describe_features(trajectories[rnd_traj])) f_dih = DihedralFeaturizer() f2 = f_dih.transform([trajectories[rnd_traj]]) df2 = pd.DataFrame(f_dih.describe_features(trajectories[rnd_traj])) df_dict = {} df_dict["ca"] = df1 df_dict["dih"] = df2 f_comb = FeatureSelector([('ca', f_ca), ('dih', f_dih)]) f3 = f_comb.transform([trajectories[rnd_traj]]) df3 = pd.DataFrame(f_comb.describe_features(trajectories[rnd_traj])) assert len(df3) == len(df1) + len(df2) df4 = pd.concat([df_dict[i] for i in f_comb.feat_list]) # lets randomly compare 40 features for i in np.random.choice(range(len(df3)), 40): for j in df3.columns: assert eq(df3.iloc[i][j], df4.iloc[i][j])