def process_master_data_no_gap(self, masfile: Path, name1: str, name2: str) -> pd.DataFrame: """Get length data from the MASTER matches. """ def cutter(row): match = row['match'] # MASTER starts match count at 0! return row['abego'][match[0][0]:match[0][-1] + 1], '-', 0 if masfile.with_suffix('.csv').is_file(): df = pd.read_csv(masfile.with_suffix('.csv')) df['match'] = df['match'].apply(literal_eval) return df dfloop = parse_master_file(masfile) dfloop = dfloop.merge(self.abegos, on=['pdb', 'chain']).merge(self.fragments, on=['pdb', 'chain']).dropna() dfloop[['abego', 'loop', 'loop_length']] = dfloop.apply(cutter, axis=1, result_type='expand') dfloop = dfloop.iloc[:self.top_loops] dfloop['length_count'] = dfloop.loop_length.map( dfloop.loop_length.value_counts()) dfloop.drop(columns=['pds_path']).to_csv( masfile.with_suffix('.all.csv'), index=False) finaldf = dfloop.sort_values('rmsd').drop_duplicates(['loop']) df = finaldf.drop(columns=['pds_path']) df.to_csv(masfile.with_suffix('.csv'), index=False) return df
def test_master(self): df = parse_master_file(os.path.join(self.dirpath, 'master.search'), max_rmsd=1.4, piece_count=2, shift_0=True) assert df.rmsd.max() == 1.3967 assert df.shape == (42, 5) assert df.iloc[-1].match == [[34, 40], [42, 48]]
def process_master_data(self, masfile: Path, name1: str, name2: str, hairpin: bool) -> pd.DataFrame: """Get length data from the MASTER matches. """ def cutter(row): match = row['match'] # MASTER starts match count at 0! loop = row['abego'][match[0][1] + 1:match[1][0]] return row['abego'][match[0][0]:match[1][1] + 1], loop, len(loop) if masfile.with_suffix('.csv').is_file(): df = pd.read_csv(masfile.with_suffix('.csv')) df['match'] = df['match'].apply(literal_eval) return df dfloop = parse_master_file(masfile) dfloop = dfloop.merge(self.abegos, on=['pdb', 'chain']).merge(self.fragments, on=['pdb', 'chain']).dropna() dfloop[['abego', 'loop', 'loop_length']] = dfloop.apply(cutter, axis=1, result_type='expand') dfloop = dfloop.iloc[:self.top_loops] dfloop['length_count'] = dfloop.loop_length.map( dfloop.loop_length.value_counts()) dfloop.drop(columns=['pds_path']).to_csv( masfile.with_suffix('.all.csv'), index=False) finaldf = dfloop.sort_values('rmsd').drop_duplicates(['loop']) pick = 0 if hairpin and 2 in finaldf['loop_length'].values: pick = 2 else: pick = finaldf[finaldf['length_count'] == finaldf['length_count'].max()]['loop_length'].min() finaldf = finaldf[finaldf['loop_length'] == pick] TBPlot.plot_loop_length_distribution(self.log, dfloop, pick, masfile.with_suffix(''), f'loop {name1} <-> {name2}') df = finaldf.drop(columns=['pds_path']) df.to_csv(masfile.with_suffix('.csv'), index=False) return df
def main(options): """ """ # Load MASTER search data. masterdf = parse_master_file(options.master, shift_0=True) # Case data case = Case(Path(options.case)) # Get connectivities sse = case.connectivities_str[0].split('.') # Get flips flip = cycle([ case['configuration.flip_first'], not case['configuration.flip_first'] ]) flip = [next(flip) for _ in range(len(sse))] # Select only the present ones. present = [sse.index(i) for i in options.present.split('.')] sse = list(itemgetter(*present)(sse)) flip = list(itemgetter(*present)(flip)) # Geometric properties retrieval masterdf = process_master_geometries(masterdf, sse, flip) # Output data masterdf.to_csv(str(options.out) + '.csv', index=False)
def process_master_data(self, masfile: Path, names: List, loop_lengths: str, loop_orders: str) -> pd.DataFrame: """Get length data from the MASTER matches. """ def cutter(row, num): match = row['match'] # MASTER starts match count at 0! loop = row['abego'][match[num][1] + 1:match[num + 1][0]] return row['abego'][match[num][0]:match[num + 1][1] + 1], loop, len(loop), int(match[num][0]), int( match[num + 1][1]) if masfile.with_suffix('.csv').is_file(): df = pd.read_csv(masfile.with_suffix('.csv')) df['match'] = df['match'].apply(literal_eval) return df llens = loop_lengths.split(';') pnames = [(names[i], names[i + 1]) for i in range(len(names) - 1)] lorder = loop_orders.split(';') dfloop = parse_master_file(masfile) dfloop = dfloop.merge(self.abegos, on=['pdb', 'chain']).merge(self.fragments, on=['pdb', 'chain']).dropna() container = [] for k, (pname, llen, lord) in enumerate(zip(pnames, llens, lorder)): if lord is 'x': # skip regions that are not of interest continue # set up nfolder = masfile.parent.absolute().joinpath( f'loop{int(lord):02d}') nfolder.mkdir(parents=True, exist_ok=True) masfile2 = str(nfolder.joinpath(f'jump{int(lord):02d}')) dfloop_copy = dfloop.copy() self.log.info( f'Current jump is {pname[0], pname[1]} of order {lord} located at {k}' ) dfloop_copy[['abego', 'loop', 'loop_length', 'start', 'stop']] = dfloop_copy.apply(cutter, num=k, axis=1, result_type='expand') dfloop_copy = dfloop_copy.iloc[:self.top_loops] dfloop_copy['length_count'] = dfloop_copy.loop_length.map( dfloop_copy.loop_length.value_counts()) finaldf = dfloop_copy.sort_values('rmsd').drop_duplicates(['loop']) is_hairpin = self.check_hairpin(pname[0], pname[1]) if self.pick_by == 'minimal': pick = finaldf['loop_length'].min() if self.hairpins_2 == True and is_hairpin == True: pick = 2 finaldf = finaldf[finaldf['loop_length'] == pick] else: pick = finaldf[ finaldf['length_count'] == finaldf['length_count'].max()]['loop_length'].min() if self.hairpins_2 == True and is_hairpin == True: pick = 2 finaldf = finaldf[finaldf['loop_length'] == pick] TBPlot.plot_loop_length_distribution( self.log, dfloop_copy, pick, Path(masfile2), f'loop {pname[0]} <-> {pname[1]}') df = finaldf.drop(columns=['pds_path']) df = df.assign(order=[int(lord)] * len(df)) df.to_csv(masfile2 + '.csv', index=False) container.append(df) df = pd.concat(container).sort_values('order') df.to_csv(masfile.with_suffix('.all.csv'), index=False) return df