Ejemplo n.º 1
0
    def process_master_data_no_gap(self, masfile: Path, name1: str,
                                   name2: str) -> pd.DataFrame:
        """Get length data from the MASTER matches.
        """
        def cutter(row):
            match = row['match']
            # MASTER starts match count at 0!
            return row['abego'][match[0][0]:match[0][-1] + 1], '-', 0

        if masfile.with_suffix('.csv').is_file():
            df = pd.read_csv(masfile.with_suffix('.csv'))
            df['match'] = df['match'].apply(literal_eval)
            return df

        dfloop = parse_master_file(masfile)
        dfloop = dfloop.merge(self.abegos,
                              on=['pdb',
                                  'chain']).merge(self.fragments,
                                                  on=['pdb',
                                                      'chain']).dropna()
        dfloop[['abego', 'loop',
                'loop_length']] = dfloop.apply(cutter,
                                               axis=1,
                                               result_type='expand')
        dfloop = dfloop.iloc[:self.top_loops]
        dfloop['length_count'] = dfloop.loop_length.map(
            dfloop.loop_length.value_counts())
        dfloop.drop(columns=['pds_path']).to_csv(
            masfile.with_suffix('.all.csv'), index=False)
        finaldf = dfloop.sort_values('rmsd').drop_duplicates(['loop'])

        df = finaldf.drop(columns=['pds_path'])
        df.to_csv(masfile.with_suffix('.csv'), index=False)
        return df
 def test_master(self):
     df = parse_master_file(os.path.join(self.dirpath, 'master.search'),
                            max_rmsd=1.4,
                            piece_count=2,
                            shift_0=True)
     assert df.rmsd.max() == 1.3967
     assert df.shape == (42, 5)
     assert df.iloc[-1].match == [[34, 40], [42, 48]]
Ejemplo n.º 3
0
    def process_master_data(self, masfile: Path, name1: str, name2: str,
                            hairpin: bool) -> pd.DataFrame:
        """Get length data from the MASTER matches.
        """
        def cutter(row):
            match = row['match']
            # MASTER starts match count at 0!
            loop = row['abego'][match[0][1] + 1:match[1][0]]
            return row['abego'][match[0][0]:match[1][1] + 1], loop, len(loop)

        if masfile.with_suffix('.csv').is_file():
            df = pd.read_csv(masfile.with_suffix('.csv'))
            df['match'] = df['match'].apply(literal_eval)
            return df

        dfloop = parse_master_file(masfile)
        dfloop = dfloop.merge(self.abegos,
                              on=['pdb',
                                  'chain']).merge(self.fragments,
                                                  on=['pdb',
                                                      'chain']).dropna()
        dfloop[['abego', 'loop',
                'loop_length']] = dfloop.apply(cutter,
                                               axis=1,
                                               result_type='expand')
        dfloop = dfloop.iloc[:self.top_loops]
        dfloop['length_count'] = dfloop.loop_length.map(
            dfloop.loop_length.value_counts())
        dfloop.drop(columns=['pds_path']).to_csv(
            masfile.with_suffix('.all.csv'), index=False)
        finaldf = dfloop.sort_values('rmsd').drop_duplicates(['loop'])

        pick = 0
        if hairpin and 2 in finaldf['loop_length'].values:
            pick = 2
        else:
            pick = finaldf[finaldf['length_count'] ==
                           finaldf['length_count'].max()]['loop_length'].min()
        finaldf = finaldf[finaldf['loop_length'] == pick]

        TBPlot.plot_loop_length_distribution(self.log, dfloop, pick,
                                             masfile.with_suffix(''),
                                             f'loop {name1} <-> {name2}')

        df = finaldf.drop(columns=['pds_path'])
        df.to_csv(masfile.with_suffix('.csv'), index=False)
        return df
Ejemplo n.º 4
0
def main(options):
    """
    """
    # Load MASTER search data.
    masterdf = parse_master_file(options.master, shift_0=True)

    # Case data
    case = Case(Path(options.case))
    # Get connectivities
    sse = case.connectivities_str[0].split('.')
    # Get flips
    flip = cycle([
        case['configuration.flip_first'], not case['configuration.flip_first']
    ])
    flip = [next(flip) for _ in range(len(sse))]
    # Select only the present ones.
    present = [sse.index(i) for i in options.present.split('.')]
    sse = list(itemgetter(*present)(sse))
    flip = list(itemgetter(*present)(flip))

    # Geometric properties retrieval
    masterdf = process_master_geometries(masterdf, sse, flip)
    # Output data
    masterdf.to_csv(str(options.out) + '.csv', index=False)
Ejemplo n.º 5
0
    def process_master_data(self, masfile: Path, names: List,
                            loop_lengths: str,
                            loop_orders: str) -> pd.DataFrame:
        """Get length data from the MASTER matches.
        """
        def cutter(row, num):
            match = row['match']
            # MASTER starts match count at 0!
            loop = row['abego'][match[num][1] + 1:match[num + 1][0]]
            return row['abego'][match[num][0]:match[num + 1][1] +
                                1], loop, len(loop), int(match[num][0]), int(
                                    match[num + 1][1])

        if masfile.with_suffix('.csv').is_file():
            df = pd.read_csv(masfile.with_suffix('.csv'))
            df['match'] = df['match'].apply(literal_eval)
            return df

        llens = loop_lengths.split(';')
        pnames = [(names[i], names[i + 1]) for i in range(len(names) - 1)]
        lorder = loop_orders.split(';')

        dfloop = parse_master_file(masfile)
        dfloop = dfloop.merge(self.abegos,
                              on=['pdb',
                                  'chain']).merge(self.fragments,
                                                  on=['pdb',
                                                      'chain']).dropna()

        container = []
        for k, (pname, llen, lord) in enumerate(zip(pnames, llens, lorder)):
            if lord is 'x':  # skip regions that are not of interest
                continue
            # set up
            nfolder = masfile.parent.absolute().joinpath(
                f'loop{int(lord):02d}')
            nfolder.mkdir(parents=True, exist_ok=True)
            masfile2 = str(nfolder.joinpath(f'jump{int(lord):02d}'))

            dfloop_copy = dfloop.copy()
            self.log.info(
                f'Current jump is {pname[0], pname[1]} of order {lord} located at {k}'
            )
            dfloop_copy[['abego', 'loop', 'loop_length', 'start',
                         'stop']] = dfloop_copy.apply(cutter,
                                                      num=k,
                                                      axis=1,
                                                      result_type='expand')
            dfloop_copy = dfloop_copy.iloc[:self.top_loops]
            dfloop_copy['length_count'] = dfloop_copy.loop_length.map(
                dfloop_copy.loop_length.value_counts())
            finaldf = dfloop_copy.sort_values('rmsd').drop_duplicates(['loop'])

            is_hairpin = self.check_hairpin(pname[0], pname[1])
            if self.pick_by == 'minimal':
                pick = finaldf['loop_length'].min()
                if self.hairpins_2 == True and is_hairpin == True:
                    pick = 2
                finaldf = finaldf[finaldf['loop_length'] == pick]
            else:
                pick = finaldf[
                    finaldf['length_count'] ==
                    finaldf['length_count'].max()]['loop_length'].min()
                if self.hairpins_2 == True and is_hairpin == True:
                    pick = 2
                finaldf = finaldf[finaldf['loop_length'] == pick]

            TBPlot.plot_loop_length_distribution(
                self.log, dfloop_copy, pick, Path(masfile2),
                f'loop {pname[0]} <-> {pname[1]}')

            df = finaldf.drop(columns=['pds_path'])
            df = df.assign(order=[int(lord)] * len(df))
            df.to_csv(masfile2 + '.csv', index=False)
            container.append(df)

        df = pd.concat(container).sort_values('order')
        df.to_csv(masfile.with_suffix('.all.csv'), index=False)
        return df