コード例 #1
0
def main():
    if (not len(sys.argv) == 3):
        print('Usage: screening.py plot_suffix target_EC\n')
        print("""
        plot_suffix: string to put at the end of plot file names.
        target_EC: file containing a target EC number (f if all EC
            should be used)
        param_file:

        """)
        sys.exit()
    else:
        plot_suffix = sys.argv[1]
        target_EC_file = sys.argv[2] if sys.argv[2] != 'f' else None
        pars = utilities.read_params(sys.argv[3])

    if target_EC_file is None:
        # Set options and get all EC numbers.
        settings = {}
        search_EC_file = pars['EC_file']
        search_ECs = utilities.get_ECs_from_file(EC_file=search_EC_file)
    else:
        settings = {}
        # Read ECs from file.
        search_ECs = utilities.get_ECs_from_file(EC_file=target_EC_file)

    print(settings)
    print(search_ECs)

    # plot distributions of protein sequence properties
    if DB_switch != 3:
        if input('do dist_GRAVY? (t/f)') == 't':
            print('doing....')
            pfn.rs_dist_GRAVY(output_dir=search_output_dir,
                              generator=yield_rxn_syst(search_output_dir),
                              plot_suffix=plot_suffix)
        if input('do dist_I index? (t/f)') == 't':
            print('doing....')
            pfn.rs_dist_I_index(output_dir=search_output_dir,
                                generator=yield_rxn_syst(search_output_dir),
                                plot_suffix=plot_suffix)
        if input('do dist_A index? (t/f)') == 't':
            print('doing....')
            pfn.rs_dist_A_index(output_dir=search_output_dir,
                                generator=yield_rxn_syst(search_output_dir),
                                plot_suffix=plot_suffix)

        if input('do dist_TM index? (t/f)') == 't':
            print('doing....')
            pfn.rs_dist_TM_index(output_dir=search_output_dir,
                                 generator=yield_rxn_syst(search_output_dir),
                                 plot_suffix=plot_suffix)
        if input('do dist_pI? (t/f)') == 't':
            print('doing....')
            pfn.rs_dist_pI(output_dir=search_output_dir,
                           generator=yield_rxn_syst(search_output_dir),
                           plot_suffix=plot_suffix)
    sys.exit()
コード例 #2
0
def main():
    if (not len(sys.argv) == 4):
        print("""
    Usage: linB_screening.py

        molecule_file

        rerun_diameter_calc

        param_file

        """)
        sys.exit()
    else:
        molecule_file = sys.argv[1]
        rerun_diameter_calc = True if sys.argv[2] == 't' else False
        pars = utilities.read_params(sys.argv[3])

    start = time.time()

    # Ignore MW restrictions.
    pars['MW_thresh'] = 2000

    df, molecules, diameters = rdkf.read_mol_txt_file(molecule_file)

    # draw 2D structures
    print('--- draw 2D structures...')
    rdkf.draw_svg_for_all_molecules(molecules)

    # calculate the size of the ellipsoid surroudning all molecules
    # using input pars
    if rerun_diameter_calc:
        print('--- calculating molecular diameters...')
        rdkf.calc_molecule_diameters(
            molecules,
            pars=pars,
            out_dir='linB_pars',
        )

    # print results for each molecule
    print('--- print results and plot...')
    pm.print_results(molecules,
                     threshold=pars['size_thresh'],
                     output_dir='linB_pars')

    pm.categorical(molecules,
                   threshold=pars['size_thresh'],
                   output_dir='linB_pars',
                   plot_suffix='linB')
    pm.shapes(molecules,
              threshold=pars['size_thresh'],
              output_dir='linB_pars',
              plot_suffix='linB')

    end = time.time()
    print(f'---- total time taken = {round(end-start, 2)} s')
コード例 #3
0
ファイル: resources.py プロジェクト: imfht/flaskapps
    def post(self):
        start_time = time.time()
        args = self.parser.parse_args()

        # read data
        params = read_params(args['params'].stream)
        df = read_file(args['raw_data'].stream.read())
        y_train = read_file(args['labels'].stream.read())

        # build features
        X_train = build_features(df, params)
        y_train = y_train.set_index('example_id')
        y_train = y_train.loc[X_train.index]

        # train model
        cl = train_model(X_train, y_train.label, params)
        self.model_factory.add_pipeline(cl, params)
        if isinstance(cl, tpot.TPOTClassifier):
            final_classifier = cl.fitted_pipeline_
            evaluated_indivs = cl.evaluated_individuals_
        else:
            final_classifier = cl
            evaluated_indivs = None
        model_type = str(final_classifier)
        mean_accuracy, mean_roc_auc = cross_validate(final_classifier, X_train,
                                                     y_train.label)

        # format feat_eng_params
        feat_eng_params = params['extract_features'].copy()
        for k in feat_eng_params.keys():
            if k == 'default_fc_parameters':  # shows calculations like min, mean, etc.
                feat_eng_params[k] = str(feat_eng_params[k].keys())
            elif k == 'impute_function':
                feat_eng_params[k] = str(feat_eng_params[k].__name__)
            else:
                feat_eng_params[k] = str(feat_eng_params[k])


#        for k in feat_eng_params:
#            feat_eng_params[k] = str(feat_eng_params[k])
        result = {
            'trainTime': time.time() - start_time,
            'trainShape': X_train.shape,
            'modelType': model_type,
            'featureEngParams': feat_eng_params,
            'modelId': params['pipeline_id'],
            'mean_cv_accuracy': mean_accuracy,
            'mean_cv_roc_auc': mean_roc_auc,
            'evaluated_models': evaluated_indivs
        }
        self.model_factory[params['pipeline_id']]['stats'] = result
        return json.dumps(result)
コード例 #4
0
def main():
    if (not len(sys.argv) == 3):
        print("""
Usage: RS_analysis.py prop_redo param_file
    prop_redo: t for rerun, else to read from prop_done.txt.
    param_file:
""")
        sys.exit()
    else:
        prop_redo = True if sys.argv[1] == 't' else False
        pars = utilities.read_params(sys.argv[2])

    temp_time = time.time()
    main_analysis(prop_redo=prop_redo, pars=pars)
    print('--- time taken =', '{0:.2f}'.format(time.time() - temp_time), 's')
コード例 #5
0
def main():
    if (not len(sys.argv) == 3):
        print("""
Usage: molecule_population.py param_file redo mol_file
    param_file:
    molecule :
        molecule file (_unopt.mol) to show the ellipsoid of.
""")
        sys.exit()
    else:
        params = utilities.read_params(sys.argv[1])
        molecule = sys.argv[2]

    vdwScale = params['vdwScale']
    boxMargin = params['boxMargin']
    spacing = params['spacing']
    show_vdw = params['show_vdw']
    plot_ellip = params['plot_ellip']
    # Set conformers to 1 to speed up.
    N_conformers = 1
    MW_thresh = params['MW_thresh']
    seed = int(params['seed'])

    name = molecule.replace('_unopt.mol', '')
    diam_file = f'ellips_plots/{name}_diam.csv'
    smiles = rdkf.read_structure_to_smiles(molecule)

    print('>> getting molecular size')
    _ = rdkf.calc_molecule_diameter(name,
                                    smiles,
                                    out_file=diam_file,
                                    vdwScale=vdwScale,
                                    boxMargin=boxMargin,
                                    spacing=spacing,
                                    MW_thresh=MW_thresh,
                                    show_vdw=show_vdw,
                                    plot_ellip=plot_ellip,
                                    N_conformers=N_conformers,
                                    rSeed=seed,
                                    do_step_plot=True)
    del _
コード例 #6
0
def main():
    if (not len(sys.argv) == 5):
        print("""
Usage: RS_collection.py run redo skipped
    run: t to run search for new rxn systems into current dir.
    redo: t to overwrite all rxn systems.
    param_file:
    skipped: t to see the number of skipped rxns in cwd.
""")
        sys.exit()
    else:
        run = True if sys.argv[1] == 't' else False
        redo = True if sys.argv[2] == 't' else False
        pars = utilities.read_params(sys.argv[3])
        skipped = True if sys.argv[4] == 't' else False

    if run:
        main_run(redo, pars)
    if skipped:
        search_output_dir = getcwd() + '/'
        percent_skipped(search_output_dir, pars)

    print('----- All done! ------')
コード例 #7
0
def main():
    if (not len(sys.argv) == 3):
        print("""
Usage: visualise_reaction_system.py param_file file

    param_file : (str)

    file : (str)
        pkl file containing reaction system
""")
        sys.exit()
    else:
        params = utilities.read_params(sys.argv[1])
        file = sys.argv[2]

    file = os.path.join(os.getcwd(), file)
    print(file)

    # Read in reaction system.
    rs = get_RS(filename=file,
                output_dir=os.getcwd(),
                pars=params,
                verbose=True)
    print(rs)
    if rs.skip_rxn:
        print(f'>>> {rs.skip_reason}')

    print(f'max d = {rs.max_min_mid_diam}\n')

    # Output molecular components and their properties.
    for rsc in rs.components:
        print(rsc)
        print(f'SMILEs = {rsc.SMILES}')
        print(f'logP = {rsc.logP}')
        print(f'logS = {rsc.logS}')
        print(f'SA = {rsc.Synth_score}')
コード例 #8
0
def main():
    if (not len(sys.argv) == 3):
        print("""
    Usage: biomin_screening.py

        rerun_diameter_calc

        param_file

        """)
        sys.exit()
    else:
        rerun_diameter_calc = True if sys.argv[1] == 't' else False
        pars = utilities.read_params(sys.argv[2])

    start = time.time()
    # set parameters
    EC_set, EC_mol_set, EC_descriptors = EC_sets()

    # Ignore MW restrictions.
    pars['MW_thresh'] = 2000

    print('------------------------------------------------')
    print('Screen molecular size of compounds in known reactions')
    print('------------------------------------------------')

    # screen known reactant and product molecules
    print('--- get molecule DB + draw 2D structures...')
    molecules, diameters = get_molecule_DB(EC_mol_set=EC_mol_set,
                                           output_dir='2d_')

    # calculate the size of the ellipsoid surroudning all molecules
    if rerun_diameter_calc:
        print('--- calculate molecular diameters...')
        rdkf.calc_molecule_diameters(
            molecules,
            pars=pars,
            out_dir='biomin_sizes',
        )

    # print results for each molecule
    print('--- print results and plot...')
    pm.print_results(molecules,
                     threshold=pars['size_thresh'],
                     output_dir='biomin_sizes')

    # plotting
    biomin_known(molecules,
                 output_dir='biomin_sizes',
                 plot_suffix='biomin_known')
    pm.categorical(molecules,
                   threshold=pars['size_thresh'],
                   output_dir='biomin_sizes',
                   plot_suffix='biomin_known')
    pm.shapes(molecules,
              threshold=pars['size_thresh'],
              output_dir='biomin_sizes',
              plot_suffix='biomin_known')

    n_phenyl_assay(output_dir='biomin_sizes')
    cyt_C_perox_assay(output_dir='biomin_sizes')
    HOF_examples(output_dir='biomin_sizes')
    Tash_esters(output_dir='biomin_sizes')

    end = time.time()
    print(f'---- total time taken = {round(end-start, 2)} s')
コード例 #9
0
ファイル: resources.py プロジェクト: imfht/flaskapps
 def post(self):
     args = self.parser.parse_args()
     params = read_params(args['params'].stream)
     df = read_file(args['raw_data'].stream.read())
     result = self.model_factory.use_pipeline(df, params['pipeline_id'])
     return result.reset_index().to_json()
コード例 #10
0
def main():
    if (not len(sys.argv) == 5):
        print("""
    Usage: param_screening.py

        molecule_file :
            enzyme_screen/data/test_molecules.txt

        rerun_diameter_calc :
            t or f

        param_file :
            enzyme_screen/data/param_file.txt

        do_parity :
            t or f to make the figure or not

        """)
        sys.exit()
    else:
        molecule_file = sys.argv[1]
        rerun_diameter_calc = True if sys.argv[2] == 't' else False
        pars = utilities.read_params(sys.argv[3])
        do_parity = True if sys.argv[4] == 't' else False

    start = time.time()

    df, molecules, diameters, min2s = rdkf.read_mol_txt_file(molecule_file)

    print(min2s)

    # Ignore MW restrictions.
    pars['MW_thresh'] = 2000

    # draw 2D structures
    print('--- draw 2D structures...')
    rdkf.draw_svg_for_all_molecules(molecules)

    parameter_sets = {
        'spacing': [0.3, 0.4, 0.5, 0.6],
        'N_conformers': [10, 50, 100, 200, 300, 400, 600, 1000],
        'boxMargin': [4, 6, 8]
    }
    seeds = [
        1, 1000, 500, 50000, 2123, 345555, 542221, 679293, 2755, 99982, 825412,
        342, 54638, 1982, 77654, 8553, 4
    ]

    # calculate the size of the ellipsoid surroudning all molecules
    # using input pars
    if rerun_diameter_calc:
        print('--- calculating molecular diameters for all tests...')
        rdkf.calc_molecule_diameters(
            molecules,
            pars=pars,
            out_dir='orig_pars',
        )

        # Scale test.
        new_pars = pars.copy()
        new_pars['vdwScale'] = 1.0
        rdkf.calc_molecule_diameters(
            molecules,
            pars=new_pars,
            out_dir='scale1_test',
        )

        new_pars = pars.copy()
        new_pars['vdwScale'] = 0.9
        rdkf.calc_molecule_diameters(
            molecules,
            pars=new_pars,
            out_dir='scale09_test',
        )

        new_pars = pars.copy()
        new_pars['vdwScale'] = 0.8
        rdkf.calc_molecule_diameters(
            molecules,
            pars=new_pars,
            out_dir='scale08_test',
        )

        new_pars = pars.copy()
        new_pars['vdwScale'] = 0.7
        rdkf.calc_molecule_diameters(
            molecules,
            pars=new_pars,
            out_dir='scale07_test',
        )

        # Seed test.
        print('--------- seed tests! ----------------')
        for seed in seeds:
            new_pars = pars.copy()
            new_pars['seed'] = seed
            print(f'doing seed {seed}')
            print(new_pars)
            new_molecules = {
                'n-hexane': 'CCCCCC',
                'n-heptane': 'CCCCCCC',
                'n-octane': 'CCCCCCCC',
                'toluene': 'CC1=CC=CC=C1',
                'p-nitrophenol': 'C1=CC(=CC=C1[N+](=O)[O-])O',
                'p-nitrophenyl butyrate':
                ('CCCC(=O)OC1=CC=C(C=C1)[N+](=O)[O-]'),
                'butyric acid': 'CCCC(=O)O',
            }
            print(new_molecules)
            rdkf.calc_molecule_diameters(
                new_molecules,
                pars=new_pars,
                out_dir=f'seeds_{seed}',
            )

        # Parameter tests.
        print('--------- param tests! ----------------')
        test_mol = [
            'n-butane', 'meta-xylene', 'n-hexane', 'n-heptane', 'n-octane',
            'toluene', 'napthalene'
        ]

        new_molecules = {i: molecules[i] for i in molecules if i in test_mol}
        print(new_molecules)

        for t in parameter_sets:
            for v in parameter_sets[t]:
                print(f'doing test {t} with value {v}')
                new_pars = pars.copy()
                new_pars[t] = v
                print(new_pars)

                rdkf.calc_molecule_diameters(
                    new_molecules,
                    pars=new_pars,
                    out_dir=f'ptests_{t}_{v}',
                )

    cf_verploegh2015(molecules, output_dir='orig_pars')
    cf_polyukhov2019(molecules, output_dir='orig_pars')
    cf_ueda2019(molecules, output_dir='orig_pars')
    cf_cuadardocollardos2019(molecules, output_dir='orig_pars')
    cf_cuadardocollardos2019(molecules, output_dir='orig_pars')

    print_results_cf_known(molecules,
                           known_df=df,
                           threshold=pars['size_thresh'],
                           output_dir='orig_pars')

    parity_with_known(molecules, diameters, output_dir='orig_pars')
    parity_with_known_min2(molecules, min2s, output_dir='orig_pars')

    categorical_with_known(molecules,
                           known_df=df,
                           threshold=pars['size_thresh'],
                           output_dir='orig_pars')

    shapes_with_known(molecules,
                      known_df=df,
                      threshold=pars['size_thresh'],
                      output_dir='orig_pars')
    # '#FA7268', '#F8A72A', '#DAF7A6'
    if do_parity:
        scale_info = {
            # DIR: (scale, C, M, alpha, edgecolor)
            'scale1_test': (1.0, '#FA7268', 'D', 1.0, 'none'),
            'scale09_test': (0.9, '#900C3F', 'o', 1.0, 'k'),
            'scale08_test': (0.8, '#6BADB0', '>', 1.0, 'none'),
            'scale07_test': (0.7, '#F6D973', '<', 1.0, 'none')
        }
        parity_cf_scale_with_known(molecules,
                                   diameters,
                                   known_df=df,
                                   pars=pars,
                                   scale_info=scale_info)
        dist_cf_scale_with_known(molecules,
                                 diameters,
                                 known_df=df,
                                 pars=pars,
                                 scale_info=scale_info)

    seed_test(seeds=seeds)

    parameter_tests(molecules)

    end = time.time()
    print(f'---- total time taken = {round(end-start, 2)} s')
コード例 #11
0
def main():
    if (not len(sys.argv) == 3):
        print('Usage: screening.py param_file case\n')
        print("""
        param_file (str) :
        case (str) :
            define the case study to use.
        """)
        case_studies(None, None)
        sys.exit()
    else:
        pars = utilities.read_params(sys.argv[1])
        case = sys.argv[2]

    pars = case_studies(string=case, pars=pars)

    if not os.path.exists(pars['file_suffix']):
        os.mkdir(pars['file_suffix'])

    # Get all EC numbers.
    search_EC_file = pars['EC_file']
    search_ECs = utilities.get_ECs_from_file(EC_file=search_EC_file)

    # Iterate through all reactions in directory.
    search_output_dir = os.getcwd()
    prop_output_file = os.path.join(search_output_dir, 'rs_properties.csv')

    if os.path.exists(prop_output_file):
        output_data = pd.read_csv(prop_output_file)
    else:
        raise FileNotFoundError(f'{prop_output_file} with all data is missing')

    target_data = pd.DataFrame(columns=output_data.columns)
    for i, row in output_data.iterrows():
        if row['ec'] in search_ECs:
            # Remove data with incomplete attributes.
            if float(row['minlogp']) == 1E10:
                continue
            if float(row['maxlogp']) == -1E10:
                continue

            if float(row['minlogs']) == 1E10:
                continue
            if float(row['maxlogs']) == -1E10:
                continue

            if float(row['rmaxsa']) == 0:
                continue
            if float(row['pmaxsa']) == 0:
                continue

            if float(row['rmaxbct']) == 0:
                continue
            if float(row['pmaxbct']) == 0:
                continue

            target_data = target_data.append(row)

    pr.no_rxns_vs_size(data=target_data,
                       params=pars,
                       plot_suffix=pars['file_suffix'])
    pr.save_candidates(data=target_data,
                       params=pars,
                       filename=(f"{pars['file_suffix']}/"
                                 f"candidates_{pars['file_suffix']}.csv"))

    pr.rxn_space(data=target_data,
                 filename=(f"{pars['file_suffix']}/"
                           f"rxn_space_{pars['file_suffix']}.pdf"))

    pr.rxn_value(data=target_data,
                 filename=(f"{pars['file_suffix']}/"
                           f"rxn_value_{pars['file_suffix']}.pdf"))

    pr.rxn_complexity(data=target_data,
                      filename=(f"{pars['file_suffix']}/"
                                f"rxn_complexity_{pars['file_suffix']}.pdf"))

    plots_to_do = [
        # Column, type (dist, stacked, pie), xtitle, xlim, width
        ('minlogs', 'dist', 'min. logS', None, 0.5),
        ('minlogs', 'stacked', 'min. logS', None, 0.5),
        ('maxlogp', 'dist', 'max. logP', None, 0.5),
        ('maxlogp', 'stacked', 'max. logP', None, 0.5),
        ('nr', 'dist', 'no reactants', None, 0.5),
        ('np', 'dist', 'no products', None, 0.5),
        ('PC_class', 'pie', 'purchasability class', None, 1),
        ('max_mid_diam', 'dist', r'$d$ of largest component [$\mathrm{\AA}$]',
         None, 0.5),
        ('max_mid_diam', 'stacked',
         r'$d$ of largest component [$\mathrm{\AA}$]', None, 0.5),
        ('deltasa', 'dist', r'$\Delta$ SAscore', (-10, 10), 0.5),
        ('deltasa', 'stacked', r'$\Delta$ SAscore', (-10, 10), 0.5),
        ('deltabct', 'dist', r'$\Delta$ BertzCT', (-1000, 1000), 100),
    ]

    for pl in plots_to_do:
        col, type, xtitle, xlim, width = pl
        if type == 'stacked':
            fig, ax = pr.stacked_dist(data=target_data,
                                      col=col,
                                      xtitle=xtitle,
                                      xlim=xlim,
                                      width=width)

            fig.tight_layout()
            fig.savefig(fname=(f"{pars['file_suffix']}/"
                               f"stacked_{col}_{pars['file_suffix']}.pdf"),
                        dpi=720,
                        bbox_inches='tight')
            fig, ax = pr.violinplot(data=target_data,
                                    col=col,
                                    ytitle=xtitle,
                                    ylim=xlim)
            fig.tight_layout()
            fig.savefig(fname=(f"{pars['file_suffix']}/"
                               f"violin_{col}_{pars['file_suffix']}.pdf"),
                        dpi=720,
                        bbox_inches='tight')
        elif type == 'dist':
            fig, ax = pr.dist(X=target_data[col],
                              xtitle=xtitle,
                              xlim=xlim,
                              width=width)
            fig.tight_layout()
            fig.savefig(fname=(f"{pars['file_suffix']}/"
                               f"dist_{col}_{pars['file_suffix']}.pdf"),
                        dpi=720,
                        bbox_inches='tight')
        elif type == 'pie':
            fig, ax = pr.pie(X=target_data[col],
                             xtitle=xtitle,
                             xlim=xlim,
                             width=width)
            fig.tight_layout()
            fig.savefig(fname=(f"{pars['file_suffix']}/"
                               f"pie_{col}_{pars['file_suffix']}.pdf"),
                        dpi=720,
                        bbox_inches='tight')
コード例 #12
0
def main():
    if (not len(sys.argv) == 6):
        print("""
Usage: molecule_population.py param_file redo mol_file
    param_file:
    redo size:
        t to overwrite SIZE of all molecules.
    redo rest:
        t to overwrite properties of all molecules.
    plot:
        t to plot distributions of molecule properties.
    mol_file :
        file name of list of molecules to allow for trivial
        parallelisation, `f` if not specified, where all `mol`
        files are populated.
""")
        sys.exit()
    else:
        params = utilities.read_params(sys.argv[1])
        redo_size = True if sys.argv[2] == 't' else False
        redo_prop = True if sys.argv[3] == 't' else False
        plot = True if sys.argv[4] == 't' else False
        mol_file = None if sys.argv[5] == 'f' else sys.argv[5]

    print('settings:')
    print('    Molecule file:', mol_file)
    print(
        'populate the properties attributes for all '
        'molecules in DB...'
    )

    populate_all_molecules(
        params=params,
        mol_file=mol_file,
        redo_size=redo_size,
        redo_prop=redo_prop,
    )

    if plot:
        pm.mol_parity(
            propx='logP',
            propy='logS',
            file='logPvslogS',
            xtitle='logP',
            ytitle='logS'
        )
        pm.mol_parity(
            propx='NHA',
            propy='Synth_score',
            file=f"NHAvsSA_{params['file_suffix']}",
            xtitle='no. heavy atoms',
            ytitle='SAScore'
        )
        pm.mol_parity(
            propx='NHA',
            propy='logP',
            file=f"NHAvslogP_{params['file_suffix']}",
            xtitle='no. heavy atoms',
            ytitle='logP'
        )
        pm.mol_parity(
            propx='NHA',
            propy='logS',
            file=f"NHAvslogS_{params['file_suffix']}",
            xtitle='no. heavy atoms',
            ytitle='logS'
        )
        pm.mol_categ(
            propx='purchasability',
            propy='Synth_score',
            file=f"purchvsSA_{params['file_suffix']}",
            xtitle='is purchasable',
            ytitle='SAscore'
        )
        pm.mol_categ(
            propx='purchasability',
            propy='bertzCT',
            file=f"purchvsbCT_{params['file_suffix']}",
            xtitle='is purchasable',
            ytitle='BertzCT'
        )
        pm.mol_categ(
            propx='purchasability',
            propy='size',
            file=f"purchvssize_{params['file_suffix']}",
            xtitle='is purchasable',
            ytitle=r'$d$ [$\mathrm{\AA}$]'
        )
        pm.mol_categ(
            propx='size',
            propy='bertzCT',
            file=f"sizevsbCT_{params['file_suffix']}",
            xtitle='can diffuse',
            ytitle='BertzCT'
        )
        pm.mol_categ(
            propx='size',
            propy='Synth_score',
            file=f"sizevsSA_{params['file_suffix']}",
            xtitle='can diffuse',
            ytitle='SAscore'
        )
        pm.mol_all_dist(plot_suffix=params['file_suffix'])