Ejemplo n.º 1
0
def prepare(args: dict, overwriting: bool):
    """Setup run by creating directories and log files.

    Args:
        args (dict): argparser arguments
        overwriting (bool): overwriting flag

    Returns:
        Tuple(DataFrame, DataFrame): Path to output and mapping_table subdirectories.
    """
    output_dir_lsh = make_dir(args, "results_tmp", "lsh_folding", overwriting)
    mapping_table_dir = make_dir(args, "mapping_table", None, overwriting)
    create_log_files(output_dir_lsh)
    create_log_files(mapping_table_dir)
    load_config(args)
    load_key(args)
    method_params_fp = ConfigDict.get_parameters()["fingerprint"]
    method_params_lsh = ConfigDict.get_parameters()["lsh"]
    method_params = {**method_params_fp, **method_params_lsh}
    key = SecretDict.get_secrets()["key"]
    lshf = LSHFoldingCalculator.from_param_dict(
        secret=key, method_param_dict=method_params, verbosity=0)
    outcols = ["fp_feat", "fp_val", "fold_id", "success", "error_message"]
    out_types = ["object", "object", "object", "bool", "object"]
    dt = DfTransformer(
        lshf,
        input_columns={"canonical_smiles": "smiles"},
        output_columns=outcols,
        output_types=out_types,
        success_column="success",
        nproc=args["number_cpu"],
        verbosity=0,
    )
    return output_dir_lsh, mapping_table_dir, dt
def prepare(args: dict, overwriting: bool):
    """Load config and key file,create output directories and setup log files.

    Args:
        args (dict): argparser dictionary

    Returns:
        Path: output directory path
    """

    output_dir = make_dir(args, "results_tmp", "activity_formatting",
                          overwriting)
    mapping_table_dir = make_dir(args, "mapping_table", None, overwriting)
    create_log_files(output_dir)
    return output_dir, mapping_table_dir
Ejemplo n.º 3
0
def prepare(args: dict, overwriting: bool) -> Path:
    """
    Create output folder for matrices

    Args:
        args (dict): argparser dictionary
        overwriting (bool): overwriting argument

    Returns:
        Path: output path
    """
    output_dir = make_dir(args, "matrices", None, overwriting)
    results_dir = make_dir(args, "results", None, overwriting)

    create_log_files(output_dir)
    return output_dir, results_dir
Ejemplo n.º 4
0
def prepare(args):
    overwriting = True

    load_config(args)
    load_key(args)
    output_dir = make_dir(args, "reference_set", None, overwriting)
    key = SecretDict.get_secrets()["key"]
    method_params_standardizer = ConfigDict.get_parameters()["standardization"]
    st = Standardizer.from_param_dict(
        method_param_dict=method_params_standardizer, verbosity=0)
    outcols_st = ["canonical_smiles", "success", "error_message"]
    out_types_st = ["object", "bool", "object"]
    dt_standarizer = DfTransformer(
        st,
        input_columns={"smiles": "smiles"},
        output_columns=outcols_st,
        output_types=out_types_st,
        success_column="success",
        nproc=1,
        verbosity=0,
    )

    method_params_folding = ConfigDict.get_parameters()["scaffold_folding"]
    sa = ScaffoldFoldAssign.from_param_dict(
        secret=key, method_param_dict=method_params_folding, verbosity=0)
    outcols_sa = [
        "murcko_smiles", "sn_smiles", "fold_id", "success", "error_message"
    ]
    out_types_sa = ["object", "object", "int", "bool", "object"]
    dt_fold = DfTransformer(
        sa,
        input_columns={"canonical_smiles": "smiles"},
        output_columns=outcols_sa,
        output_types=out_types_sa,
        success_column="success",
        nproc=1,
        verbosity=0,
    )

    method_params_descriptor = ConfigDict.get_parameters()["fingerprint"]
    dc = DescriptorCalculator.from_param_dict(
        secret=key, method_param_dict=method_params_descriptor, verbosity=0)
    outcols_dc = ["fp_feat", "fp_val", "success", "error_message"]
    out_types_dc = ["object", "object", "bool", "object"]
    dt_descriptor = DfTransformer(
        dc,
        input_columns={"canonical_smiles": "smiles"},
        output_columns=outcols_dc,
        output_types=out_types_dc,
        success_column="success",
        nproc=1,
        verbosity=0,
    )

    return output_dir, dt_standarizer, dt_fold, dt_descriptor
Ejemplo n.º 5
0
def prepare(args: dict, overwriting: bool) -> Path:
    """Load config and key file,create output directories and setup log files.

    Args:
        args (dict): argparser dictionary

    Returns:
        Path: output directory path
    """

    output_dir = make_dir(args, "results_tmp", "thresholding", overwriting)
    create_log_files(output_dir)
    return output_dir
Ejemplo n.º 6
0
def prepare(args):
    """
    Prepare output directories and instantiate df tansformer object for scaffold based folding

    Args:
        args (dict): argparser arguments

    Returns:
        Tuple(Path, DfTransformer): Path to output directory and instatitaed DfTranfomer for sccaffold folding


    """
    output_dir = make_dir(args, "results_tmp", "folding", args["non_interactive"])
    mapping_table_dir = make_dir(args, "mapping_table", None, args["non_interactive"])

    create_log_files(output_dir)
    create_log_files(mapping_table_dir)

    load_config(args)
    load_key(args)
    key = SecretDict.get_secrets()["key"]
    method_params = ConfigDict.get_parameters()["scaffold_folding"]
    sa = ScaffoldFoldAssign.from_param_dict(
        secret=key, method_param_dict=method_params, verbosity=0
    )
    outcols = ["murcko_smiles", "sn_smiles", "fold_id", "success", "error_message"]
    out_types = ["object", "object", "int", "bool", "object"]
    dt = DfTransformer(
        sa,
        input_columns={"canonical_smiles": "smiles"},
        output_columns=outcols,
        output_types=out_types,
        success_column="success",
        nproc=args["number_cpu"],
        verbosity=0,
    )
    return output_dir, mapping_table_dir, dt
Ejemplo n.º 7
0
def do_prepare_training(args):
    """Wrapper to run the entire pipeline for training.

    Args:
        args (Namespace): Subparser argmuents
    #"""
    start_total = time.time()

    start = time.time()
    _args = vars(args)
    if _args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False

    num_cpu = _args["number_cpu"]
    # # load parameters and key
    load_config(_args)
    load_key(_args)
    bit_size = melloddy_tuner.utils.config.parameters.get_parameters(
    )["fingerprint"]["fold_size"]
    #########
    # Consistency check
    print("Consistency checks of config and key files.")
    hash_reference_set.main(_args)
    #########
    start = time.time()
    tag = _args["tag"]

    print("Reading input data.")
    df_T0 = read_input_file(_args["weight_table"])
    df_T1 = read_input_file(_args["activity_file"])
    df_T2 = read_input_file(_args["structure_file"])
    print("Data loaded.")
    print("Start sanity checks of input data.")
    print("Check assay types in T0.")
    sanity_check_assay_type(df_T0)

    print("Check consistency of input_assay_id between T0 and T1.")
    sanity_check_assay_sizes(df_T0, df_T1)

    print("Check consistency of input_compound_id between T1 and T2.")
    sanity_check_compound_sizes(df_T1, df_T2)

    print("Check uniqueness of T0 and T2.")
    sanity_check_uniqueness(df_T0, colname="input_assay_id", filename="T0")
    sanity_check_uniqueness(df_T2, colname="input_compound_id", filename="T2")
    print(f"Sanity checks took {time.time() - start:.08} seconds.")
    print(f"Sanity checks passed.")

    start = time.time()
    print("Start standardizing structures.")

    # Make directories, load input files
    results_dir = make_dir(_args, "results", None, overwriting)
    output_dir_std, dt_std = standardize_smiles.prepare(_args)

    df_smi, sd_smi_failed = standardize_smiles.run(df_T2, dt_std)
    save_df_as_csv(output_dir_std, df_smi, "T2_standardized")
    save_df_as_csv(output_dir_std, sd_smi_failed, "T2_standardized.FAILED")
    del sd_smi_failed, df_T2
    print(f"Standardization took {time.time() - start:.08} seconds.")
    print(f"Standardization done.")
    df_T5 = pd.DataFrame()
    df_T6 = pd.DataFrame()
    if _args["folding_method"] == "scaffold":
        print("Using scaffold-based fold assignment.")

        output_dir_desc, dt_desc = calculate_descriptors.prepare(
            _args, overwriting)

        start = time.time()
        print("Start calculating descriptors.")

        df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc)

        save_df_as_csv(output_dir_desc, df_desc, "T2_descriptors")
        save_df_as_csv(output_dir_desc, df_desc_failed,
                       "T2_descriptors.FAILED")
        del df_smi, df_desc_failed

        print(
            f"Fingerprint calculation took {time.time() - start:.08} seconds.")
        print(f"Descriptor calculation done.")

        start = time.time()
        print("Start computing folds.")
        output_dir_fold, mapping_table_dir, dt_fold = calculate_scaffold_folds.prepare(
            _args)

        df_fold, df_fold_failed = calculate_scaffold_folds.run(
            df_desc, dt_fold)
        save_df_as_csv(output_dir_fold, df_fold, "T2_folds")
        save_df_as_csv(output_dir_fold, df_fold_failed, "T2_folds.FAILED")
        del df_fold_failed, df_desc
        df_T5, df_T6, df_duplicates = helper.format_dataframe(df_fold)
        save_df_as_csv(mapping_table_dir, df_T5, "T5")
        save_df_as_csv(mapping_table_dir, df_T6, "T6")
        save_df_as_csv(output_dir_desc, df_duplicates,
                       "T2_descriptor_vector_id.DUPLICATES")
        del df_duplicates

        print(f"Fold calculation took {time.time() - start:.08} seconds.")
        print(f"Fold calculation done.")

    elif _args["folding_method"] == "lsh":
        print("Using LSH based fold assignment.")
        output_dir_lsh, mapping_table_dir, dt_lsh = calculate_lsh_folds.prepare(
            _args, overwriting)

        output_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.csv")
        error_file = os.path.join(output_dir_lsh,
                                  "T2_descriptors_lsh.FAILED.csv")
        dupl_file = os.path.join(output_dir_lsh,
                                 "T2_descriptors_lsh.DUPLICATES.csv")
        mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv")
        mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv")

        df_desc_lsh, df_desc_lsh_failed = dt_lsh.process_dataframe(df_smi)
        df_desc_lsh.to_csv(output_file, index=False)
        df_desc_lsh_failed.to_csv(error_file, index=False)
        df_T5, df_T6, df_duplicates = helper.format_dataframe(df_desc_lsh)
        df_duplicates.to_csv(dupl_file, index=False)
        df_T5.to_csv(mapping_file_T5, index=False)
        df_T6.to_csv(mapping_file_T6, index=False)
        del df_duplicates
        end = time.time()
        print(
            f"Fingerprint calculation and LSH folding took {end - start:.08} seconds."
        )
        print(f"Descriptor calculation and LSH folding done.")
    else:
        print("Please use scaffold or lsh as folding method.")
        quit()

    start = time.time()

    print("Start aggregating values.")

    output_dir_agg = aggregate_values.prepare(_args, overwriting)

    (
        df_T4r,
        df_failed_range,
        df_failed_aggr,
        df_failed_std,
        df_dup,
        df_T0_upd,
    ) = aggregate_values.aggregate_replicates(
        df_T0, df_T1, df_T5,
        ConfigDict.get_parameters()["credibility_range"], num_cpu)
    df_T4r = df_T4r[[
        "input_assay_id",
        "descriptor_vector_id",
        "fold_id",
        "standard_qualifier",
        "standard_value",
    ]]
    save_df_as_csv(
        output_dir_agg,
        df_T4r,
        "T4r",
        [
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_range,
        "failed_range_T1",
        [
            "input_compound_id", "input_assay_id", "standard_qualifier",
            "standard_value"
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_aggr,
        "failed_aggr_T1",
        [
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_std,
        "failed_std_T1",
        [
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_dup,
        "duplicates_T1",
        [
            "input_assay_id",
            "input_compound_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
    save_df_as_csv(output_dir_agg, df_T0_upd, "T0_upd")
    del df_T5, df_failed_range, df_failed_aggr, df_dup, df_T1
    print(f"Replicate aggregation took {time.time() - start:.08} seconds.")
    print(f"Replicate aggregation done.")

    start = time.time()
    print("Start thresholding.")
    output_dir_thres = apply_thresholding.prepare(_args, overwriting)
    df_T0_upd = df_T0_upd.astype({"input_assay_id": "int"})
    df_T4r = df_T4r.astype({"input_assay_id": "int"})
    df_T4c, df_T3c = apply_thresholding.run(df_T0_upd, df_T4r, num_cpu)

    # Write final dataframes (T4c, T3c)
    columns_T3c = [
        "classification_task_id",
        "input_assay_id",
        "assay_type",
        "variance_quorum_OK",
        "use_in_regression",
        "is_auxiliary",
        "threshold",
        "threshold_method",
        "direction",
    ]
    columns_T4c = [
        "classification_task_id",
        "descriptor_vector_id",
        "fold_id",
        "input_assay_id",
        "standard_qualifier",
        "standard_value",
        "threshold",
        "class_label",
    ]

    df_T4c.sort_values("classification_task_id", inplace=True)
    df_T3c.sort_values("classification_task_id", inplace=True)

    # Filter ambiguous class labels
    df_T4c_failed = df_T4c[df_T4c.class_label.isna()]
    df_T4c = df_T4c[~df_T4c.class_label.isna()]

    df_T4c = df_T4c[columns_T4c]
    df_T3c = df_T3c[columns_T3c]

    save_df_as_csv(output_dir_thres, df_T4c_failed, "T4c.FAILED")
    save_df_as_csv(output_dir_thres, df_T4c, "T4c")
    save_df_as_csv(output_dir_thres, df_T3c, "T3c")

    print(f"Thresholding took {time.time() - start:.08} seconds.")
    print(f"Thresholding done.")

    print("Start filter classification data.")
    start = time.time()

    output_dir_filter_clf = filter_classification.prepare(_args, overwriting)
    T10c, T8c, T4c_filtered_out, T4c_dedup = filter_classification.filter_clf(
        df_T3c,
        df_T4c,
        ConfigDict.get_parameters()["training_quorum"]["classification"],
        ConfigDict.get_parameters()["evaluation_quorum"]["classification"],
        ConfigDict.get_parameters()["initial_task_weights"],
    )

    filter_classification.write_tmp_output(output_dir_filter_clf, T10c, T8c,
                                           T4c_filtered_out, T4c_dedup)

    del df_T4c, df_T3c, T4c_filtered_out, T4c_dedup

    print(f"Classification filtering took {time.time() - start:.08} seconds.")
    print(f"Classification filtering done.")
    print("Start filter regression data.")
    #####
    start = time.time()
    out_dir_filter_reg = filter_regression.prepare(_args, overwriting)

    T10r, T8r, T4r_filtered_out, T4r_dedup = filter_regression.filter_regression_tasks(
        df_T0_upd,
        df_T4r,
        ConfigDict.get_parameters()["training_quorum"]["regression"],
        ConfigDict.get_parameters()["evaluation_quorum"]["regression"],
        ConfigDict.get_parameters()["initial_task_weights"],
        ConfigDict.get_parameters()["censored_downweighting"],
    )
    filter_regression.write_tmp_output(out_dir_filter_reg, T10r, T8r,
                                       T4r_filtered_out, T4r_dedup)
    del df_T0, df_T4r, T4r_filtered_out, T4r_dedup
    print(f"Filtering regression data took {time.time() - start:.08} seconds.")
    print(f"Filtering regression data done.")

    print("Start creating sparse matrices.")

    start = time.time()
    out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting)

    df_T6_cont, T10c_cont, T10r_cont = csv_2_mtx.get_cont_id(df_T6, T10c, T10r)
    df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]]

    save_df_as_csv(results_dir, T10c_cont, "T10c_cont")
    save_df_as_csv(results_dir, T10r_cont, "T10r_cont")
    save_df_as_csv(results_dir, df_T6_cont, "T6_cont")

    csv_2_mtx.save_csv_output(out_dir_matrices, tag, T8c, T8r)
    del df_T6, df_T6_cont, T10r, T10c

    (
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    ) = csv_2_mtx.make_matrices(df_T11, T10c_cont, T10r_cont, bit_size)
    del df_T11, T10c_cont, T10r_cont
    y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False)
    y_matrix_clf.eliminate_zeros()

    csv_2_mtx.save_npy_matrices(
        out_dir_matrices,
        tag,
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    )

    print(f"Formatting to matrices took {time.time() - start:.08} seconds.")
    end = time.time()
    print(f"Overall processing took {end - start_total:.08} seconds.")
    print(f"Files are ready for SparseChem.")