required=True, help="list of .npy filenames for labels to be split randomly e.g. " "'--label_filenames training_y_0.csv validation_y_0.csv " "training_y_-2.csv validation_y_-2.csv'. " "Assumed to be grouped by threshold, in the same order as " "the thresholds parameter.") required_named.add_argument("--thresholds", type=int, nargs='+', required=True, help="list of thresholds to use to reduce dataset by similarity " "score e.g. --thresholds 0 -2") args = parser.parse_args() log_utils.setup_logging(args.verbosity) label_filenames = args.label_filenames data_filenames = args.data_filenames group_proportions = [80, 20] thresholds = args.thresholds logging.info(f"Reading in data frame {args.input}") data_frame = utils.read_bound_pairs(args.input) logging.info(f"Read {len(data_frame)} rows") positives_df = data_frame[data_frame['binding_observed'] == 1].iloc[:args.num_negatives] logging.info(f"Chosen {len(positives_df)} random positive samples") label_filenames_grouped = [label_filenames[2*i:2*i+2] for i in range(len(thresholds))] logging.info(f"{len(label_filenames)} label files grouped into "
"""Wrapper for snakemake to call the find_unique_bound_pairs script""" import logging import traceback import peptidebinding.find_unique_bound_pairs as find_unique_bound_pairs import peptidebinding.helper.log_utils as log_utils log_utils.setup_logging(3, logfile=snakemake.log[0]) try: find_unique_bound_pairs.main( bound_pairs_tables=snakemake.input.bound_pairs, output_file=snakemake.output.bound_pairs, fragment_lengths_out=snakemake.output.fragment_lengths) except: logging.error(f"Unexpected error:\n{traceback.format_exc()}") raise