def preprocessing(project_directory, var_matrix_path, full_matrix_path, flag_file_path, **kwargs): """ Groups variant sites into amplicon windows and filters out any amplicons that do not have well conserved upstream and downstream primer regions """ logger = logging.getLogger(__name__) logger.info("BEGIN Preprocessing") args = _set_parameters(**kwargs) start_time = time.time() history = History(project_directory.make_new_file("history", "preprocessing_history"), "Preprocessing", project_directory.timestamp, param_dict=args) # Get strains and sites from matrix strains = _get_strains_from_file(var_matrix_path, args["sep"]) sites = _get_sites_from_file(var_matrix_path, args["sep"]) # Remove excluded strains if args["exclude_strains"] is not None: strains = _remove_strains(args["exclude_strains"], strains) var_matrix = _parse_var_matrix(var_matrix_path, strains, args["sep"]) if args["strict"]: n_sites_before = len(sites) var_matrix, sites = _remove_ambiguous_sites(var_matrix, sites) logger.info("Strict Mode: {} sites with ambiguous " "or missing data were removed".format(n_sites_before - len(sites))) history.add_path("VARIANT SITE MATRIX FILE", var_matrix_path) history.add_parameter("Number of Sites", len(sites)) history.add_parameter("Number of Strains", len(strains)) _check_inputs(args["pz_size"], args["pz_filter_length"], args["strain_cutoff"], len(strains)) if full_matrix_path is not None: flag_df = get_flags_from_matrix(full_matrix_path, strains, history, project_directory, **args) if flag_file_path is not None: flag_df = get_flags_from_file(flag_file_path, history) flag_dic = _get_flags_from_counts(flag_df, args["strain_cutoff"]) amplicon_filter = AmpliconFilter(sites, var_matrix, flag_dic, args['window'], args['pz_size'], args['pz_filter_length'], args['pz_filter_percent'], args['strict']) patterns = amplicon_filter.filter_amplicons_get_patterns() # Write patterns to a json file pattern_json_file = project_directory.make_new_file( "patterns", "patterns", "json") patterns.to_json(pattern_json_file, list(strains)) history.add_path("PATTERN JSON", pattern_json_file) # Write history logger.info("FINISHED Preprocessing") run_time = time.time() - start_time history.add_other("Run Time", run_time) history.write()
def pattern_selection(project_directory, **kwargs): logger = logging.getLogger(__name__) logger.info("BEGIN Pattern Selection") args = _set_parameters(**kwargs) start_time = time.time() _check_inputs(args['max_loci'], args['required_loci'], args['exclude_loci']) history = History(project_directory.make_new_file( "history", "pattern_selection_history"), "Pattern_Selection", project_directory.timestamp, param_dict=args) preprocessing_history = History( project_directory.get_parent_subdirectory_file( "history", "preprocessing_history_{}.txt".format( project_directory.get_parent_directory_timestamp())), "Preprocessing", exists=True) # Get JSON file path from preprocessing step json_file = preprocessing_history.get_path("PATTERN JSON") variant_matrix = preprocessing_history.get_path("VARIANT SITE MATRIX FILE") sep = { 'comma': ",", "space": " ", "tab": "\t" }[preprocessing_history.get_parameter("SEP")] # Get flag file path from preprocessing step flag_file = preprocessing_history.get_path("PRIMER ZONE FLAGS") primer_zone_size = preprocessing_history.get_parameter("PZ_SIZE") history.add_path("PATTERN JSON", json_file) logger.info("Reading from pattern JSON: %s", json_file) # Read in pattern JSON patterns = Patterns() patterns.load_patterns(json_file) if len(args['exclude_loci']): patterns.remove_sites(args['exclude_loci']) if len(args['required_loci']): patterns.add_required_sites(args['required_loci']) if len(args['exclude_strains']): patterns.remove_strains(args['exclude_strains']) patterns.set_resolution(args['res'], args['stop_at_res']) best_set = _get_minimum_spanning_set( patterns, args['reps'], args['max_loci'], args['max_res'], args['n_threads'], int(preprocessing_history.get_parameter("PZ_SIZE"))) haplotype_file = project_directory.make_new_file("minimum_spanning_set", ".haplotype", "csv") amplicon_json = project_directory.make_new_file("minimum_spanning_set", ".amplicons", "json") haplotype_matrix = project_directory.make_new_file("minimum_spanning_set", "haplotypes", "csv") amplicon_matrix = project_directory.make_new_file("minimum_spanning_set", "amplicons", "csv") pattern_matrix = project_directory.make_new_file("minimum_spanning_set", "patterns", "csv") summary_file = project_directory.make_new_file("summary", "summary") haplotype = Haplotype(patterns, best_set, flag_file, primer_zone_size, variant_matrix, sep) haplotype.write_haplotype(haplotype_file) history.add_path("Haplotype File", haplotype_file) haplotype.write_json(amplicon_json) history.add_path("Amplicon JSON", amplicon_json) haplotype.write_summary(summary_file) history.add_path("Summary", summary_file) haplotype.write_output(haplotype_matrix, pattern_matrix, amplicon_matrix) history.add_path("Haplotype Matrix", haplotype_matrix) history.add_path("Amplicon Matrix", amplicon_matrix) history.add_path("Pattern Matrix", pattern_matrix) logger.info("FINISHED Pattern Selection") run_time = time.time() - start_time history.add_other("Run Time", run_time) history.write()