Ejemplo n.º 1
0
def preprocessing(project_directory, var_matrix_path, full_matrix_path,
                  flag_file_path, **kwargs):
    """
    Groups variant sites into amplicon windows and filters
    out any amplicons that do not have well conserved
    upstream and downstream primer regions
    """
    logger = logging.getLogger(__name__)
    logger.info("BEGIN Preprocessing")
    args = _set_parameters(**kwargs)
    start_time = time.time()

    history = History(project_directory.make_new_file("history",
                                                      "preprocessing_history"),
                      "Preprocessing",
                      project_directory.timestamp,
                      param_dict=args)

    # Get strains and sites from matrix
    strains = _get_strains_from_file(var_matrix_path, args["sep"])
    sites = _get_sites_from_file(var_matrix_path, args["sep"])

    # Remove excluded strains
    if args["exclude_strains"] is not None:
        strains = _remove_strains(args["exclude_strains"], strains)

    var_matrix = _parse_var_matrix(var_matrix_path, strains, args["sep"])

    if args["strict"]:
        n_sites_before = len(sites)
        var_matrix, sites = _remove_ambiguous_sites(var_matrix, sites)
        logger.info("Strict Mode: {} sites with ambiguous "
                    "or missing data were removed".format(n_sites_before -
                                                          len(sites)))

    history.add_path("VARIANT SITE MATRIX FILE", var_matrix_path)
    history.add_parameter("Number of Sites", len(sites))
    history.add_parameter("Number of Strains", len(strains))

    _check_inputs(args["pz_size"], args["pz_filter_length"],
                  args["strain_cutoff"], len(strains))

    if full_matrix_path is not None:
        flag_df = get_flags_from_matrix(full_matrix_path, strains, history,
                                        project_directory, **args)

    if flag_file_path is not None:
        flag_df = get_flags_from_file(flag_file_path, history)

    flag_dic = _get_flags_from_counts(flag_df, args["strain_cutoff"])

    amplicon_filter = AmpliconFilter(sites, var_matrix, flag_dic,
                                     args['window'], args['pz_size'],
                                     args['pz_filter_length'],
                                     args['pz_filter_percent'], args['strict'])

    patterns = amplicon_filter.filter_amplicons_get_patterns()

    # Write patterns to a json file
    pattern_json_file = project_directory.make_new_file(
        "patterns", "patterns", "json")
    patterns.to_json(pattern_json_file, list(strains))
    history.add_path("PATTERN JSON", pattern_json_file)

    # Write history
    logger.info("FINISHED Preprocessing")
    run_time = time.time() - start_time
    history.add_other("Run Time", run_time)
    history.write()
Ejemplo n.º 2
0
def pattern_selection(project_directory, **kwargs):
    logger = logging.getLogger(__name__)
    logger.info("BEGIN Pattern Selection")
    args = _set_parameters(**kwargs)
    start_time = time.time()
    _check_inputs(args['max_loci'], args['required_loci'],
                  args['exclude_loci'])
    history = History(project_directory.make_new_file(
        "history", "pattern_selection_history"),
                      "Pattern_Selection",
                      project_directory.timestamp,
                      param_dict=args)

    preprocessing_history = History(
        project_directory.get_parent_subdirectory_file(
            "history", "preprocessing_history_{}.txt".format(
                project_directory.get_parent_directory_timestamp())),
        "Preprocessing",
        exists=True)

    # Get JSON file path from preprocessing step
    json_file = preprocessing_history.get_path("PATTERN JSON")
    variant_matrix = preprocessing_history.get_path("VARIANT SITE MATRIX FILE")
    sep = {
        'comma': ",",
        "space": " ",
        "tab": "\t"
    }[preprocessing_history.get_parameter("SEP")]

    # Get flag file path from preprocessing step
    flag_file = preprocessing_history.get_path("PRIMER ZONE FLAGS")
    primer_zone_size = preprocessing_history.get_parameter("PZ_SIZE")

    history.add_path("PATTERN JSON", json_file)
    logger.info("Reading from pattern JSON: %s", json_file)
    # Read in pattern JSON
    patterns = Patterns()
    patterns.load_patterns(json_file)
    if len(args['exclude_loci']):
        patterns.remove_sites(args['exclude_loci'])
    if len(args['required_loci']):
        patterns.add_required_sites(args['required_loci'])
    if len(args['exclude_strains']):
        patterns.remove_strains(args['exclude_strains'])
    patterns.set_resolution(args['res'], args['stop_at_res'])
    best_set = _get_minimum_spanning_set(
        patterns, args['reps'], args['max_loci'], args['max_res'],
        args['n_threads'], int(preprocessing_history.get_parameter("PZ_SIZE")))

    haplotype_file = project_directory.make_new_file("minimum_spanning_set",
                                                     ".haplotype", "csv")
    amplicon_json = project_directory.make_new_file("minimum_spanning_set",
                                                    ".amplicons", "json")
    haplotype_matrix = project_directory.make_new_file("minimum_spanning_set",
                                                       "haplotypes", "csv")
    amplicon_matrix = project_directory.make_new_file("minimum_spanning_set",
                                                      "amplicons", "csv")
    pattern_matrix = project_directory.make_new_file("minimum_spanning_set",
                                                     "patterns", "csv")
    summary_file = project_directory.make_new_file("summary", "summary")

    haplotype = Haplotype(patterns, best_set, flag_file, primer_zone_size,
                          variant_matrix, sep)

    haplotype.write_haplotype(haplotype_file)
    history.add_path("Haplotype File", haplotype_file)

    haplotype.write_json(amplicon_json)
    history.add_path("Amplicon JSON", amplicon_json)

    haplotype.write_summary(summary_file)
    history.add_path("Summary", summary_file)

    haplotype.write_output(haplotype_matrix, pattern_matrix, amplicon_matrix)
    history.add_path("Haplotype Matrix", haplotype_matrix)
    history.add_path("Amplicon Matrix", amplicon_matrix)
    history.add_path("Pattern Matrix", pattern_matrix)

    logger.info("FINISHED Pattern Selection")
    run_time = time.time() - start_time
    history.add_other("Run Time", run_time)
    history.write()