Ejemplo n.º 1
0
def preprocessing(project_directory, var_matrix_path, full_matrix_path,
                  flag_file_path, **kwargs):
    """
    Groups variant sites into amplicon windows and filters
    out any amplicons that do not have well conserved
    upstream and downstream primer regions
    """
    logger = logging.getLogger(__name__)
    logger.info("BEGIN Preprocessing")
    args = _set_parameters(**kwargs)
    start_time = time.time()

    history = History(project_directory.make_new_file("history",
                                                      "preprocessing_history"),
                      "Preprocessing",
                      project_directory.timestamp,
                      param_dict=args)

    # Get strains and sites from matrix
    strains = _get_strains_from_file(var_matrix_path, args["sep"])
    sites = _get_sites_from_file(var_matrix_path, args["sep"])

    # Remove excluded strains
    if args["exclude_strains"] is not None:
        strains = _remove_strains(args["exclude_strains"], strains)

    var_matrix = _parse_var_matrix(var_matrix_path, strains, args["sep"])

    if args["strict"]:
        n_sites_before = len(sites)
        var_matrix, sites = _remove_ambiguous_sites(var_matrix, sites)
        logger.info("Strict Mode: {} sites with ambiguous "
                    "or missing data were removed".format(n_sites_before -
                                                          len(sites)))

    history.add_path("VARIANT SITE MATRIX FILE", var_matrix_path)
    history.add_parameter("Number of Sites", len(sites))
    history.add_parameter("Number of Strains", len(strains))

    _check_inputs(args["pz_size"], args["pz_filter_length"],
                  args["strain_cutoff"], len(strains))

    if full_matrix_path is not None:
        flag_df = get_flags_from_matrix(full_matrix_path, strains, history,
                                        project_directory, **args)

    if flag_file_path is not None:
        flag_df = get_flags_from_file(flag_file_path, history)

    flag_dic = _get_flags_from_counts(flag_df, args["strain_cutoff"])

    amplicon_filter = AmpliconFilter(sites, var_matrix, flag_dic,
                                     args['window'], args['pz_size'],
                                     args['pz_filter_length'],
                                     args['pz_filter_percent'], args['strict'])

    patterns = amplicon_filter.filter_amplicons_get_patterns()

    # Write patterns to a json file
    pattern_json_file = project_directory.make_new_file(
        "patterns", "patterns", "json")
    patterns.to_json(pattern_json_file, list(strains))
    history.add_path("PATTERN JSON", pattern_json_file)

    # Write history
    logger.info("FINISHED Preprocessing")
    run_time = time.time() - start_time
    history.add_other("Run Time", run_time)
    history.write()