Ejemplo n.º 1
0
def pipeline_STEP6(cfg, logger, cpu_number):
    s = "Working with STEP 6: SECOND ALIGNMENT CLEANING"
    print(s)
    logger.info(s)
    input_folder = cfg["input_folder"]
    output_folder = cfg["output_folder"]
    in_suffix = cfg["input_suffix"]
    out_suffix = cfg["output_suffix"]
    in_format = cfg["input_format"]
    create_result_folder(output_folder, logger)
    os.chdir(input_folder)
    starting = get_starting_files(input_folder, output_folder, in_format,
                                  out_suffix)
    starting_files = [x for x in starting if in_suffix in x]
    logger.info("There are {} input fasta files".format(len(starting_files)))
    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            STEP6.trim_clean,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Cleaning",
        )
    logger.info("STEP 6 finished")
Ejemplo n.º 2
0
def runMakeblastdb(cfg, logger, cpu_number):
    logger.info("Running blastdb")
    in_folder = cfg["input_folder"]
    in_suffix = cfg["input_suffix"]
    blastn_db_folder = cfg["blastn_databases_folder"]
    create_result_folder(blastn_db_folder, logger)
    os.chdir(in_folder)
    finished_files = [x.split(".nin")[0] for x in os.listdir(blastn_db_folder)]
    starting_files = sorted([
        x for x in os.listdir(os.getcwd())
        if in_suffix in x and x.split(in_suffix)[0] not in finished_files
    ])
    if cfg["fasta_sanity_check"]:
        for fasta in starting_files:
            is_fasta(fasta)
    logger.info("There are {} input fasta files".format(len(starting_files)))
    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            primaryScript.run_makeblastdb,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Running blastdb",
        )

    logger.info("run_makeblastdb finished")
Ejemplo n.º 3
0
def pipeline_STEP5(cfg, logger, cpu_number):
    s = "Working with STEP 5: INPARALOGS SEPARATION"
    print(s)
    logger.info(s)
    input_folder = cfg["input_folder"]
    output_folder = cfg["output_folder"]
    in_suffix = cfg["input_suffix"]
    out_suffix = cfg["output_suffix"]
    in_format = cfg["input_format"]
    min_taxa = cfg["min_taxa_in_alignment"]
    create_result_folder(output_folder, logger)
    os.chdir(input_folder)
    starting = get_starting_files(input_folder, output_folder, in_format,
                                  out_suffix)
    starting_files = [
        x for x in starting if in_suffix in x and find_n_taxa(x, min_taxa)
    ]
    logger.info("There are {} input fasta files".format(len(starting_files)))
    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            STEP5.inparalog_separation,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Inparalogs",
        )
    cleaning_dnadist(logger)
    logger.info("STEP 5 finished")
Ejemplo n.º 4
0
def pipeline_STEP3(cfg, logger, cpu_number):
    s = "Working with STEP 3: ALIGNMENT CLEANING"
    print(s)
    logger.info(s)
    input_folder = cfg["input_folder"]
    output_folder = cfg["output_folder"]
    in_suffix = cfg["input_suffix"]
    out_suffix = cfg["output_suffix"]
    create_result_folder(output_folder, logger)
    os.chdir(input_folder)
    starting_files = get_starting_files(input_folder, output_folder, in_suffix,
                                        out_suffix)
    logger.debug("There are {} input fasta files".format(len(starting_files)))
    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            STEP3.find_unspliced_segments,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Cleaning",
        )
    logger.info("STEP 3 finished")
Ejemplo n.º 5
0
def runBLAST(cfg, logger, cpu_number):
    logger.info("Running BLAST")
    in_folder = cfg["input_folder"]
    blastn_dbs = cfg["blastn_databases_folder"]
    in_suffix = cfg["input_suffix"]
    out_folder = cfg["blastn_results_folder"]
    create_result_folder(out_folder, logger)
    if not os.path.exists(blastn_dbs) or not os.listdir(blastn_dbs):
        exception = "[error]: either blastdb folder {} is empty or \
it has not been created by running 'run_makeblastdb'".format(blastn_dbs)
        raise Exception(exception)
        sys.exit()

    if not os.path.exists(in_folder) or not os.listdir(in_folder):
        exception = "[error]: either fasta input folder {} is empty or \
it does not exist".format(in_folder)
        raise Exception(exception)
        sys.exit()
    os.chdir(in_folder)
    starting_files = sorted(
        [x for x in os.listdir(os.getcwd()) if in_suffix in x])

    if cfg["fasta_sanity_check"]:
        for fasta in starting_files:
            primaryScript.is_fasta(fasta)

    logger.info("There are {} input fasta files".format(len(starting_files)))
    fasta_to_db = primaryScript.fasta_to_blastdb(starting_files, cfg)

    logger.info("Starting BLAST...")
    if fasta_to_db:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            primaryScript.run_blastn,
            fasta_to_db,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Running BLAST",
        )

    logger.info("run_BLAST finished")
Ejemplo n.º 6
0
def parseXMLs(cfg, logger, cpu_number, temporary_folder):
    logger.info("Parsing xmls")

    in_folder = cfg["blastn_results_folder"]
    if not os.path.exists(in_folder) or not [
            x for x in os.listdir(in_folder) if ".xml" in x
    ]:
        exception = "[error]: either blastn_results_folder {} is empty or \
it has not been created by running 'run_BLAST'".format(in_folder)
        raise Exception(exception)
        sys.exit()
    os.chdir(in_folder)
    starting_files = sorted(
        [x for x in os.listdir(os.getcwd()) if ".xml" in x])

    logger.info("There are {} input xml files".format(len(starting_files)))
    logger.debug("Parsing xml files")
    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            primaryScript.parse_pickle,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            temporary_folder,
            min_align_length=cfg["min_align_len"],
            tqdm_desc="Parsing xmls",
        )
    os.chdir(temporary_folder)
    dat_files = [x for x in os.listdir(os.getcwd()) if ".dat" in x]
    taxa_pairs = primaryScript.find_dat_pairs(dat_files)
    common_hits = shelve.open("common_hits_dict")
    for pair in taxa_pairs:
        primaryScript.populate_shelve(common_hits, pair)
    common_hits.close()
    logger.info("Done parsing xmls")
Ejemplo n.º 7
0
def PhaseAlleles(cfg, logger, cpu_number, **kargs):
    logger.info("Phasing alleles starting")
    in_folder = cfg["input_folder"]
    in_suffix = cfg["input_suffix"]
    out_suffix = cfg["output_suffix"]
    out_folder = cfg["output_folder"]
    create_result_folder(out_folder, logger)
    logger.debug("Checking the settings")
    temporary_folder = kargs["temp"]
    phase.check_settings(cfg, logger)
    starting = get_starting_files(in_folder, out_folder, in_suffix, out_suffix)

    starting_files = [x for x in starting if in_suffix in x]  # [:1]
    s = "There are {} input fasta files".format(len(starting_files))
    print(s)
    logger.info(s)
    samtools_v = phase.find_samtools_version(cfg)
    print("samtools version is: .X", samtools_v)

    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            phase.phasing_contig,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            temp=temporary_folder,
            samtools_version=samtools_v,
            tqdm_desc="phasing",
        )

    logger.info("Phasing alleles finished")
Ejemplo n.º 8
0
def MakeSecondaryAssortments(cfg, logger, cpu_number, **kargs):
    logger.info("Retrieving the Secondary sequence assortments")
    in_folder = cfg["input_folder"]
    in_suffix = cfg["input_suffix"]
    max_size = cfg["limit_large_file"]
    filter_taxa = cfg["filtered_taxa_list"]
    filter_taxa_occurences = cfg["filtered_taxon_occurences"]
    filter_taxa_bool = cfg["filtered_taxon_boolean"]
    out_passed = cfg["output_folder_passed"]
    create_result_folder(out_passed, logger)
    out_failed = cfg["output_folder_failed"]
    create_result_folder(out_failed, logger)
    temporary_folder = kargs["temp"]
    remove_temp = kargs["remove_temp"]

    out_files = [
        re.split("[L][0-9]+", x)[0] for x in os.listdir(out_passed)
    ] + [re.split("[L][0-9]+", x)[0] for x in os.listdir(out_failed)]
    os.chdir(in_folder)
    starting_files = sorted([
        x for x in os.listdir(os.getcwd())
        if (in_suffix in x and "_core" not in x and "_addit" not in x
            and "_temp_" not in x and os.stat(x).st_size != 0
            and x not in out_files)
    ])

    if cfg["fasta_sanity_check"]:
        logger.debug("Performing sanity check")
        for fasta in starting_files:
            primary.is_fasta(fasta)

    logger.debug("Searching for large fasta files")
    starting_fasta = "fasta files:" + ",".join(starting_files)
    logger.debug(starting_fasta)
    filtering_modes = "filtering_mode_size: {0}, filtering_mode_taxa: {1}".format(
        cfg["filtering_mode_size"], cfg["filtering_mode_taxa"])
    logger.debug(filtering_modes)
    if cfg["filtering_mode_size"] and not cfg["filtering_mode_taxa"]:
        large_fastas = [
            x for x in starting_files
            if not secondary.get_large_files(x, max_size + 1)
        ]
    elif cfg["filtering_mode_taxa"] and not cfg["filtering_mode_size"]:
        large_fastas = [
            x for x in starting_files if not secondary.filter_with_taxa(
                x, filter_taxa, filter_taxa_occurences, filter_taxa_bool)
        ]
    elif cfg["filtering_mode_taxa"] and cfg["filtering_mode_size"]:
        large_fastas = [
            x for x in starting_files
            if (not secondary.filter_with_taxa(
                x, filter_taxa, filter_taxa_occurences, filter_taxa_bool)
                and not secondary.get_large_files(x, max_size + 1))
        ]
    else:
        large_fastas = []

    small_fastas = list(set(starting_files).difference(set(large_fastas)))
    for x in small_fastas:
        try:
            shutil.copy(x, out_passed)
        except:
            print("[Error]: unable to move file {0} to folder {1}".format(
                x, out_passed))

    logger.info(
        "There are {} small fasta files that have been validated".format(
            len(small_fastas)))
    logger.info(
        "There are {} large fasta files that need to be processed".format(
            len(large_fastas)))

    if cfg["proceed_with_secondary_search"] and large_fastas:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            secondary.mk_secondary,
            large_fastas,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            temp=temporary_folder,
            tqdm_desc="2nd search",
            remove_temp=remove_temp,
        )

    logger.debug("Done MakeSecondaryAssortments")
Ejemplo n.º 9
0
def pipeline_STEP1(cfg, logger, cpu_number):
    s = "Working with STEP 1: running MAFFT"
    print(s)
    logger.info(s)
    input_folder = cfg["input_folder"]
    output_folder = cfg["output_folder"]
    out_suffix = cfg["output_suffix"]
    create_result_folder(output_folder, logger)
    os.chdir(input_folder)
    unwanted_files = [
        x for x in os.listdir(os.getcwd()) if "_temp_aligned.fasta" in x
    ]

    # Remove files from a previously aborted run .
    for f in unwanted_files:
        os.remove(f)
    finished_files = [
        x.split(out_suffix)[0] for x in os.listdir(output_folder)
    ]
    starting_files = sorted([
        x for x in os.listdir(os.getcwd()) if cfg["input_suffix"] in x
        and find_n_seqs(x, 1, cfg["upper_sequence_limit"]) and "_core" not in x
        and "_addit" not in x and x.split(out_suffix)[0] not in finished_files
    ])

    for fasta in starting_files:
        is_fasta(fasta)

    # Size threshold for switching from small_fastas alignment to large_fastas
    size_threshold = cfg["MAFFT_upper_limit_addfragments"]
    small_fastas = [
        x for x in starting_files if find_n_seqs(x, 1, size_threshold + 1)
    ]
    logger.debug("there are {} small_fastas".format(len(small_fastas)))

    large_fastas = [
        x for x in starting_files if not find_n_seqs(x, 1, size_threshold + 1)
    ]
    logger.debug("there are {} large_fastas".format(len(large_fastas)))

    if small_fastas:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        logger.info("Running Mafft on small fasta")
        process_future_fasta(
            STEP1.run_MAFFT_small,
            small_fastas,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Mafft on small files",
        )
    if large_fastas:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        logger.info("Running Mafft on large fasta")
        process_future_fasta(
            STEP1.run_MAFFT_large,
            large_fastas,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Mafft on large files",
        )
    logger.info("STEP 1 finished")