def makePrimaryAssortments(cfg, logger, temporary_folder):
    logger.info("Retrieving the primary sequence assortments")
    out_folder = cfg["output_folder"]
    create_result_folder(out_folder, logger)
    discard_aligns = cfg["discarded_groups_folder"]
    create_result_folder(discard_aligns, logger)
    primaryScript.mk_primary(cfg, logger, temporary_folder)
def runMakeblastdb(cfg, logger, cpu_number):
    logger.info("Running blastdb")
    in_folder = cfg["input_folder"]
    in_suffix = cfg["input_suffix"]
    blastn_db_folder = cfg["blastn_databases_folder"]
    create_result_folder(blastn_db_folder, logger)
    os.chdir(in_folder)
    finished_files = [x.split(".nin")[0] for x in os.listdir(blastn_db_folder)]
    starting_files = sorted([
        x for x in os.listdir(os.getcwd())
        if in_suffix in x and x.split(in_suffix)[0] not in finished_files
    ])
    if cfg["fasta_sanity_check"]:
        for fasta in starting_files:
            is_fasta(fasta)
    logger.info("There are {} input fasta files".format(len(starting_files)))
    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            primaryScript.run_makeblastdb,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Running blastdb",
        )

    logger.info("run_makeblastdb finished")
def pipeline_STEP6(cfg, logger, cpu_number):
    s = "Working with STEP 6: SECOND ALIGNMENT CLEANING"
    print(s)
    logger.info(s)
    input_folder = cfg["input_folder"]
    output_folder = cfg["output_folder"]
    in_suffix = cfg["input_suffix"]
    out_suffix = cfg["output_suffix"]
    in_format = cfg["input_format"]
    create_result_folder(output_folder, logger)
    os.chdir(input_folder)
    starting = get_starting_files(input_folder, output_folder, in_format,
                                  out_suffix)
    starting_files = [x for x in starting if in_suffix in x]
    logger.info("There are {} input fasta files".format(len(starting_files)))
    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            STEP6.trim_clean,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Cleaning",
        )
    logger.info("STEP 6 finished")
def pipeline_STEP5(cfg, logger, cpu_number):
    s = "Working with STEP 5: INPARALOGS SEPARATION"
    print(s)
    logger.info(s)
    input_folder = cfg["input_folder"]
    output_folder = cfg["output_folder"]
    in_suffix = cfg["input_suffix"]
    out_suffix = cfg["output_suffix"]
    in_format = cfg["input_format"]
    min_taxa = cfg["min_taxa_in_alignment"]
    create_result_folder(output_folder, logger)
    os.chdir(input_folder)
    starting = get_starting_files(input_folder, output_folder, in_format,
                                  out_suffix)
    starting_files = [
        x for x in starting if in_suffix in x and find_n_taxa(x, min_taxa)
    ]
    logger.info("There are {} input fasta files".format(len(starting_files)))
    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            STEP5.inparalog_separation,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Inparalogs",
        )
    cleaning_dnadist(logger)
    logger.info("STEP 5 finished")
def pipeline_STEP3(cfg, logger, cpu_number):
    s = "Working with STEP 3: ALIGNMENT CLEANING"
    print(s)
    logger.info(s)
    input_folder = cfg["input_folder"]
    output_folder = cfg["output_folder"]
    in_suffix = cfg["input_suffix"]
    out_suffix = cfg["output_suffix"]
    create_result_folder(output_folder, logger)
    os.chdir(input_folder)
    starting_files = get_starting_files(input_folder, output_folder, in_suffix,
                                        out_suffix)
    logger.debug("There are {} input fasta files".format(len(starting_files)))
    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            STEP3.find_unspliced_segments,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Cleaning",
        )
    logger.info("STEP 3 finished")
def runBLAST(cfg, logger, cpu_number):
    logger.info("Running BLAST")
    in_folder = cfg["input_folder"]
    blastn_dbs = cfg["blastn_databases_folder"]
    in_suffix = cfg["input_suffix"]
    out_folder = cfg["blastn_results_folder"]
    create_result_folder(out_folder, logger)
    if not os.path.exists(blastn_dbs) or not os.listdir(blastn_dbs):
        exception = "[error]: either blastdb folder {} is empty or \
it has not been created by running 'run_makeblastdb'".format(blastn_dbs)
        raise Exception(exception)
        sys.exit()

    if not os.path.exists(in_folder) or not os.listdir(in_folder):
        exception = "[error]: either fasta input folder {} is empty or \
it does not exist".format(in_folder)
        raise Exception(exception)
        sys.exit()
    os.chdir(in_folder)
    starting_files = sorted(
        [x for x in os.listdir(os.getcwd()) if in_suffix in x])

    if cfg["fasta_sanity_check"]:
        for fasta in starting_files:
            primaryScript.is_fasta(fasta)

    logger.info("There are {} input fasta files".format(len(starting_files)))
    fasta_to_db = primaryScript.fasta_to_blastdb(starting_files, cfg)

    logger.info("Starting BLAST...")
    if fasta_to_db:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            primaryScript.run_blastn,
            fasta_to_db,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Running BLAST",
        )

    logger.info("run_BLAST finished")
Beispiel #7
0
def PhaseAlleles(cfg, logger, cpu_number, **kargs):
    logger.info("Phasing alleles starting")
    in_folder = cfg["input_folder"]
    in_suffix = cfg["input_suffix"]
    out_suffix = cfg["output_suffix"]
    out_folder = cfg["output_folder"]
    create_result_folder(out_folder, logger)
    logger.debug("Checking the settings")
    temporary_folder = kargs["temp"]
    phase.check_settings(cfg, logger)
    starting = get_starting_files(in_folder, out_folder, in_suffix, out_suffix)

    starting_files = [x for x in starting if in_suffix in x]  # [:1]
    s = "There are {} input fasta files".format(len(starting_files))
    print(s)
    logger.info(s)
    samtools_v = phase.find_samtools_version(cfg)
    print("samtools version is: .X", samtools_v)

    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            phase.phasing_contig,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            temp=temporary_folder,
            samtools_version=samtools_v,
            tqdm_desc="phasing",
        )

    logger.info("Phasing alleles finished")
def pipeline_STEP7(cfg, logger, cpu_number):
    s = "Working with STEP 7: Building final sequences"
    print(s)
    logger.info(s)
    input_folder = cfg["input_folder"]
    output_folder = cfg["output_folder"]
    in_suffix = cfg["input_suffix"]
    out_suffix = cfg["output_suffix"]
    in_format = cfg["input_format"]
    create_result_folder(output_folder, logger)
    os.chdir(input_folder)
    starting = get_starting_files(input_folder, output_folder, in_format,
                                  out_suffix)
    starting_files = [x for x in starting if in_suffix in x]
    logger.info("There are {} input fasta files".format(len(starting_files)))
    """
    if starting_files:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            STEP7.assemble_alleles,
            starting_files,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="STEP 7",
        )
        
    """
    for fasta in starting_files:
        STEP7.assemble_alleles(fasta, cfg)

    cleaning_dnadist(logger)
    logger.info("STEP 7 finished")
Beispiel #9
0
def main():
    # ===================================================================================
    #     set logger and parse arguments
    # ===================================================================================
    start_time = time.time()
    parser = create_parser()
    args = parser.parse_args()

    if args.verbose:
        logging_level = logging.DEBUG
    else:
        logging_level = logging.INFO
    LOG_FILENAME = args.log
    logger = logging.getLogger("MyLogger")

    handler = logging.handlers.RotatingFileHandler(LOG_FILENAME,
                                                   maxBytes=2000000,
                                                   backupCount=5)
    logger.addHandler(handler)
    start = "Pipeline started at date & time {0}".format(time.strftime("%c"))
    print(start)
    logger.setLevel(logging_level)
    stderrhandler = logging.StreamHandler(sys.stderr)
    stderrhandler.setFormatter(logging.Formatter("    %(message)s"))
    stderrhandler.setLevel(logging_level)

    available_cpus = cpu_count()
    if args.cpu:
        cpu_number = args.cpu
        if cpu_number > available_cpus:
            sys.exit("[Error] Exiting:\
            more CPU requested than the available number of {} CPUS".format(
                available_cpus))
        if cpu_number == 0:
            sys.exit("[Error] Exiting: '--cpu 0' is not a useful parameter")
    print("cpu_number = ", cpu_number)
    print("verbosity", logging_level)

    # ===================================================================================
    #      get global parameters
    # ===================================================================================

    cfg = config_parser.main(args.ini, "GLOBAL")
    global aligera_folder
    aligera_folder = cfg["GLOBAL"]["aligera_folder"]
    global temporary_folder
    temporary_folder = cfg["GLOBAL"]["temporary_folder"]
    utilities.create_result_folder(temporary_folder, logger)
    global clean_temp_files
    clean_temp_files = cfg["GLOBAL"]["remove_temporary_files"]
    if clean_temp_files:
        logger.info("Deleting files in temporary folder")
        utilities.cleanTemporaryFolder(temporary_folder)
        os.chdir(aligera_folder)
    if args.verbose:
        for (k, v) in cfg["GLOBAL"].items():
            print(k, ":", v)
    logger.debug(cfg["GLOBAL"])

    # ===================================================================================
    #         run primary
    # ===================================================================================

    if args.tool == "primary":
        import aligera_scripts.AligerA_primary as primary

        s = "\nSTARTING PRIMARY TOOL"
        print(s)
        logger.info(s)
        cfg = config_parser.main(args.ini, "PRIMARY")
        if cfg["PRIMARY"]["run_makeblastdb"]:
            s = "\nBuilding BLAST db"
            print(s)
            logger.info(s)
            primary.runMakeblastdb(cfg["PRIMARY"], logger, cpu_number)

        if cfg["PRIMARY"]["run_BLAST"]:
            s = "\nStarting BLAST"
            print(s)
            logger.info(s)
            primary.runBLAST(cfg["PRIMARY"], logger, cpu_number)

        if cfg["PRIMARY"]["build_primary_groups"]:
            s = "\nParsing BLAST xmls"
            print(s)
            logger.info(s)
            primary.parseXMLs(cfg["PRIMARY"], logger, cpu_number,
                              temporary_folder)
            if not cfg["PRIMARY"]["parallel"]:
                s = "\nBuilding primary groups sequentially"
                print(s)
                logger.info(s)
                primary.componentsSequential(cfg["PRIMARY"], logger,
                                             temporary_folder)
            else:
                s = "\nBuilding primary groups in parallel"
                print(s)
                logger.info(s)
                primary.componentsParallel(cfg["PRIMARY"], logger, cpu_number,
                                           temporary_folder)

            s = "\nCreating groups"
            print(s)
            logger.info(s)
            primary.makePrimaryAssortments(cfg["PRIMARY"], logger,
                                           temporary_folder)

        if cfg["PRIMARY"]["build_primary_groups"] and clean_temp_files:
            logger.info("Deleting files in temporary folder")
            utilities.cleanTemporaryFolder(temporary_folder)

    # ===================================================================================
    #         run secondary
    # ===================================================================================

    if args.tool == "secondary":
        import aligera_scripts.AligerA_secondary as secondary

        s = "\nSTARTING SECONDARY TOOL"
        print(s)
        logger.info(s)
        cfg = config_parser.main(args.ini, "SECONDARY")
        secondary.MakeSecondaryAssortments(
            cfg["SECONDARY"],
            logger,
            cpu_number,
            temp=temporary_folder,
            remove_temp=clean_temp_files,
        )
        if (cfg["SECONDARY"]["proceed_with_secondary_search"]
                and clean_temp_files):
            logger.info("Deleting files in temporary folder")
            utilities.cleanTemporaryFolder(temporary_folder)

    # ===================================================================================
    #         run phasing
    # ===================================================================================

    if args.tool == "phasing":
        import aligera_scripts.AligerA_phase as phase

        s = "\nSTARTING PHASING"
        print(s)
        logger.info(s)
        cfg = config_parser.main(args.ini, "PHASING")
        phase.PhaseAlleles(cfg["PHASING"],
                           logger,
                           cpu_number,
                           temp=temporary_folder)

        if clean_temp_files:
            logger.info("Deleting files in temporary folder")
            utilities.cleanTemporaryFolder(temporary_folder)

    # ===================================================================================
    #         run pipeline
    # ===================================================================================

    if args.tool == "pipeline":
        import aligera_scripts.AligerA_pipeline as pipeline

        s = "\nSTARTING PIPELINE"
        print(s)
        logger.info(s)
        cfg = config_parser.main(args.ini, "ALIGERA PIPELINE")

        if cfg["STEP1"]["run_STEP1"]:
            s = "\nSTARTING STEP1"
            print(s)
            logger.info(s)
            pipeline.pipeline_STEP1(cfg["STEP1"], logger, cpu_number)

        if cfg["STEP2"]["run_STEP2"]:
            s = "\nSTARTING STEP2"
            print(s)
            logger.info(s)
            pipeline.pipeline_STEP2(cfg["STEP2"], logger, cpu_number)

        if cfg["STEP3"]["run_STEP3"]:
            s = "\nSTARTING STEP3"
            print(s)
            logger.info(s)
            pipeline.pipeline_STEP3(cfg["STEP3"], logger, cpu_number)

        if cfg["STEP4"]["run_STEP4"]:
            s = "\nSTARTING STEP4"
            print(s)
            logger.info(s)
            pipeline.pipeline_STEP4(cfg["STEP4"], logger, cpu_number)

        if cfg["STEP5"]["run_STEP5"]:
            s = "\nSTARTING STEP5"
            print(s)
            logger.info(s)
            pipeline.pipeline_STEP5(cfg["STEP5"], logger, cpu_number)

        if cfg["STEP6"]["run_STEP6"]:
            s = "\nSTARTING STEP6"
            print(s)
            logger.info(s)
            pipeline.pipeline_STEP6(cfg["STEP6"], logger, cpu_number)

        if cfg["STEP7"]["run_STEP7"]:
            s = "\nSTARTING STEP7"
            print(s)
            logger.info(s)
            pipeline.pipeline_STEP7(cfg["STEP7"], logger, cpu_number)

    secs = time.time() - start_time
    logger.info("Pipeline completed  in {} hours:minutes:seconds".format(
        datetime.timedelta(seconds=int(secs))))

    logger.removeHandler(handler)
    logging.shutdown()
def MakeSecondaryAssortments(cfg, logger, cpu_number, **kargs):
    logger.info("Retrieving the Secondary sequence assortments")
    in_folder = cfg["input_folder"]
    in_suffix = cfg["input_suffix"]
    max_size = cfg["limit_large_file"]
    filter_taxa = cfg["filtered_taxa_list"]
    filter_taxa_occurences = cfg["filtered_taxon_occurences"]
    filter_taxa_bool = cfg["filtered_taxon_boolean"]
    out_passed = cfg["output_folder_passed"]
    create_result_folder(out_passed, logger)
    out_failed = cfg["output_folder_failed"]
    create_result_folder(out_failed, logger)
    temporary_folder = kargs["temp"]
    remove_temp = kargs["remove_temp"]

    out_files = [
        re.split("[L][0-9]+", x)[0] for x in os.listdir(out_passed)
    ] + [re.split("[L][0-9]+", x)[0] for x in os.listdir(out_failed)]
    os.chdir(in_folder)
    starting_files = sorted([
        x for x in os.listdir(os.getcwd())
        if (in_suffix in x and "_core" not in x and "_addit" not in x
            and "_temp_" not in x and os.stat(x).st_size != 0
            and x not in out_files)
    ])

    if cfg["fasta_sanity_check"]:
        logger.debug("Performing sanity check")
        for fasta in starting_files:
            primary.is_fasta(fasta)

    logger.debug("Searching for large fasta files")
    starting_fasta = "fasta files:" + ",".join(starting_files)
    logger.debug(starting_fasta)
    filtering_modes = "filtering_mode_size: {0}, filtering_mode_taxa: {1}".format(
        cfg["filtering_mode_size"], cfg["filtering_mode_taxa"])
    logger.debug(filtering_modes)
    if cfg["filtering_mode_size"] and not cfg["filtering_mode_taxa"]:
        large_fastas = [
            x for x in starting_files
            if not secondary.get_large_files(x, max_size + 1)
        ]
    elif cfg["filtering_mode_taxa"] and not cfg["filtering_mode_size"]:
        large_fastas = [
            x for x in starting_files if not secondary.filter_with_taxa(
                x, filter_taxa, filter_taxa_occurences, filter_taxa_bool)
        ]
    elif cfg["filtering_mode_taxa"] and cfg["filtering_mode_size"]:
        large_fastas = [
            x for x in starting_files
            if (not secondary.filter_with_taxa(
                x, filter_taxa, filter_taxa_occurences, filter_taxa_bool)
                and not secondary.get_large_files(x, max_size + 1))
        ]
    else:
        large_fastas = []

    small_fastas = list(set(starting_files).difference(set(large_fastas)))
    for x in small_fastas:
        try:
            shutil.copy(x, out_passed)
        except:
            print("[Error]: unable to move file {0} to folder {1}".format(
                x, out_passed))

    logger.info(
        "There are {} small fasta files that have been validated".format(
            len(small_fastas)))
    logger.info(
        "There are {} large fasta files that need to be processed".format(
            len(large_fastas)))

    if cfg["proceed_with_secondary_search"] and large_fastas:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        process_future_fasta(
            secondary.mk_secondary,
            large_fastas,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            temp=temporary_folder,
            tqdm_desc="2nd search",
            remove_temp=remove_temp,
        )

    logger.debug("Done MakeSecondaryAssortments")
def pipeline_STEP1(cfg, logger, cpu_number):
    s = "Working with STEP 1: running MAFFT"
    print(s)
    logger.info(s)
    input_folder = cfg["input_folder"]
    output_folder = cfg["output_folder"]
    out_suffix = cfg["output_suffix"]
    create_result_folder(output_folder, logger)
    os.chdir(input_folder)
    unwanted_files = [
        x for x in os.listdir(os.getcwd()) if "_temp_aligned.fasta" in x
    ]

    # Remove files from a previously aborted run .
    for f in unwanted_files:
        os.remove(f)
    finished_files = [
        x.split(out_suffix)[0] for x in os.listdir(output_folder)
    ]
    starting_files = sorted([
        x for x in os.listdir(os.getcwd()) if cfg["input_suffix"] in x
        and find_n_seqs(x, 1, cfg["upper_sequence_limit"]) and "_core" not in x
        and "_addit" not in x and x.split(out_suffix)[0] not in finished_files
    ])

    for fasta in starting_files:
        is_fasta(fasta)

    # Size threshold for switching from small_fastas alignment to large_fastas
    size_threshold = cfg["MAFFT_upper_limit_addfragments"]
    small_fastas = [
        x for x in starting_files if find_n_seqs(x, 1, size_threshold + 1)
    ]
    logger.debug("there are {} small_fastas".format(len(small_fastas)))

    large_fastas = [
        x for x in starting_files if not find_n_seqs(x, 1, size_threshold + 1)
    ]
    logger.debug("there are {} large_fastas".format(len(large_fastas)))

    if small_fastas:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        logger.info("Running Mafft on small fasta")
        process_future_fasta(
            STEP1.run_MAFFT_small,
            small_fastas,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Mafft on small files",
        )
    if large_fastas:
        manager = Manager()
        fastas = manager.Queue()
        result_dict = manager.dict()
        logger.info("Running Mafft on large fasta")
        process_future_fasta(
            STEP1.run_MAFFT_large,
            large_fastas,
            result_dict,
            fastas,
            cpu_number,
            logger,
            cfg,
            tqdm_desc="Mafft on large files",
        )
    logger.info("STEP 1 finished")