Example #1
0
def main(args):

    projectFolder = os.getcwd()
    assemblies_dir  = args.assembly_dir
    lineage = args.lineage

    for sample_dir_name in [dir for dir in os.listdir(assemblies_dir) \
            if os.path.isdir(os.path.join(assemblies_dir, dir))]:
        # in this folder I stored all the assemblies
        assemblies_folder   = os.path.join(assemblies_dir, sample_dir_name)
        # in this folder I will compute the validation
        validation_folder = os.path.join(os.getcwd(), sample_dir_name)
        if not os.path.exists(validation_folder):
            os.makedirs(validation_folder)
        os.chdir(validation_folder)
        ## Restore all information present in the sample yaml file present
        ## in the assembly folder
        sample_config_assembly_file = os.path.join(assemblies_folder,
                "{}_assemble.yaml".format(sample_dir_name))
        with open(sample_config_assembly_file) as sample_config_handle:
            sample_config_assembly = yaml.load(sample_config_handle)
        # prepare for each assembly employed a validation job --- do not check
        # in sample sheet the asse,blies, run one validation per forder
        # only (assumptions are done on assmebly name)
        for assembler in [dir for dir in os.listdir(assemblies_folder) \
                if os.path.isdir(os.path.join(assemblies_folder, dir))]:
            if not os.path.exists(assembler):
                os.makedirs(assembler)
            os.chdir(assembler)
            assembly_dir = os.path.join(assemblies_folder, assembler)
            assembly_name = os.path.join(assembly_dir,
                    "{}.scf.fasta".format(sample_config_assembly["output"]))
            pipeline = "evaluete"
            sample_YAML_name = "{}_{}.yaml".format(sample_dir_name, pipeline)
            sample_YAML = open(sample_YAML_name, 'w')
            sample_YAML.write("pipeline:\n")
            sample_YAML.write(" {}\n".format(pipeline))
            sample_YAML.write("tools:\n")
            if lineage == 'none':
                sample_YAML.write(" [align, qaTools, FRC]\n")
            elif lineage in ['eukaryota', 'bacteria', 'vertebrata', 'fungi', 'metazoa',
                'plant_early_release', 'arthropoda']:
                sample_YAML.write(" [align, qaTools, FRC, BUSCO]\n")
                sample_YAML.write("BUSCODataPath: /sw/apps/bioinfo/BUSCO/lineage_sets/{}\n".format(lineage))
            sample_YAML.write(
                    "output: {}\n".format(sample_config_assembly["output"]))
            sample_YAML.write(
                    "projectName: {}_validate\n".format(
                    sample_config_assembly["output"]))
            sample_YAML.write("kmer: \n")
            sample_YAML.write("threads: {}\n".format(args.threads))
            sample_YAML.write(
                    "genomeSize: {}\n".format(
                    sample_config_assembly["genomeSize"]))
            sample_YAML.write("minCtgLength: 1000\n")
            sample_YAML.write("reference: {}\n".format(assembly_name))
            sample_YAML.write("libraries:\n")
            for library, libraryData in \
                    sample_config_assembly["libraries"].items():
                sample_YAML.write(" {}:\n".format(library))
                sample_YAML.write("  pair1: {}\n".format(libraryData["pair1"]))
                sample_YAML.write("  pair2: {}\n".format(libraryData["pair2"]))
                sample_YAML.write("  orientation: {}\n".format(
                    libraryData["orientation"]))
                sample_YAML.write("  insert: {}\n".format(
                    libraryData["insert"]))
                sample_YAML.write("  std: {}\n".format(libraryData["std"]))

            sample_YAML.close

            # Run the job
            extramodules = ["module load samtools\nmodule load bwa\nmodule load BUSCO/1.22\nsource $BUSCO_SETUP\n"]
            jobname = "{}_{}_{}".format(sample_dir_name, pipeline, assembler)
            submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules)

            os.chdir(validation_folder)
        os.chdir(projectFolder)
Example #2
0
def main(args):
    projectFolder    = os.getcwd()
    samples_data_dir = args.sample_data_dir
    projectName      = os.path.basename(os.path.normpath(samples_data_dir))
    for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) \
            if os.path.isdir(os.path.join(samples_data_dir, dir))]:
        sample_folder = os.path.join(os.getcwd(), sample_dir_name)
        if not os.path.exists(sample_folder):
            os.makedirs(sample_folder)
        os.chdir(sample_folder)
        # now I am in the folder, i can run at the same time QC and MP anlaysis

        pipeline = "QCcontrol"
        tools    = ["trimmomatic", "fastqc", "abyss", "align"]
        if args.reference is None:
            tools    = ["trimmomatic", "fastqc", "abyss"]

        sample_YAML_name = os.path.join(sample_folder,  "{}_{}.yaml".format(
            sample_dir_name, pipeline))
        sample_YAML = open(sample_YAML_name, 'w')

        sample_YAML.write("pipeline:\n")
        sample_YAML.write(" {}\n".format(pipeline))
        sample_YAML.write("tools:\n")
        sample_YAML.write(" {}\n".format(tools))
        ##TODO: output must became sampleName
        sample_YAML.write("output: {}\n".format(sample_dir_name))
        sample_YAML.write("projectName: {}\n".format(projectName))
        sample_YAML.write("kmer: 35\n")
        sample_YAML.write("threads: {}\n".format(args.threads))
        sample_YAML.write("genomeSize: \n")
        sample_YAML.write("adapters: {}\n".format(args.adapter))

        if args.reference is not None:
            sample_YAML.write("reference: {}\n".format(args.reference))
        sample_YAML.write("libraries:\n")

        sample_data_dir = os.path.join(samples_data_dir,sample_dir_name)
        
        # helper variables for collecting FCs
        fc_pat, prep_pat = (r'^\d{6}_.*_?.*$', r'^[A-Z]$')
        def _get_expected_dir(path, pat):
            return [os.path.join(path, d) for d in os.listdir(path) if re.match(pat, d) \
                    and os.path.isdir(os.path.join(path, d))]
        
        #collect FC directories            
        flowcells_dirs  = _get_expected_dir(sample_data_dir, fc_pat)
        
        # to adapt the directory structure in IRMA where it have lib prep dir
        lib_prep_dirs  = _get_expected_dir(sample_data_dir, prep_pat)
        
        # Check and collect the flowcells in the lib prep directory
        for prep_dir in lib_prep_dirs:
            flowcells_dirs.extend(_get_expected_dir(prep_dir, fc_pat))
            
        sample_files = []
        for flowcell in flowcells_dirs:

            sample_files.extend([os.path.join(flowcell, f) for f in \
                    os.listdir(flowcell) \
                    if (os.path.isfile(os.path.realpath(os.path.join(flowcell,f))) \
                    and re.search('.gz$',f))])
        # now sample_files contains all the file sequenced for this sample
        pair1_file = ""
        pair2_file = ""
        single     = ""
        library    = 1
        while len(sample_files) > 0:
            file = sample_files[0]
            sample_YAML.write(" lib{}:\n".format(library))
            if "_1.fastq.gz" in file:
                pair1_file = file
                pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file)
            elif "_2.fastq.gz" in file:
                pair2_file = file
                pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file)
            elif "R1_001.fastq.gz" in file:
                pair1_file = file
                pair2_file = re.sub("R1_001.fastq.gz", "R2_001.fastq.gz", file)
            elif "R2_001.fastq.gz" in file:
                pair2_file = file
                pair1_file = re.sub("R2_001.fastq.gz", "R1_001.fastq.gz", file)
            else:
                sys.exit("file {} does not respect naming convection. \
                        Exit!".format(file))

            sample_YAML.write("  pair1: {}\n".format(pair1_file))
            sample_YAML.write("  pair2: {}\n".format(pair2_file))
            sample_YAML.write("  orientation: {}\n".format(args.orientation))
            sample_YAML.write("  insert: {}\n".format(args.insert))
            sample_YAML.write("  std: {}\n".format(args.std))
            sample_files.remove(pair1_file)
            sample_files.remove(pair2_file)
            library += 1
        sample_YAML.close

        # Run the job
        extramodules = []
        if "abyss" in tools:
            extramodules.append("module load abyss/1.3.5\n")
        if "align" in tools:
            extramodules.append("module load samtools\nmodule load bwa\n")
        jobname = "{}_{}".format(sample_dir_name, pipeline)
        submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules)
        os.chdir(projectFolder)
Example #3
0
def main(args):
    projectFolder    = os.getcwd()
    samples_data_dir = args.sample_data_dir
    #UPPMAX assumption
    projectName      = os.path.basename(os.path.normpath(samples_data_dir))
    for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) \
            if os.path.isdir(os.path.join(samples_data_dir, dir))]:
        sample_folder = os.path.join(os.getcwd(), sample_dir_name)
        if not os.path.exists(sample_folder):
            os.makedirs(sample_folder)
        os.chdir(sample_folder)
        # if this is the case I need to retrive the project name from the yaml
        # file
        if args.afterqc:
            QC_YAML_file = os.path.join(samples_data_dir,sample_dir_name,
                    "{}_QCcontrol.yaml".format(sample_dir_name))
            if not os.path.exists(QC_YAML_file):
                sys.exit("Error file {} must exists!".format(QC_YAML_file))
            with open(QC_YAML_file) as QC_YAML_file_handle:
                QC_sample_config = yaml.load(QC_YAML_file_handle)
            # TODO: I need to use the sample sheet that must be present in
            # the QC folder to extract the project name
            projectName  = QC_sample_config["projectName"]
        #Now all the info is in place and I am in the correct folder
        pipeline = "assemble"
        tools = list(args.assemblers)
        tools = list(map(str, tools)) # Beware whoever inputs unicode characters
        sample_YAML_name = os.path.join(sample_folder,
                "{}_{}.yaml".format(sample_dir_name, pipeline))
        sample_YAML = open(sample_YAML_name, 'w')
        sample_YAML.write("pipeline:\n")
        sample_YAML.write(" {}\n".format(pipeline))
        sample_YAML.write("tools:\n")
        sample_YAML.write(" {}\n".format(tools))
        sample_YAML.write("output: {}\n".format(sample_dir_name))
        sample_YAML.write("projectName: {}\n".format(projectName))
        sample_YAML.write("kmer: {}\n".format(args.kmer))
        sample_YAML.write("threads: {}\n".format(args.threads))
        sample_YAML.write("genomeSize: {}\n".format(args.genomesize))
        if args.keep_tmp_files: #TODO: generalize if we add more flags
            sample_YAML.write("flags: ['keep_tmp_files']\n")
        #I have to distinguish between afterQC and not
        sample_data_dir = ""
        sample_files = []
        if args.afterqc:
            sample_data_dir = os.path.join(samples_data_dir,sample_dir_name)
            fastq_files     = os.path.join(sample_data_dir, "results",
                    "fastq_trimmed")
            sample_files    = [os.path.join(fastq_files, f) for f in \
                    os.listdir(fastq_files) \
                    if (os.path.isfile(os.path.join(fastq_files,f)) \
                    and re.search('[1|2].fastq.gz$',f))]
        else:
            sample_data_dir = os.path.join(samples_data_dir,sample_dir_name)
            # full path to flowcell
            flowcells_dirs  = [os.path.join(sample_data_dir,flowcell) \
                    for flowcell in os.listdir(sample_data_dir) \
                    if os.path.isdir(os.path.join(sample_data_dir,flowcell))]
            for flowcell in flowcells_dirs:
                sample_files.extend([os.path.join(flowcell, f) for f in \
                        os.listdir(flowcell) \
                        if (os.path.isfile(os.path.join(flowcell,f)) \
                        and re.search('.gz$',f))])
        # now sample_files contains all the file sequenced for this sample
        pair1_file = ""
        pair2_file = ""
        single = ""
        library = 1
        sample_YAML.write("libraries:\n")
        for file in sample_files:
            sample_YAML.write(" lib{}:\n".format(library))
            if "_1.fastq.gz" in file:
                pair1_file = file
                pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file)
            elif "_2.fastq.gz" in file:
                pair2_file = file
                pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file)
            elif "R1_001.fastq.gz" in file:
                pair1_file = file
                pair2_file = re.sub("R1_001.fastq.gz", "R2_001.fastq.gz", file)
            elif "R2_001.fastq.gz" in file:
                pair2_file = file
                pair1_file = re.sub("R2_001.fastq.gz", "R1_001.fastq.gz", file)
            sample_YAML.write("  pair1: {}\n".format(pair1_file))
            sample_YAML.write("  pair2: {}\n".format(pair2_file))
            sample_YAML.write("  orientation: {}\n".format(args.orientation))
            sample_YAML.write("  insert: {}\n".format(args.insert))
            sample_YAML.write("  std: {}\n".format(args.std))
            sample_files.remove(pair1_file)
            sample_files.remove(pair2_file)
            library += 1
        sample_YAML.close

        #Run the job
        all_modules = {
                "abyss": "module load abyss/1.3.5\n",
                "soapdenovo": "module load soapdenovo/2.04-r240\n",
                "spades": "module load spades/3.6.0\n",
                "cabog": "module load cabog/8.1\n",
                "allpaths": "module unload gcc\nmodule load allpathslg/52485\n",
                "masurca": "module load masurca MaSuRCA/2.3.2\n"
        }

        extramodules = [all_modules[tool] for tool in tools]
        jobname = "{}_{}".format(sample_dir_name, pipeline)
        submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules)
        os.chdir(projectFolder)
Example #4
0
def main(args):
    projectFolder = os.getcwd()
    samples_data_dir = args.sample_data_dir
    #UPPMAX assumption
    projectName = os.path.basename(os.path.normpath(samples_data_dir))
    for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) \
            if os.path.isdir(os.path.join(samples_data_dir, dir))]:
        sample_folder = os.path.join(os.getcwd(), sample_dir_name)
        if not os.path.exists(sample_folder):
            os.makedirs(sample_folder)
        os.chdir(sample_folder)
        # if this is the case I need to retrive the project name from the yaml
        # file
        if args.afterqc:
            QC_YAML_file = os.path.join(
                samples_data_dir, sample_dir_name,
                "{}_QCcontrol.yaml".format(sample_dir_name))
            if not os.path.exists(QC_YAML_file):
                sys.exit("Error file {} must exists!".format(QC_YAML_file))
            with open(QC_YAML_file) as QC_YAML_file_handle:
                QC_sample_config = yaml.load(QC_YAML_file_handle)
            # TODO: I need to use the sample sheet that must be present in
            # the QC folder to extract the project name
            projectName = QC_sample_config["projectName"]
        #Now all the info is in place and I am in the correct folder
        pipeline = "assemble"
        tools = list(args.assemblers)
        tools = list(map(str,
                         tools))  # Beware whoever inputs unicode characters
        sample_YAML_name = os.path.join(
            sample_folder, "{}_{}.yaml".format(sample_dir_name, pipeline))
        sample_YAML = open(sample_YAML_name, 'w')
        sample_YAML.write("pipeline:\n")
        sample_YAML.write(" {}\n".format(pipeline))
        sample_YAML.write("tools:\n")
        sample_YAML.write(" {}\n".format(tools))
        sample_YAML.write("output: {}\n".format(sample_dir_name))
        sample_YAML.write("projectName: {}\n".format(projectName))
        sample_YAML.write("kmer: {}\n".format(args.kmer))
        sample_YAML.write("threads: {}\n".format(args.threads))
        sample_YAML.write("genomeSize: {}\n".format(args.genomesize))
        if args.keep_tmp_files:  #TODO: generalize if we add more flags
            sample_YAML.write("flags: ['keep_tmp_files']\n")
        #I have to distinguish between afterQC and not
        sample_data_dir = ""
        sample_files = []
        if args.afterqc:
            sample_data_dir = os.path.join(samples_data_dir, sample_dir_name)
            fastq_files = os.path.join(sample_data_dir, "Trimmomatic")
            sample_files    = [os.path.join(fastq_files, f) for f in \
                    os.listdir(fastq_files) \
                    if (os.path.isfile(os.path.join(fastq_files,f)) \
                    and re.search('[1|2].fastq.gz$',f))]
        else:
            sample_data_dir = os.path.join(samples_data_dir, sample_dir_name)
            # full path to flowcell
            flowcells_dirs  = [os.path.join(sample_data_dir,flowcell) \
                    for flowcell in os.listdir(sample_data_dir) \
                    if os.path.isdir(os.path.join(sample_data_dir,flowcell))]
            for flowcell in flowcells_dirs:
                sample_files.extend([os.path.join(flowcell, f) for f in \
                        os.listdir(flowcell) \
                        if (os.path.isfile(os.path.join(flowcell,f)) \
                        and re.search('.gz$',f))])
        # now sample_files contains all the file sequenced for this sample
        pair1_file = ""
        pair2_file = ""
        single = ""
        library = 1
        sample_YAML.write("libraries:\n")
        for file in sample_files:
            sample_YAML.write(" lib{}:\n".format(library))
            if "_1.fastq.gz" in file:
                pair1_file = file
                pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file)
            elif "_2.fastq.gz" in file:
                pair2_file = file
                pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file)
            elif "R1_001.fastq.gz" in file:
                pair1_file = file
                pair2_file = re.sub("R1_001.fastq.gz", "R2_001.fastq.gz", file)
            elif "R2_001.fastq.gz" in file:
                pair2_file = file
                pair1_file = re.sub("R2_001.fastq.gz", "R1_001.fastq.gz", file)
            sample_YAML.write("  pair1: {}\n".format(pair1_file))
            sample_YAML.write("  pair2: {}\n".format(pair2_file))
            sample_YAML.write("  orientation: {}\n".format(args.orientation))
            sample_YAML.write("  insert: {}\n".format(args.insert))
            sample_YAML.write("  std: {}\n".format(args.std))
            sample_files.remove(pair1_file)
            sample_files.remove(pair2_file)
            library += 1
        sample_YAML.close

        #Run the job
        all_modules = {
            "abyss": "module load abyss/1.3.5\n",
            "soapdenovo": "module load soapdenovo/2.04-r240\n",
            "spades": "module load spades/3.6.0\n",
            "cabog": "module load cabog/8.1\n",
            "allpaths": "module unload gcc\nmodule load allpathslg/52485\n",
            "masurca": "module load MaSuRCA/2.3.2\n"
        }

        extramodules = [all_modules[tool] for tool in tools]
        jobname = "{}_{}".format(sample_dir_name, pipeline)
        submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules)
        os.chdir(projectFolder)
Example #5
0
def main(args):
    projectFolder = os.getcwd()
    samples_data_dir = args.sample_data_dir
    projectName = os.path.basename(os.path.normpath(samples_data_dir))
    for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) \
            if os.path.isdir(os.path.join(samples_data_dir, dir))]:
        sample_folder = os.path.join(os.getcwd(), sample_dir_name)
        if not os.path.exists(sample_folder):
            os.makedirs(sample_folder)
        os.chdir(sample_folder)
        # now I am in the folder, i can run at the same time QC and MP anlaysis

        pipeline = "QCcontrol"
        tools = ["trimmomatic", "fastqc", "abyss", "align"]
        if args.reference is None:
            tools = ["trimmomatic", "fastqc", "abyss"]

        sample_YAML_name = os.path.join(
            sample_folder, "{}_{}.yaml".format(sample_dir_name, pipeline))
        sample_YAML = open(sample_YAML_name, 'w')

        sample_YAML.write("pipeline:\n")
        sample_YAML.write(" {}\n".format(pipeline))
        sample_YAML.write("tools:\n")
        sample_YAML.write(" {}\n".format(tools))
        ##TODO: output must became sampleName
        sample_YAML.write("output: {}\n".format(sample_dir_name))
        sample_YAML.write("projectName: {}\n".format(projectName))
        sample_YAML.write("kmer: 35\n")
        sample_YAML.write("threads: {}\n".format(args.threads))
        sample_YAML.write("genomeSize: \n")
        sample_YAML.write("adapters: {}\n".format(args.adapter))

        if args.reference is not None:
            sample_YAML.write("reference: {}\n".format(args.reference))
        sample_YAML.write("libraries:\n")

        sample_data_dir = os.path.join(samples_data_dir, sample_dir_name)

        # helper variables for collecting FCs
        fc_pat, prep_pat = (r'^\d{6}_.*_?.*$', r'^[A-Z]$')

        def _get_expected_dir(path, pat):
            return [os.path.join(path, d) for d in os.listdir(path) if re.match(pat, d) \
                    and os.path.isdir(os.path.join(path, d))]

        #collect FC directories
        flowcells_dirs = _get_expected_dir(sample_data_dir, fc_pat)

        # to adapt the directory structure in IRMA where it have lib prep dir
        lib_prep_dirs = _get_expected_dir(sample_data_dir, prep_pat)

        # Check and collect the flowcells in the lib prep directory
        for prep_dir in lib_prep_dirs:
            flowcells_dirs.extend(_get_expected_dir(prep_dir, fc_pat))

        sample_files = []
        for flowcell in flowcells_dirs:

            sample_files.extend([os.path.join(flowcell, f) for f in \
                    os.listdir(flowcell) \
                    if (os.path.isfile(os.path.realpath(os.path.join(flowcell,f))) \
                    and re.search('.gz$',f))])
        # now sample_files contains all the file sequenced for this sample
        pair1_file = ""
        pair2_file = ""
        single = ""
        library = 1
        while len(sample_files) > 0:
            file = sample_files[0]
            sample_YAML.write(" lib{}:\n".format(library))
            if "_1.fastq.gz" in file:
                pair1_file = file
                pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file)
            elif "_2.fastq.gz" in file:
                pair2_file = file
                pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file)
            elif "R1_001.fastq.gz" in file:
                pair1_file = file
                pair2_file = re.sub("R1_001.fastq.gz", "R2_001.fastq.gz", file)
            elif "R2_001.fastq.gz" in file:
                pair2_file = file
                pair1_file = re.sub("R2_001.fastq.gz", "R1_001.fastq.gz", file)
            else:
                sys.exit("file {} does not respect naming convection. \
                        Exit!".format(file))

            sample_YAML.write("  pair1: {}\n".format(pair1_file))
            sample_YAML.write("  pair2: {}\n".format(pair2_file))
            sample_YAML.write("  orientation: {}\n".format(args.orientation))
            sample_YAML.write("  insert: {}\n".format(args.insert))
            sample_YAML.write("  std: {}\n".format(args.std))
            sample_files.remove(pair1_file)
            sample_files.remove(pair2_file)
            library += 1
        sample_YAML.close

        # Run the job
        extramodules = []
        if "abyss" in tools:
            extramodules.append("module load abyss/1.3.5\n")
        if "align" in tools:
            extramodules.append("module load samtools\nmodule load bwa\n")
        jobname = "{}_{}".format(sample_dir_name, pipeline)
        submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules)
        os.chdir(projectFolder)