def assemble_pear(self, input_f, input_r, outdir, name, processes, pearthreads, extraargstring):
        """Uses PEAR to assemble paired F/R read files in run_parallel.

        :param input_f: File path to forward Fastq Reads file or folder.
        :param input_r: File path to reverse Fastq Reads file or folder.
        :param outdir: File path to the output directory.
        :param name: File prefix for the assembled reads.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        :param pearthreads: The number of threads per process to use.
        """
        # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d"
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tAssembling reads with pear")
        debugPrintInputInfo(inputs, "assemble")
        run_parallel([ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR,
                                      [forwards, reverse, "%s/%s_%s" % ( outdir, name, getFileName(forwards)),
                                        pearthreads],
                                      {"exists": [forwards, reverse], "positive": [pearthreads]},
                                      extraargstring)
                        for forwards, reverse in inputs], pool)

        printVerbose("Done assembling sequences...")
        # Grab all the auxillary files (everything not containing ".assembled."
        aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
Ejemplo n.º 2
0
    def demux_fastx(self, input_f, barcodes, outdir, processes, extraargstring):
        """Demuxes using FAST X BARCODE SPLITTER.

        :param input_f: File path to input file or folder of input files.
        :param barcodes: File path to input barcodes file.
        :param outdir: Filepath to output directory.
        :param processes: Number of processes to use to demux input fileset.
        :param extraargstring: Advanced program parameter string.
        """
        # Get input files
        files_to_split = getInputFiles(input_f)
        # Assign the files shard numbers
        file_id = range(len(files_to_split))
        file_id_pairs = zip(files_to_split, file_id)
        debugPrintInputInfo(files_to_split, "demux")
        pool = init_pool(min(len(file_id_pairs), processes))

        printVerbose("Demuxing sequences...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.DEMUX_FASTX,
                                    [input_, barcodes, "%s/" % outdir, "_%d_demux.fastq" % id_],
                                    {"exists": [input_, barcodes]}, extraargstring)
                      for input_, id_ in file_id_pairs], pool)
        printVerbose("Demuxed sequences.")

        # Grab all the auxillary files
        aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
    def demux_by_name(self, input_f, barcodes, outdir, filetype, processes, extraargstring):
        """Demuxes using SeqIO.

        :param input_f: File path to input file or folder of input files.
        :param barcodes: File path to input barcodes file.
        :param outdir: Filepath to output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param processes: Number of processes to use to demux input fileset.
        :param extraargstring: Advanced program parameter string.
        """
        aux_dir = makeAuxDir(outdir)
        # Get input files
        files_to_split = getInputFiles(input_f)
        # Assign the files shard numbers
        file_id = range(len(files_to_split))
        file_id_pairs = zip(files_to_split, file_id)
        debugPrintInputInfo(files_to_split, "demux")
        pool = init_pool(min(len(file_id_pairs), processes))

        printVerbose("Demuxing sequences...")
        run_parallel([PythonRunner(split_on_name,
                                   [input_, barcodes, outdir, id_, filetype], {"exists": [input_]})
                        for input_, id_ in file_id_pairs], pool)


        # Grab all the auxillary files
        aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)
    def rename_chewbacca(self, input_f, outdir, filetype, clip, processes):
        """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where
            <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes.

        :param input_f: Filepath to an input file or folder to rename.
        :param outdir: Filepath to the output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param clip: If True, remove dereplication counts from sequence names before renaming.
        :param processes: The maximum number of processes to use.
        """

        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "rename")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Renaming sequences...")
        # Run serialRename in run_parallel
        run_parallel([PythonRunner(serialRename,
                                   [input_,
                                    "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]),
                                    filetype, clip], {"exists": [input_]})
                      for input_ in inputs], pool)
        printVerbose("Done renaming sequences...")

        samples_dir = makeDirOrdie("%s_samples" % outdir)
        samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False)
        bulk_move_to_dir(samples_files, samples_dir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)

        cleanup_pool(pool)
Ejemplo n.º 5
0
    def preclean_bayeshammer(self, input_f, input_r, outdir, processes,
                             bayesthreads, extraargstring):
        """Assembles reads from two (left and right) fastq files/directories.

        :param input_f: File path to file or folder of left reads to clean.
        :param input_r: File path to file or folder of right reads to clean.
        :param outdir: Filepath to output directory.
        :param bayesthreads: The number of threads per process to use.
        :param processes: The maximum number of processes to use.
        :param kmerlen: The kmer length to use.  Default: 16.
        :param extraargstring: Advanced program parameter string.
        """
        # Collect input files, and validate that they match
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tPrecleaning %s reads with Spades-Baye's Hammer..." %
                     len(inputs))
        debugPrintInputInfo(inputs, "preclean/fix.")

        run_parallel([
            ProgramRunner(ProgramRunnerCommands.PRECLEAN_SPADES,
                          [forwards, reverse, outdir, bayesthreads], {
                              "exists": [forwards, reverse],
                              "positive": [bayesthreads]
                          }, extraargstring) for forwards, reverse in inputs
        ], pool)
        printVerbose("Done cleaning reads.")

        # Grab all the auxillary files (everything not containing ".assembled."
        # aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        # bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        # Select output files
        aux_files = getInputFiles(outdir, "*", ignore_empty_files=False)
        corrected_dir = "%s/corrected" % outdir
        bulk_move_to_dir(getInputFiles(corrected_dir, "*"), outdir)
        aux_files += getInputFiles(outdir,
                                   "*unpaired*",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "configs", ignore_empty_files=False)

        # Gather aux files
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Rename output files
        output_files = getInputFiles(outdir, "*", "corrected_*")
        for out_file in output_files:
            move(out_file,
                 "%s/%s_corrected.fastq" % (outdir, strip_ixes(out_file)))

        # move the last minute log file
        try:
            move("%s/corrected_corrected.fastq" % outdir,
                 "%s/corrected_corrected.fastq" % aux_dir)
        except:
            pass
        cleanup_pool(pool)
Ejemplo n.º 6
0
    def query_fasta_vsearch(self, input_f, referencefasta, taxinfo, outdir, processes, simmilarity, coverage,
                            extraargstring):
        """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment.

        :param input_f:  Filepath to a file or folder of files to identify.
        :param outdir: Filepath to the output directory.
        :param referencefasta: Filepath to a file or folder of files to use as a reference.
        :param taxinfo:  Filepath to a file containing taxonomic info correlated with the referencefasta.
        :param simmilarity: The % simmilarity between a query and reference sequence required for positive
                                identification.
        :param coverage: The % coverage of matching regions between a query and reference sequence required for positive
                            identification.
        :param processes: The number of processes to use in the identification process.
        :param extraargstring: Advanced program parameter string.
        """
        # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
        #       --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt

        # expecting a fasta to annotate
        query_fastas = getInputFiles(input_f)
        debugPrintInputInfo(query_fastas, "queried for identification.")
        ref_fastas = getInputFiles(referencefasta)
        debugPrintInputInfo(ref_fastas, "referenced for sequence identification.")
        tax_info_files = getInputFiles(taxinfo)
        debugPrintInputInfo(tax_info_files, "referenced for taxanomic names.")

        # make sure the number of reference fasta files is the same as the number of tax_info files
        if len(tax_info_files) != len(ref_fastas):
            print "Error: The number of reference fastas and taxonomic mapping files is not the same.  There must be \
                    one taxonomic mapping file for each reference fasta."
            return
        ref_data_pairs = zip(ref_fastas, tax_info_files)
        inputs = [x for x in product(query_fastas, ref_fastas)]
        aln_user_string = ""
        pool = init_pool(min(len(inputs), processes))

        # VSEARCH ALIGNMENT
        query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool)

        printVerbose("Parsing output...")
        # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in
        # parsed_BIOCODE.out.  Parameters can be changed and this command can be rerun as many times as necessary
        #
        # parseVSearchOutputAgainstFasta(vsearch_outfile, taxInfo, output_file, min_simmilarity, min_coverage):
        inputs = [x for x in product(query_fastas, ref_data_pairs)]
        debugPrintInputInfo(inputs, "queryied against paired refereces.")
        run_parallel([PythonRunner(parseVSearchOutputAgainstFasta,
                                   ["%s/%s.out" % (outdir, strip_ixes(query)), tax_info,
                                    "%s/%s.tax" % (outdir, strip_ixes(query)), simmilarity, coverage],
                                   {"exists": [query, ref_fasta, tax_info]})
                      for query, (ref_fasta, tax_info) in inputs], pool)
        printVerbose("\nDone parsing...")

        # Gather and move auxillary files
        aux_files = getInputFiles(outdir, "*", "*.tax", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        cleanup_pool(pool)
Ejemplo n.º 7
0
    def query_fasta_db_vsearch(self, input_f, outdir, ref_fasta, ref_db,
                               simmilarity, coverage, processes,
                               extraargstring):
        """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment.

        :param input_f:  Filepath to a file or folder of files to identify.
        :param outdir: Filepath to the output directory.
        :param ref_fasta: Filepath to the curated fasta file to use as a reference.
        :param ref_db: Filepath to the curated fasta file to use as a reference.
        :param simmilarity:"Minimum % simmilarity (decimal between 0 and 1) between query and reference sequences
                            required for positive identification.
        :param coverage:Minimum % coverage (decimal between 0 and 1) required query and reference sequences required
                            for positive identification.
        :param processes: The number of processes to use in the identification process.
        :param extraargstring: Advanced program parameter string.
        """
        # blast6 output format http://www.drive5.com/usearch/manual/blast6out.html
        aln_user_string = "--userfields query+target+id+alnlen+qcov"
        # coi_fasta = os.path.expanduser("~/ARMS/refs/COI.fasta")
        # ncbi_db_string = os.path.expanduser("~/ARMS/refs/ncbi.db")
        coi_fasta = ref_fasta
        ncbi_db_string = ref_db

        query_fastas = getInputFiles(input_f)
        debugPrintInputInfo(query_fastas, "queried against the DB.")
        inputs = [x for x in product(query_fastas, [coi_fasta])]
        pool = init_pool(min(len(query_fastas), processes))

        # VSEARCH ALIGNMENT
        query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string,
                      extraargstring, pool)

        printVerbose("Parsing output...")
        # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in
        # parsed_BIOCODE.out.  Parameters can be changed and this command can be rerun as many times as necessary
        #
        # parseVSearchOutputAgainstNCBI(vsearch_out, ncbi_db, min_coverage, min_similarity)> parsed_nt.out
        run_parallel([
            PythonRunner(parseVSearchOutputAgainstNCBI, [
                "%s/%s.out" % (outdir, strip_ixes(query)), ncbi_db_string,
                "%s/%s.tax" %
                (outdir, strip_ixes(query)), simmilarity, coverage
            ], {"exits": [query, ncbi_db_string]}) for query in query_fastas
        ], pool)
        printVerbose("Done processing.")

        # Gather and move auxillary files
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*.tax",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        cleanup_pool(pool)
Ejemplo n.º 8
0
    def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc,
                                    outdir, allowedns, processes,
                                    extraargstring):
        """Use flexbar to trim adapters and barcodes from sequences.  By default, Flexbar does not allow any 'N' \
            characters in SEQUENCE, and will toss any sequences that do contain 'N'.  To avoid this, use the -u or \
            --allowedns flags to specify the maximum number of 'N's to allow

        :param input_f: Filepath to input file or folder.
        :param adapters: Filepath to a list of adapters.
        :param adaptersrc: Filepath to a list of reverse-complemented adapters.
        :param outdir: Filepath to the output directory.
        :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        debugPrintInputInfo(inputs, "trim adapters from")
        # "flexbar":  "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"",
        printVerbose("Trimming barcodes and adapters with flexbar")
        temp_file_name_template = "%s/temp_%s"
        debarcoded_file_name_template = "%s/%s_debarcoded"
        # Trim adapters from the left
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [
                input_file, temp_file_name_template %
                (outdir, strip_ixes(input_file)), "LEFT", adapters, allowedns
            ], {"exists": [input_file, adapters]}, extraargstring)
            for input_file in inputs
        ], pool)

        temp_files = getInputFiles(outdir, "temp_*")
        debugPrintInputInfo(temp_files, "trim adapters from")

        # Trim the reverse complemented adapters from the right
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [
                input_file, debarcoded_file_name_template %
                (outdir, strip_ixes(input_file)[5:]), "RIGHT", adaptersrc,
                allowedns
            ], {"exists": [input_file, adaptersrc]}, extraargstring)
            for input_file in temp_files
        ], pool)
        printVerbose("Done Trimming sequences.")

        # Move temp files
        aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
Ejemplo n.º 9
0
    def align_clean_macse(self, input_f, ref, samplesdir, outdir, processes, extraargstring=""):
        """Removes non-nucleotide characters in MACSE aligned sequences for all fasta files in the samples directory
            (the samplesDir argument).

        :param input_f: File path to file or folder of files to clean.
        :param samplesdir: Filepath to the original, unaligned input files (the inputs to the macse aligner).
        :param ref: Filepath to the reference file used to align the input files.
        :param outdir: Filepath to the directory to write outputs to.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # "macse_format":     "java -jar " + programPaths["MACSE"] + "  -prog exportAlignment -align \"%s\" \
        #                           -charForRemainingFS - -gc_def 5 -out_AA \"%s\" -out_NT \"%s\" -statFile \"%s\""

        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\t %s Processing MACSE alignments")
        samples_list = getInputFiles(samplesdir)
        run_parallel([ProgramRunner(ProgramRunnerCommands.MACSE_FORMAT,
                                    ["%s/%s_NT" % (input_f, getFileName(sample)),
                                     "%s/%s_AA_macse.fasta" % (outdir, getFileName(sample)),
                                     "%s/%s_NT_macse.fasta" % (outdir, getFileName(sample)),
                                     "%s/%s_macse.csv" % (outdir, getFileName(sample))],

                                    {"exists": ["%s/%s_NT" % (input_f, getFileName(sample))]}, extraargstring)
                      for sample in samples_list], pool)
        printVerbose("\tCleaning MACSE alignments")

        printVerbose("Processing %s samples..." % len(samples_list))
        nt_macse_outs = ["%s/%s_NT_macse.fasta" % (outdir, strip_ixes(sample)) for sample in samples_list]

        # Clean the alignments
        from classes.PythonRunner import PythonRunner
        run_parallel([PythonRunner(remove_refs_from_macse_out, [input_, ref,
                                   "%s/%s" % (outdir, "%s_cleaned.fasta" % strip_ixes(input_))],
                                   {"exists": [input_, ref]})
                      for input_ in nt_macse_outs], pool)

        # Cat the cleaned alignments
        cleaned_alignments = getInputFiles(outdir, "*_cleaned.fasta")
        merge_files(cleaned_alignments, "%s/MACSE_OUT_MERGED.fasta" % outdir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir, "*", "MACSE_OUT_MERGED.fasta", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)
    def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc, outdir, allowedns, processes, extraargstring):
        """Use flexbar to trim adapters and barcodes from sequences.  By default, Flexbar does not allow any 'N' \
            characters in SEQUENCE, and will toss any sequences that do contain 'N'.  To avoid this, use the -u or \
            --allowedns flags to specify the maximum number of 'N's to allow

        :param input_f: Filepath to input file or folder.
        :param adapters: Filepath to a list of adapters.
        :param adaptersrc: Filepath to a list of reverse-complemented adapters.
        :param outdir: Filepath to the output directory.
        :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        debugPrintInputInfo(inputs, "trim adapters from")
        # "flexbar":  "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"",
        printVerbose("Trimming barcodes and adapters with flexbar")
        temp_file_name_template = "%s/temp_%s"
        debarcoded_file_name_template = "%s/%s_debarcoded"
        # Trim adapters from the left
        run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR,
                                    [input_file, temp_file_name_template % (outdir, strip_ixes(input_file)),
                                     "LEFT", adapters, allowedns],
                                    {"exists": [input_file, adapters]}, extraargstring)
                      for input_file in inputs], pool)

        temp_files = getInputFiles(outdir, "temp_*")
        debugPrintInputInfo(temp_files, "trim adapters from")

        # Trim the reverse complemented adapters from the right
        run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR,
                                    [input_file, debarcoded_file_name_template % (outdir, strip_ixes(input_file)[5:]),
                                     "RIGHT", adaptersrc, allowedns],
                                    {"exists": [input_file, adaptersrc]}, extraargstring)
                      for input_file in temp_files], pool)
        printVerbose("Done Trimming sequences.")

        # Move temp files
        aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
Ejemplo n.º 11
0
    def rename_chewbacca(self, input_f, outdir, filetype, clip, processes):
        """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where
            <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes.

        :param input_f: Filepath to an input file or folder to rename.
        :param outdir: Filepath to the output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param clip: If True, remove dereplication counts from sequence names before renaming.
        :param processes: The maximum number of processes to use.
        """

        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "rename")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Renaming sequences...")
        # Run serialRename in run_parallel
        run_parallel([
            PythonRunner(serialRename, [
                input_,
                "%s/%s_renamed%s" %
                (outdir, strip_ixes(input_), os.path.splitext(input_)[1]),
                filetype, clip
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done renaming sequences...")

        samples_dir = makeDirOrdie("%s_samples" % outdir)
        samples_files = getInputFiles(outdir,
                                      "*.samples",
                                      ignore_empty_files=False)
        bulk_move_to_dir(samples_files, samples_dir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir,
                                  "*.mapping",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)

        cleanup_pool(pool)
Ejemplo n.º 12
0
    def assemble_pear(self, input_f, input_r, outdir, name, processes,
                      pearthreads, extraargstring):
        """Uses PEAR to assemble paired F/R read files in run_parallel.

        :param input_f: File path to forward Fastq Reads file or folder.
        :param input_r: File path to reverse Fastq Reads file or folder.
        :param outdir: File path to the output directory.
        :param name: File prefix for the assembled reads.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        :param pearthreads: The number of threads per process to use.
        """
        # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d"
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tAssembling reads with pear")
        debugPrintInputInfo(inputs, "assemble")
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR, [
                forwards, reverse,
                "%s/%s_%s" % (outdir, name, getFileName(forwards)), pearthreads
            ], {
                "exists": [forwards, reverse],
                "positive": [pearthreads]
            }, extraargstring) for forwards, reverse in inputs
        ], pool)

        printVerbose("Done assembling sequences...")
        # Grab all the auxillary files (everything not containing ".assembled."
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*.assembled.*",
                                  ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
Ejemplo n.º 13
0
    def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize,
                     clustpct, maxmcmc, maxsm, rare, blockcount,
                     extraargstring):
        """Clusters sequences using CROP.

        :param input_f: Filepath to the input fasta file to cluster.
        :param outdir: Filepath to the output directory.
        :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting.
        :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the
                            first round.  For data set with different average sequence length, this parameter should \
                            be tuned such that it won't take too long for each block to do pariwise alignment.  Hint \
                            for choosing z: z*L<150,000, where L is the average length of the sequences.
        :param clustpct: The minimum similarity threshold for clustering.  Either 'g' for 95% or 's' for 97%.
        :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \
                            this value to enhance accuracy (recommended value is at least 10*block size).
        :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run.  Max is 20.
        :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \
                            'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \
                            clusters are mapped to the 'abundant' clusters.  Finally, 'rare' clusters which cannot be \
                            mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \
                            considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \
                            believe your data is not too diverse to be handled, then r=0 will be the best choice.
        :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \
                            first round should contain about 50 sequences.  i.e. b=N/50, where N is the number of \
                            input sequences.  Default: # input sequences / z.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """

        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # crop -i %s -o %s -z %s -c %s -e %s -m %s%s
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [
                input_,
                "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct,
                maxmcmc, maxsm, rare, blockcount
            ], {"exists": [input_]}, extraargstring) for input_ in inputs
        ], pool)

        # CLEAN THE OUTPUT GROUPS FILE
        printVerbose("Parsing the groups file from clustering")
        clustered_groups_files = getInputFiles(outdir, "*.cluster.list")
        debugPrintInputInfo(clustered_groups_files,
                            "converted to groups files")
        run_parallel([
            PythonRunner(parseCROPoutToGroups, [
                input_,
                "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}) for input_ in clustered_groups_files
        ], pool)
        printVerbose("Done parsing groups file.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(
            outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(
            outdir, groupsfile, cleaned_clustered_groups_files)

        # GATHER AUX FILES
        input_dir = getDirName(input_f)
        aux_files = cleaned_clustered_groups_files
        aux_files += getInputFiles(input_dir,
                                   "*.unique",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(input_dir,
                                   "*.unique.list",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(input_dir,
                                   "*.unique.TempCenters.Rare",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir,
                                   "*.cluster",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir,
                                   "*.cluster.list",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False)
        aux_files += getInputFiles(".",
                                   "LikelihoodRatio.txt",
                                   ignore_empty_files=False)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
Ejemplo n.º 14
0
    def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes,
                            stripcounts, extraargstring):
        """Dereplicates with vsearch.

        :param input_f: Filepath to the file or folder of files to dereplicate.
        :param outdir: Filepath to the output directory.
        :param groupsfile: A groups file to use as a reference for replicant counting.  If no groups file is
                            provided, input sequences are conidered singletons (regardless of their name-annotated
                            dereplication count).
        :param processes: The number of processes to use to dereplicate the fileset.
        :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY
        # strip counts if we need to.
        if stripcounts:
            printVerbose("Removing counts from sequence names...")
            debugPrintInputInfo(inputs, "renamed")
            run_parallel([
                PythonRunner(removeCountsFromFastFile, [
                    input_,
                    "%s/%s_uncount.fasta" %
                    (outdir, strip_ixes(input_)), 'fasta'
                ], {"exists": input_}) for input_ in inputs
            ], pool)
            printVerbose("Done removing counts.")

            # Grab the cleaned files as input for the next step
            inputs = getInputFiles(outdir, "*_uncount.fasta")

        # DEREPLICATE
        debugPrintInputInfo(inputs, "dereplicated")
        printVerbose("Dereplicating...")
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH, [
                processes, input_,
                "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)),
                "%s/%s_uc.out" % (outdir, strip_ixes(input_))
            ], {
                "exists": [input_],
                "positive": [processes]
            }, extraargstring) for input_ in inputs
        ], pool)
        printVerbose("Done dereplicating")

        # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE
        # generates a .groups file named _uc_parsed.out
        # python parseUCtoGroups.py uc.out uc_parsed.out
        input_ucs = getInputFiles(outdir, "*_uc.out")
        printVerbose("Generating a groups file from dereplication.")
        debugPrintInputInfo(inputs, "parsed (into a .groups file)")
        run_parallel([
            PythonRunner(
                parseUCtoGroups,
                [input_,
                 "%s/%s_derep.groups" %
                 (outdir, strip_ixes(input_))], {"exists": [input_]})
            for input_ in input_ucs
        ], pool)

        most_recent_groups_files = getInputFiles(outdir,
                                                 "*_derep.groups",
                                                 ignore_empty_files=False)

        # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS
        if groupsfile is not None:
            # Grab the oldgroups file and the dereplicated groups file
            old_groups_files = getInputFiles(groupsfile)
            derep_groups_files = getInputFiles(outdir, "*_derep.groups")

            printVerbose("Updating .groups files with dereplicated data")
            printVerbose("%d Reference (old) groups files to be read:" %
                         len(old_groups_files))
            printVerbose(str(old_groups_files))
            printVerbose("%d Dereplicated (new) groups files to be read:" %
                         len(derep_groups_files))
            printVerbose(str(derep_groups_files))

            update_groups(old_groups_files, derep_groups_files, outdir,
                          "dereplicated")
            most_recent_groups_files = getInputFiles(outdir,
                                                     "dereplicated*",
                                                     ignore_empty_files=False)
            printVerbose("Done updating .groups files.")

        if len(inputs) != len(most_recent_groups_files):
            print(
                "Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)."
                % (len(inputs), len(most_recent_groups_files)))
            exit()
        fasta_groups_pairs = zip(inputs, most_recent_groups_files)
        # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT
        # python renameWithReplicantCounts.py
        #               8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta
        printVerbose("Adding dereplication data to unique fasta")
        run_parallel([
            PythonRunner(renameWithReplicantCounts, [
                fasta, groups,
                "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'
            ], {"exists": [fasta, groups]})
            for fasta, groups in fasta_groups_pairs
        ], pool)
        printVerbose("Done adding data")

        aux_dir = makeAuxDir(outdir)
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(most_recent_groups_files, groups_dir)
        aux_files = getInputFiles(outdir,
                                  '*',
                                  "*_counts.fasta",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)
Ejemplo n.º 15
0
    def cluster_vsearch(self, input_f, outdir, groupsfile, processes, idpct, extraargstring):
        """Clusters sequences using SWARM.
        :param input_f: A file or folder containing fasta files to cluster.
        :param outdir: The output directory results will be written to.
        :param groupsfile: A groups file or folder containinggroups files that describe the input. Note: if no groups
                            file is supplied, then entries in the fasta file are assumed to be singleton sequences.
        :param idpct: Real number in the range (0,1] that specifies the minimum simmilarity threshold for
                            clustering.  e.g. .95 indicates that a candidate sequence 95% must be at least
                            95% simmilar to the seed sequence to be included in the cluster.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # " --cluster_size %s -id %f --centroids %s  --uc %s",
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_VSEARCH,
                                    [input_, float(idpct), "%s/%s_seeds.fasta" % (outdir, strip_ixes(input_)),
                                     "%s/%s_clustered_uc" % (outdir, strip_ixes(input_))],
                                    {"exists": [input_]}, extraargstring) for input_ in inputs], pool)

        # PARSE UC FILE TO GROUPS FILE
        printVerbose("Parsing the clustered uc files to groups files")
        clustered_uc_files = getInputFiles(outdir, "*_clustered_uc")
        debugPrintInputInfo(clustered_uc_files, "parsed to groups")
        run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_uc_files], pool)

        # REMOVE COUNTS FROM CLUSTERING GROUPS FILE
        printVerbose("Cleaning the .groups file from clustering")
        # Grab the current groups file and the new clustered groups file (which needs to be cleaned)
        clustered_groups_files = getInputFiles(outdir, "*_clustered.groups")
        # Remove counts from the clustering groups files
        debugPrintInputInfo(clustered_groups_files, "cleaned")
        run_parallel([PythonRunner(removeCountsFromGroupsFile,
                                   [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_groups_files], pool)
        printVerbose("Done cleaning groups files.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_files = getInputFiles(outdir, "*", "*_seeds.fasta", ignore_empty_files=False)
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
    def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare,
                     blockcount, extraargstring):
        """Clusters sequences using CROP.

        :param input_f: Filepath to the input fasta file to cluster.
        :param outdir: Filepath to the output directory.
        :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting.
        :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the
                            first round.  For data set with different average sequence length, this parameter should \
                            be tuned such that it won't take too long for each block to do pariwise alignment.  Hint \
                            for choosing z: z*L<150,000, where L is the average length of the sequences.
        :param clustpct: The minimum similarity threshold for clustering.  Either 'g' for 95% or 's' for 97%.
        :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \
                            this value to enhance accuracy (recommended value is at least 10*block size).
        :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run.  Max is 20.
        :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \
                            'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \
                            clusters are mapped to the 'abundant' clusters.  Finally, 'rare' clusters which cannot be \
                            mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \
                            considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \
                            believe your data is not too diverse to be handled, then r=0 will be the best choice.
        :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \
                            first round should contain about 50 sequences.  i.e. b=N/50, where N is the number of \
                            input sequences.  Default: # input sequences / z.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """

        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # crop -i %s -o %s -z %s -c %s -e %s -m %s%s
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP,
                                    [input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct,
                                        maxmcmc, maxsm, rare, blockcount],
                                    {"exists": [input_]}, extraargstring) for input_ in inputs], pool)

        # CLEAN THE OUTPUT GROUPS FILE
        printVerbose("Parsing the groups file from clustering")
        clustered_groups_files = getInputFiles(outdir, "*.cluster.list")
        debugPrintInputInfo(clustered_groups_files, "converted to groups files")
        run_parallel([PythonRunner(parseCROPoutToGroups, [input_,
                                   "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_groups_files], pool)
        printVerbose("Done parsing groups file.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files)

        # GATHER AUX FILES
        input_dir = getDirName(input_f)
        aux_files = cleaned_clustered_groups_files
        aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False)
        aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False)
        aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False)
        aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
Ejemplo n.º 17
0
    def cluster_swarm(self, input_f, outdir, groupsfile, processes,
                      extraargstring):
        """Clusters sequences using SWARM.
        :param input_f: A file or folder containing fasta files to cluster.
        :param outdir: The output directory results will be written to.
        :param groupsfile: A groups file or folder containing groups files that describe the input. Note: if no groups
                            file is supplied, then entries in the fasta file are assumed to be singleton sequences.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.CLUSTER_SWARM, [
                input_,
                "%s/%s_clustered" % (outdir, strip_ixes(input_)),
                "%s/%s_clustered_uc" % (outdir, strip_ixes(input_)),
                "%s/%s_clustered_seeds" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}, extraargstring) for input_ in inputs
        ], pool)

        # PARSE UC FILE TO GROUPS FILE
        printVerbose("Parsing the clustered uc files to groups files")
        clustered_uc_files = getInputFiles(outdir, "*_clustered_uc")
        debugPrintInputInfo(clustered_uc_files, "parsed to groups")
        run_parallel([
            PythonRunner(
                parseUCtoGroups,
                [input_, "%s/%s.groups" %
                 (outdir, strip_ixes(input_))], {"exists": [input_]})
            for input_ in clustered_uc_files
        ], pool)
        printVerbose("Done parsing groups files.")

        # REMOVE COUNTS FROM CLUSTERING GROUPS FILE
        printVerbose("Cleaning the .groups file from clustering")
        # Grab the current groups file and the new clustered groups file (which needs to be cleaned)
        clustered_groups_files = getInputFiles(outdir, "*_clustered.groups")
        debugPrintInputInfo(clustered_groups_files, "cleaned")
        run_parallel([
            PythonRunner(removeCountsFromGroupsFile, [
                input_,
                "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}) for input_ in clustered_groups_files
        ], pool)
        printVerbose("Done cleaning groups files.")

        printVerbose("Capitalizing sequences")
        # Convert the seeds files to uppercase (swarm writes in lowercase)
        inputs = getInputFiles(outdir, "*_seeds")
        run_parallel([
            PythonRunner(capitalize_seqs, [input_, "%s.fasta" % input_],
                         {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done capitalizing sequences")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(
            outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(
            outdir, groupsfile, cleaned_clustered_groups_files)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*_seeds.fasta",
                                  ignore_empty_files=False)
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
    def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes, stripcounts, extraargstring):
        """Dereplicates with vsearch.

        :param input_f: Filepath to the file or folder of files to dereplicate.
        :param outdir: Filepath to the output directory.
        :param groupsfile: A groups file to use as a reference for replicant counting.  If no groups file is
                            provided, input sequences are conidered singletons (regardless of their name-annotated
                            dereplication count).
        :param processes: The number of processes to use to dereplicate the fileset.
        :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY
        # strip counts if we need to.
        if stripcounts:
            printVerbose("Removing counts from sequence names...")
            debugPrintInputInfo(inputs, "renamed")
            run_parallel([PythonRunner(removeCountsFromFastFile,
                                       [input_, "%s/%s_uncount.fasta" % (outdir, strip_ixes(input_)), 'fasta'],
                                       {"exists": input_})
                          for input_ in inputs], pool)
            printVerbose("Done removing counts.")

            # Grab the cleaned files as input for the next step
            inputs = getInputFiles(outdir, "*_uncount.fasta")

        # DEREPLICATE
        debugPrintInputInfo(inputs, "dereplicated")
        printVerbose("Dereplicating...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH,
                                    [processes, input_,
                                     "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)),
                                     "%s/%s_uc.out" % (outdir, strip_ixes(input_))],
                                    {"exists": [input_], "positive": [processes]},
                                    extraargstring)
                      for input_ in inputs], pool)
        printVerbose("Done dereplicating")

        # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE
        # generates a .groups file named _uc_parsed.out
        # python parseUCtoGroups.py uc.out uc_parsed.out
        input_ucs = getInputFiles(outdir, "*_uc.out")
        printVerbose("Generating a groups file from dereplication.")
        debugPrintInputInfo(inputs, "parsed (into a .groups file)")
        run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s_derep.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in input_ucs], pool)

        most_recent_groups_files = getInputFiles(outdir, "*_derep.groups", ignore_empty_files=False)

        # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS
        if groupsfile is not None:
            # Grab the oldgroups file and the dereplicated groups file
            old_groups_files = getInputFiles(groupsfile)
            derep_groups_files = getInputFiles(outdir, "*_derep.groups")

            printVerbose("Updating .groups files with dereplicated data")
            printVerbose("%d Reference (old) groups files to be read:" % len(old_groups_files))
            printVerbose(str(old_groups_files))
            printVerbose("%d Dereplicated (new) groups files to be read:" % len(derep_groups_files))
            printVerbose(str(derep_groups_files))

            update_groups(old_groups_files, derep_groups_files, outdir, "dereplicated")
            most_recent_groups_files = getInputFiles(outdir, "dereplicated*", ignore_empty_files=False)
            printVerbose("Done updating .groups files.")

        if len(inputs) != len(most_recent_groups_files):
            print ("Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)." %
                   (len(inputs), len(most_recent_groups_files)))
            exit()
        fasta_groups_pairs = zip(inputs, most_recent_groups_files)
        # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT
        # python renameWithReplicantCounts.py
        #               8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta
        printVerbose("Adding dereplication data to unique fasta")
        run_parallel([PythonRunner(renameWithReplicantCounts,
                                   [fasta, groups, "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'],
                                   {"exists": [fasta, groups]})
                      for fasta, groups in fasta_groups_pairs], pool)
        printVerbose("Done adding data")

        aux_dir = makeAuxDir(outdir)
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(most_recent_groups_files, groups_dir)
        aux_files = getInputFiles(outdir, '*', "*_counts.fasta", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)