Esempio n. 1
0
def query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string,
                  extraargstring, pool):
    """Runs a VSEARCH alignment on pairs of query/reference sequences.

    :param inputs: A list of pairs of (filepaths to) query_fastas and the refrence fastas to compare them to.
    :param outdir: Filepath to the directory where the alignment result should be written.
    :param aln_user_string: An optional string of commandline parameters passed to the VSEARCH program.
    :param simmilarity: The minimum simmilarity percentage (between reference and query sequences), \
                            as a decimal between 0 and 1), required for a positive  match.
    :param processes: The number of processes to use in the identification process.
    :param extraargstring: Advanced program parameter string.
    :param pool: A fully initalized multiprocessing.Pool object.
    """
    printVerbose("Aligning against reference sequences...")
    #     # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
    # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt
    run_parallel([
        ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [
            processes, query_fasta, ref_fasta, simmilarity,
            "%s/%s.out" % (outdir, strip_ixes(query_fasta)),
            "%s/%s.alnout" % (outdir, strip_ixes(query_fasta)), aln_user_string
        ], {
            "exists": [query_fasta, ref_fasta],
            "positive": [processes]
        }, extraargstring) for query_fasta, ref_fasta in inputs
    ], pool)
    printVerbose("Done aligning.")
    return
    def clean_quality_trimmomatic(self, input_f, outdir, window_size, quality, min_len, processes, extraargstring):
        """Uses a sliding window to identify and trim away areas of low quality.

        :param input_f: Filepath to input file or folder.
        :param outdir: Filepath to the output directory.
        :param window_size: Width of the sliding window. (Number of consecutive base-pairs to average for quality \
                            analysis).
        :param quality: Minimum quality allowed.  Sections with lower average quality than this will be dropped.
        :param min_len: Minimum allowed length for TRIMMED sequences.  (i.e. if a sequence is too short after trimming,
                        its dropped.)
        :param processes: Number of processes to use to clean the input fileset.
        """
        # "trimomatic":       "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \
        # -%phred %input %output SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen"

        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clean")
        pool = init_pool(min(len(inputs), processes))

        printVerbose("Cleaning sequences with Trimmomatic...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLEAN_TRIMMOMATIC,
                                    [input_, "%s/%s_cleaned.fastq" % (outdir, strip_ixes(input_)), window_size, quality,
                                     min_len],
                                    {"exists": [outdir, input_], "positive": [window_size, quality, min_len]},
                                    extraargstring)
                      for input_ in inputs], pool)
        printVerbose("Done cleaning sequences.")
        cleanup_pool(pool)
    def assemble_pear(self, input_f, input_r, outdir, name, processes, pearthreads, extraargstring):
        """Uses PEAR to assemble paired F/R read files in run_parallel.

        :param input_f: File path to forward Fastq Reads file or folder.
        :param input_r: File path to reverse Fastq Reads file or folder.
        :param outdir: File path to the output directory.
        :param name: File prefix for the assembled reads.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        :param pearthreads: The number of threads per process to use.
        """
        # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d"
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tAssembling reads with pear")
        debugPrintInputInfo(inputs, "assemble")
        run_parallel([ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR,
                                      [forwards, reverse, "%s/%s_%s" % ( outdir, name, getFileName(forwards)),
                                        pearthreads],
                                      {"exists": [forwards, reverse], "positive": [pearthreads]},
                                      extraargstring)
                        for forwards, reverse in inputs], pool)

        printVerbose("Done assembling sequences...")
        # Grab all the auxillary files (everything not containing ".assembled."
        aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
    def demux_by_name(self, input_f, barcodes, outdir, filetype, processes, extraargstring):
        """Demuxes using SeqIO.

        :param input_f: File path to input file or folder of input files.
        :param barcodes: File path to input barcodes file.
        :param outdir: Filepath to output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param processes: Number of processes to use to demux input fileset.
        :param extraargstring: Advanced program parameter string.
        """
        aux_dir = makeAuxDir(outdir)
        # Get input files
        files_to_split = getInputFiles(input_f)
        # Assign the files shard numbers
        file_id = range(len(files_to_split))
        file_id_pairs = zip(files_to_split, file_id)
        debugPrintInputInfo(files_to_split, "demux")
        pool = init_pool(min(len(file_id_pairs), processes))

        printVerbose("Demuxing sequences...")
        run_parallel([PythonRunner(split_on_name,
                                   [input_, barcodes, outdir, id_, filetype], {"exists": [input_]})
                        for input_, id_ in file_id_pairs], pool)


        # Grab all the auxillary files
        aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)
    def clean_quality_trimmomatic(self, input_f, outdir, window_size, quality,
                                  min_len, processes, extraargstring):
        """Uses a sliding window to identify and trim away areas of low quality.

        :param input_f: Filepath to input file or folder.
        :param outdir: Filepath to the output directory.
        :param window_size: Width of the sliding window. (Number of consecutive base-pairs to average for quality \
                            analysis).
        :param quality: Minimum quality allowed.  Sections with lower average quality than this will be dropped.
        :param min_len: Minimum allowed length for TRIMMED sequences.  (i.e. if a sequence is too short after trimming,
                        its dropped.)
        :param processes: Number of processes to use to clean the input fileset.
        """
        # "trimomatic":       "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \
        # -%phred %input %output SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen"

        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clean")
        pool = init_pool(min(len(inputs), processes))

        printVerbose("Cleaning sequences with Trimmomatic...")
        run_parallel([
            ProgramRunner(
                ProgramRunnerCommands.CLEAN_TRIMMOMATIC, [
                    input_,
                    "%s/%s_cleaned.fastq" %
                    (outdir, strip_ixes(input_)), window_size, quality, min_len
                ], {
                    "exists": [outdir, input_],
                    "positive": [window_size, quality, min_len]
                }, extraargstring) for input_ in inputs
        ], pool)
        printVerbose("Done cleaning sequences.")
        cleanup_pool(pool)
    def annotate_otu_chewbacca(self, input_f, outdir, annotation, processes):
        """Annotates an OTU table.

        :param input_f: Filepath to a file or folder of files to annotate.
        :param annotation: Filepath to a file or a folder of files to use as annotations.
        :param outdir: Filepath to the output directory where annotated files will be written.
        :param procs: The maximum number of procs to use.
        """
        # matrixes = getInputFiles(input_f)
        matricies = getInputFiles(input_f)
        debugPrintInputInfo(matricies, "annotated.")
        annotations = getInputFiles(annotation)

        
        # if all the annotations files are empty, just copy over files.
        if len(annotations) == 0 and len(getInputFiles(annotation, ignore_empty_files=False)) > 0:
            pool = init_pool(min(len(matricies), processes))
            print "**WARNING**: Annotation File is empty.  Skipping annotation and copying old OTU tables to output \
                    directory.\n"
            run_parallel([PythonRunner(copy_file, [matrix, outdir],
                                       {"exists": [matrix]}) for matrix in matricies], pool)
        else:
            pool = init_pool(min(len(matricies) * len(annotations), processes))
            debugPrintInputInfo(annotations, "parsed.")
            inputs = product(matricies, annotations)

            printVerbose("Annotating matrix...")

            
            run_parallel([PythonRunner(annotateOTUtable, [matrix, annotation, "%s/%s.txt" % (outdir, "matrix")],
                                       {"exists": [matrix, annotation]})
                          for matrix, annotation in inputs], pool)
            printVerbose("Done Annotating.")

        cleanup_pool(pool)
    def rename_chewbacca(self, input_f, outdir, filetype, clip, processes):
        """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where
            <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes.

        :param input_f: Filepath to an input file or folder to rename.
        :param outdir: Filepath to the output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param clip: If True, remove dereplication counts from sequence names before renaming.
        :param processes: The maximum number of processes to use.
        """

        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "rename")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Renaming sequences...")
        # Run serialRename in run_parallel
        run_parallel([PythonRunner(serialRename,
                                   [input_,
                                    "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]),
                                    filetype, clip], {"exists": [input_]})
                      for input_ in inputs], pool)
        printVerbose("Done renaming sequences...")

        samples_dir = makeDirOrdie("%s_samples" % outdir)
        samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False)
        bulk_move_to_dir(samples_files, samples_dir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)

        cleanup_pool(pool)
Esempio n. 8
0
    def demux_fastx(self, input_f, barcodes, outdir, processes, extraargstring):
        """Demuxes using FAST X BARCODE SPLITTER.

        :param input_f: File path to input file or folder of input files.
        :param barcodes: File path to input barcodes file.
        :param outdir: Filepath to output directory.
        :param processes: Number of processes to use to demux input fileset.
        :param extraargstring: Advanced program parameter string.
        """
        # Get input files
        files_to_split = getInputFiles(input_f)
        # Assign the files shard numbers
        file_id = range(len(files_to_split))
        file_id_pairs = zip(files_to_split, file_id)
        debugPrintInputInfo(files_to_split, "demux")
        pool = init_pool(min(len(file_id_pairs), processes))

        printVerbose("Demuxing sequences...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.DEMUX_FASTX,
                                    [input_, barcodes, "%s/" % outdir, "_%d_demux.fastq" % id_],
                                    {"exists": [input_, barcodes]}, extraargstring)
                      for input_, id_ in file_id_pairs], pool)
        printVerbose("Demuxed sequences.")

        # Grab all the auxillary files
        aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
Esempio n. 9
0
    def preclean_bayeshammer(self, input_f, input_r, outdir, processes,
                             bayesthreads, extraargstring):
        """Assembles reads from two (left and right) fastq files/directories.

        :param input_f: File path to file or folder of left reads to clean.
        :param input_r: File path to file or folder of right reads to clean.
        :param outdir: Filepath to output directory.
        :param bayesthreads: The number of threads per process to use.
        :param processes: The maximum number of processes to use.
        :param kmerlen: The kmer length to use.  Default: 16.
        :param extraargstring: Advanced program parameter string.
        """
        # Collect input files, and validate that they match
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tPrecleaning %s reads with Spades-Baye's Hammer..." %
                     len(inputs))
        debugPrintInputInfo(inputs, "preclean/fix.")

        run_parallel([
            ProgramRunner(ProgramRunnerCommands.PRECLEAN_SPADES,
                          [forwards, reverse, outdir, bayesthreads], {
                              "exists": [forwards, reverse],
                              "positive": [bayesthreads]
                          }, extraargstring) for forwards, reverse in inputs
        ], pool)
        printVerbose("Done cleaning reads.")

        # Grab all the auxillary files (everything not containing ".assembled."
        # aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        # bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        # Select output files
        aux_files = getInputFiles(outdir, "*", ignore_empty_files=False)
        corrected_dir = "%s/corrected" % outdir
        bulk_move_to_dir(getInputFiles(corrected_dir, "*"), outdir)
        aux_files += getInputFiles(outdir,
                                   "*unpaired*",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "configs", ignore_empty_files=False)

        # Gather aux files
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Rename output files
        output_files = getInputFiles(outdir, "*", "corrected_*")
        for out_file in output_files:
            move(out_file,
                 "%s/%s_corrected.fastq" % (outdir, strip_ixes(out_file)))

        # move the last minute log file
        try:
            move("%s/corrected_corrected.fastq" % outdir,
                 "%s/corrected_corrected.fastq" % aux_dir)
        except:
            pass
        cleanup_pool(pool)
Esempio n. 10
0
    def query_fasta_vsearch(self, input_f, referencefasta, taxinfo, outdir, processes, simmilarity, coverage,
                            extraargstring):
        """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment.

        :param input_f:  Filepath to a file or folder of files to identify.
        :param outdir: Filepath to the output directory.
        :param referencefasta: Filepath to a file or folder of files to use as a reference.
        :param taxinfo:  Filepath to a file containing taxonomic info correlated with the referencefasta.
        :param simmilarity: The % simmilarity between a query and reference sequence required for positive
                                identification.
        :param coverage: The % coverage of matching regions between a query and reference sequence required for positive
                            identification.
        :param processes: The number of processes to use in the identification process.
        :param extraargstring: Advanced program parameter string.
        """
        # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
        #       --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt

        # expecting a fasta to annotate
        query_fastas = getInputFiles(input_f)
        debugPrintInputInfo(query_fastas, "queried for identification.")
        ref_fastas = getInputFiles(referencefasta)
        debugPrintInputInfo(ref_fastas, "referenced for sequence identification.")
        tax_info_files = getInputFiles(taxinfo)
        debugPrintInputInfo(tax_info_files, "referenced for taxanomic names.")

        # make sure the number of reference fasta files is the same as the number of tax_info files
        if len(tax_info_files) != len(ref_fastas):
            print "Error: The number of reference fastas and taxonomic mapping files is not the same.  There must be \
                    one taxonomic mapping file for each reference fasta."
            return
        ref_data_pairs = zip(ref_fastas, tax_info_files)
        inputs = [x for x in product(query_fastas, ref_fastas)]
        aln_user_string = ""
        pool = init_pool(min(len(inputs), processes))

        # VSEARCH ALIGNMENT
        query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool)

        printVerbose("Parsing output...")
        # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in
        # parsed_BIOCODE.out.  Parameters can be changed and this command can be rerun as many times as necessary
        #
        # parseVSearchOutputAgainstFasta(vsearch_outfile, taxInfo, output_file, min_simmilarity, min_coverage):
        inputs = [x for x in product(query_fastas, ref_data_pairs)]
        debugPrintInputInfo(inputs, "queryied against paired refereces.")
        run_parallel([PythonRunner(parseVSearchOutputAgainstFasta,
                                   ["%s/%s.out" % (outdir, strip_ixes(query)), tax_info,
                                    "%s/%s.tax" % (outdir, strip_ixes(query)), simmilarity, coverage],
                                   {"exists": [query, ref_fasta, tax_info]})
                      for query, (ref_fasta, tax_info) in inputs], pool)
        printVerbose("\nDone parsing...")

        # Gather and move auxillary files
        aux_files = getInputFiles(outdir, "*", "*.tax", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        cleanup_pool(pool)
Esempio n. 11
0
    def query_fasta_db_vsearch(self, input_f, outdir, ref_fasta, ref_db,
                               simmilarity, coverage, processes,
                               extraargstring):
        """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment.

        :param input_f:  Filepath to a file or folder of files to identify.
        :param outdir: Filepath to the output directory.
        :param ref_fasta: Filepath to the curated fasta file to use as a reference.
        :param ref_db: Filepath to the curated fasta file to use as a reference.
        :param simmilarity:"Minimum % simmilarity (decimal between 0 and 1) between query and reference sequences
                            required for positive identification.
        :param coverage:Minimum % coverage (decimal between 0 and 1) required query and reference sequences required
                            for positive identification.
        :param processes: The number of processes to use in the identification process.
        :param extraargstring: Advanced program parameter string.
        """
        # blast6 output format http://www.drive5.com/usearch/manual/blast6out.html
        aln_user_string = "--userfields query+target+id+alnlen+qcov"
        # coi_fasta = os.path.expanduser("~/ARMS/refs/COI.fasta")
        # ncbi_db_string = os.path.expanduser("~/ARMS/refs/ncbi.db")
        coi_fasta = ref_fasta
        ncbi_db_string = ref_db

        query_fastas = getInputFiles(input_f)
        debugPrintInputInfo(query_fastas, "queried against the DB.")
        inputs = [x for x in product(query_fastas, [coi_fasta])]
        pool = init_pool(min(len(query_fastas), processes))

        # VSEARCH ALIGNMENT
        query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string,
                      extraargstring, pool)

        printVerbose("Parsing output...")
        # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in
        # parsed_BIOCODE.out.  Parameters can be changed and this command can be rerun as many times as necessary
        #
        # parseVSearchOutputAgainstNCBI(vsearch_out, ncbi_db, min_coverage, min_similarity)> parsed_nt.out
        run_parallel([
            PythonRunner(parseVSearchOutputAgainstNCBI, [
                "%s/%s.out" % (outdir, strip_ixes(query)), ncbi_db_string,
                "%s/%s.tax" %
                (outdir, strip_ixes(query)), simmilarity, coverage
            ], {"exits": [query, ncbi_db_string]}) for query in query_fastas
        ], pool)
        printVerbose("Done processing.")

        # Gather and move auxillary files
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*.tax",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        cleanup_pool(pool)
 def convert_chewbacca(self, input_f, outdir, proceses):
     inputs = getInputFiles(input_f)
     debugPrintInputInfo(inputs, "convert to fasta.")
     printVerbose("Converting to fasta...")
     pool = init_pool(min(len(inputs), proceses))
     run_parallel([PythonRunner(translateFastqToFasta,
                                [input_, "%s/%s.fasta" % (outdir, getFileName(input_))],
                                {"exists": input_})
                                 for input_ in inputs], pool)
     printVerbose("Done converting.")
     cleanup_pool(pool)
 def convert_chewbacca(self, input_f, outdir, proceses):
     inputs = getInputFiles(input_f)
     debugPrintInputInfo(inputs, "convert to fasta.")
     printVerbose("Converting to fasta...")
     pool = init_pool(min(len(inputs), proceses))
     run_parallel([
         PythonRunner(
             translateFastqToFasta,
             [input_, "%s/%s.fasta" %
              (outdir, getFileName(input_))], {"exists": input_})
         for input_ in inputs
     ], pool)
     printVerbose("Done converting.")
     cleanup_pool(pool)
Esempio n. 14
0
    def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc,
                                    outdir, allowedns, processes,
                                    extraargstring):
        """Use flexbar to trim adapters and barcodes from sequences.  By default, Flexbar does not allow any 'N' \
            characters in SEQUENCE, and will toss any sequences that do contain 'N'.  To avoid this, use the -u or \
            --allowedns flags to specify the maximum number of 'N's to allow

        :param input_f: Filepath to input file or folder.
        :param adapters: Filepath to a list of adapters.
        :param adaptersrc: Filepath to a list of reverse-complemented adapters.
        :param outdir: Filepath to the output directory.
        :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        debugPrintInputInfo(inputs, "trim adapters from")
        # "flexbar":  "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"",
        printVerbose("Trimming barcodes and adapters with flexbar")
        temp_file_name_template = "%s/temp_%s"
        debarcoded_file_name_template = "%s/%s_debarcoded"
        # Trim adapters from the left
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [
                input_file, temp_file_name_template %
                (outdir, strip_ixes(input_file)), "LEFT", adapters, allowedns
            ], {"exists": [input_file, adapters]}, extraargstring)
            for input_file in inputs
        ], pool)

        temp_files = getInputFiles(outdir, "temp_*")
        debugPrintInputInfo(temp_files, "trim adapters from")

        # Trim the reverse complemented adapters from the right
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [
                input_file, debarcoded_file_name_template %
                (outdir, strip_ixes(input_file)[5:]), "RIGHT", adaptersrc,
                allowedns
            ], {"exists": [input_file, adaptersrc]}, extraargstring)
            for input_file in temp_files
        ], pool)
        printVerbose("Done Trimming sequences.")

        # Move temp files
        aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
Esempio n. 15
0
    def align_clean_macse(self, input_f, ref, samplesdir, outdir, processes, extraargstring=""):
        """Removes non-nucleotide characters in MACSE aligned sequences for all fasta files in the samples directory
            (the samplesDir argument).

        :param input_f: File path to file or folder of files to clean.
        :param samplesdir: Filepath to the original, unaligned input files (the inputs to the macse aligner).
        :param ref: Filepath to the reference file used to align the input files.
        :param outdir: Filepath to the directory to write outputs to.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # "macse_format":     "java -jar " + programPaths["MACSE"] + "  -prog exportAlignment -align \"%s\" \
        #                           -charForRemainingFS - -gc_def 5 -out_AA \"%s\" -out_NT \"%s\" -statFile \"%s\""

        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\t %s Processing MACSE alignments")
        samples_list = getInputFiles(samplesdir)
        run_parallel([ProgramRunner(ProgramRunnerCommands.MACSE_FORMAT,
                                    ["%s/%s_NT" % (input_f, getFileName(sample)),
                                     "%s/%s_AA_macse.fasta" % (outdir, getFileName(sample)),
                                     "%s/%s_NT_macse.fasta" % (outdir, getFileName(sample)),
                                     "%s/%s_macse.csv" % (outdir, getFileName(sample))],

                                    {"exists": ["%s/%s_NT" % (input_f, getFileName(sample))]}, extraargstring)
                      for sample in samples_list], pool)
        printVerbose("\tCleaning MACSE alignments")

        printVerbose("Processing %s samples..." % len(samples_list))
        nt_macse_outs = ["%s/%s_NT_macse.fasta" % (outdir, strip_ixes(sample)) for sample in samples_list]

        # Clean the alignments
        from classes.PythonRunner import PythonRunner
        run_parallel([PythonRunner(remove_refs_from_macse_out, [input_, ref,
                                   "%s/%s" % (outdir, "%s_cleaned.fasta" % strip_ixes(input_))],
                                   {"exists": [input_, ref]})
                      for input_ in nt_macse_outs], pool)

        # Cat the cleaned alignments
        cleaned_alignments = getInputFiles(outdir, "*_cleaned.fasta")
        merge_files(cleaned_alignments, "%s/MACSE_OUT_MERGED.fasta" % outdir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir, "*", "MACSE_OUT_MERGED.fasta", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)
    def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc, outdir, allowedns, processes, extraargstring):
        """Use flexbar to trim adapters and barcodes from sequences.  By default, Flexbar does not allow any 'N' \
            characters in SEQUENCE, and will toss any sequences that do contain 'N'.  To avoid this, use the -u or \
            --allowedns flags to specify the maximum number of 'N's to allow

        :param input_f: Filepath to input file or folder.
        :param adapters: Filepath to a list of adapters.
        :param adaptersrc: Filepath to a list of reverse-complemented adapters.
        :param outdir: Filepath to the output directory.
        :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        debugPrintInputInfo(inputs, "trim adapters from")
        # "flexbar":  "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"",
        printVerbose("Trimming barcodes and adapters with flexbar")
        temp_file_name_template = "%s/temp_%s"
        debarcoded_file_name_template = "%s/%s_debarcoded"
        # Trim adapters from the left
        run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR,
                                    [input_file, temp_file_name_template % (outdir, strip_ixes(input_file)),
                                     "LEFT", adapters, allowedns],
                                    {"exists": [input_file, adapters]}, extraargstring)
                      for input_file in inputs], pool)

        temp_files = getInputFiles(outdir, "temp_*")
        debugPrintInputInfo(temp_files, "trim adapters from")

        # Trim the reverse complemented adapters from the right
        run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR,
                                    [input_file, debarcoded_file_name_template % (outdir, strip_ixes(input_file)[5:]),
                                     "RIGHT", adaptersrc, allowedns],
                                    {"exists": [input_file, adaptersrc]}, extraargstring)
                      for input_file in temp_files], pool)
        printVerbose("Done Trimming sequences.")

        # Move temp files
        aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
Esempio n. 17
0
    def annotate_otu_chewbacca(self, input_f, outdir, annotation, processes):
        """Annotates an OTU table.

        :param input_f: Filepath to a file or folder of files to annotate.
        :param annotation: Filepath to a file or a folder of files to use as annotations.
        :param outdir: Filepath to the output directory where annotated files will be written.
        :param procs: The maximum number of procs to use.
        """
        # matrixes = getInputFiles(input_f)
        matricies = getInputFiles(input_f)
        debugPrintInputInfo(matricies, "annotated.")
        annotations = getInputFiles(annotation)

        # if all the annotations files are empty, just copy over files.
        if len(annotations) == 0 and len(
                getInputFiles(annotation, ignore_empty_files=False)) > 0:
            pool = init_pool(min(len(matricies), processes))
            print "**WARNING**: Annotation File is empty.  Skipping annotation and copying old OTU tables to output \
                    directory.\n"

            run_parallel([
                PythonRunner(copy_file, [matrix, outdir], {"exists": [matrix]})
                for matrix in matricies
            ], pool)
        else:
            pool = init_pool(min(len(matricies) * len(annotations), processes))
            debugPrintInputInfo(annotations, "parsed.")
            inputs = product(matricies, annotations)

            printVerbose("Annotating matrix...")

            run_parallel([
                PythonRunner(
                    annotateOTUtable,
                    [matrix, annotation,
                     "%s/%s.txt" %
                     (outdir, "matrix")], {"exists": [matrix, annotation]})
                for matrix, annotation in inputs
            ], pool)
            printVerbose("Done Annotating.")

        cleanup_pool(pool)
    def partition_chewbacca(self, input_f, outdir, processes, chunksize, filetype):
        """Partition a fasta/fastq file into chunks of user-defined size.

        :param input_f: Filepath to a file or folder of files to partition.
        :param outdir: The directory to write split files to.
        :param processes: The number of processes to use to partition the input fileset.
        :param chunksize: The number of sequences per file.
        :param filetype: Either 'fasta' or 'fastq'.
        """
        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "partitioned")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Partitioning Files...")
        run_parallel([PythonRunner(splitK,
                                   [input_, "%s/%s" % (outdir, strip_ixes(input_)), chunksize, filetype],
                                   {"exists": [input_]})
                  for input_ in inputs], pool)
        printVerbose("Done partitioning files.")
        cleanup_pool(pool)
    def ungap_chewbacca(self, input_f, outdir, gapchars, file_ext, processes):
        """Ungaps a character using Bio python.

            :param input_f: Filepath to input file or folder to ungap.
            :param outdir: Filepath to the output directory where ungapped files should be written.
            :param gapchars: A string containing the gap characters to remove.
            :param file_ext: Either 'fasta' or 'fastq'.
            :param processes: The number of threads to use to ungap the input fileset.
        """
        inputs = getInputFiles(input_f, "*.fasta")
        debugPrintInputInfo(inputs, "ungap.")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Removing all '%s' from sequences..." % gapchars)
        # ungap(file_to_clean, output_file_name, gap_char, file_type):
        run_parallel([PythonRunner(remove_gap_chars,
                                   [input_, "%s/%s_cleaned.%s" % (outdir, strip_ixes(input_), 'fasta'),
                                   gapchars, file_ext],
                                   {"exists": [input_]}) for input_ in inputs], pool)
        printVerbose("Done removing.")
        cleanup_pool(pool)
Esempio n. 20
0
    def rename_chewbacca(self, input_f, outdir, filetype, clip, processes):
        """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where
            <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes.

        :param input_f: Filepath to an input file or folder to rename.
        :param outdir: Filepath to the output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param clip: If True, remove dereplication counts from sequence names before renaming.
        :param processes: The maximum number of processes to use.
        """

        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "rename")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Renaming sequences...")
        # Run serialRename in run_parallel
        run_parallel([
            PythonRunner(serialRename, [
                input_,
                "%s/%s_renamed%s" %
                (outdir, strip_ixes(input_), os.path.splitext(input_)[1]),
                filetype, clip
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done renaming sequences...")

        samples_dir = makeDirOrdie("%s_samples" % outdir)
        samples_files = getInputFiles(outdir,
                                      "*.samples",
                                      ignore_empty_files=False)
        bulk_move_to_dir(samples_files, samples_dir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir,
                                  "*.mapping",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)

        cleanup_pool(pool)
Esempio n. 21
0
    def partition_chewbacca(self, input_f, outdir, processes, chunksize,
                            filetype):
        """Partition a fasta/fastq file into chunks of user-defined size.

        :param input_f: Filepath to a file or folder of files to partition.
        :param outdir: The directory to write split files to.
        :param processes: The number of processes to use to partition the input fileset.
        :param chunksize: The number of sequences per file.
        :param filetype: Either 'fasta' or 'fastq'.
        """
        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "partitioned")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Partitioning Files...")
        run_parallel([
            PythonRunner(splitK, [
                input_,
                "%s/%s" % (outdir, strip_ixes(input_)), chunksize, filetype
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done partitioning files.")
        cleanup_pool(pool)
Esempio n. 22
0
    def ungap_chewbacca(self, input_f, outdir, gapchars, file_ext, processes):
        """Ungaps a character using Bio python.

            :param input_f: Filepath to input file or folder to ungap.
            :param outdir: Filepath to the output directory where ungapped files should be written.
            :param gapchars: A string containing the gap characters to remove.
            :param file_ext: Either 'fasta' or 'fastq'.
            :param processes: The number of threads to use to ungap the input fileset.
        """
        inputs = getInputFiles(input_f, "*.fasta")
        debugPrintInputInfo(inputs, "ungap.")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Removing all '%s' from sequences..." % gapchars)
        # ungap(file_to_clean, output_file_name, gap_char, file_type):
        run_parallel([
            PythonRunner(remove_gap_chars, [
                input_,
                "%s/%s_cleaned.%s" %
                (outdir, strip_ixes(input_), 'fasta'), gapchars, file_ext
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done removing.")
        cleanup_pool(pool)
Esempio n. 23
0
    def assemble_pear(self, input_f, input_r, outdir, name, processes,
                      pearthreads, extraargstring):
        """Uses PEAR to assemble paired F/R read files in run_parallel.

        :param input_f: File path to forward Fastq Reads file or folder.
        :param input_r: File path to reverse Fastq Reads file or folder.
        :param outdir: File path to the output directory.
        :param name: File prefix for the assembled reads.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        :param pearthreads: The number of threads per process to use.
        """
        # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d"
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tAssembling reads with pear")
        debugPrintInputInfo(inputs, "assemble")
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR, [
                forwards, reverse,
                "%s/%s_%s" % (outdir, name, getFileName(forwards)), pearthreads
            ], {
                "exists": [forwards, reverse],
                "positive": [pearthreads]
            }, extraargstring) for forwards, reverse in inputs
        ], pool)

        printVerbose("Done assembling sequences...")
        # Grab all the auxillary files (everything not containing ".assembled."
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*.assembled.*",
                                  ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
    def align_macse(self, input_f, db, outdir, processes, extraargstring):
        """Aligns sequences by iteratively adding them to a known good alignment.

        :param input_f: Filepath to an input file or folder to rename.
        :param db: Filepath to a reference file or folder of reference files for alignment.
        :param outdir: Filepath to the output directory.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # "macse_align":      "java -jar " + programPaths["MACSE"] + " -prog enrichAlignment  -seq \"%s\" -align \
        #                                    \"%s\" -seq_lr \"%s\" -maxFS_inSeq 0  -maxSTOP_inSeq 0  -maxINS_inSeq 0 \
        #                                    -maxDEL_inSeq 3 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT \"%s\"_NT \
        #                                    -out_AA \"%s\"_AA -seqToAdd_logFile \"%s\"_log.csv",

        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Aligning reads using MACSE")
        inputs = getInputFiles(input_f)
        run_parallel([ProgramRunner(ProgramRunnerCommands.MACSE_ALIGN,
                                    [db, db, input_] + ["%s/%s" % (outdir, getFileName(input_))] * 3,
                                    {"exists": [input_, db]}, extraargstring)
                      for input_ in inputs], pool)
        printVerbose("Done with MACSE alignment.")
        cleanup_pool(pool)
Esempio n. 25
0
    def cluster_vsearch(self, input_f, outdir, groupsfile, processes, idpct, extraargstring):
        """Clusters sequences using SWARM.
        :param input_f: A file or folder containing fasta files to cluster.
        :param outdir: The output directory results will be written to.
        :param groupsfile: A groups file or folder containinggroups files that describe the input. Note: if no groups
                            file is supplied, then entries in the fasta file are assumed to be singleton sequences.
        :param idpct: Real number in the range (0,1] that specifies the minimum simmilarity threshold for
                            clustering.  e.g. .95 indicates that a candidate sequence 95% must be at least
                            95% simmilar to the seed sequence to be included in the cluster.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # " --cluster_size %s -id %f --centroids %s  --uc %s",
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_VSEARCH,
                                    [input_, float(idpct), "%s/%s_seeds.fasta" % (outdir, strip_ixes(input_)),
                                     "%s/%s_clustered_uc" % (outdir, strip_ixes(input_))],
                                    {"exists": [input_]}, extraargstring) for input_ in inputs], pool)

        # PARSE UC FILE TO GROUPS FILE
        printVerbose("Parsing the clustered uc files to groups files")
        clustered_uc_files = getInputFiles(outdir, "*_clustered_uc")
        debugPrintInputInfo(clustered_uc_files, "parsed to groups")
        run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_uc_files], pool)

        # REMOVE COUNTS FROM CLUSTERING GROUPS FILE
        printVerbose("Cleaning the .groups file from clustering")
        # Grab the current groups file and the new clustered groups file (which needs to be cleaned)
        clustered_groups_files = getInputFiles(outdir, "*_clustered.groups")
        # Remove counts from the clustering groups files
        debugPrintInputInfo(clustered_groups_files, "cleaned")
        run_parallel([PythonRunner(removeCountsFromGroupsFile,
                                   [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_groups_files], pool)
        printVerbose("Done cleaning groups files.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_files = getInputFiles(outdir, "*", "*_seeds.fasta", ignore_empty_files=False)
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
Esempio n. 26
0
    def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes,
                            stripcounts, extraargstring):
        """Dereplicates with vsearch.

        :param input_f: Filepath to the file or folder of files to dereplicate.
        :param outdir: Filepath to the output directory.
        :param groupsfile: A groups file to use as a reference for replicant counting.  If no groups file is
                            provided, input sequences are conidered singletons (regardless of their name-annotated
                            dereplication count).
        :param processes: The number of processes to use to dereplicate the fileset.
        :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY
        # strip counts if we need to.
        if stripcounts:
            printVerbose("Removing counts from sequence names...")
            debugPrintInputInfo(inputs, "renamed")
            run_parallel([
                PythonRunner(removeCountsFromFastFile, [
                    input_,
                    "%s/%s_uncount.fasta" %
                    (outdir, strip_ixes(input_)), 'fasta'
                ], {"exists": input_}) for input_ in inputs
            ], pool)
            printVerbose("Done removing counts.")

            # Grab the cleaned files as input for the next step
            inputs = getInputFiles(outdir, "*_uncount.fasta")

        # DEREPLICATE
        debugPrintInputInfo(inputs, "dereplicated")
        printVerbose("Dereplicating...")
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH, [
                processes, input_,
                "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)),
                "%s/%s_uc.out" % (outdir, strip_ixes(input_))
            ], {
                "exists": [input_],
                "positive": [processes]
            }, extraargstring) for input_ in inputs
        ], pool)
        printVerbose("Done dereplicating")

        # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE
        # generates a .groups file named _uc_parsed.out
        # python parseUCtoGroups.py uc.out uc_parsed.out
        input_ucs = getInputFiles(outdir, "*_uc.out")
        printVerbose("Generating a groups file from dereplication.")
        debugPrintInputInfo(inputs, "parsed (into a .groups file)")
        run_parallel([
            PythonRunner(
                parseUCtoGroups,
                [input_,
                 "%s/%s_derep.groups" %
                 (outdir, strip_ixes(input_))], {"exists": [input_]})
            for input_ in input_ucs
        ], pool)

        most_recent_groups_files = getInputFiles(outdir,
                                                 "*_derep.groups",
                                                 ignore_empty_files=False)

        # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS
        if groupsfile is not None:
            # Grab the oldgroups file and the dereplicated groups file
            old_groups_files = getInputFiles(groupsfile)
            derep_groups_files = getInputFiles(outdir, "*_derep.groups")

            printVerbose("Updating .groups files with dereplicated data")
            printVerbose("%d Reference (old) groups files to be read:" %
                         len(old_groups_files))
            printVerbose(str(old_groups_files))
            printVerbose("%d Dereplicated (new) groups files to be read:" %
                         len(derep_groups_files))
            printVerbose(str(derep_groups_files))

            update_groups(old_groups_files, derep_groups_files, outdir,
                          "dereplicated")
            most_recent_groups_files = getInputFiles(outdir,
                                                     "dereplicated*",
                                                     ignore_empty_files=False)
            printVerbose("Done updating .groups files.")

        if len(inputs) != len(most_recent_groups_files):
            print(
                "Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)."
                % (len(inputs), len(most_recent_groups_files)))
            exit()
        fasta_groups_pairs = zip(inputs, most_recent_groups_files)
        # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT
        # python renameWithReplicantCounts.py
        #               8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta
        printVerbose("Adding dereplication data to unique fasta")
        run_parallel([
            PythonRunner(renameWithReplicantCounts, [
                fasta, groups,
                "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'
            ], {"exists": [fasta, groups]})
            for fasta, groups in fasta_groups_pairs
        ], pool)
        printVerbose("Done adding data")

        aux_dir = makeAuxDir(outdir)
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(most_recent_groups_files, groups_dir)
        aux_files = getInputFiles(outdir,
                                  '*',
                                  "*_counts.fasta",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)
Esempio n. 27
0
    def cluster_swarm(self, input_f, outdir, groupsfile, processes,
                      extraargstring):
        """Clusters sequences using SWARM.
        :param input_f: A file or folder containing fasta files to cluster.
        :param outdir: The output directory results will be written to.
        :param groupsfile: A groups file or folder containing groups files that describe the input. Note: if no groups
                            file is supplied, then entries in the fasta file are assumed to be singleton sequences.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.CLUSTER_SWARM, [
                input_,
                "%s/%s_clustered" % (outdir, strip_ixes(input_)),
                "%s/%s_clustered_uc" % (outdir, strip_ixes(input_)),
                "%s/%s_clustered_seeds" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}, extraargstring) for input_ in inputs
        ], pool)

        # PARSE UC FILE TO GROUPS FILE
        printVerbose("Parsing the clustered uc files to groups files")
        clustered_uc_files = getInputFiles(outdir, "*_clustered_uc")
        debugPrintInputInfo(clustered_uc_files, "parsed to groups")
        run_parallel([
            PythonRunner(
                parseUCtoGroups,
                [input_, "%s/%s.groups" %
                 (outdir, strip_ixes(input_))], {"exists": [input_]})
            for input_ in clustered_uc_files
        ], pool)
        printVerbose("Done parsing groups files.")

        # REMOVE COUNTS FROM CLUSTERING GROUPS FILE
        printVerbose("Cleaning the .groups file from clustering")
        # Grab the current groups file and the new clustered groups file (which needs to be cleaned)
        clustered_groups_files = getInputFiles(outdir, "*_clustered.groups")
        debugPrintInputInfo(clustered_groups_files, "cleaned")
        run_parallel([
            PythonRunner(removeCountsFromGroupsFile, [
                input_,
                "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}) for input_ in clustered_groups_files
        ], pool)
        printVerbose("Done cleaning groups files.")

        printVerbose("Capitalizing sequences")
        # Convert the seeds files to uppercase (swarm writes in lowercase)
        inputs = getInputFiles(outdir, "*_seeds")
        run_parallel([
            PythonRunner(capitalize_seqs, [input_, "%s.fasta" % input_],
                         {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done capitalizing sequences")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(
            outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(
            outdir, groupsfile, cleaned_clustered_groups_files)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*_seeds.fasta",
                                  ignore_empty_files=False)
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
Esempio n. 28
0
    def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize,
                     clustpct, maxmcmc, maxsm, rare, blockcount,
                     extraargstring):
        """Clusters sequences using CROP.

        :param input_f: Filepath to the input fasta file to cluster.
        :param outdir: Filepath to the output directory.
        :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting.
        :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the
                            first round.  For data set with different average sequence length, this parameter should \
                            be tuned such that it won't take too long for each block to do pariwise alignment.  Hint \
                            for choosing z: z*L<150,000, where L is the average length of the sequences.
        :param clustpct: The minimum similarity threshold for clustering.  Either 'g' for 95% or 's' for 97%.
        :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \
                            this value to enhance accuracy (recommended value is at least 10*block size).
        :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run.  Max is 20.
        :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \
                            'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \
                            clusters are mapped to the 'abundant' clusters.  Finally, 'rare' clusters which cannot be \
                            mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \
                            considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \
                            believe your data is not too diverse to be handled, then r=0 will be the best choice.
        :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \
                            first round should contain about 50 sequences.  i.e. b=N/50, where N is the number of \
                            input sequences.  Default: # input sequences / z.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """

        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # crop -i %s -o %s -z %s -c %s -e %s -m %s%s
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [
                input_,
                "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct,
                maxmcmc, maxsm, rare, blockcount
            ], {"exists": [input_]}, extraargstring) for input_ in inputs
        ], pool)

        # CLEAN THE OUTPUT GROUPS FILE
        printVerbose("Parsing the groups file from clustering")
        clustered_groups_files = getInputFiles(outdir, "*.cluster.list")
        debugPrintInputInfo(clustered_groups_files,
                            "converted to groups files")
        run_parallel([
            PythonRunner(parseCROPoutToGroups, [
                input_,
                "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}) for input_ in clustered_groups_files
        ], pool)
        printVerbose("Done parsing groups file.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(
            outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(
            outdir, groupsfile, cleaned_clustered_groups_files)

        # GATHER AUX FILES
        input_dir = getDirName(input_f)
        aux_files = cleaned_clustered_groups_files
        aux_files += getInputFiles(input_dir,
                                   "*.unique",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(input_dir,
                                   "*.unique.list",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(input_dir,
                                   "*.unique.TempCenters.Rare",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir,
                                   "*.cluster",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir,
                                   "*.cluster.list",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False)
        aux_files += getInputFiles(".",
                                   "LikelihoodRatio.txt",
                                   ignore_empty_files=False)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
    def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare,
                     blockcount, extraargstring):
        """Clusters sequences using CROP.

        :param input_f: Filepath to the input fasta file to cluster.
        :param outdir: Filepath to the output directory.
        :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting.
        :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the
                            first round.  For data set with different average sequence length, this parameter should \
                            be tuned such that it won't take too long for each block to do pariwise alignment.  Hint \
                            for choosing z: z*L<150,000, where L is the average length of the sequences.
        :param clustpct: The minimum similarity threshold for clustering.  Either 'g' for 95% or 's' for 97%.
        :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \
                            this value to enhance accuracy (recommended value is at least 10*block size).
        :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run.  Max is 20.
        :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \
                            'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \
                            clusters are mapped to the 'abundant' clusters.  Finally, 'rare' clusters which cannot be \
                            mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \
                            considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \
                            believe your data is not too diverse to be handled, then r=0 will be the best choice.
        :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \
                            first round should contain about 50 sequences.  i.e. b=N/50, where N is the number of \
                            input sequences.  Default: # input sequences / z.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """

        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # crop -i %s -o %s -z %s -c %s -e %s -m %s%s
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP,
                                    [input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct,
                                        maxmcmc, maxsm, rare, blockcount],
                                    {"exists": [input_]}, extraargstring) for input_ in inputs], pool)

        # CLEAN THE OUTPUT GROUPS FILE
        printVerbose("Parsing the groups file from clustering")
        clustered_groups_files = getInputFiles(outdir, "*.cluster.list")
        debugPrintInputInfo(clustered_groups_files, "converted to groups files")
        run_parallel([PythonRunner(parseCROPoutToGroups, [input_,
                                   "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_groups_files], pool)
        printVerbose("Done parsing groups file.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files)

        # GATHER AUX FILES
        input_dir = getDirName(input_f)
        aux_files = cleaned_clustered_groups_files
        aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False)
        aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False)
        aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False)
        aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
    def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes, stripcounts, extraargstring):
        """Dereplicates with vsearch.

        :param input_f: Filepath to the file or folder of files to dereplicate.
        :param outdir: Filepath to the output directory.
        :param groupsfile: A groups file to use as a reference for replicant counting.  If no groups file is
                            provided, input sequences are conidered singletons (regardless of their name-annotated
                            dereplication count).
        :param processes: The number of processes to use to dereplicate the fileset.
        :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY
        # strip counts if we need to.
        if stripcounts:
            printVerbose("Removing counts from sequence names...")
            debugPrintInputInfo(inputs, "renamed")
            run_parallel([PythonRunner(removeCountsFromFastFile,
                                       [input_, "%s/%s_uncount.fasta" % (outdir, strip_ixes(input_)), 'fasta'],
                                       {"exists": input_})
                          for input_ in inputs], pool)
            printVerbose("Done removing counts.")

            # Grab the cleaned files as input for the next step
            inputs = getInputFiles(outdir, "*_uncount.fasta")

        # DEREPLICATE
        debugPrintInputInfo(inputs, "dereplicated")
        printVerbose("Dereplicating...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH,
                                    [processes, input_,
                                     "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)),
                                     "%s/%s_uc.out" % (outdir, strip_ixes(input_))],
                                    {"exists": [input_], "positive": [processes]},
                                    extraargstring)
                      for input_ in inputs], pool)
        printVerbose("Done dereplicating")

        # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE
        # generates a .groups file named _uc_parsed.out
        # python parseUCtoGroups.py uc.out uc_parsed.out
        input_ucs = getInputFiles(outdir, "*_uc.out")
        printVerbose("Generating a groups file from dereplication.")
        debugPrintInputInfo(inputs, "parsed (into a .groups file)")
        run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s_derep.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in input_ucs], pool)

        most_recent_groups_files = getInputFiles(outdir, "*_derep.groups", ignore_empty_files=False)

        # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS
        if groupsfile is not None:
            # Grab the oldgroups file and the dereplicated groups file
            old_groups_files = getInputFiles(groupsfile)
            derep_groups_files = getInputFiles(outdir, "*_derep.groups")

            printVerbose("Updating .groups files with dereplicated data")
            printVerbose("%d Reference (old) groups files to be read:" % len(old_groups_files))
            printVerbose(str(old_groups_files))
            printVerbose("%d Dereplicated (new) groups files to be read:" % len(derep_groups_files))
            printVerbose(str(derep_groups_files))

            update_groups(old_groups_files, derep_groups_files, outdir, "dereplicated")
            most_recent_groups_files = getInputFiles(outdir, "dereplicated*", ignore_empty_files=False)
            printVerbose("Done updating .groups files.")

        if len(inputs) != len(most_recent_groups_files):
            print ("Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)." %
                   (len(inputs), len(most_recent_groups_files)))
            exit()
        fasta_groups_pairs = zip(inputs, most_recent_groups_files)
        # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT
        # python renameWithReplicantCounts.py
        #               8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta
        printVerbose("Adding dereplication data to unique fasta")
        run_parallel([PythonRunner(renameWithReplicantCounts,
                                   [fasta, groups, "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'],
                                   {"exists": [fasta, groups]})
                      for fasta, groups in fasta_groups_pairs], pool)
        printVerbose("Done adding data")

        aux_dir = makeAuxDir(outdir)
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(most_recent_groups_files, groups_dir)
        aux_files = getInputFiles(outdir, '*', "*_counts.fasta", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)